# Feature Engineering

## Heart Failure Prediction (Categorical Features)

### EDA

In [57]:
import pandas as pd

df = pd.read_csv('data/heart.csv')

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [59]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [60]:
def get_cat_cols(df):
    return [col for col in df.columns if df[col].dtype == 'object']

In [61]:
cat_cols = get_cat_cols(df)
cat_cols

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

### Dummy Variables

To quote the SMLH, 
> we recommend you use it [dummy variables technique] when your column has fewer than ten categories.

In [62]:
cat_cols_dummy = [col for col in cat_cols if df[col].nunique() < 10]
cat_cols_dummy

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [63]:
df_dummy = pd.get_dummies(df, columns=cat_cols_dummy, drop_first=True)

df_dummy.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1


In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def get_logreg_score(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [65]:
from sklearn.model_selection import train_test_split

target = 'HeartDisease'

y = df_dummy[target]
X = df_dummy.drop(target, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

get_logreg_score(X_train, X_test, y_train, y_test)

0.8804347826086957

### Frequency Imputation

In [66]:
df_freq = df.copy()

y = df_freq[target]
X = df_freq.drop(target, axis=1)

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [67]:
import category_encoders as ce

cat_cols = get_cat_cols(df_freq)

encoder = ce.CountEncoder(cols=cat_cols, normalize=True)
encoder.fit(X, y)
X_transformed = encoder.transform(X)

X_transformed.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,0.78976,0.188453,140,289,0,0.601307,172,0.595861,0.0,0.430283
1,49,0.21024,0.221133,160,180,0,0.601307,156,0.595861,1.0,0.501089
2,37,0.78976,0.188453,130,283,0,0.1939,98,0.595861,0.0,0.430283
3,48,0.21024,0.540305,138,214,0,0.601307,108,0.404139,1.5,0.501089
4,54,0.78976,0.221133,150,195,0,0.601307,122,0.595861,0.0,0.430283


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

get_logreg_score(X_train, X_test, y_train, y_test)

0.8442028985507246

It seems that frequency encoding leads to a worse performance for the selected columns than using dummy variables.

#### With label encoding for binary categorical columns

In [69]:
df_freq_2 = df.copy()

cat_cols = get_cat_cols(df_freq_2)
cat_cols_bin = [col for col in cat_cols if df_freq_2[col].nunique() == 2]
cat_cols_bin

['Sex', 'ExerciseAngina']

In [70]:
df_freq_2['Sex'] = df_freq_2['Sex'].astype('category').cat.codes
df_freq_2['ExerciseAngina'] = df_freq_2['ExerciseAngina'].astype('category').cat.codes

X = df_freq_2.drop(target, axis=1)
y = df_freq_2[target]

df_freq_2.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [71]:
import category_encoders as ce

cat_cols = get_cat_cols(df_freq_2)

encoder = ce.CountEncoder(cols=cat_cols, normalize=True)
encoder.fit(X, y)
X_transformed = encoder.transform(X)

X_transformed.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,0.188453,140,289,0,0.601307,172,0,0.0,0.430283
1,49,0,0.221133,160,180,0,0.601307,156,0,1.0,0.501089
2,37,1,0.188453,130,283,0,0.1939,98,0,0.0,0.430283
3,48,0,0.540305,138,214,0,0.601307,108,1,1.5,0.501089
4,54,1,0.221133,150,195,0,0.601307,122,0,0.0,0.430283


In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

get_logreg_score(X_train, X_test, y_train, y_test)

0.8478260869565217

This combination of features enginnering and encoding results in a slightly better performance.

### Target Encoding

In [73]:
df_target = df.copy()

y = df_target[target]
X = df_target.drop(target, axis=1)

In [74]:
import category_encoders as ce

cat_cols = get_cat_cols(df_target)

encoder = ce.TargetEncoder(cols=cat_cols)
encoder.fit(X, y)
X_transformed = encoder.transform(X)

X_transformed.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,0.631724,0.138728,140,289,0,0.516304,172,0.351005,0.0,0.197468
1,49,0.259067,0.35468,160,180,0,0.516304,156,0.351005,1.0,0.828261
2,37,0.631724,0.138728,130,283,0,0.657303,98,0.351005,0.0,0.197468
3,48,0.259067,0.790323,138,214,0,0.516304,108,0.851752,1.5,0.828261
4,54,0.631724,0.35468,150,195,0,0.516304,122,0.351005,0.0,0.197468


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

get_logreg_score(X_train, X_test, y_train, y_test)

0.8768115942028986

Target imputation performed better than frequency encoding, but slightly worse than one-hot encoding.

## Electric Motor Temperature (Numerical Features)

### EDA

In [76]:
df = pd.read_csv('data/electric_motor.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3465 entries, 0 to 3464
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   u_q             3465 non-null   float64
 1   coolant         3465 non-null   float64
 2   stator_winding  3465 non-null   float64
 3   u_d             3465 non-null   float64
 4   stator_tooth    3465 non-null   float64
 5   motor_speed     3465 non-null   float64
 6   i_d             3465 non-null   float64
 7   i_q             3465 non-null   float64
 8   pm              3465 non-null   float64
 9   stator_yoke     3465 non-null   float64
 10  ambient         3465 non-null   float64
 11  torque          3465 non-null   float64
 12  profile_id      3465 non-null   int64  
dtypes: float64(12), int64(1)
memory usage: 352.0 KB


In [77]:
df.head()

Unnamed: 0,u_q,coolant,stator_winding,u_d,stator_tooth,motor_speed,i_d,i_q,pm,stator_yoke,ambient,torque,profile_id
0,-0.450682,18.805172,19.08667,-0.350055,18.293219,0.002866,0.004419,0.000328,24.554214,18.316547,19.850691,0.187101,17
1,-0.325737,18.818571,19.09239,-0.305803,18.294807,0.000257,0.000606,-0.000785,24.538078,18.314955,19.850672,0.245417,17
2,-0.440864,18.82877,19.08938,-0.372503,18.294094,0.002355,0.00129,0.000386,24.544693,18.326307,19.850657,0.176615,17
3,-0.327026,18.835567,19.083031,-0.316199,18.292542,0.006105,2.6e-05,0.002046,24.554018,18.330833,19.850647,0.238303,17
4,-0.47115,18.857033,19.082525,-0.332272,18.291428,0.003133,-0.064317,0.037184,24.565397,18.326662,19.850639,0.208197,17


In [78]:
df.describe()

Unnamed: 0,u_q,coolant,stator_winding,u_d,stator_tooth,motor_speed,i_d,i_q,pm,stator_yoke,ambient,torque,profile_id
count,3465.0,3465.0,3465.0,3465.0,3465.0,3465.0,3465.0,3465.0,3465.0,3465.0,3465.0,3465.0,3465.0
mean,87.756427,19.053978,65.129383,-94.165607,51.364837,4953.402704,-138.187562,52.833685,54.777751,37.11215,22.332963,48.053873,17.0
std,7.961513,0.44995,11.892837,8.852112,9.22746,462.598037,13.21244,4.525163,11.484557,5.703823,1.22447,4.298795,0.0
min,-0.974433,17.516184,19.049341,-96.527031,18.276365,-0.004142,-143.617004,-0.000785,24.264736,18.300852,19.85062,0.176615,17.0
25%,87.602745,18.834822,64.872131,-95.91806,50.522747,4999.951172,-140.056564,53.154724,48.560631,36.783073,21.570688,48.415657,17.0
50%,88.054886,19.128372,70.561028,-95.451088,55.392807,4999.954102,-139.11525,53.276806,59.292618,39.449806,22.639578,48.477646,17.0
75%,89.095169,19.359198,71.976898,-94.359489,57.323189,4999.957031,-138.689636,53.341125,63.823906,40.45612,23.273638,48.545872,17.0
max,95.686531,19.986029,72.121681,0.851181,57.653431,4999.971191,0.004419,55.308327,66.281319,41.787533,24.217205,48.845245,17.0


In [79]:
df = df.drop('profile_id', axis=1)

df.head()

Unnamed: 0,u_q,coolant,stator_winding,u_d,stator_tooth,motor_speed,i_d,i_q,pm,stator_yoke,ambient,torque
0,-0.450682,18.805172,19.08667,-0.350055,18.293219,0.002866,0.004419,0.000328,24.554214,18.316547,19.850691,0.187101
1,-0.325737,18.818571,19.09239,-0.305803,18.294807,0.000257,0.000606,-0.000785,24.538078,18.314955,19.850672,0.245417
2,-0.440864,18.82877,19.08938,-0.372503,18.294094,0.002355,0.00129,0.000386,24.544693,18.326307,19.850657,0.176615
3,-0.327026,18.835567,19.083031,-0.316199,18.292542,0.006105,2.6e-05,0.002046,24.554018,18.330833,19.850647,0.238303
4,-0.47115,18.857033,19.082525,-0.332272,18.291428,0.003133,-0.064317,0.037184,24.565397,18.326662,19.850639,0.208197


### Performance without feature scaling

In [80]:
from sklearn.linear_model import LinearRegression

def get_linreg_score(X_train, X_test, y_train, y_test):
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred = linreg.predict(X_test)
    return linreg.score(X_test, y_test)

In [81]:
from sklearn.model_selection import train_test_split

target = 'pm'
y = df[target]
X = df.drop(target, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

get_linreg_score(X_train, X_test, y_train, y_test)

0.9754316973236408

### Feature Scaling

#### Standardization

In [82]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

get_linreg_score(X_train_std, X_test_std, y_train, y_test)

0.9754316973236503

#### Normalization

In [83]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
minmax.fit(X_train)
X_train_minmax = minmax.transform(X_train)
X_test_minmax = minmax.transform(X_test)

get_linreg_score(X_train_minmax, X_test_minmax, y_train, y_test)

0.9754316973236501

Feature scaling did not have a significant effect on the performance of the model for ths particular data set.

### Box-Cox Transformation

In [84]:
from imperio import BoxCoxTransformer

# X = df.drop(target, axis=1)
# y = df[target]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

boxcox = BoxCoxTransformer()
boxcox.fit(X_train, y_train)
X_transformed = boxcox.transform(X_test)

get_linreg_score(X_train, X_transformed, y_train, y_test)

TypeError: unsupported operand type(s) for ** or pow(): 'str' and 'float'

### ZCA Transformation

In [85]:
from imperio import ZCATransformer

zca = ZCATransformer()
zca.fit(X_train, y_train)
X_transformed = zca.transform(X_test)

get_linreg_score(X_train, X_test, y_train, y_test)

0.9754316973236408