# <ins> Feature engineering file </ins>

In [2]:
# Import libraries
import pandas as pd
import numpy as np

# Load the cleaned dataset
df = pd.read_csv('../data/processed/heart_cleaned.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140.0,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160.0,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130.0,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138.0,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150.0,195.0,0,Normal,122,N,0.0,Up,0


<b> Label Encoding: Sex and ExerciseAngina </b>

In [3]:
# Mapping Sex with Male: 0 and Female:1
sex_mapping = {'M': 0, 'F': 1}
df['Sex'] = df['Sex'].str.strip().map(sex_mapping)
print('Sex feature has been label encoded to binary values')

exercise_angina_mapping = {'N': 0, 'Y': 1}
df['ExerciseAngina'] = df['ExerciseAngina'].str.strip().map(exercise_angina_mapping)
print('ExerciseAngina feature has been label encoded to binary values')

df.head()

Sex feature has been label encoded to binary values
ExerciseAngina feature has been label encoded to binary values


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,ATA,140.0,289.0,0,Normal,172,0,0.0,Up,0
1,49,1,NAP,160.0,180.0,0,Normal,156,0,1.0,Flat,1
2,37,0,ATA,130.0,283.0,0,ST,98,0,0.0,Up,0
3,48,1,ASY,138.0,214.0,0,Normal,108,1,1.5,Flat,1
4,54,0,NAP,150.0,195.0,0,Normal,122,0,0.0,Up,0


<b> One-hot Encoding: ChestPainType, RestingECG and ST_Slope </b>

In [4]:
encoded_df = pd.get_dummies(df, columns=['ChestPainType', 'RestingECG', 'ST_Slope'], drop_first=True)
encoded_df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,0,140.0,289.0,0,172,0,0.0,0,True,False,False,True,False,False,True
1,49,1,160.0,180.0,0,156,0,1.0,1,False,True,False,True,False,True,False
2,37,0,130.0,283.0,0,98,0,0.0,0,True,False,False,False,True,False,True
3,48,1,138.0,214.0,0,108,1,1.5,1,False,False,False,True,False,True,False
4,54,0,150.0,195.0,0,122,0,0.0,0,False,True,False,True,False,False,True


In [5]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   Sex                918 non-null    int64  
 2   RestingBP          918 non-null    float64
 3   Cholesterol        918 non-null    float64
 4   FastingBS          918 non-null    int64  
 5   MaxHR              918 non-null    int64  
 6   ExerciseAngina     918 non-null    int64  
 7   Oldpeak            918 non-null    float64
 8   HeartDisease       918 non-null    int64  
 9   ChestPainType_ATA  918 non-null    bool   
 10  ChestPainType_NAP  918 non-null    bool   
 11  ChestPainType_TA   918 non-null    bool   
 12  RestingECG_Normal  918 non-null    bool   
 13  RestingECG_ST      918 non-null    bool   
 14  ST_Slope_Flat      918 non-null    bool   
 15  ST_Slope_Up        918 non-null    bool   
dtypes: bool(7), float64(3), in

In [6]:
bool_columns = encoded_df.select_dtypes(include='bool').columns
encoded_df[bool_columns] = encoded_df[bool_columns].astype(int)

In [7]:
# encoded_df.to_csv('heart_encoded.csv', index=False)

# <ins>Feature Creation</ins>

In [11]:
df = pd.read_csv('../data/processed/heart_encoded.csv')
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,0,140.0,289.0,0,172,0,0.0,0,1,0,0,1,0,0,1
1,49,1,160.0,180.0,0,156,0,1.0,1,0,1,0,1,0,1,0
2,37,0,130.0,283.0,0,98,0,0.0,0,1,0,0,0,1,0,1
3,48,1,138.0,214.0,0,108,1,1.5,1,0,0,0,1,0,1,0
4,54,0,150.0,195.0,0,122,0,0.0,0,0,1,0,1,0,0,1


In [12]:
df['Age_x_Oldpeak'] = df['Age'] * df['Oldpeak']
df['Age_x_Cholesterol'] = df['Age'] * df['Cholesterol']
df ['Oldpeak_sq'] = df['Oldpeak'] ** 2
print('New features created: Age x Oldpeak, Age x Cholesterol, Oldpeak squared')
df.head()

New features created: Age x Oldpeak, Age x Cholesterol, Oldpeak squared


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up,Age_x_Oldpeak,Age_x_Cholesterol,Oldpeak_sq
0,40,0,140.0,289.0,0,172,0,0.0,0,1,0,0,1,0,0,1,0.0,11560.0,0.0
1,49,1,160.0,180.0,0,156,0,1.0,1,0,1,0,1,0,1,0,49.0,8820.0,1.0
2,37,0,130.0,283.0,0,98,0,0.0,0,1,0,0,0,1,0,1,0.0,10471.0,0.0
3,48,1,138.0,214.0,0,108,1,1.5,1,0,0,0,1,0,1,0,72.0,10272.0,2.25
4,54,0,150.0,195.0,0,122,0,0.0,0,0,1,0,1,0,0,1,0.0,10530.0,0.0


In [13]:
df.to_csv('01_heart_engineered.csv', index=False)