In [65]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [66]:
df = pd.read_csv('data/heart.csv')

In [67]:
y = df['HeartDisease']

In [68]:
numerical_features = df.drop('HeartDisease', axis=1).select_dtypes('number').columns.to_list()
numerical_features.remove('FastingBS')
numerical_features

['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

In [69]:
categorical_features = df.select_dtypes('object').columns.to_list()
categorical_features.append('FastingBS')
categorical_features

['Sex',
 'ChestPainType',
 'RestingECG',
 'ExerciseAngina',
 'ST_Slope',
 'FastingBS']

# 1. Approach - label encoding

In [70]:
train = df[numerical_features + categorical_features].copy()

In [71]:
from sklearn.preprocessing import LabelEncoder

labelencoder_X = LabelEncoder()

for cat in categorical_features:
    train[cat] = labelencoder_X.fit_transform(df[cat])

In [72]:
train.head()

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,FastingBS
0,40,140,289,172,0.0,1,1,1,0,2,0
1,49,160,180,156,1.0,0,2,1,0,1,0
2,37,130,283,98,0.0,1,1,2,0,2,0
3,48,138,214,108,1.5,0,0,1,1,1,0
4,54,150,195,122,0.0,1,2,1,0,2,0


### Logistic Regression

In [73]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.1, random_state=42)

In [74]:
w = {0: (y_train==0).mean(), 1: (y_train==1).mean()}
w

{0: 0.45036319612590797, 1: 0.549636803874092}

In [75]:
from sklearn.linear_model import LogisticRegression

In [76]:
model = LogisticRegression(random_state=42, max_iter=100000000, class_weight='balanced', ).fit(X_train, y_train)



In [77]:
coef = pd.Series(data = model.coef_.tolist()[0], index=train.columns).sort_values(ascending=True)
coef

ST_Slope         -1.557479
ChestPainType    -0.625675
RestingECG       -0.201815
MaxHR            -0.006050
Cholesterol      -0.003872
RestingBP         0.003851
Age               0.021802
Oldpeak           0.431896
FastingBS         1.111409
ExerciseAngina    1.120643
Sex               1.178650
dtype: float64

In [78]:
y_train_pred = model.predict(X_train)

In [79]:
model.score(X_train, y_train)

0.8571428571428571

In [80]:
model.score(X_test, y_test)

0.8260869565217391

In [81]:
from sklearn.metrics import f1_score

In [82]:
f1_score(y_test, model.predict(X_test))

0.8461538461538461

In [83]:
from sklearn.metrics import confusion_matrix

In [84]:
confusion_matrix(y_test, model.predict(X_test))

array([[32,  6],
       [10, 44]])

## 2 Approach - one hot encoder

In [85]:
df_cat = pd.get_dummies(df[categorical_features])

In [86]:
df_cat.head()

Unnamed: 0,FastingBS,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


In [87]:
X = df_cat.merge(df[numerical_features], left_index=True, right_index=True)

In [88]:
X.head()

Unnamed: 0,FastingBS,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,40,140,289,172,0.0
1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,49,160,180,156,1.0
2,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,37,130,283,98,0.0
3,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,48,138,214,108,1.5
4,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,54,150,195,122,0.0


# Logistic Regression

In [90]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [91]:
w = {0: (y_train==0).mean(), 1: (y_train==1).mean()}
w

{0: 0.45036319612590797, 1: 0.549636803874092}

In [92]:
model = LogisticRegression(random_state=42, max_iter=100000000, class_weight='balanced').fit(X_train, y_train)



In [93]:
model.score(X_train, y_train)

0.8680387409200968

In [94]:
model.score(X_test, y_test)

0.8478260869565217

In [95]:
f1_score(y_test, model.predict(X_test))

0.8679245283018868

In [96]:
confusion_matrix(y_test, model.predict(X_test))

array([[32,  6],
       [ 8, 46]])

# 3 Approach - boolean feature 'hasCholesterol'

In [97]:
df['hasCholesterol'] = df['Cholesterol'].apply(lambda x : 1 if x>0 else 0)

In [98]:
df[df.Cholesterol==0][['Cholesterol', 'hasCholesterol']]

Unnamed: 0,Cholesterol,hasCholesterol
293,0,0
294,0,0
295,0,0
296,0,0
297,0,0
...,...,...
514,0,0
515,0,0
518,0,0
535,0,0


In [99]:
categorical_features.append('hasCholesterol')
categorical_features

['Sex',
 'ChestPainType',
 'RestingECG',
 'ExerciseAngina',
 'ST_Slope',
 'FastingBS',
 'hasCholesterol']

In [100]:
df_cat = pd.get_dummies(df[categorical_features])

In [101]:
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(df_cat, df['HeartDisease'], discrete_features=True)

In [102]:
mi_series = pd.Series(data = mi, index=df_cat.columns)
mi_series.sort_values(ascending=False).head(50)

ST_Slope_Up          0.206974
ST_Slope_Flat        0.163076
ChestPainType_ASY    0.139650
ExerciseAngina_Y     0.131680
ExerciseAngina_N     0.131680
ChestPainType_ATA    0.085956
hasCholesterol       0.057659
Sex_M                0.047477
Sex_F                0.047477
FastingBS            0.038040
ChestPainType_NAP    0.022697
ST_Slope_Down        0.008038
RestingECG_ST        0.005351
RestingECG_Normal    0.004212
ChestPainType_TA     0.001490
RestingECG_LVH       0.000057
dtype: float64

In [103]:
X = df_cat.merge(df[numerical_features], left_index=True, right_index=True)

In [104]:
X.head()

Unnamed: 0,FastingBS,hasCholesterol,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,...,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,0,1,0,1,0,1,0,0,0,1,...,1,0,0,0,1,40,140,289,172,0.0
1,0,1,1,0,0,0,1,0,0,1,...,1,0,0,1,0,49,160,180,156,1.0
2,0,1,0,1,0,1,0,0,0,0,...,1,0,0,0,1,37,130,283,98,0.0
3,0,1,1,0,1,0,0,0,0,1,...,0,1,0,1,0,48,138,214,108,1.5
4,0,1,0,1,0,0,1,0,0,1,...,1,0,0,0,1,54,150,195,122,0.0


In [105]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [106]:
model = LogisticRegression(random_state=42, max_iter=100000000, class_weight='balanced', ).fit(X_train, y_train)



In [107]:
model.score(X_train, y_train)

0.8777239709443099

In [108]:
model.score(X_test, y_test)

0.8695652173913043

In [109]:
f1_score(y_test, model.predict(X_test))

0.8888888888888888

In [110]:
confusion_matrix(y_test, model.predict(X_test))

array([[32,  6],
       [ 6, 48]])