In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/heart.csv')

In [3]:
y = df['HeartDisease']

In [4]:
numerical_features = df.drop('HeartDisease', axis=1).select_dtypes('number').columns.to_list()
numerical_features.remove('FastingBS')
numerical_features

['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

In [5]:
categorical_features = df.select_dtypes('object').columns.to_list()
categorical_features.append('FastingBS')
categorical_features

['Sex',
 'ChestPainType',
 'RestingECG',
 'ExerciseAngina',
 'ST_Slope',
 'FastingBS']

# Approach - boolean feature 'hasCholesterol'

In [7]:
df['hasCholesterol'] = df['Cholesterol'].apply(lambda x : 1 if x>0 else 0)

In [8]:
df[df.Cholesterol==0][['Cholesterol', 'hasCholesterol']]

Unnamed: 0,Cholesterol,hasCholesterol
293,0,0
294,0,0
295,0,0
296,0,0
297,0,0
...,...,...
514,0,0
515,0,0
518,0,0
535,0,0


In [9]:
categorical_features.append('hasCholesterol')
categorical_features

['Sex',
 'ChestPainType',
 'RestingECG',
 'ExerciseAngina',
 'ST_Slope',
 'FastingBS',
 'hasCholesterol']

In [10]:
df_cat = pd.get_dummies(df[categorical_features])

In [11]:
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(df_cat, df['HeartDisease'], discrete_features=True)

In [12]:
mi_series = pd.Series(data = mi, index=df_cat.columns)
mi_series.sort_values(ascending=False).head(50)

ST_Slope_Up          0.206974
ST_Slope_Flat        0.163076
ChestPainType_ASY    0.139650
ExerciseAngina_Y     0.131680
ExerciseAngina_N     0.131680
ChestPainType_ATA    0.085956
hasCholesterol       0.057659
Sex_M                0.047477
Sex_F                0.047477
FastingBS            0.038040
ChestPainType_NAP    0.022697
ST_Slope_Down        0.008038
RestingECG_ST        0.005351
RestingECG_Normal    0.004212
ChestPainType_TA     0.001490
RestingECG_LVH       0.000057
dtype: float64

In [13]:
X = df_cat.merge(df[numerical_features], left_index=True, right_index=True)

In [14]:
X.head()

Unnamed: 0,FastingBS,hasCholesterol,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,...,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,0,1,0,1,0,1,0,0,0,1,...,1,0,0,0,1,40,140,289,172,0.0
1,0,1,1,0,0,0,1,0,0,1,...,1,0,0,1,0,49,160,180,156,1.0
2,0,1,0,1,0,1,0,0,0,0,...,1,0,0,0,1,37,130,283,98,0.0
3,0,1,1,0,1,0,0,0,0,1,...,0,1,0,1,0,48,138,214,108,1.5
4,0,1,0,1,0,0,1,0,0,1,...,1,0,0,0,1,54,150,195,122,0.0


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [26]:
f1_results = {}

### Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
model_log_reg = LogisticRegression(random_state=42, max_iter=100000000, class_weight='balanced', ).fit(X_train, y_train)



In [30]:
model_log_reg.score(X_train, y_train)

0.8777239709443099

In [31]:
model_log_reg.score(X_test, y_test)

0.8695652173913043

In [32]:
from sklearn.metrics import f1_score, confusion_matrix
f1_score(y_test, model_log_reg.predict(X_test))

0.8888888888888888

In [45]:
f1_results['LogisticRegression'] = f1_score(y_test, model_log_reg.predict(X_test))

In [34]:
confusion_matrix(y_test, model_log_reg.predict(X_test))

array([[32,  6],
       [ 6, 48]])

In [53]:
coef = pd.Series(data = model.coef_.tolist()[0], index=X.columns).sort_values(ascending=True)
coef

hasCholesterol      -1.374308
ST_Slope_Up         -1.060808
Sex_F               -0.669277
ChestPainType_ATA   -0.656823
ExerciseAngina_N    -0.482571
ChestPainType_NAP   -0.412732
RestingECG_ST       -0.238408
ST_Slope_Down       -0.216501
RestingECG_Normal   -0.010051
ChestPainType_TA    -0.006253
MaxHR               -0.005024
Cholesterol         -0.000160
RestingBP            0.000154
Age                  0.011988
RestingECG_LVH       0.198001
ExerciseAngina_Y     0.432113
Oldpeak              0.434512
Sex_M                0.618818
ChestPainType_ASY    1.025349
FastingBS            1.050820
ST_Slope_Flat        1.226850
dtype: float64

### DecisionTreeClassifier

In [83]:
from sklearn.tree import DecisionTreeClassifier

In [84]:
model_tree = DecisionTreeClassifier(random_state=42, class_weight='balanced').fit(X_train, y_train)

In [85]:
model_tree.score(X_train, y_train)

1.0

In [86]:
model_tree.score(X_test, y_test)

0.7934782608695652

In [88]:
from sklearn.metrics import f1_score, confusion_matrix
f1_score(y_test, model_tree.predict(X_test))

0.8155339805825242

In [90]:
f1_results['DecisionTreeClassifier'] = f1_score(y_test, model_tree.predict(X_test))

In [91]:
confusion_matrix(y_test, model_tree.predict(X_test))

array([[31,  7],
       [12, 42]])

In [92]:
coef_tree = pd.Series(data = model_tree.feature_importances_, index=X.columns).sort_values(ascending=False)
coef_tree

ST_Slope_Up          0.392159
Cholesterol          0.113319
MaxHR                0.097329
Oldpeak              0.082499
ChestPainType_ASY    0.066768
RestingBP            0.057882
Age                  0.041163
FastingBS            0.040039
Sex_F                0.030813
ExerciseAngina_N     0.022034
RestingECG_Normal    0.014807
ChestPainType_NAP    0.012506
RestingECG_LVH       0.009456
ExerciseAngina_Y     0.007544
ST_Slope_Down        0.005784
RestingECG_ST        0.003478
Sex_M                0.002421
ChestPainType_TA     0.000000
ChestPainType_ATA    0.000000
ST_Slope_Flat        0.000000
hasCholesterol       0.000000
dtype: float64

### RandomForestClassifier

In [97]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
model_forest = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=3000).fit(X_train, y_train)

In [99]:
model_forest.score(X_train, y_train)

1.0

In [100]:
model_forest.score(X_test, y_test)

0.9021739130434783

In [101]:
from sklearn.metrics import f1_score, confusion_matrix
f1_score(y_test, model_forest.predict(X_test))

0.918918918918919

In [102]:
f1_results['RandomForestClassifier'] = f1_score(y_test, model_forest.predict(X_test))

In [103]:
confusion_matrix(y_test, model_forest.predict(X_test))

array([[32,  6],
       [ 3, 51]])

In [104]:
model_forest.feature_importances_

array([0.0234042 , 0.02866926, 0.02087129, 0.01925947, 0.06857478,
       0.02152325, 0.01205325, 0.00545655, 0.0132472 , 0.01131142,
       0.00869244, 0.05962481, 0.05323382, 0.00601532, 0.10504551,
       0.13343383, 0.07415683, 0.0655991 , 0.08517969, 0.08984877,
       0.09479923])

In [105]:
coef_forest = pd.Series(data = model_forest.feature_importances_, index=X.columns).sort_values(ascending=False)
coef_forest

ST_Slope_Up          0.133434
ST_Slope_Flat        0.105046
Oldpeak              0.094799
MaxHR                0.089849
Cholesterol          0.085180
Age                  0.074157
ChestPainType_ASY    0.068575
RestingBP            0.065599
ExerciseAngina_N     0.059625
ExerciseAngina_Y     0.053234
hasCholesterol       0.028669
FastingBS            0.023404
ChestPainType_ATA    0.021523
Sex_F                0.020871
Sex_M                0.019259
RestingECG_LVH       0.013247
ChestPainType_NAP    0.012053
RestingECG_Normal    0.011311
RestingECG_ST        0.008692
ST_Slope_Down        0.006015
ChestPainType_TA     0.005457
dtype: float64

### SVM

In [109]:
from sklearn.svm import LinearSVC

In [136]:
model_svm = LinearSVC(random_state=42, class_weight='balanced', max_iter=1000000).fit(X_train, y_train)



In [124]:
model_svm.score(X_train, y_train)

0.8801452784503632

In [125]:
model_svm.score(X_test, y_test)

0.8695652173913043

In [126]:
from sklearn.metrics import f1_score, confusion_matrix
f1_score(y_test, model_svm.predict(X_test))

0.8888888888888888

In [127]:
f1_results['SVM'] = f1_score(y_test, model_svm.predict(X_test))

In [128]:
confusion_matrix(y_test, model_svm.predict(X_test))

array([[32,  6],
       [ 6, 48]])

In [134]:
coef_svm = pd.Series(data = model_svm.coef_[0], index=X.columns).sort_values(ascending=False)
coef_svm

ST_Slope_Flat        0.417729
FastingBS            0.331401
ChestPainType_ASY    0.325636
Sex_M                0.217145
ExerciseAngina_Y     0.154634
Oldpeak              0.138871
RestingECG_LVH       0.055569
RestingECG_Normal    0.010165
Age                  0.003853
Cholesterol          0.000507
RestingBP            0.000197
MaxHR               -0.001572
ChestPainType_TA    -0.006112
ST_Slope_Down       -0.052527
RestingECG_ST       -0.067965
ChestPainType_NAP   -0.135774
ExerciseAngina_N    -0.156865
ChestPainType_ATA   -0.185981
Sex_F               -0.219376
ST_Slope_Up         -0.367434
hasCholesterol      -0.634849
dtype: float64

In [135]:
f1_results

{'LogisticRegression': 0.8888888888888888,
 'RandomForestClassifier': 0.918918918918919,
 'DecisionTreeClassifier': 0.8155339805825242,
 'SVM': 0.8888888888888888}