In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/heart.csv')

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
Age               918 non-null int64
Sex               918 non-null object
ChestPainType     918 non-null object
RestingBP         918 non-null int64
Cholesterol       918 non-null int64
FastingBS         918 non-null int64
RestingECG        918 non-null object
MaxHR             918 non-null int64
ExerciseAngina    918 non-null object
Oldpeak           918 non-null float64
ST_Slope          918 non-null object
HeartDisease      918 non-null int64
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [25]:
numerical_features = df.drop('HeartDisease', axis=1).select_dtypes('number').columns.to_list()
numerical_features.remove('FastingBS')
numerical_features

['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

In [26]:
categorical_features = df.select_dtypes('object').columns.to_list()
categorical_features.append('FastingBS')
categorical_features

['Sex',
 'ChestPainType',
 'RestingECG',
 'ExerciseAngina',
 'ST_Slope',
 'FastingBS']

In [8]:
from sklearn.feature_selection import f_classif

In [27]:
f_values = f_classif(df[df.Cholesterol>0][numerical_features], df[df.Cholesterol>0]['HeartDisease'])

In [28]:
f_values

(array([ 72.83911596,  23.02030868,   8.11385082, 123.42507848,
        242.36427902]),
 array([7.86107177e-17, 1.93738362e-06, 4.51406911e-03, 1.22692601e-26,
        1.63029397e-47]))

In [29]:
f_values[1]

array([7.86107177e-17, 1.93738362e-06, 4.51406911e-03, 1.22692601e-26,
       1.63029397e-47])

In [30]:
p_values = pd.Series(index=numerical_features, data=f_values[1])
p_values

Age            7.861072e-17
RestingBP      1.937384e-06
Cholesterol    4.514069e-03
MaxHR          1.226926e-26
Oldpeak        1.630294e-47
dtype: float64

In [31]:
threshold = 0.05
p_values < threshold

Age            True
RestingBP      True
Cholesterol    True
MaxHR          True
Oldpeak        True
dtype: bool

In [32]:
from sklearn.feature_selection import mutual_info_classif

In [33]:
df_cat = pd.get_dummies(df[categorical_features])

In [34]:
mi = mutual_info_classif(df_cat, df['HeartDisease'], discrete_features=True)

In [35]:
mi_series = pd.Series(data = mi, index=df_cat.columns)
mi_series.sort_values(ascending=False).head(50)

ST_Slope_Up          0.206974
ST_Slope_Flat        0.163076
ChestPainType_ASY    0.139650
ExerciseAngina_Y     0.131680
ExerciseAngina_N     0.131680
ChestPainType_ATA    0.085956
Sex_M                0.047477
Sex_F                0.047477
FastingBS            0.038040
ChestPainType_NAP    0.022697
ST_Slope_Down        0.008038
RestingECG_ST        0.005351
RestingECG_Normal    0.004212
ChestPainType_TA     0.001490
RestingECG_LVH       0.000057
dtype: float64

In [36]:
from sklearn.preprocessing import LabelEncoder

labelencoder_X = LabelEncoder()

df_labeled = df[categorical_features].copy()
for cat in categorical_features:
    df_labeled[cat] = labelencoder_X.fit_transform(df[cat])

# df_labeled = labelencoder_X.fit_transform(df[['CITY', 'STATE']])
df_labeled

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,FastingBS
0,1,1,1,0,2,0
1,0,2,1,0,1,0
2,1,1,2,0,2,0
3,0,0,1,1,1,0
4,1,2,1,0,2,0
...,...,...,...,...,...,...
913,1,3,1,0,1,0
914,1,0,1,0,1,1
915,1,0,1,1,1,0
916,0,1,0,0,1,0


In [37]:
mi = mutual_info_classif(df_labeled, df['HeartDisease'], discrete_features=True)

In [38]:
mi_series = pd.Series(data = mi, index=df_labeled.columns)
mi_series.sort_values(ascending=False).head(50)

ST_Slope          0.207474
ChestPainType     0.155988
ExerciseAngina    0.131680
Sex               0.047477
FastingBS         0.038040
RestingECG        0.006045
dtype: float64