
Heart Disease classification

https://www.kaggle.com/ayushjain001/regularization-model-on-heart-dataset/data


In [None]:
#imports

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix ,  plot_roc_curve , classification_report , accuracy_score
from sklearn.model_selection import learning_curve
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df=pd.read_csv("../input/heart-failure-prediction/heart.csv")
df.head()


Age: age of the patient [years]

Sex: sex of the patient [M: Male, F: Female]

ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]

RestingBP: resting blood pressure [mm Hg]

Cholesterol: serum cholesterol [mm/dl]

FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]

RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions
and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]

MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]

ExerciseAngina: exercise-induced angina [Y: Yes, N: No]

Oldpeak: oldpeak = ST [Numeric value measured in depression]

ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]

HeartDisease: output class [1: heart disease, 0: Normal]


In [None]:
print("shape of the df :   {}".format(df.shape))
df.describe()

In [None]:
#we can  see max cholesterol mm/dl wich can be an outlier, other features don't have some
df["Cholesterol"].value_counts()
df=df[df["Cholesterol"]<500]

In [None]:
df.isnull().sum()
#no missing value

Data visualisation

Countplots


In [None]:
def countplot(df,max_columns,val,figsize,title_size=20,title=""):
    l=len(df.columns)
    ligns=ceil(l/max_columns)
    fig=plt.figure(1,figsize=figsize)
    i=1
    
    for col in df.columns:
        if len(pd.unique(df[col]))<val:
            plt.subplot(ligns,max_columns,i)
            plt.title(col, fontsize=title_size)
            i=i+1
            sns.countplot(data = df, x=col )
    fig.tight_layout(pad=3.0)
    
    fig.suptitle(title)
    plt.show()
    
title="Different Countplots"
countplot(df,3,10,(80,80),80)
#countsplot dataframe,columns spliting,max_uniquevalues,figsize,title,title_size


We can see not all data are homogeneous for example there are more men 

the more common chest is 'ASY': Asymptomatic

more common FastingBS (fasting blood sugar):  0 wich means:  FastingBS <= 120 mg/dl 

more common resting ECG (resting electrocardiogram results): normal

more common ExerciseAngine: No

more common ST_slope(the slope of the peak exercise ST segment ): Flat

most people have heart disease (in the dataframe)



In [None]:
figure2=plt.figure(2,figsize=(15,10))

plt.subplot(3,2,1)
sns.violinplot(data=df,x="Cholesterol")
plt.subplot(3,2,2)
sns.violinplot(data=df,x="Age")
plt.subplot(3,2,3)
sns.violinplot(data=df,x="Oldpeak")
plt.subplot(3,2,4)
sns.violinplot(data =df, x="MaxHR")
plt.subplot(3,2,5)
sns.violinplot(data=df,x="RestingBP")
plt.suptitle("Repartition of Cholesterol,Age,MaxHR,Oldpeak,RestingBP")


figure2.tight_layout(pad=3.0)


plt.show()


We can see most of the people are between 50-60

the Cholesterol rate is concentrated in the 200-300 mm/dl range



Principal component analysis:

In [None]:

#encoding
df["Sex"] = df["Sex"].astype('category')
df["Sex"] = df["Sex"].cat.codes
df["ChestPainType"] = df["ChestPainType"].astype('category')
df["ChestPainType"] = df["ChestPainType"].cat.codes

df["RestingECG"] = df["RestingECG"].astype('category')
df["RestingECG"] = df["RestingECG"].cat.codes

df["ExerciseAngina"] = df["ExerciseAngina"].astype('category')
df["ExerciseAngina"] = df["ExerciseAngina"].cat.codes

df["ST_Slope"] = df["ST_Slope"].astype('category')
df["ST_Slope"] = df["ST_Slope"].cat.codes

X=df.drop(["HeartDisease"],axis=1)
y=df["HeartDisease"]
X=StandardScaler().fit_transform(X)



ploting heatmap

In [None]:
figsns=plt.figure(7,figsize=(25,25))

sns.heatmap(pd.get_dummies(df).corr(),annot=True)

plt.show()

In [None]:

#two principal components

pca = PCA(n_components=2)
principalComponents=pca.fit_transform(df)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, df[['HeartDisease']]], axis = 1)

In [None]:
total_var = pca.explained_variance_ratio_.sum() * 100

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = df["HeartDisease"].values


colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['HeartDisease'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

total_var = pca.explained_variance_ratio_.sum() * 100
print("total variance {:.2f}  \n  explained_var_ratio{}".format(total_var,pca.explained_variance_ratio_))


First component countains 92% of the variance and the second one 5%

We can clearly see the class HeartDisease 1 separated with the -200 value on the principal component 1



three principal component analysis:

In [None]:


pca = PCA(n_components=3)
components = pca.fit_transform(X)


fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df['HeartDisease'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()
print("explained_var_ratio{}".format(pca.explained_variance_ratio_))

first component countains 25% of the Variance

second component countais 13% of the variance

third component countais 10% of the variance

the class 0 and 1 seems relatively  separable with these 3 components

First model

In [None]:
df=pd.read_csv("../input/heart-failure-prediction/heart.csv")

X=pd.get_dummies(df.drop(["HeartDisease"],axis=1))
y=df["HeartDisease"]
X_train,X_test,y_train,y_test=train_test_split(X,y)

In [None]:
pca = PCA(.95)
#minimum number of components so that 95% of the variance is retained
pca.fit(X_train)
print("number of component retained {}".format(pca.n_components_))


X_train_pca=pca.transform(X_train)
X_test_pca=pca.transform(X_test)

In [None]:
#-----------------------MLP grid search CV-----------------------------
mlp=make_pipeline(RobustScaler(),MLPClassifier(max_iter=10000))
param_grid={
            'mlpclassifier__hidden_layer_sizes':(range(1,4)),
            'mlpclassifier__solver':['lbfgs','sgd'],
            'mlpclassifier__activation':["logistic", "tanh", "relu"]
           }                             
grid= GridSearchCV(mlp,param_grid=param_grid,cv=5)
grid.fit(X_train_pca,y_train)
print("best train cv score : {:.3f}, with parameters : {}".format(grid.best_score_,grid.best_params_))
mlp=grid.best_estimator_
print("test score {:.3f}".format(mlp.score(X_test_pca,y_test)))

In [None]:
#--------------------RandomForest--------------------
rdm=make_pipeline(RobustScaler(),RandomForestClassifier())

param_rdm = {'robustscaler__with_centering':[True,False],
             'robustscaler__with_scaling':[True,False],
             'randomforestclassifier__n_estimators': [200, 500],
             'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'],
             'randomforestclassifier__max_depth' : [4,5,6,7,8],
             'randomforestclassifier__criterion' :['gini', 'entropy'],
            }

grid_rdm=GridSearchCV(rdm,param_rdm,cv=5)
grid_rdm.fit(X_train_pca,y_train)
rdm=grid_rdm.best_estimator_
print("best score rdm {:.3f} aw parameters :{}".format(grid_rdm.best_score_,grid_rdm.best_params_))
print("test score {:.3f}".format(rdm.score(X_test_pca,y_test)))


Model analysis

https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py

In [None]:
def plot_learning_curve(
    estimator,
    title,
    X,
    y,
    axes=None,
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
):
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        return_times=True,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes[0].fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes[0].plot(
        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
    )
    axes[0].plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, "o-")
    axes[1].fill_between(
        train_sizes,
        fit_times_mean - fit_times_std,
        fit_times_mean + fit_times_std,
        alpha=0.1,
    )
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    fit_time_argsort = fit_times_mean.argsort()
    fit_time_sorted = fit_times_mean[fit_time_argsort]
    test_scores_mean_sorted = test_scores_mean[fit_time_argsort]
    test_scores_std_sorted = test_scores_std[fit_time_argsort]
    axes[2].grid()
    axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-")
    axes[2].fill_between(
        fit_time_sorted,
        test_scores_mean_sorted - test_scores_std_sorted,
        test_scores_mean_sorted + test_scores_std_sorted,
        alpha=0.1,
    )
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

In [None]:

y_pred=mlp.predict(X_test_pca)
#confusion matrix
fig3=plt.figure(4,figsize=(10,10))
fig3.suptitle("confusion matrix mlp")
cm = confusion_matrix(y_test,y_pred)
print(cm)
plt.subplot(221)
df_cm = pd.DataFrame(cm, index = [i for i in range(2)],
                  columns = [i for i in range(2)])
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='d')
plt.title('confusion matrix')
plt.xlabel('prediction')
plt.ylabel('Actual');

fig4=plt.figure(5,figsize=(10,10))
fig4.suptitle("roc curve mlp")
#Roc curve
plot_roc_curve(mlp,X_test_pca,y_test)

fig5=plt.figure(6,fogsize=(10,10))
plot_learning_curve(mlp,'Learning curve mlp',X_train_pca,y_train)

plt.show()

All features models

In [None]:
#-----------------------MLP grid search CV-----------------------------

mlp2=make_pipeline(RobustScaler(),MLPClassifier(max_iter=10000))
param_grid={
            'mlpclassifier__hidden_layer_sizes':(range(1,4)),
            'mlpclassifier__solver':['lbfgs','sgd'],
            'mlpclassifier__activation':["logistic", "tanh", "relu"]
           }                             
grid= GridSearchCV(mlp2,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)
print("best train cv score : {:.3f}, avec les paramÃ¨tres : {}".format(grid.best_score_,grid.best_params_))
mlp2=grid.best_estimator_
print("test score {:.3f}".format(mlp2.score(X_test,y_test)))


In [None]:

y_pred=mlp2.predict(X_test)
#confusion matrix
fig3=plt.figure(4,figsize=(10,10))
fig3.suptitle("confusion matrix mlp")
cm = confusion_matrix(y_test,y_pred)
print(cm)
plt.subplot(221)
df_cm = pd.DataFrame(cm, index = [i for i in range(2)],
                  columns = [i for i in range(2)])
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='d')
plt.title('confusion matrix')
plt.xlabel('prediction')
plt.ylabel('Actual');

fig4=plt.figure(5,figsize=(10,10))
fig4.suptitle("roc curve mlp")
#Roc curve
plot_roc_curve(mlp2,X_test,y_test)

fig5=plt.figure(6,fogsize=(10,10))
plot_learning_curve(mlp2,'Learning curve mlp',X_train,y_train)

plt.show()