# Imports 

In [1]:
import pandas as pd 
import seaborn as sns 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SMOTENC
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, classification_report


# Read the dataset 

In [2]:
dataframe = pd.read_csv('./data/heart_failure_clinical_records_dataset.csv')
dataframe.head(5)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [3]:
dataframe.rename(columns={'DEATH_EVENT' : 'label'}, inplace=True)
dataframe.rename(columns={'creatinine_phosphokinase' : 'CPK'}, inplace=True)

# Data Cleaning - Missing values and Check of the attributes

In [4]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  299 non-null    float64
 1   anaemia              299 non-null    int64  
 2   CPK                  299 non-null    int64  
 3   diabetes             299 non-null    int64  
 4   ejection_fraction    299 non-null    int64  
 5   high_blood_pressure  299 non-null    int64  
 6   platelets            299 non-null    float64
 7   serum_creatinine     299 non-null    float64
 8   serum_sodium         299 non-null    int64  
 9   sex                  299 non-null    int64  
 10  smoking              299 non-null    int64  
 11  time                 299 non-null    int64  
 12  label                299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [None]:
min(dataframe.iloc[:, 6]) , max(dataframe.iloc[:, 6]) #inconsistency in platelets values

In [None]:
min(dataframe.iloc[:, 8]) , max(dataframe.iloc[:, 8]) #inconsistency Sodium 114 instead of 113

Change the value 113 of serum_sodium in 114 since there is only one value

In [5]:
dataframe = dataframe.replace({'serum_sodium': 113}, 114)

In [None]:
min(dataframe.iloc[:, 8]) , max(dataframe.iloc[:, 8]) #inconsistency Sodium 114 instead of 113

# Params

In [6]:
class Params:
    drop_first = True 

# Manipulation of Categorical Attributes

### First change the 0 and 1 with the corresponding string

In [7]:
dataframe.anaemia = ['no_anaemia' if c == 0 else 'anaemia' for c in dataframe.anaemia]
dataframe.diabetes = ['no_diabetes' if c == 0 else 'diabetes' for c in dataframe.diabetes]
dataframe.high_blood_pressure = ['no_HBP' if c == 0 else 'HBP' for c in dataframe.high_blood_pressure]  
dataframe.sex = ['female' if c == 0 else 'male' for c in dataframe.sex] 
dataframe.smoking = ['no_smoking' if c == 0 else 'smoking' for c in dataframe.smoking] 
dataframe.head(3)


Unnamed: 0,age,anaemia,CPK,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,label
0,75.0,no_anaemia,582,no_diabetes,20,HBP,265000.0,1.9,130,male,no_smoking,4,1
1,55.0,no_anaemia,7861,no_diabetes,38,no_HBP,263358.03,1.1,136,male,no_smoking,6,1
2,65.0,no_anaemia,146,no_diabetes,20,no_HBP,162000.0,1.3,129,male,smoking,7,1


### Then change the type of Categorical attributes columns into 'category'

In [8]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  299 non-null    float64
 1   anaemia              299 non-null    object 
 2   CPK                  299 non-null    int64  
 3   diabetes             299 non-null    object 
 4   ejection_fraction    299 non-null    int64  
 5   high_blood_pressure  299 non-null    object 
 6   platelets            299 non-null    float64
 7   serum_creatinine     299 non-null    float64
 8   serum_sodium         299 non-null    int64  
 9   sex                  299 non-null    object 
 10  smoking              299 non-null    object 
 11  time                 299 non-null    int64  
 12  label                299 non-null    int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 30.5+ KB


In [9]:
categorical_columns = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking', 'label']
for c in categorical_columns:
  dataframe[c] = dataframe[c].astype('category')

In [10]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  299 non-null    float64 
 1   anaemia              299 non-null    category
 2   CPK                  299 non-null    int64   
 3   diabetes             299 non-null    category
 4   ejection_fraction    299 non-null    int64   
 5   high_blood_pressure  299 non-null    category
 6   platelets            299 non-null    float64 
 7   serum_creatinine     299 non-null    float64 
 8   serum_sodium         299 non-null    int64   
 9   sex                  299 non-null    category
 10  smoking              299 non-null    category
 11  time                 299 non-null    int64   
 12  label                299 non-null    category
dtypes: category(6), float64(3), int64(4)
memory usage: 19.0 KB


# Elimination of Time attribute (no meaning) and change de Platelets value.

In [11]:
dataframe = dataframe.drop('time', axis=1)
dataframe.platelets = dataframe.platelets / 1000

In [None]:
dataframe.head(5)

# Outlier Detection - LOF

In [12]:
columns=list(dataframe.columns)
numerical_columns=[c for c in columns if c not in categorical_columns]
numerical_columns

['age',
 'CPK',
 'ejection_fraction',
 'platelets',
 'serum_creatinine',
 'serum_sodium']

In [13]:
categorical_columns

['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking', 'label']

## LOF

In [14]:
local_outlier = LocalOutlierFactor(n_neighbors=20)
outlier_mask = local_outlier.fit_predict(dataframe.loc[:,numerical_columns])

We are summing the 1s that are "inliners" 
and the -1s that are "outliers"

In [15]:
#Number of outliers 
(outlier_mask == -1).sum()

37

In [16]:
mask = outlier_mask==1
df_masked = dataframe[mask]

In [17]:
(dataframe.label == 1).sum(), (dataframe.label == 0).sum()

(96, 203)

In [18]:
(df_masked.label == 1 ).sum(), (df_masked.label == 0 ).sum()

(83, 179)

In [19]:
(dataframe.label == 1).sum()-(df_masked.label == 1).sum(), (dataframe.label == 0).sum()-(df_masked.label == 0).sum()

(13, 24)

In [20]:
dataframe.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,299.0,60.833893,11.894809,40.0,51.0,60.0,70.0,95.0
CPK,299.0,581.839465,970.287881,23.0,116.5,250.0,582.0,7861.0
ejection_fraction,299.0,38.083612,11.834841,14.0,30.0,38.0,45.0,80.0
platelets,299.0,263.358029,97.804237,25.1,212.5,262.0,303.5,850.0
serum_creatinine,299.0,1.39388,1.03451,0.5,0.9,1.1,1.4,9.4
serum_sodium,299.0,136.628763,4.394854,114.0,134.0,137.0,140.0,148.0


In [21]:
dataframe = df_masked
df_masked.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,262.0,60.764634,11.609995,40.0,51.0,60.0,69.75,95.0
CPK,262.0,459.053435,565.656968,23.0,111.25,216.5,582.0,2794.0
ejection_fraction,262.0,37.89313,11.833618,14.0,30.0,38.0,45.0,80.0
platelets,262.0,255.066865,78.226947,73.0,210.25,255.0,294.75,742.0
serum_creatinine,262.0,1.381718,1.030771,0.5,0.9,1.1,1.4,9.4
serum_sodium,262.0,136.755725,4.29843,114.0,134.0,137.0,140.0,148.0


In [22]:
class2name = {0: 'Alive', 1:'Dead'}
dataframe.loc[:, 'label'] = dataframe.apply(lambda r: class2name[r['label']], axis=1)
dataframe.head(2)

Unnamed: 0,age,anaemia,CPK,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,label
0,75.0,no_anaemia,582,no_diabetes,20,HBP,265.0,1.9,130,male,no_smoking,Dead
2,65.0,no_anaemia,146,no_diabetes,20,no_HBP,162.0,1.3,129,male,smoking,Dead


In [None]:
import plotly.express as px

fig = px.box(dataframe, y=numerical_columns[5], color='label',
                     color_discrete_map={'Dead': 'coral', 'Alive': 'rgb(99, 180, 250)'}, width=400, height=400)
fig.update_layout({'paper_bgcolor' : 'rgba(0, 0, 0, 0)'})
fig.show()

# Train - Test split 

75% - 25% split  

In [23]:
X = dataframe.drop(['label'], axis=1)
y = dataframe['label']
# Percentage of labels
y.value_counts(normalize=True)

Alive    0.683206
Dead     0.316794
Name: label, dtype: float64

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((196, 11), (196,), (66, 11), (66,))

# Standardization of Numerical Attributes

In [25]:
X_train.index

Int64Index([ 56, 200, 128, 149, 170, 118,  71, 236, 278,  70,
            ...
            274,  59, 253,  57,  79, 145, 211, 252, 250, 163],
           dtype='int64', length=196)

In [26]:
scaler = RobustScaler()
fitted_scaler = scaler.fit(X=X_train.loc[:, numerical_columns])

X_train_scaled = fitted_scaler.transform(X_train.loc[:, numerical_columns])
X_test_scaled = fitted_scaler.transform(X_test.loc[:, numerical_columns])

In [27]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_columns,  index=X_test.index)
X_train_scaled.shape, X_test_scaled.shape

((196, 6), (66, 6))

In [28]:
X_train_scaled_df.head(2)

Unnamed: 0,age,CPK,ejection_fraction,platelets,serum_creatinine,serum_sodium
56,0.555556,-0.337259,-0.2,-0.255319,3.2,0.166667
200,0.166667,3.285867,0.466667,-1.851064,-0.8,0.0


Boxplots

In [None]:
boxplots = X_train_scaled_df.assign(label = y_train)

In [None]:
boxplots.head(2)

In [None]:
numerical_columns

In [None]:
import plotly.express as px

fig = px.box(boxplots, y=numerical_columns[5], color='label',
                     color_discrete_map={'Dead': 'coral', 'Alive': 'rgb(99, 180, 250)'}, width=400, height=400)
fig.update_layout({'paper_bgcolor' : 'rgba(0, 0, 0, 0)'})
fig.show()

# Encoding of the Categorical Features 

In [27]:
Params.drop_first = True

In [28]:
#Omitting label from categorical columns
categorical_columns[:-1], Params.drop_first

(['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'], True)

In [29]:
X_train_dummy = pd.get_dummies(X_train.loc[:, categorical_columns[:-1]], drop_first=Params.drop_first)
X_test_dummy = pd.get_dummies(X_test.loc[:, categorical_columns[:-1]], drop_first=Params.drop_first)
X_train_dummy.head(5)

Unnamed: 0,anaemia_no_anaemia,diabetes_no_diabetes,high_blood_pressure_no_HBP,sex_male,smoking_smoking
56,0,1,1,1,1
200,0,1,1,1,0
128,1,1,0,1,1
149,1,1,0,1,0
170,0,0,1,1,1


In [30]:
X_train_final = pd.concat([X_train_dummy, X_train_scaled_df], axis=1)
X_test_final = pd.concat([X_test_dummy, X_test_scaled_df], axis=1)
X_train_final.head(2)

Unnamed: 0,anaemia_no_anaemia,diabetes_no_diabetes,high_blood_pressure_no_HBP,sex_male,smoking_smoking,age,CPK,ejection_fraction,platelets,serum_creatinine,serum_sodium
56,0,1,1,1,1,0.555556,-0.337259,-0.2,-0.255319,3.2,0.166667
200,0,1,1,1,0,0.166667,3.285867,0.466667,-1.851064,-0.8,0.0


# PCA

In [None]:
pca = PCA().fit(X_train_final)

In [None]:
pca.explained_variance_ratio_

In [None]:
list(range(1,len(X_train_final.columns)+1))

In [None]:
sns.set(rc={'figure.figsize':(11,7)})
sns.set(style="ticks")
plt.grid()

sns.lineplot(x=list(range(1, len(X_train_final.columns)+1)), y=np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.tight_layout()
x = sns.lineplot(x=list(range(1,len(X_train_final.columns)+1)), y=pca.explained_variance_ratio_)
x.figure.savefig("pca.pdf")
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
pca.explained_variance_ratio_

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
pca = PCA()
fitted_pca = pca.fit(X_train_final)

X_train_pca = fitted_pca.transform(X_train_final)
X_test_pca = fitted_pca.transform(X_test_final)
X_train_pca.shape

# Stratified K fold
196 sample


with 5 fold split -> 157 train and 39 
with 3 fold split -> 130 train and 66

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
skf.get_n_splits(X_train_pca, y_train)

from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
loo.get_n_splits(X_train_final)


# Run Experiment

In [None]:
def run_experiment (X_train, X_test, y_train, y_test, model, model_name, params, skf, sampling):

    if sampling=='SMOTENC':
        pipeline = imbpipeline(steps = [['smote', SMOTENC(categorical_features=[True]*5 + [False]*6)],
                                    ['classifier', model]])


    #Grid Search 
    grid = GridSearchCV(estimator=pipeline,
                                     scoring='f1',
                                     param_grid=params, 
                                     cv=skf,
                                     refit=True, 
                                     n_jobs=-1) #modello testato è quello trainato precedentemente su tutto il dataset

    results = grid.fit(X_train, y_train)
    report = classification_report(y_test, results.predict(X_test))
    
    print("========================")
    print(report)
    print("========================")
    
    #Plot 

    fig, axes = plt.subplots( 1, 2, figsize=(25,7))

    fig.suptitle(f"{model_name}")

    axes[0].grid(False)
    axes[0].set_title("Confusion Matrix")
    plot_confusion_matrix(results.best_estimator_, X_test, y_test, ax=axes[0],
                                display_labels=['alive', 'dead'],
                                cmap=plt.cm.Blues,
                                normalize='true')

    axes[1].set_title("Roc Curve")
    plot_roc_curve(results.best_estimator_, X_test, y_test, ax=axes[1])

    plt.show()
    return results


# Models 

## Random Forest

In [None]:
random_forest = RandomForestClassifier(n_jobs=-1, random_state=10)

param_random_forest = { 'smote__sampling_strategy':["minority", 'all', 'auto'],
                        'smote__k_neighbors': [5,7,8,10],
                        'classifier__n_estimators': [200, 100, 150, 50, 5, 15, 30, 40],
                        'classifier__criterion': ['gini', 'entropy', 'log_loss'],
                        'classifier__max_depth': [ 10, 15, 20, 25, 30, 35]
                        }
for m in ['SMOTENC']:
    title = f'Random Forest with {m}'   
    results = run_experiment(X_train_pca, X_test_pca, y_train, y_test, random_forest, title, param_random_forest, skf, m)
    print("-------------------------------------------------------")
    print(f"best Score Random Forest = {results.best_score_}")
    print(f"best parameter Random Forest = {results.best_params_}") 

# LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(solver='liblinear', random_state=42)
param_logistic = {'smote__sampling_strategy':["minority", 'all', 'auto'],
                  'smote__k_neighbors': [5,7,8,10],
                  'classifier__penalty': ['l1', 'l2'],
                  'classifier__C': [0.001, 0.01, 0.02, 0.05, 0.08, .1, .15, .2],
                }


#title = f'Logistic Regression'
#evaluate_model(X, y, logistic, title, param_logistic, skf, 'BorderlineSMOTE')
for m in ['SMOTENC']:
    title = f'Logistic Regression with {m}'   
    results = run_experiment(X_train_pca, X_test_pca, y_train, y_test, logistic, title, param_logistic, skf, m)
    print("-------------------------------------------------------")
    print(f"best Score Logistic Regression = {results.best_score_}")
    print(f"best parameter Logistic Regression = {results.best_params_}") 

# SVM

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', random_state = 42 )  
param_svm = {   'smote__sampling_strategy':["minority", 'all'],
                'smote__k_neighbors': [5,7,8,10],
                'classifier__C'     : [.001, .01, .1,  1, 2, 5, 7, 10],
                'classifier__gamma' : ['auto', 'scale']  }

for m in ['SMOTENC']:
    title = f'SVM (linear) with {m}'   
    results = run_experiment(X_train_pca, X_test_pca, y_train, y_test, svm, title, param_svm, skf, m)
    print("-------------------------------------------------------")
    print(f"best Score SVM (linear) = {results.best_score_}")
    print(f"best parameter SVM  (linear) = {results.best_params_}") 


In [None]:
from sklearn.svm import SVC
svm = SVC(kernel = 'poly', random_state = 42 )  
param_svm = {   'smote__sampling_strategy':["minority", 'all'],
                'smote__k_neighbors': [5,7,8,10],
                'classifier__C'     : [.001, .01, .1,  1, 2, 5, 7, 10],
                'classifier__gamma' : ['auto', 'scale']  }

for m in ['SMOTENC']:
    title = f'SVM (poly) with {m}'   
    results = run_experiment(X_train_pca, X_test_pca, y_train, y_test, svm, title, param_svm, skf, m)
    print("-------------------------------------------------------")
    print(f"best Score SVM (poly) = {results.best_score_}")
    print(f"best parameter SVM (poly) = {results.best_params_}") 


In [None]:
from sklearn.svm import SVC
svm = SVC(kernel = 'rbf', random_state = 42 )  
param_svm = {   'smote__sampling_strategy':["minority", 'all'],
                'smote__k_neighbors': [5,7,8,10],
                'classifier__C'     : [.001, .01, .1,  1, 2, 5, 7, 10],
                'classifier__gamma' : ['auto', 'scale']  }

for m in ['SMOTENC']:
    title = f'SVM (rbf) with {m}'   
    results = run_experiment(X_train_pca, X_test_pca, y_train, y_test, svm, title, param_svm, skf, m)
    print("-------------------------------------------------------")
    print(f"best Score SVM (rbf) = {results.best_score_}")
    print(f"best parameter SVM (rbf) = {results.best_params_}") 