Librerie

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import roc_curve, roc_auc_score, plot_confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, zero_one_loss
from sklearn import metrics

Caricamento DataSet

In [None]:
data = pd.read_csv('C:/Users/giaco/Desktop/Icon_HEART/heart.csv')
data.head()

In [None]:
data.describe()

In [None]:
#check if there are outliers in the features

# Boxplot for age
sns.boxplot(x=data.age, color = 'red')
plt.show()

# Boxplot for creatinine_phosphokinase
sns.boxplot(x=data.creatinine_phosphokinase, color = 'red')
plt.show()

# Boxplot for ejection_fraction
sns.boxplot(x=data.ejection_fraction, color = 'red')
plt.show()

# Boxplot for platelets
sns.boxplot(x=data.platelets, color = 'red')
plt.show()

# Boxplot for serum_creatinine
sns.boxplot(x=data.serum_creatinine, color = 'red')
plt.show()

# Boxplot for serum_sodium
sns.boxplot(x=data.serum_sodium, color = 'red')
plt.show()

# Boxplot for time
sns.boxplot(x=data.time, color = 'red')
plt.show()



# Nessun valore anomalo per età e tempo.
# Tuttavia, prima di occuparci dei valori anomali, 
# è necessaria la conoscenza dei valori anomali, del set di dati e possibilmente di alcune conoscenze di dominio.
# La rimozione dei valori anomali senza una buona ragione non aumenterà sempre la precisione. 
# Senza una profonda comprensione di quali sono i possibili intervalli che esistono all'interno di ciascuna funzionalità, 
# la rimozione dei valori anomali diventa difficile.

Analisi dei Dati

In [None]:
fig_age = go.Figure()

fig_age.add_trace(go.Histogram(x=data['age'],
                               marker_color='#6a6fff'))
fig_age.update_layout(
    title_text='Distribuzione per età',
    xaxis_title_text='Age',
    yaxis_title_text='Count', 
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, 
    height=600
)
fig_age.show()

Anemia

In [None]:
normal = data[data['anaemia']==0]

anemia = data[data['anaemia']==1]

In [None]:
# L'anemia, o un basso livello di emoglobina nel sangue, 
# è spesso collegata a malattie cardiache 
# perché il cuore deve lavorare di più per pompare più sangue e ossigeno attraverso il corpo.

colors= ['#7eff5e', '#ff5e79']

labels = ['Normale', 'Anemia']

values = [len(normal[normal['DEATH_EVENT'] == 1]), 
          len(anemia[anemia['DEATH_EVENT'] == 1])]

fig_anemia = go.Figure()

fig_anemia.add_trace(go.Pie(labels=labels, values=values,
                            hole=.4, marker_colors=colors))

fig_anemia.update_layout(
    title_text='Numero Totale di Decessi - Anemia',
    template = 'plotly_dark',
    width=750, 
    height=600
)
fig_anemia.show()

Creatinina fosfochinasi

In [None]:
normal_cpk_level = data[(data['creatinine_phosphokinase'] >= 10) & 
                        (data['creatinine_phosphokinase'] <= 120)]

abnormal_cpk_level = data[(data['creatinine_phosphokinase'] < 10) | 
                          (data['creatinine_phosphokinase'] > 120)]

In [None]:
fig_creatinine = go.Figure()

fig_creatinine.add_trace(go.Histogram(x=data['creatinine_phosphokinase'],
                                      marker_color='#6a6fff'))

fig_creatinine.update_layout(
    title_text='Distribuzione della creatinina fosfochinasi',
    xaxis_title_text='Creatinina Fosfochinasi',
    yaxis_title_text='Count', 
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, 
    height=600
)
fig_creatinine.show()

In [None]:
colors = ['#7eff5e', '#ff5e79']

labels = ['Livelli Normali di CPK', 'Livelli Anomali di CPK']

values = [len(normal_cpk_level[normal_cpk_level['DEATH_EVENT'] == 1]),
          len(abnormal_cpk_level[abnormal_cpk_level['DEATH_EVENT'] == 1])]

fig_creatinine = go.Figure()

fig_creatinine.add_trace(go.Pie(labels=labels, values=values, 
                                hole=.4, marker_colors=colors))

fig_creatinine.update_layout(
    title_text='Numero totale di decessi - CPK',
    template = 'plotly_dark',
    width=750, 
    height=600
)

Diabete

In [None]:
normal = data[data['diabetes']==0]

diabetes = data[data['diabetes']==1]

In [None]:
colors = ['#7eff5e', '#ff5e79']

labels = ['Normale', 'Diabete']

values = [len(normal[normal['DEATH_EVENT'] == 1]), 
          len(diabetes[diabetes['DEATH_EVENT'] == 1])]

fig_diabetes = go.Figure()

fig_diabetes.add_trace(go.Pie(labels=labels, values=values,
                              hole=.4, marker_colors=colors))

fig_diabetes.update_layout(
    title_text='Numero totale di decessi - Diabete',
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, height=600)

Frazione di eiezione

In [None]:
normal_ejection_fract = data[data['ejection_fraction'] >= 55]

reduced_ejection_fract = data[data['ejection_fraction'] <= 50]

borderline_ejection_fract = data[(data['ejection_fraction'] < 55) & 
                                 (data['ejection_fraction'] > 50)]

In [None]:
fig_eject_fract = go.Figure()

fig_eject_fract.add_trace(go.Histogram(x=data['ejection_fraction'],
                                      marker_color='#6a6fff'))

fig_eject_fract.update_layout(
    title_text='Distribuzione della frazione di eiezione',
    xaxis_title_text='Frazione di eiezione',
    yaxis_title_text='Count', 
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, height=600
)
fig_eject_fract.show()

In [None]:
colors = ['#7eff5e', '#ff5e79', '#fddb3a']

labels = ['Normale Frazione di Eiezione', 'Frazione di eiezione ridotta', 
          'Frazione di eiezione borderline']

values = [len(normal_ejection_fract[normal_ejection_fract['DEATH_EVENT']==1]),
          len(reduced_ejection_fract[reduced_ejection_fract['DEATH_EVENT']==1]),
          len(borderline_ejection_fract[borderline_ejection_fract['DEATH_EVENT']==1])]

fig_eject_fract = go.Figure()

fig_eject_fract.add_trace(go.Pie(labels=labels, values=values,
                         hole=.4, marker_colors=colors))

fig_eject_fract.update_layout(
    title_text='Numero totale di decessi - Frazione di Eiezione',
    template = 'plotly_dark',
    width=750, 
    height=600
)

Alta pressione sanguigna

In [None]:
normal_blood_pressure = data[data['high_blood_pressure'] == 0]

high_blood_pressure = data[data['high_blood_pressure'] == 1]

In [None]:
color = ['#7eff5e', '#ff5e79']

labels = ['Pressione Sanguigna Normale', 'Pressione Sanguigna Alta']

values = [len(normal_blood_pressure[normal_blood_pressure['DEATH_EVENT'] == 1]), 
          len(high_blood_pressure[high_blood_pressure['DEATH_EVENT'] == 1])]

fig_pressure = go.Figure()

fig_pressure.add_trace(go.Pie(labels=labels, values=values,
                             hole=.4, marker_colors=colors))

fig_pressure.update_layout(
    title_text='Numero Totale di Decessi - Pressione Sanguigna',
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, 
    height=600
)

Piastrine

In [None]:
normal_platelets_level = data[(data['platelets'] >= 150000) & (data['platelets'] <= 450000)]

abnormal_platelets_level = data[(data['platelets'] < 150000) | (data['platelets'] > 450000)]

In [None]:
fig_platelets = go.Figure()

fig_platelets.add_trace(go.Histogram(x=data['platelets'], 
                                      marker_color='#6a6fff'))

fig_platelets.update_layout(
    title_text='Distribuzione delle Piastrine',
    xaxis_title_text='Piastrine',
    yaxis_title_text='Count', 
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, height=600
)
fig_platelets.show()

In [None]:
colors = ['#7eff5e', '#ff5e79']

labels = ['Livello Nomrale di Piastrine', 'Livello Anomalo di Piastrine']

values = [len(normal_platelets_level[normal_platelets_level['DEATH_EVENT']==1]),
          len(abnormal_platelets_level[abnormal_platelets_level['DEATH_EVENT']==1])]

fig_platelets = go.Figure()

fig_platelets.add_trace(go.Pie(labels=labels, values=values, 
                         hole=.4, marker_colors=colors))

fig_platelets.update_layout(
    title_text='Numero Totale di Decessi - Piastrine',
    template = 'plotly_dark',
    width=750, 
    height=600
)

fig_platelets.show()

Creatinina Sierica

In [None]:
normal_range_creatinine = data[(data['serum_creatinine'] >= 0.7) & (data['serum_creatinine'] <= 1.2)]

out_range_creatinine = data[(data['serum_creatinine'] < 0.7) | (data['serum_creatinine'] > 1.2)]

In [None]:
fig_creatinine = go.Figure()

fig_creatinine.add_trace(go.Histogram(x=data['serum_creatinine'], 
                                      marker_color='#6a6fff'))

fig_creatinine.update_layout(
    title_text='Distribuzione della Creatinina Sierica',
    xaxis_title_text='Creatinina Sierica',
    yaxis_title_text='Count', 
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, height=600
)
fig_creatinine.show()

In [None]:
colors = ['#7eff5e', '#ff5e79']

labels = ['Livelli Normali di Creatinina', 'Livelli Anomali di Creatinina']

values = [len(normal_range_creatinine[normal_range_creatinine['DEATH_EVENT']==1]),
          len(out_range_creatinine[out_range_creatinine['DEATH_EVENT']==1])]

fig_creatinine = go.Figure()

fig_creatinine.add_trace(go.Pie(labels=labels, values=values, 
                         hole=.4, marker_colors=colors))

fig_creatinine.update_layout(
    title_text='Numero totale di decessi - Creatinina sierica',
    template = 'plotly_dark',
    width=750, 
    height=600
)
fig_creatinine.show()

Siero di Sodio

In [None]:
normal_sodium_level = data[(data['serum_sodium'] >= 135) & (data['serum_sodium'] <= 145)]
abnormal_sodium_level = data[(data['serum_sodium'] < 135) | (data['serum_sodium'] > 145)]

In [None]:
fig_sodium = go.Figure()

fig_sodium.add_trace(go.Histogram(x=data['serum_sodium'], 
                                  marker_color='#6a6fff'))
fig_sodium.update_layout(
    title_text='Distribuzione Siero di Sodio',
    xaxis_title_text='Siero di Sodio',
    yaxis_title_text='Count', 
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, height=600
)
fig_sodium.show()

In [None]:
colors = ['#7eff5e', '#ff5e79']

labels = ['Livello normale di serio di sodio', 'Livello anomalo di serio di sodio']

values = [len(normal_sodium_level[normal_sodium_level['DEATH_EVENT']==1]),
          len(abnormal_sodium_level[abnormal_sodium_level['DEATH_EVENT']==1])]

fig_sodium = go.Figure()

fig_sodium.add_trace(go.Pie(labels=labels, values=values, 
                         hole=.4, marker_colors=colors))

fig_sodium.update_layout(
    title_text='Numero totale di decessi - Siero di Sodio',
    template = 'plotly_dark',
    width=750, 
    height=600
)

fig_sodium.show()

Sesso

In [None]:
colors = ['#013766', '#bc4558']

labels = ['Maschi', 'Femmine']

values = [len(data[(data['DEATH_EVENT'] == 1) & (data['sex'] == 1)]), 
          len(data[(data['DEATH_EVENT'] == 1) & (data['sex'] == 0)])]

fig_sex = go.Figure()

fig_sex.add_trace(go.Pie(labels=labels, values=values, 
                         hole=.4, marker_colors=colors))

fig_sex.update_layout(
    title_text='Numero totale di decessi - Sesso',
    template = 'plotly_dark',
    width=750, 
    height=600
)

fig_sex.show()

Fumo

In [None]:
labels = ['Fumatori', 'Non fumatori']

values = [len(data[(data['DEATH_EVENT'] == 1) & (data['smoking'] == 1)]), 
          len(data[(data['DEATH_EVENT'] == 1) & (data['smoking'] == 0)])]

fig_smoking = go.Figure()

fig_smoking.add_trace(go.Pie(labels=labels, values=values,
                            hole=.4))

fig_smoking.update_layout(
    title_text='Numero totale di decessi - Fumo',
    template = 'plotly_dark',
    width=750, 
    height=600
)

fig_smoking.show()

Tempo

In [None]:
fig_time = go.Figure()

fig_time.add_trace(go.Histogram(x=data['time'], 
                                marker_color='#6a6fff'))

fig_time.update_layout(
    title_text='Distribuzione del Tempo',
    xaxis_title_text='Tempo giorni di follow up',
    yaxis_title_text='Count', 
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, 
    height=600
)
fig_time.show()

EVENTO DI MORTE (DEATH EVENT)

In [None]:
survived = data[data['DEATH_EVENT'] == 0]

dead = data[data['DEATH_EVENT'] == 1]

In [None]:
fig_target = go.Figure()

fig_target.add_trace(go.Histogram(x=survived['DEATH_EVENT'], 
                                  name='Sopravvissuto'))

fig_target.add_trace(go.Histogram(x=dead['DEATH_EVENT'], 
                                  name='Non Sopravvissuto'))

fig_target.update_layout(
    yaxis_title_text='Count', 
    bargap=0.05, 
    template = 'plotly_dark',
    width=750, 
    height=600
)
fig_target.show()

Correlazione Feature

In [None]:
# Checking corelation between numerical features
plt.figure(figsize=(8,9))
cor =data.corr()
sns.heatmap(cor,annot=True,vmax=0.8,cmap='coolwarm',fmt='.2f',linecolor='green',linewidths=0.7,square=True)

Preparazione dei dati e Selezione delle feature rilevanti

In [None]:
numerical_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction',
                      'platelets', 'serum_creatinine', 'serum_sodium',
                      'time']

categorical_features = ['anaemia', 'diabetes', 'high_blood_pressure',
                        'sex', 'smoking']

numerical_selector = SelectKBest(f_classif, k=4)

categorical_selector =  SelectKBest(chi2, k=1)

X_numerical = numerical_selector.fit_transform(data[numerical_features], 
                                                  data['DEATH_EVENT'])

X_categorical = categorical_selector.fit_transform(data[categorical_features],
                                                    data['DEATH_EVENT'])

print('Numerical features selected:', data[numerical_features].columns[numerical_selector.get_support()].to_list())

print('Categorical features selected:', data[categorical_features].columns[categorical_selector.get_support()].to_list())

In [None]:
# definisco le variabili da usare durante l'apprendimento automatico
X_selected = data[['age', 'ejection_fraction', 'serum_creatinine', 'time', 
                   'high_blood_pressure']]

y = data['DEATH_EVENT']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, 
                                                    test_size = 0.2, 
                                                    stratify = y)

In [None]:
# effettuo la standardizzazione dei dati
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

In [None]:
num_all = np.unique(y, return_counts=True)
print('             The number of DEATH_EVENT')
print('                   0        |      1')
print('------------------------------------------')
print('All dataset      ', num_all[1][0], ' '*5, '|', ' '*4, num_all[1][1])
num_train = np.unique(y_train, return_counts=True)
print('Train set (70%)  ', num_train[1][0], ' '*5, '|', ' '*4, num_train[1][1])
num_test = np.unique(y_test, return_counts=True)
print('Test set (30%)    ', num_test[1][0], ' '*5, '|', ' '*4, num_test[1][1])

        Apprendimento Supervisionato

In [None]:
#Random Forest
rfc = RandomForestClassifier(n_estimators=30, max_depth=8, criterion='gini', random_state=42)
rfc.fit(X_train_scaled, y_train)
y_pred_rfc = rfc.predict(X_test_scaled)
ncv_score_rfc = rfc.score(X_test_scaled, y_test)
score_train_rfc = rfc.score(X_train_scaled, y_train)
score_rfcl = cross_val_score(rfc, X_train_scaled, y_train, cv=20)
rfc_fpr, rfc_tpr, thr = roc_curve(y_test, y_pred_rfc)



no_skill = [0 for _ in range(len(y_test))]

rfc_probs = rfc.predict_proba(X_test_scaled)

rfc_probs = rfc_probs[:, 1]

rfc_auc = roc_auc_score(y_test, rfc_probs)

print("Train score: ", score_train_rfc, "\nTest score: ", ncv_score_rfc)
print("Cross Validated Score: ", score_rfcl.mean())
print("Standard Deviation: ", score_rfcl.std())
print("Variance: ", np.var(score_rfcl))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_rfc))
print("Accuracy:", rfc_auc)
print(classification_report(y_test, y_pred_rfc))


In [None]:
plot_confusion_matrix(rfc,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
plt.title("Random Forest before Opt")

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=10, stop=100, num=10)]

# Number of features to consider at every split
max_features = ['sqrt', 'log2']

# Maximum number of levels in tree
max_depth = range(1, 10)

# measure the quality of a split
criterion = ['gini', 'entropy']

# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the param grid
param_grid = {'n_estimators': n_estimators,
                  'max_features': max_features,
                  'max_depth': max_depth,
                  'criterion': criterion,
                  'bootstrap': bootstrap}

optimal_params = GridSearchCV(RandomForestClassifier(),
                                  param_grid,
                                  cv=5,  # we are taking 5-fold as in k-fold cross validation
                                  scoring='accuracy',  # try the other scoring if have time
                                  verbose=0,
                                  n_jobs=-1)

optimal_params.fit(X_train_scaled, y_train)
optimal_params.best_params_

In [None]:
rfc_opt = RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth=8, max_features='sqrt', n_estimators=70)
rfc_opt.fit(X_train_scaled, y_train)
y_pred_rfc_opt = rfc_opt.predict(X_test_scaled)
ncv_score_rfc_opt = rfc_opt.score(X_test_scaled, y_test)
score_train_rfc_opt = rfc_opt.score(X_train_scaled, y_train)
score_rfcl_opt = cross_val_score(rfc_opt, X_train_scaled, y_train, cv=20)
rfc_opt_fpr, rfc_opt_tpr, thr = roc_curve(y_test, y_pred_rfc_opt)



no_skill = [0 for _ in range(len(y_test))]

rfc_probs_opt = rfc_opt.predict_proba(X_test_scaled)

rfc_probs_opt = rfc_probs_opt[:, 1]

rfc_auc_opt = roc_auc_score(y_test, rfc_probs_opt)

print("Train score: ", score_train_rfc_opt, "\nTest score: ", ncv_score_rfc_opt)
print("Cross Validated Score: ", score_rfcl_opt.mean())
print("Standard Deviation: ", score_rfcl_opt.std())
print("Variance: ", np.var(score_rfcl_opt))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_rfc_opt))
print("Accuracy:", rfc_auc_opt)
print(classification_report(y_test, y_pred_rfc_opt))

In [None]:
plot_confusion_matrix(optimal_params,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
plt.title("Random Forest after Opt")

In [None]:
# Decision Tree
dt = DecisionTreeClassifier(criterion='gini')
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)
ncv_score_dt = dt.score(X_test_scaled, y_test)
score_train_dt = dt.score(X_train_scaled, y_train)
score_dtl = cross_val_score(dt, X_train_scaled, y_train, cv=20)
dt_fpr, dt_tpr, thr = roc_curve(y_test, y_pred_dt)



no_skill = [0 for _ in range(len(y_test))]

dt_probs = dt.predict_proba(X_test_scaled)

dt_probs = dt_probs[:, 1]

dt_auc = roc_auc_score(y_test, dt_probs)

print("Train score: ", score_train_dt, "\nTest score: ", ncv_score_dt)
print("Cross Validated Score: ", score_dtl.mean())
print("Standard Deviation: ", score_dtl.std())
print("Variance: ", np.var(score_dtl))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_dt))
print("Accuracy:", dt_auc)
print(classification_report(y_test, y_pred_dt))



In [None]:
plot_confusion_matrix(dt,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
plt.title("Decision Tree before Opt")

In [None]:
param_grid_dt = {'criterion': ['gini', 'entropy'],
                  'max_depth': range(1, 100)}
optimal_params_dt  = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5, refit=True, error_score=0,
                          n_jobs =-1, return_train_score=True)

optimal_params_dt.fit(X_train_scaled, y_train)
optimal_params_dt.best_params_

In [None]:
dt_opt = DecisionTreeClassifier(criterion='gini', max_depth=1)
dt_opt.fit(X_train_scaled, y_train)
y_pred_dt_opt = dt_opt.predict(X_test_scaled)
ncv_score_dt_opt = dt_opt.score(X_test_scaled, y_test)
score_train_dt_opt = dt_opt.score(X_train_scaled, y_train)
score_dtl_opt = cross_val_score(dt_opt, X_train_scaled, y_train, cv=20)
dt_opt_fpr, dt_opt_tpr, thr = roc_curve(y_test, y_pred_dt_opt)



no_skill = [0 for _ in range(len(y_test))]

dt_probs_opt = dt_opt.predict_proba(X_test_scaled)

dt_probs_opt = dt_probs_opt[:, 1]

dt_auc_opt = roc_auc_score(y_test, dt_probs_opt)

print("Train score: ", score_train_dt_opt, "\nTest score: ", ncv_score_dt_opt)
print("Cross Validated Score: ", score_dtl_opt.mean())
print("Standard Deviation: ", score_dtl_opt.std())
print("Variance: ", np.var(score_dtl_opt))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_dt_opt))
print("Accuracy:", dt_auc_opt)
print(classification_report(y_test, y_pred_dt_opt))

In [None]:
plot_confusion_matrix(optimal_params_dt,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
plt.title("Decision Tree after Opt")

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
ncv_score_knn = knn.score(X_test_scaled, y_test)
score_train_knn = knn.score(X_train_scaled, y_train)
score_knnl = cross_val_score(knn, X_train_scaled, y_train, cv=20)
knn_fpr, knn_tpr, thr = roc_curve(y_test, y_pred_knn)



no_skill = [0 for _ in range(len(y_test))]

knn_probs = knn.predict_proba(X_test_scaled)

knn_probs = knn_probs[:, 1]

knn_auc = roc_auc_score(y_test, knn_probs)

print("Train score: ", score_train_knn, "\nTest score: ", ncv_score_knn)
print("Cross Validated Score: ", score_knnl.mean())
print("Standard Deviation: ", score_knnl.std())
print("Variance: ", np.var(score_knnl))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_knn))
print("Accuracy:", knn_auc)
print(classification_report(y_test, y_pred_knn))

In [None]:
plot_confusion_matrix(knn,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
plt.title("KNN before Opt")

In [None]:
param_grid_knn = {'n_neighbors': [5,10,15,30,60,90,120], 'weights': ['uniform','distance'], 'algorithm':['kd_tree','ball_tree','brute'],'p':[1,2]}
search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5, refit=True, error_score=0, n_jobs=-1, return_train_score=True)
search_knn.fit(X_train_scaled, y_train)
search_knn.best_params_

In [None]:
knn_opt = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10, p=1, weights='distance')
knn_opt.fit(X_train_scaled, y_train)
y_pred_knn_opt = knn_opt.predict(X_test_scaled)
ncv_score_knn_opt = knn_opt.score(X_test_scaled, y_test)
score_train_knn_opt = knn_opt.score(X_train_scaled, y_train)
score_knnl_opt = cross_val_score(knn_opt, X_train_scaled, y_train, cv=20)
knn_opt_fpr, knn_opt_tpr, thr = roc_curve(y_test, y_pred_knn_opt)



no_skill = [0 for _ in range(len(y_test))]

knn_probs_opt = knn_opt.predict_proba(X_test_scaled)

knn_probs_opt = knn_probs_opt[:, 1]

knn_auc_opt = roc_auc_score(y_test, knn_probs_opt)

print("Train score: ", score_train_knn_opt, "\nTest score: ", ncv_score_knn_opt)
print("Cross Validated Score: ", score_knnl_opt.mean())
print("Standard Deviation: ", score_knnl_opt.std())
print("Variance: ", np.var(score_knnl_opt))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_knn_opt))
print("Accuracy:", knn_auc_opt)
print(classification_report(y_test, y_pred_knn_opt))

In [None]:
plot_confusion_matrix(search_knn,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
plt.title("KNN after Opt")

In [None]:
# Logistic Regression
lr = LogisticRegression(C=1)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
ncv_score_lr = lr.score(X_test_scaled, y_test)
score_train_lr = lr.score(X_train_scaled, y_train)
score_lrl = cross_val_score(lr, X_train_scaled, y_train, cv=20)
lr_fpr, lr_tpr, thr = roc_curve(y_test, y_pred_lr)



no_skill = [0 for _ in range(len(y_test))]

lr_probs = lr.predict_proba(X_test_scaled)

lr_probs = lr_probs[:, 1]

lr_auc = roc_auc_score(y_test, lr_probs)

print("Train score: ", score_train_lr, "\nTest score: ", ncv_score_lr)
print("Cross Validated Score: ", score_lrl.mean())
print("Standard Deviation: ", score_lrl.std())
print("Variance: ", np.var(score_lrl))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_lr))
print("Accuracy:", lr_auc)
print(classification_report(y_test, y_pred_lr))

In [None]:
plot_confusion_matrix(lr,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
plt.title("Logistic Regression before Opt")

In [None]:
# Create the param grid
param_grid_lr = {'penalty': ['l1', 'l2'],
                  'C': [0.001, 0.01, 0.1, 1, 2, 3, 5, 10, 100, 1000]}

optimal_params_lr = GridSearchCV(LogisticRegression(),
                                  param_grid_lr,
                                  cv=5,  # we are taking 5-fold as in k-fold cross validation
                                  scoring='accuracy',  # try the other scoring if have time
                                  verbose=0,
                                  n_jobs=-1)

optimal_params_lr.fit(X_train_scaled, y_train)
optimal_params_lr.best_params_

In [None]:
lr_opt = LogisticRegression(C=0.1, penalty='l2')
lr_opt.fit(X_train_scaled, y_train)
y_pred_lr_opt = lr_opt.predict(X_test_scaled)
ncv_score_lr_opt = lr_opt.score(X_test_scaled, y_test)
score_train_lr_opt = lr_opt.score(X_train_scaled, y_train)
score_lrl_opt = cross_val_score(lr_opt, X_train_scaled, y_train, cv=20)
lr_opt_fpr, lr_opt_tpr, thr = roc_curve(y_test, y_pred_lr_opt)



no_skill = [0 for _ in range(len(y_test))]

lr_probs_opt = lr_opt.predict_proba(X_test_scaled)

lr_probs_opt = lr_probs_opt[:, 1]

lr_auc_opt = roc_auc_score(y_test, lr_probs_opt)

print("Train score: ", score_train_lr_opt, "\nTest score: ", ncv_score_lr_opt)
print("Cross Validated Score: ", score_lrl_opt.mean())
print("Standard Deviation: ", score_lrl_opt.std())
print("Variance: ", np.var(score_lrl_opt))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_lr_opt))
print("Accuracy:", lr_auc_opt)
print(classification_report(y_test, y_pred_lr_opt))

In [None]:
plot_confusion_matrix(optimal_params_lr,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
                              
plt.title("Logistic Regression after Opt")

In [None]:
# Multi Layer Perceptron
mlp = MLPClassifier(activation='tanh')
mlp.fit(X_train_scaled, y_train)
y_pred_mlp = mlp.predict(X_test_scaled)
ncv_score_mlp = mlp.score(X_test_scaled, y_test)
score_train_mlp = mlp.score(X_train_scaled, y_train)
score_mlpl = cross_val_score(mlp, X_train_scaled, y_train, cv=20)
mlp_fpr, mlp_tpr, thr = roc_curve(y_test, y_pred_mlp)


no_skill = [0 for _ in range(len(y_test))]


mlp_probs = mlp.predict_proba(X_test_scaled)

mlp_probs = mlp_probs[:, 1]

mlp_auc = roc_auc_score(y_test, mlp_probs)

print("Train score: ", score_train_mlp, "\nTest score: ", ncv_score_mlp)
print("Cross Validated Score: ", score_mlpl.mean())
print("Standard Deviation: ", score_mlpl.std())
print("Variance: ", np.var(score_mlpl))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_mlp))
print("Accuracy:", mlp_auc)
print(classification_report(y_test, y_pred_mlp))


In [None]:
plot_confusion_matrix(mlp,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
plt.title("MLP before Opt")

In [None]:
param_grid_mlp = {'activation': ['relu','logistic','tanh','identity']}
search_mlp = GridSearchCV(MLPClassifier(max_iter=200), param_grid_mlp, cv=5, refit=True, error_score=0, n_jobs=-1)
search_mlp.fit(X_train_scaled, y_train)
search_mlp.best_params_

In [None]:
mlp_opt = MLPClassifier(activation='relu')
mlp_opt.fit(X_train_scaled, y_train)
y_pred_mlp_opt = mlp_opt.predict(X_test_scaled)
ncv_score_mlp_opt = mlp_opt.score(X_test_scaled, y_test)
score_train_mlp_opt = mlp_opt.score(X_train_scaled, y_train)
score_mlpl_opt = cross_val_score(mlp_opt, X_train_scaled, y_train, cv=20)
mlp_opt_fpr, mlp_opt_tpr, thr = roc_curve(y_test, y_pred_mlp_opt)



no_skill = [0 for _ in range(len(y_test))]

mlp_probs_opt = mlp_opt.predict_proba(X_test_scaled)

mlp_probs_opt = mlp_probs_opt[:, 1]

mlp_auc_opt = roc_auc_score(y_test, mlp_probs_opt)

print("Train score: ", score_train_mlp_opt, "\nTest score: ", ncv_score_mlp_opt)
print("Cross Validated Score: ", score_mlpl_opt.mean())
print("Standard Deviation: ", score_mlpl_opt.std())
print("Variance: ", np.var(score_mlpl_opt))
print("0-1 Loss: ", zero_one_loss(y_test, y_pred_mlp_opt))
print("Accuracy:", mlp_auc_opt)
print(classification_report(y_test, y_pred_mlp_opt))

In [None]:
plot_confusion_matrix(search_mlp,X_test_scaled,
                              y_test,
                              values_format='d',
                              display_labels=['Heart Not Failed', 'Heart Fail'])
                              
plt.title("MLP after Opt")

        Apprendimento non Supervisionato

In [None]:
# K-MEANS
losses = []
for K in range(1, 15):
    kmeans = KMeans(n_clusters=K)
    kmeans.fit(X_train, y_train)
    losses.append(kmeans.inertia_)
    
    
plt.plot(range(1,15), losses, '-o')
plt.xlabel('Number of Clusters')
plt.ylabel('Loss')

In [None]:

kmeans_model = KMeans(n_clusters=3)
kmeans_model.fit(X_train_scaled, y_train)
y_pred_dct = kmeans_model.predict(X_test_scaled)
kms_fpr, kms_tpr, thr = roc_curve(y_test, y_pred_dct)


print(classification_report(y_test, y_pred_dct))






In [None]:
plt.figure(figsize=(5, 5), dpi=100)
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic Regression')
plt.plot(rfc_fpr, rfc_tpr, linestyle=':', label='Random Forest')
plt.plot(dt_fpr, dt_tpr, linestyle='-.', label='Decision Tree')
plt.plot(knn_fpr, knn_tpr, linestyle='-.', label='K-Neighbors')
plt.plot(mlp_fpr, mlp_tpr, linestyle='-.', label='Multi Layer Perceptron')
plt.plot(kms_fpr, kms_tpr, linestyle='-.', label='K-Means')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()


In [None]:
models = [('RFC', rfc_auc), 
          ('KNN', knn_auc),
          ('DT' , dt_auc),
          ('LR' , lr_auc),
          ('MLP', mlp_auc)]

model_comparasion = pd.DataFrame(models, columns=['Model', 'Accuracy Score'])

model_comparasion.head()