In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import pickle
from joblib import dump, load

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('data/heart_failure_clinical_records_dataset.csv')

In [3]:
df_c = df.copy()

In [4]:
df.drop('diabetes', axis=1, inplace=True)

In [5]:
df['DEATH_EVENT'].value_counts()

0    203
1     96
Name: DEATH_EVENT, dtype: int64

In [6]:
df[df.duplicated()]

Unnamed: 0,age,anaemia,creatinine_phosphokinase,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT


In [7]:
df.isnull().sum().sum()

0

In [8]:
labelencoder = LabelEncoder()
df['DEATH_EVENT'] = labelencoder.fit_transform(df['DEATH_EVENT'])
y = df['DEATH_EVENT'].to_numpy().ravel()

In [9]:
bins= [0, 50, 60, 70, 80, 90, 100]
labels = ['<50', '50-60', '60-70', '70-80', '80-90', '90+']
df['AgeGroup'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

In [10]:
df[['age','AgeGroup']]

Unnamed: 0,age,AgeGroup
0,75.0,70-80
1,55.0,50-60
2,65.0,60-70
3,50.0,50-60
4,65.0,60-70
...,...,...
294,62.0,60-70
295,55.0,50-60
296,45.0,<50
297,45.0,<50


In [11]:
categorical_features = ['anaemia', 'high_blood_pressure', 'sex', 'smoking', 'AgeGroup']

In [12]:
df[categorical_features]

Unnamed: 0,anaemia,high_blood_pressure,sex,smoking,AgeGroup
0,0,1,1,0,70-80
1,0,0,1,0,50-60
2,0,0,1,1,60-70
3,1,0,1,0,50-60
4,1,0,0,0,60-70
...,...,...,...,...,...
294,0,1,1,1,60-70
295,0,0,0,0,50-60
296,0,0,0,0,<50
297,0,0,1,1,<50


In [13]:
labels

['<50', '50-60', '60-70', '70-80', '80-90', '90+']

In [14]:
enc = OrdinalEncoder(categories=[labels])
categorical_features = ['AgeGroup']
enc.fit(df[categorical_features])
df[categorical_features] = enc.transform(df[categorical_features])

In [15]:
df[['age','AgeGroup']]

Unnamed: 0,age,AgeGroup
0,75.0,3.0
1,55.0,1.0
2,65.0,2.0
3,50.0,1.0
4,65.0,2.0
...,...,...
294,62.0,2.0
295,55.0,1.0
296,45.0,0.0
297,45.0,0.0


In [16]:
df[['age','AgeGroup']]

Unnamed: 0,age,AgeGroup
0,75.0,3.0
1,55.0,1.0
2,65.0,2.0
3,50.0,1.0
4,65.0,2.0
...,...,...
294,62.0,2.0
295,55.0,1.0
296,45.0,0.0
297,45.0,0.0


In [17]:
df.drop('age', axis=1, inplace=True)

In [18]:
numeric_features = ['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']
scaler = MinMaxScaler()
scaler.fit(df[numeric_features])
df[numeric_features] = scaler.transform(df[numeric_features])

In [19]:
scaler.transform(np.array([582.0, 20.0, 265000.0, 1.9, 137.0, 4.0]).reshape((1,-1)))

array([[0.07131921, 0.09090909, 0.29082313, 0.15730337, 0.68571429,
        0.        ]])

In [21]:
pickle.dump(enc, open('enc/encoder_hd.pkl','wb'))

In [22]:
pickle.dump(scaler, open('enc/scaler_hd.pkl','wb'))

In [23]:
# dump(enc, 'enc/encoder_hd.joblib')

In [24]:
# dump(scaler, 'enc/scaler_hd.joblib')

In [25]:
df.head()

Unnamed: 0,anaemia,creatinine_phosphokinase,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,AgeGroup
0,0,0.071319,0.090909,1,0.290823,0.157303,0.485714,1,0,0.0,1,3.0
1,0,1.0,0.363636,0,0.288833,0.067416,0.657143,1,0,0.007117,1,1.0
2,0,0.015693,0.090909,0,0.16596,0.089888,0.457143,1,1,0.010676,1,2.0
3,1,0.011227,0.090909,0,0.224148,0.157303,0.685714,1,0,0.010676,1,1.0
4,1,0.017479,0.090909,0,0.365984,0.247191,0.085714,0,0,0.014235,1,2.0


In [26]:
df['AgeGroup'].value_counts()

2.0    93
1.0    82
3.0    52
0.0    47
4.0    19
5.0     6
Name: AgeGroup, dtype: int64

In [27]:
y = df['DEATH_EVENT']
x = df.drop('DEATH_EVENT', axis=1)

'''x_train, x_test, y_train,  y_test = train_test_split(x, y, test_size=0.2, stratify=y)

from imblearn.over_sampling import RandomOverSampler
ros = SMOTE()
x_train, y_train = ros.fit_resample(x_train, y_train)

x_train.shape, y_train.shape, x_test.shape, y_test.shape, sum(y_train)/len(y_train), sum(y_test)/len(y_test)'''

'x_train, x_test, y_train,  y_test = train_test_split(x, y, test_size=0.2, stratify=y)\n\nfrom imblearn.over_sampling import RandomOverSampler\nros = SMOTE()\nx_train, y_train = ros.fit_resample(x_train, y_train)\n\nx_train.shape, y_train.shape, x_test.shape, y_test.shape, sum(y_train)/len(y_train), sum(y_test)/len(y_test)'

In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate

In [29]:
models = {
    'LogisticRegression': LogisticRegression,
    'SVC': SVC,
    'Bayes': GaussianNB,
    'KNeighborsClassifier': KNeighborsClassifier,
    'RandomForestClassifier': RandomForestClassifier,
    'GradientBoostingClassifier': GradientBoostingClassifier,
}

In [30]:
'''print('| Model | Accuracy |')
print('|---|---|')
for name, current_model in models.items():
    model = current_model()
    results = cross_validate(estimator=model, X=x, y=y, scoring='accuracy', n_jobs=4, verbose=False, cv=7)
    print(f"|{name}|{np.mean(results['test_score'])}|")'''

'print(\'| Model | Accuracy |\')\nprint(\'|---|---|\')\nfor name, current_model in models.items():\n    model = current_model()\n    results = cross_validate(estimator=model, X=x, y=y, scoring=\'accuracy\', n_jobs=4, verbose=False, cv=7)\n    print(f"|{name}|{np.mean(results[\'test_score\'])}|")'

| Model | Accuracy |
|---|---|
|LogisticRegression|0.7771904321395534|
|SVC|0.7805603277388663|
|Bayes|0.7248579357737543|
|KNeighborsClassifier|0.6792652306065812|
|RandomForestClassifier|0.8607109818950709|
|GradientBoostingClassifier|0.8115501519756838|

In [31]:
# grid = {
#     'n_estimators': [30, 50, 100, 150],
#     'criterion': ['gini', 'logloss'],
#     'max_features': ['sqrt', 'log2', None],
#     'class_weight': ['balanced', None],
#     'max_leaf_nodes': [None, 50, 100, 150],
#     'min_samples_leaf': [1, 2],
#     'max_depth': [None, 10, 20, 40, 50],
# }

# cross_validation = StratifiedKFold(n_splits=7, shuffle=True)
# model = RandomForestClassifier()
# grid_cv = GridSearchCV(model, grid,
#                            scoring='accuracy',
#                            cv=cross_validation, verbose=2,
#                            n_jobs=4)
# grid_cv.fit(x, y)

In [32]:
# grid_cv.best_params_

In [33]:
# grid_cv.best_score_

In [34]:
bp = {'max_depth':10, 'max_features':'log2', 'max_leaf_nodes':150, 'n_estimators':30}

In [45]:
model = RandomForestClassifier(**bp)
model.fit(x, y)

RandomForestClassifier(max_depth=10, max_features='log2', max_leaf_nodes=150,
                       n_estimators=30)

In [46]:
pickle.dump(model, open('model/model2_rf.pkl','wb'))

In [38]:
# dump(model, 'model/model2_rf.joblib')

In [39]:
# model = load('model/model2_rf.joblib')

In [47]:
model = pickle.load(open(f'model/model2_rf.pkl','rb'))

In [48]:
x.head(1).values

array([[0.        , 0.07131921, 0.09090909, 1.        , 0.29082313,
        0.15730337, 0.48571429, 1.        , 0.        , 0.        ,
        3.        ]])

In [49]:
arr = np.array([0., 0.07948456, 0.16666667, 0., 0.33931386, 0.17977528, 0.68571429,
                1., 0., 0.37010676, 2.])

In [50]:
model.predict(arr.reshape((1,-1)))

array([0], dtype=int64)

In [51]:
preds = model.predict_proba(arr.reshape((1,-1)))
preds

array([[0.76111111, 0.23888889]])

In [52]:
round(preds[0][1], 3)*100

23.9

In [53]:
df_c.sample()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
54,60.0,1,260,1,38,0,255000.0,2.2,132,0,1,45,1


In [54]:
model.predict(x.tail(3).values)

array([0, 0, 0], dtype=int64)

In [55]:
model.predict_proba(x.tail(3).values)

array([[0.96666667, 0.03333333],
       [0.9       , 0.1       ],
       [0.97777778, 0.02222222]])