In [1]:
import pickle
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import os

In [2]:
READ_PATH = "../data/heart_2020_for_modelling.pkl"
SAVE_PATH = "../models/logistic_regression.pkl"

In [3]:
heart = pd.read_pickle(READ_PATH)

In [4]:
heart.head()

Unnamed: 0,HeartDisease,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,Stroke_No,...,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer_No,SkinCancer_Yes
0,0,16.6,3.0,30.0,5.0,0,1,1,0,1,...,0,0,0,1,0,1,1,0,0,1
1,0,20.34,0.0,0.0,7.0,1,0,1,0,0,...,0,0,0,1,1,0,1,0,1,0
2,0,26.58,20.0,30.0,8.0,0,1,1,0,1,...,1,0,0,0,0,1,1,0,1,0
3,0,24.21,0.0,0.0,6.0,1,0,1,0,1,...,0,1,0,0,1,0,1,0,0,1
4,0,23.71,28.0,0.0,8.0,1,0,1,0,1,...,0,0,0,1,1,0,1,0,1,0


In [5]:
y = heart["HeartDisease"]
X = heart.drop("HeartDisease", axis=1)

In [6]:
def model_eval(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[::, 1]
    print(confusion_matrix(y_test, y_pred))
    return classification_report(y_pred, y_test)

## 2. Creating the model

### 2.1 Using the default parameters

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

In [8]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

RandomForestClassifier()

In [9]:
print(model_eval(rf_model, X_test, y_test))

[[57137  1464]
 [ 4643   715]]
              precision    recall  f1-score   support

           0       0.98      0.92      0.95     61780
           1       0.13      0.33      0.19      2179

    accuracy                           0.90     63959
   macro avg       0.55      0.63      0.57     63959
weighted avg       0.95      0.90      0.92     63959



### 2.2 Using the balanced weights

In [None]:
rf_model_bal = RandomForestClassifier(n_estimators=500, class_weight="balanced")
rf_model_bal.fit(X_train, y_train)

In [None]:
print(model_eval(rf_model_bal, X_test, y_test))

### 2.3 Using the undersampling technique

In [None]:
undersample = RandomUnderSampler(sampling_strategy="majority")

In [None]:
X_over, y_over = undersample.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=2022)

In [None]:
rf_model_under = RandomForestClassifier(n_estimators=500)
rf_model_under.fit(X_train, y_train)

In [None]:
print(model_eval(rf_model_under, X_test, y_test))