In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
stroke_df = pd.read_csv("resources/healthcare-dataset-stroke-data.csv")
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
stroke_df['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [5]:
stroke_df = stroke_df.dropna()

In [6]:
stroke_df = stroke_df.drop_duplicates()

In [7]:
stroke_df = stroke_df.drop(stroke_df[stroke_df.gender == 'Other'].index)

In [8]:
stroke_df.drop(['id'], axis=1, inplace=True)

In [9]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4908 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4908 non-null   object 
 1   age                4908 non-null   float64
 2   hypertension       4908 non-null   int64  
 3   heart_disease      4908 non-null   int64  
 4   ever_married       4908 non-null   object 
 5   work_type          4908 non-null   object 
 6   Residence_type     4908 non-null   object 
 7   avg_glucose_level  4908 non-null   float64
 8   bmi                4908 non-null   float64
 9   smoking_status     4908 non-null   object 
 10  stroke             4908 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 460.1+ KB


In [10]:
stroke_df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [11]:
stroke_df['stroke'].value_counts()

0    4699
1     209
Name: stroke, dtype: int64

# Balancing, Encoding, Splitting, Scaling  Data

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [13]:
# Class count
class_count_0, class_count_1 = stroke_df['stroke'].value_counts()
print(class_count_0)
print(class_count_1)

class_0 = stroke_df[stroke_df['stroke'] == 0]
class_1 = stroke_df[stroke_df['stroke'] == 1]

4699
209


In [14]:
# Random Undersampling
class_0_under = class_0.sample(class_count_1)
undersampled_df = pd.concat([class_0_under, class_1], axis=0)
print('Random Under Sampling:')
print(undersampled_df['stroke'].value_counts())

Random Under Sampling:
0    209
1    209
Name: stroke, dtype: int64


In [15]:
# Random Oversampling
class_1_over = class_1.sample(class_count_0,replace=True)
oversampled_df = pd.concat([class_1_over, class_0], axis=0)
print('Random Over Sampling:')
print(oversampled_df['stroke'].value_counts())

Random Over Sampling:
1    4699
0    4699
Name: stroke, dtype: int64


In [16]:
oversampled_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
37,Female,72.0,1,0,Yes,Private,Rural,74.63,23.1,formerly smoked,1
102,Female,74.0,0,0,Yes,Private,Rural,231.61,34.6,formerly smoked,1
98,Male,57.0,1,0,Yes,Govt_job,Urban,78.92,27.7,formerly smoked,1
21,Female,52.0,1,0,Yes,Self-employed,Urban,233.29,48.9,never smoked,1
204,Male,54.0,0,0,Yes,Govt_job,Rural,87.85,31.1,smokes,1


In [17]:
balanced_df = pd.get_dummies(oversampled_df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
balanced_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
37,72.0,1,0,74.63,23.1,1,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
102,74.0,0,0,231.61,34.6,1,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
98,57.0,1,0,78.92,27.7,1,0,1,0,1,...,0,0,0,0,0,1,0,1,0,0
21,52.0,1,0,233.29,48.9,1,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0
204,54.0,0,0,87.85,31.1,1,0,1,0,1,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,1,0,1,0,...,0,0,0,1,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,0,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0


In [18]:
balanced_df['stroke'].value_counts()

1    4699
0    4699
Name: stroke, dtype: int64

In [19]:
balanced_df.columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke', 'gender_Female', 'gender_Male', 'ever_married_No',
       'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'Residence_type_Rural', 'Residence_type_Urban',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes'],
      dtype='object')

In [20]:
scaler = StandardScaler()
scaled_num_col = ['age', 'avg_glucose_level', 'bmi']
balanced_df[scaled_num_col] = scaler.fit_transform(balanced_df[scaled_num_col])
balanced_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
37,0.770851,1,0,-0.812493,-0.904586,1,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
102,0.860771,0,0,1.989055,0.690196,1,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
98,0.096454,1,0,-0.735932,-0.266673,1,0,1,0,1,...,0,0,0,0,0,1,0,1,0,0
21,-0.128345,1,0,2.019037,2.673273,1,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0
204,-0.038425,0,0,-0.576562,0.204828,1,0,1,0,1,...,0,0,0,0,1,0,0,0,0,1


In [21]:
X = balanced_df.drop(columns='stroke')
y =  balanced_df['stroke']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(7048, 20)

In [23]:
st = SMOTE()
X_train_resampled, y_train_resampled = st.fit_resample(X_train, y_train)
X_train_resampled.shape, y_train_resampled.value_counts()

((7048, 20),
 0    3524
 1    3524
 Name: stroke, dtype: int64)

# Machine Learning Models

In [24]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

## Random Forest

In [25]:
# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=800, random_state=1)
rf_model = rf_model.fit(X_train_resampled, y_train_resampled)
rf_model

In [26]:
# Make predictions 
rf_predictions = rf_model.predict(X_test)
rf_predictions

array([1, 1, 1, ..., 1, 0, 0], dtype=int64)

In [27]:
# Evaluate Model
cm = confusion_matrix(y_test, rf_predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual No Stroke', "Actual Stroke"], columns = ['Predicted No Stroke', 'Predicted Stroke']
)
rf_acc_score = accuracy_score(y_test,rf_predictions)
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix


Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,1145,30
Actual Stroke,0,1175


Accuracy Score : 0.9872340425531915
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.97      0.99      1175
           1       0.98      1.00      0.99      1175

    accuracy                           0.99      2350
   macro avg       0.99      0.99      0.99      2350
weighted avg       0.99      0.99      0.99      2350



In [28]:
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3642623353817521, 'age'),
 (0.2034228219612731, 'avg_glucose_level'),
 (0.15775106518969634, 'bmi'),
 (0.03910397050095617, 'hypertension'),
 (0.024408020015171596, 'ever_married_Yes'),
 (0.023862443775789603, 'ever_married_No'),
 (0.020907086622494672, 'heart_disease'),
 (0.01693529551919317, 'smoking_status_never smoked'),
 (0.015446680714029886, 'smoking_status_Unknown'),
 (0.015238882654902985, 'work_type_Self-employed'),
 (0.014543189139771507, 'Residence_type_Urban'),
 (0.01429696764151801, 'Residence_type_Rural'),
 (0.014175542548972637, 'work_type_Private'),
 (0.014100032507542824, 'smoking_status_formerly smoked'),
 (0.01370591834573791, 'gender_Male'),
 (0.013588018037309482, 'gender_Female'),
 (0.012443955207892455, 'smoking_status_smokes'),
 (0.011180825314660933, 'work_type_Govt_job'),
 (0.0105448699258843, 'work_type_children'),
 (8.207899545020204e-05, 'work_type_Never_worked')]

In [29]:
import pickle
with open('model.pkl','wb') as f:
  pickle.dump(rf_model,f)

## Decision Tree

In [33]:
# Train Decision Tree Model
dt_model = DecisionTreeClassifier(criterion='entropy', random_state=1)
dt_model = dt_model.fit(X_train_resampled, y_train_resampled)
dt_model

DecisionTreeClassifier(criterion='entropy', random_state=1)

In [34]:
# Make predictions 
dt_predictions = dt_model.predict(X_test)
dt_predictions

array([1, 1, 1, ..., 1, 0, 0])

In [35]:
# Evaluate Model
cm_dt = confusion_matrix(y_test, dt_predictions)
tree_df = pd.DataFrame(
    cm_dt, index=['Actual No Stroke', "Actual Stroke"], columns = ['Predicted No Stroke', 'Predicted Stroke']
)
dt_acc_score = accuracy_score(y_test,dt_predictions)
print("Confusion Matrix")
display(tree_df)
print(f"Accuracy Score : {dt_acc_score}")
print("Classification Report")
print(classification_report(y_test, dt_predictions))

Confusion Matrix


Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,1097,78
Actual Stroke,0,1175


Accuracy Score : 0.9668085106382979
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.93      0.97      1175
           1       0.94      1.00      0.97      1175

    accuracy                           0.97      2350
   macro avg       0.97      0.97      0.97      2350
weighted avg       0.97      0.97      0.97      2350

