In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
stroke_df = pd.read_csv("resources/healthcare-dataset-stroke-data.csv")
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [8]:
stroke_df['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [9]:
stroke_df = stroke_df.dropna()

In [10]:
stroke_df = stroke_df.drop_duplicates()

In [11]:
stroke_df = stroke_df.drop(stroke_df[stroke_df.gender == 'Other'].index)

In [12]:
stroke_df.drop(['id'], axis=1, inplace=True)

In [13]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4908 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4908 non-null   object 
 1   age                4908 non-null   float64
 2   hypertension       4908 non-null   int64  
 3   heart_disease      4908 non-null   int64  
 4   ever_married       4908 non-null   object 
 5   work_type          4908 non-null   object 
 6   Residence_type     4908 non-null   object 
 7   avg_glucose_level  4908 non-null   float64
 8   bmi                4908 non-null   float64
 9   smoking_status     4908 non-null   object 
 10  stroke             4908 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 460.1+ KB


In [14]:
stroke_df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [15]:
stroke_df['stroke'].value_counts()

stroke
0    4699
1     209
Name: count, dtype: int64

# Balancing, Encoding, Splitting, Scaling  Data

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [17]:
# Class count
class_count_0, class_count_1 = stroke_df['stroke'].value_counts()
print(class_count_0)
print(class_count_1)

class_0 = stroke_df[stroke_df['stroke'] == 0]
class_1 = stroke_df[stroke_df['stroke'] == 1]

4699
209


In [18]:
# Random Undersampling
class_0_under = class_0.sample(class_count_1)
undersampled_df = pd.concat([class_0_under, class_1], axis=0)
print('Random Under Sampling:')
print(undersampled_df['stroke'].value_counts())

Random Under Sampling:
stroke
0    209
1    209
Name: count, dtype: int64


In [19]:
# Random Oversampling
class_1_over = class_1.sample(class_count_0,replace=True)
oversampled_df = pd.concat([class_1_over, class_0], axis=0)
print('Random Over Sampling:')
print(oversampled_df['stroke'].value_counts())

Random Over Sampling:
stroke
1    4699
0    4699
Name: count, dtype: int64


In [20]:
oversampled_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
136,Male,76.0,0,0,Yes,Private,Urban,140.1,29.9,formerly smoked,1
159,Female,81.0,1,0,Yes,Self-employed,Rural,74.02,25.0,never smoked,1
32,Female,82.0,1,0,Yes,Self-employed,Urban,196.92,22.2,never smoked,1
226,Female,81.0,0,0,Yes,Private,Rural,184.4,27.5,never smoked,1
118,Female,38.0,0,0,No,Self-employed,Urban,82.28,24.0,formerly smoked,1


In [21]:
balanced_df = pd.get_dummies(oversampled_df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
balanced_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
136,76.0,0,0,140.10,29.9,1,False,True,False,True,...,False,True,False,False,False,True,False,True,False,False
159,81.0,1,0,74.02,25.0,1,True,False,False,True,...,False,False,True,False,True,False,False,False,True,False
32,82.0,1,0,196.92,22.2,1,True,False,False,True,...,False,False,True,False,False,True,False,False,True,False
226,81.0,0,0,184.40,27.5,1,True,False,False,True,...,False,True,False,False,True,False,False,False,True,False
118,38.0,0,0,82.28,24.0,1,True,False,True,False,...,False,False,True,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,True,False,True,False,...,False,False,False,True,True,False,True,False,False,False
5106,81.0,0,0,125.20,40.0,0,True,False,False,True,...,False,False,True,False,False,True,False,False,True,False
5107,35.0,0,0,82.99,30.6,0,True,False,False,True,...,False,False,True,False,True,False,False,False,True,False
5108,51.0,0,0,166.29,25.6,0,False,True,False,True,...,False,True,False,False,True,False,False,True,False,False


In [22]:
balanced_df['stroke'].value_counts()

stroke
1    4699
0    4699
Name: count, dtype: int64

In [23]:
balanced_df.columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke', 'gender_Female', 'gender_Male', 'ever_married_No',
       'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'Residence_type_Rural', 'Residence_type_Urban',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes'],
      dtype='object')

In [24]:
scaler = StandardScaler()
scaled_num_col = ['age', 'avg_glucose_level', 'bmi']
balanced_df[scaled_num_col] = scaler.fit_transform(balanced_df[scaled_num_col])
balanced_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
136,0.960218,0,0,0.381697,0.041376,1,False,True,False,True,...,False,True,False,False,False,True,False,True,False,False
159,1.185698,1,0,-0.813458,-0.640094,1,True,False,False,True,...,False,False,True,False,True,False,False,False,True,False
32,1.230794,1,0,1.409371,-1.029506,1,True,False,False,True,...,False,False,True,False,False,True,False,False,True,False
226,1.185698,0,0,1.182928,-0.292405,1,True,False,False,True,...,False,True,False,False,True,False,False,False,True,False
118,-0.753433,0,0,-0.664064,-0.77917,1,True,False,True,False,...,False,False,True,False,False,True,False,True,False,False


In [25]:
X = balanced_df.drop(columns='stroke')
y =  balanced_df['stroke']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(7048, 20)

In [27]:
st = SMOTE()
X_train_resampled, y_train_resampled = st.fit_resample(X_train, y_train)
X_train_resampled.shape, y_train_resampled.value_counts()

((7048, 20),
 stroke
 0    3524
 1    3524
 Name: count, dtype: int64)

# Machine Learning Models

In [28]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

## Random Forest

In [29]:
# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=800, random_state=1)
rf_model = rf_model.fit(X_train_resampled, y_train_resampled)
rf_model

RandomForestClassifier(n_estimators=800, random_state=1)

In [30]:
# Make predictions 
rf_predictions = rf_model.predict(X_test)
rf_predictions

array([1, 1, 1, ..., 1, 0, 0])

In [31]:
# Evaluate Model
cm = confusion_matrix(y_test, rf_predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual No Stroke', "Actual Stroke"], columns = ['Predicted No Stroke', 'Predicted Stroke']
)
rf_acc_score = accuracy_score(y_test,rf_predictions)
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix


Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,1146,29
Actual Stroke,0,1175


Accuracy Score : 0.987659574468085
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1175
           1       0.98      1.00      0.99      1175

    accuracy                           0.99      2350
   macro avg       0.99      0.99      0.99      2350
weighted avg       0.99      0.99      0.99      2350



In [32]:
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.35907756370158367, 'age'),
 (0.20170883380448448, 'avg_glucose_level'),
 (0.1626032677016524, 'bmi'),
 (0.036273222662264216, 'hypertension'),
 (0.026092355470527368, 'ever_married_Yes'),
 (0.025448234220544555, 'ever_married_No'),
 (0.02132074091749728, 'heart_disease'),
 (0.018026234233911486, 'smoking_status_never smoked'),
 (0.015412649003064579, 'work_type_Self-employed'),
 (0.014760339261686696, 'smoking_status_Unknown'),
 (0.0146137856954824, 'work_type_Private'),
 (0.014419239311913632, 'Residence_type_Rural'),
 (0.014227351657197398, 'Residence_type_Urban'),
 (0.013571488558249085, 'gender_Female'),
 (0.013539741195651227, 'smoking_status_formerly smoked'),
 (0.013432478375449334, 'gender_Male'),
 (0.01293273375372928, 'smoking_status_smokes'),
 (0.011884376062797545, 'work_type_Govt_job'),
 (0.010552876476004646, 'work_type_children'),
 (0.00010248793630861789, 'work_type_Never_worked')]

## Decision Tree

In [33]:
# Train Decision Tree Model
dt_model = DecisionTreeClassifier(criterion='entropy', random_state=1)
dt_model = dt_model.fit(X_train_resampled, y_train_resampled)
dt_model

DecisionTreeClassifier(criterion='entropy', random_state=1)

In [34]:
# Make predictions 
dt_predictions = dt_model.predict(X_test)
dt_predictions

array([1, 1, 1, ..., 1, 0, 0])

In [35]:
# Evaluate Model
cm_dt = confusion_matrix(y_test, dt_predictions)
tree_df = pd.DataFrame(
    cm_dt, index=['Actual No Stroke', "Actual Stroke"], columns = ['Predicted No Stroke', 'Predicted Stroke']
)
dt_acc_score = accuracy_score(y_test,dt_predictions)
print("Confusion Matrix")
display(tree_df)
print(f"Accuracy Score : {dt_acc_score}")
print("Classification Report")
print(classification_report(y_test, dt_predictions))

Confusion Matrix


Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,1097,78
Actual Stroke,0,1175


Accuracy Score : 0.9668085106382979
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.93      0.97      1175
           1       0.94      1.00      0.97      1175

    accuracy                           0.97      2350
   macro avg       0.97      0.97      0.97      2350
weighted avg       0.97      0.97      0.97      2350

