In [159]:
# usual imports in a classic ML pipeline for Classification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# additional metrics ONLY for classification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [160]:
# load the data
df = pd.read_csv("healthcare-dataset-stroke-data.csv")


# let's quickly see the first 5 rows of data
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [161]:
# Check total no of records
df.shape


(5110, 12)

In [162]:
# drop id field since it is not required
df.drop(columns=['id'], inplace=True)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [163]:
# check duplicates and missing values
df.duplicated().sum()

np.int64(0)

In [164]:
# do we have missing values? apparently not
# dataset can't have any missing values when passing the data
# to the machine learning algorithm
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [165]:
# Dro NAs
df.dropna(inplace=True)


In [166]:
# do we have missing values? apparently not
# dataset can't have any missing values when passing the data
# to the machine learning algorithm
df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [167]:
# Check total no of records
df.shape

(4909, 11)

In [168]:
# Check value counts for ShelveLoc before modifying ordinal categories
df['ever_married'].value_counts()

ever_married
Yes    3204
No     1705
Name: count, dtype: int64

In [169]:
from sklearn.preprocessing import LabelEncoder

# list all variables that can be binary-converted
variables = ['ever_married']

# load the encoder
encoder = LabelEncoder()

# convert the listed variables
df[variables] = df[variables].apply(encoder.fit_transform)

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,1,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,1,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,1,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,1,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,1,Private,Urban,186.21,29.0,formerly smoked,1


In [170]:
# Check value counts for gender before modifying ordinal categories
df['gender'].value_counts()

gender
Female    2897
Male      2011
Other        1
Name: count, dtype: int64

In [171]:
# Remove the row where gender= Other
df = df[df['gender'] != 'Other']
df['gender'].value_counts()


gender
Female    2897
Male      2011
Name: count, dtype: int64

In [172]:
# Map gender - Ordinal categories
category_mapper = {
'Female': 1,
'Male': 0
}
df['gender'] = df['gender'].replace(category_mapper)

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,Private,Urban,228.69,36.6,formerly smoked,1
2,0,80.0,0,1,1,Private,Rural,105.92,32.5,never smoked,1
3,1,49.0,0,0,1,Private,Urban,171.23,34.4,smokes,1
4,1,79.0,1,0,1,Self-employed,Rural,174.12,24.0,never smoked,1
5,0,81.0,0,0,1,Private,Urban,186.21,29.0,formerly smoked,1


In [173]:
# Check value counts for work_type before modifying ordinal categories
df['work_type'].value_counts()

work_type
Private          2810
Self-employed     775
children          671
Govt_job          630
Never_worked       22
Name: count, dtype: int64

In [174]:
# remove the records of Neve_worked since it is an outlier
df = df[df['work_type'] != 'Never_worked']
df['work_type'].value_counts()

work_type
Private          2810
Self-employed     775
children          671
Govt_job          630
Name: count, dtype: int64

In [175]:
# Check value counts for ShelveLoc before modifying ordinal categories
df['Residence_type'].value_counts()

Residence_type
Urban    2475
Rural    2411
Name: count, dtype: int64

In [176]:
# Map Residence_type - Ordinal categories
category_mapper = {
'Urban': 1,
'Rural': 0
}
df['Residence_type'] = df['Residence_type'].replace(category_mapper)

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
2,0,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
3,1,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
4,1,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1
5,0,81.0,0,0,1,Private,1,186.21,29.0,formerly smoked,1


In [177]:
# Check value counts for ShelveLoc before modifying ordinal categories
df['smoking_status'].value_counts()

smoking_status
never smoked       1838
Unknown            1475
formerly smoked     836
smokes              737
Name: count, dtype: int64