Predicting the prevalence of stroke using Random Forest.
we will use pipelines

In [25]:
# loading the required libraries
import pandas as pd # loading the data
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error

## The Data

In [2]:
# loading the data
stroke = pd.read_csv("data/healthcare-dataset-stroke-data.csv")
stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [13]:
# checking for missing data and dropping them
stroke.isna().sum()

# dropna
stroke = stroke.dropna()

# dropping the id
# we do not need it
stroke = stroke.drop('id', axis=1)

stroke.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [14]:
stroke.dtypes
# according the data source, most of the variables are categorical.
# in the previous video i converted them to categorical and then encoded them
# in this video i will label encode them to see if their will be any difference

gender                 int32
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int32
work_type              int32
Residence_type         int32
avg_glucose_level    float64
bmi                  float64
smoking_status         int32
stroke                 int64
dtype: object

In [15]:
### Encofing the data
le = LabelEncoder()

# defining the columns to encode
cols = ['gender','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status','stroke']

# encoding
stroke[cols] = stroke[cols].apply(lambda col: le.fit_transform(col))

stroke.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [16]:
# we need to scale the data before feeding it to the random forest model
# we will use a pipeline

# defining the pipeline.
# scaler and random forest classifier
# the pipeline will scale the data before feeding it to the classifier 
pipe = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())])
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestClassifier())])

In [17]:
stroke.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [21]:
# data splitting

# definig our X and y
X = stroke.loc[:, 'gender':'smoking_status']
y = stroke.stroke

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [23]:
# the rf model using the pipeline
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestClassifier())])

In [24]:
pipe.score(X_test, y_test)

0.9511201629327902

In [26]:
# confusion matrix
confusion_matrix(y_test, pipe.predict(X_test))

array([[1401,    0],
       [  72,    0]], dtype=int64)

# AOB FOR BETTER MODELING

CROSS VALIDATION

CHECK IF THE STROKE CLASSES ARE BALANCED AND EITHER UPSAMPLE/DOWNSAMPLE ACCODINGLY SINCE THIS CAN LEAD TO WRONG CONCLUSIONS.

In [None]:
THANKS FOR WATCHING. ALL THE CODE WILLBE UPLOADED TO GITHUB.