PREDICTING THE CHANCES OF GETTING STROKE USING GRADIENT BOOSTING

In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report

In [3]:
# LOADING THE DATA
stroke = pd.read_csv("data/healthcare-dataset-stroke-data.csv")
stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
# CHECKING THE DATA TYPES
stroke.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [9]:
# CHECKING FOR MISSING DATA
(stroke.isna().sum()/stroke.shape[0])*100

id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  3.933464
smoking_status       0.000000
stroke               0.000000
dtype: float64

In [10]:
# DROPPING MISSING DATA BUT YOU CAN IMPUTE THEM
stroke = stroke.dropna()
(stroke.isna().sum()/stroke.shape[0])*100

id                   0.0
gender               0.0
age                  0.0
hypertension         0.0
heart_disease        0.0
ever_married         0.0
work_type            0.0
Residence_type       0.0
avg_glucose_level    0.0
bmi                  0.0
smoking_status       0.0
stroke               0.0
dtype: float64

In [11]:
stroke.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [13]:
# ENCODING THE CATEGORICAL COLUMNS AS PER THE DATA DESCRIPTION.
# YOU CAN GET THIS FROM THE PREVIOUS VIDEO, THE CODE WILL BE ON GITHUB FROM PREVIOUS NOTEBOOKS.
cols = ['gender','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status','stroke']

# DEFINING THE ENCODER
le = LabelEncoder()

# ENCODING THE COLUMNS
stroke[cols] = stroke[cols].apply(lambda col: le.fit_transform(col))

stroke.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,56669,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [16]:
# DEFINING THE PIPELINE
# LETS US FIT WITH SCALLING AND WITHOUT AND COMPARE
pipe = Pipeline([('gradientBoost', GradientBoostingClassifier())])

In [20]:
# DEFING X AND Y
X = stroke.loc[:, 'id':'smoking_status']
y = stroke['stroke']

# SPLITTING THE DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
X_train.shape

(3289, 11)

In [22]:
# FITTING THE PIPE
pipe.fit(X_train, y_train)

Pipeline(steps=[('gradientBoost', GradientBoostingClassifier())])

In [25]:
pred = pipe.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print(mean_squared_error(y_test, pred))

[[1537    3]
 [  80    0]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1540
           1       0.00      0.00      0.00        80

    accuracy                           0.95      1620
   macro avg       0.48      0.50      0.49      1620
weighted avg       0.90      0.95      0.93      1620

0.05123456790123457


In [26]:
# FITTING WITH SCALLING
pipe_SCALE = Pipeline([('scaler', StandardScaler()),('gradientBoost', GradientBoostingClassifier())])

In [27]:
pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print(mean_squared_error(y_test, pred))

[[1537    3]
 [  80    0]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1540
           1       0.00      0.00      0.00        80

    accuracy                           0.95      1620
   macro avg       0.48      0.50      0.49      1620
weighted avg       0.90      0.95      0.93      1620

0.05123456790123457


In [28]:
# there is no difference in the results.
# for better results you can try converting the cols into categorical and then one hot encode them.
# the compare the results with these.
# thanks for watching.