<a href="https://www.kaggle.com/code/esraamohamedahmed/brain-stroke-prediction-using-random-forest?scriptVersionId=102383704" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np 
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix,classification_report
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/full-filled-brain-stroke-dataset/full_data.csv
/kaggle/input/full-filled-brain-stroke-dataset/full_filled_stroke_data (1).csv


In [2]:
data = pd.read_csv('../input/full-filled-brain-stroke-dataset/full_data.csv')
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


 ### Data Exploration

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [4]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531
min,0.08,0.0,0.0,55.12,14.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0
75%,61.0,0.0,0.0,113.86,32.6,0.0
max,82.0,1.0,1.0,271.74,48.9,1.0


In [5]:
data.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [6]:
data['stroke'].value_counts()


0    4733
1     248
Name: stroke, dtype: int64

So, data is imbalanced. 

In [7]:
print(data['gender'].unique())
print(data['work_type'].unique())
print(data['Residence_type'].unique())
print(data['smoking_status'].unique())
print(data['ever_married'].unique())

['Male' 'Female']
['Private' 'Self-employed' 'Govt_job' 'children']
['Urban' 'Rural']
['formerly smoked' 'never smoked' 'smokes' 'Unknown']
['Yes' 'No']


In [8]:
marriage_dict = {'Yes': 1,'No': 0}
smoking_dict={'formerly smoked':1,'never smoked':2, 'smokes':3, 'Unknown':4}
residance_dict={'Urban':1, 'Rural':2}
work_dict={'Private':1, 'Self-employed':2, 'Govt_job':3, 'children':4}
gender_dict={'Male':1, 'Female':0}
data["ever_married"] = data.ever_married.map(marriage_dict)
data["smoking_status"] = data.smoking_status.map(smoking_dict)
data["Residence_type"] = data.Residence_type.map(residance_dict)
data["work_type"] = data.work_type.map(work_dict)
data["gender"] = data.gender.map(gender_dict)


In [9]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,1,1,228.69,36.6,1,1
1,1,80.0,0,1,1,1,2,105.92,32.5,2,1
2,0,49.0,0,0,1,1,1,171.23,34.4,3,1
3,0,79.0,1,0,1,2,2,174.12,24.0,2,1
4,1,81.0,0,0,1,1,1,186.21,29.0,1,1


## Model Building

In [10]:
X = data.drop(['stroke'], axis = 1)
y = data['stroke']
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)
X_train.shape, X_test.shape

((3337, 10), (1644, 10))

## Hyperparameter Tunning

In [11]:
parameters ={'max_depth': [50, 90, 130],
     'criterion' : ['gini', 'entropy'],
     'n_estimators': [100,200,400]}
from sklearn.model_selection import ParameterGrid
param_size = ParameterGrid(parameters)
len(param_size)

18

In [12]:
from sklearn.model_selection import GridSearchCV
grid_search = RandomForestClassifier()
grid_search = GridSearchCV(
    grid_search, 
    parameters, 
    cv=5,
    scoring='accuracy',n_jobs=-1)

grid_result= grid_search.fit(X_train, y_train)
print('Best Params: ', grid_result.best_params_)
print('Best Score: ', grid_result.best_score_)


Best Params:  {'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 200}
Best Score:  0.9511540636867195


## Results

In [13]:
y_pred_rfc = grid_search.predict(X_test)
print(confusion_matrix(y_test,y_pred_rfc))
print('The accuracy is: {:.4f}'.format(accuracy_score(y_test,y_pred_rfc)))
print('The classification report is:\n{:}'.format(classification_report(y_test,y_pred_rfc)))

[[1558    1]
 [  85    0]]
The accuracy is: 0.9477
The classification report is:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1559
           1       0.00      0.00      0.00        85

    accuracy                           0.95      1644
   macro avg       0.47      0.50      0.49      1644
weighted avg       0.90      0.95      0.92      1644



In [14]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test,y_pred_rfc)

0.4996792815907633

We need to balance the data.

## Undersampling Dataset

In [15]:
from collections import Counter
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X, y)
print(sorted(Counter(y_under).items()))

[(0, 248), (1, 248)]


In [16]:
X_train1, X_test1, y_train1 , y_test1 = train_test_split(X_under,y_under, test_size = 0.33, random_state = 43)

In [17]:
from sklearn.model_selection import GridSearchCV
grid_search = RandomForestClassifier()
grid_search = GridSearchCV(
    grid_search, 
    parameters, 
    cv=5,
    scoring='accuracy',n_jobs=-1)

grid_result= grid_search.fit(X_train1, y_train1)
print('Best Params: ', grid_result.best_params_)
print('Best Score: ', grid_result.best_score_)


Best Params:  {'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 100}
Best Score:  0.7981908638625057


In [18]:
y_pred_rfc_balance = grid_search.predict(X_test1)
print(confusion_matrix(y_test1,y_pred_rfc_balance))
print('The accuracy is: {:.4f}'.format(accuracy_score(y_test1,y_pred_rfc_balance)))
print('The classification report is:\n{:}'.format(classification_report(y_test1,y_pred_rfc_balance)))

[[65 18]
 [24 57]]
The accuracy is: 0.7439
The classification report is:
              precision    recall  f1-score   support

           0       0.73      0.78      0.76        83
           1       0.76      0.70      0.73        81

    accuracy                           0.74       164
   macro avg       0.75      0.74      0.74       164
weighted avg       0.74      0.74      0.74       164



In [19]:
balanced_accuracy_score(y_test1,y_pred_rfc_balance)

0.7434181169120928