In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [34]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [36]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [37]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

In [38]:
df.drop_duplicates(inplace=True)

In [39]:
df.drop('id',axis=1,inplace=True)

In [40]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [41]:
features = df.iloc[:,:-1]
target = df.iloc[:,-1]

In [42]:
cols = features[['gender', 'ever_married', 'work_type', 'Residence_type','smoking_status']]

In [44]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
features['gender'] = le.fit_transform(features['gender'])
features['ever_married'] = le.fit_transform(features['ever_married'])
features['work_type'] = le.fit_transform(features['work_type'])
features['Residence_type'] = le.fit_transform(features['Residence_type'])
features['smoking_status'] = le.fit_transform(features['smoking_status'])

In [46]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [45]:
features.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,67.0,0,1,1,2,1,228.69,36.6,1
1,0,61.0,0,0,1,3,0,202.21,28.893237,2
2,1,80.0,0,1,1,2,0,105.92,32.5,2
3,0,49.0,0,0,1,2,1,171.23,34.4,3
4,0,79.0,1,0,1,3,0,174.12,24.0,2


In [47]:
features.work_type.value_counts()

2    2925
3     819
4     687
0     657
1      22
Name: work_type, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(features,target,random_state=2,test_size=0.3,stratify=target)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(xtrain,ytrain)
ypred = rf.predict(xtest)

In [None]:
train = rf.score(xtrain,ytrain)
test = rf.score(xtest,ytest)

print(f'Training_Acc : {train}\nTesting_Acc : {test}')

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(ytest,ypred)
cr = classification_report(ytest,ypred)

print(f'{cm}\n{cr}')

In [None]:
pip install imblearn

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
x_sample,y_sample = rus.fit_resample(features,target)

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x_sample,y_sample,random_state=2,test_size=0.3)

In [None]:
rf.fit(xtrain,ytrain)
ypred = rf.predict(xtest)

In [None]:
train = rf.score(x_sample,y_sample)
test = rf.score(xtest,ytest)

print(f'Training_Acc : {train}\nTesting_Acc : {test}')

In [None]:
cm = confusion_matrix(ytest,ypred)
cr = classification_report(ytest,ypred)

print(f'{cm}\n{cr}')

In [None]:
parameter = {
    'criterion':['gini','entropy'],
    'max_depth':list(range(1,20)),
    'min_samples_leaf':list(range(1,20))
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(rf,parameter,verbose=2)
grid.fit(xtrain,ytrain)

In [None]:
grid.best_estimator_

In [None]:
rf = grid.best_estimator_
rf.fit(xtrain,ytrain)
ypred = rf.predict(xtest)

In [None]:
train = rf.score(x_sample,y_sample)
test = rf.score(xtest,ytest)

print(f'Training_Acc : {train}\nTesting_Acc : {test}')

In [None]:
cm = confusion_matrix(ytest,ypred)
cr = classification_report(ytest,ypred)

print(f'{cm}\n{cr}')

In [None]:
import pickle
pickle.dump(rf,open('./model.sav', 'wb'))

In [None]:
pickle.load(open('model.sav', 'rb'))