In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("Stroke_Disease.csv")

In [3]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

In [5]:
num_features=["age","avg_glucose_level","bmi"]
cat_features=["gender","ever_married","work_type","Residence_type","smoking_status"]
binary_features=["hypertension","heart_disease"]

In [6]:
# Numerical Pipeline
numerical_pipeline=Pipeline([
    ("Imputer",SimpleImputer(strategy='median')),
    ("Standardization",StandardScaler())
])

In [8]:
# Categorical pipeline
Categorical_pipeline=Pipeline([
    ("Imputer",SimpleImputer(strategy='most_frequent')),
    ("One_Hot_Encoding",OneHotEncoder(handle_unknown="ignore"))
])


In [9]:
preprocessor=ColumnTransformer([
    ("num",numerical_pipeline,num_features),
    ("cat",Categorical_pipeline,cat_features),
    ("bin","passthrough",binary_features)
])

In [10]:
from sklearn.model_selection import GridSearchCV
params=dict()
params["n_estimators"]=[200,300,350,400]
params["max_depth"]=[20,30,25,40,35]
params["min_samples_split"]=[12,15,16,17,20,25]
model=GridSearchCV(RandomForestClassifier(),param_grid=params,cv=5,n_jobs=-1)

In [11]:
pipe=ImbPipeline([
    ("Preprocess",preprocessor),
    ("Smote",SMOTE(random_state=42)),
    ("Model",model)
])

In [12]:
from sklearn.model_selection import train_test_split
x=df.drop("stroke",axis=1)
y=df["stroke"]

In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [14]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((4088, 10), (1022, 10), (4088,), (1022,))

In [15]:
pipe.fit(x_train,y_train)

In [16]:
y_pred=pipe.predict(x_test)

In [17]:
from sklearn.metrics import accuracy_score,precision_score,roc_auc_score
print(accuracy_score(y_test,y_pred))


0.9187866927592955


In [18]:
import joblib
joblib.dump(pipe,"Stroke_Trained")

['Stroke_Trained']