
## Stroke Prediction Dataset
### 11 clinical features for predicting stroke events

In [None]:
# importing the needed df
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dython.nominal import associations
from dython.model_utils import metric_graph
import os

In [None]:
dataset = pd.read_csv(r"C:\Users\godwi\Data_Science_ML\stroke_prediction_streamlit_app\stroke_train.csv")
dataset.head(6)

In [None]:
dataset.hypertension = dataset.hypertension.apply(lambda x: "Yes" if x == 1 else "No")
dataset.heart_disease = dataset.heart_disease.apply(lambda x: "Yes" if x == 1 else "No")
dataset.head(6)

In [None]:
dataset[dataset.duplicated(subset= ["gender","age","hypertension","heart_disease","ever_married","work_type",\
    "Residence_type","avg_glucose_level","bmi","smoking_status","stroke"],keep="first")]

In [None]:
dataset.isna().sum()

In [None]:
dataset.bmi.mean()

In [None]:
dataset.bmi.fillna(dataset.bmi.mean(), inplace = True)

In [None]:
dataset.bmi.fillna(dataset.bmi.mean(), inplace = True)

In [None]:
dataset.isna().sum()


In [None]:
# drop smoking status is null
dataset.smoking_status.fillna(np.nan, inplace = True)

In [None]:
dataset.dropna(how = "any", inplace = True)

In [None]:
dataset.isna().sum()

In [None]:
shell = dataset.copy()
shell.head()

In [None]:
age_hyper =  dataset.loc[:,["age","gender"]]
age_hyper

In [None]:
age_hyper['age_cat'] = age_hyper.age.apply(lambda x :  "0-2" if 0 <= x<2 else
                                            "2-5" if 2<= x<= 5 else
                                            "6-13" if 5< x< 13 else
                                            "13-18" if 13<= x< 18 else
                                            "18-30" if 18<= x< 30 else
                                            "30-40" if 30<= x< 40 else
                                            "40-50" if 40<= x< 50 else
                                            "50-65" if 50<= x< 65 else
                                            "65+" if x>= 65 else "not known"
                                )

In [None]:
pivot_age = age_hyper.pivot_table(index = 'age_cat', columns='gender', values="age", aggfunc= 'count')
pivot_age

In [None]:
pivot_age.plot(kind = 'bar')

In [None]:
gender_stat =   dataset.gender.value_counts().to_frame()
gender_stat

In [None]:

sns.barplot(data = gender_stat, x = gender_stat.index, y = gender_stat['gender'] )

In [None]:
dataset.to_csv("master_df.csv", index = False)

In [None]:
dataset.heart_disease.value_counts().plot(kind= "pie", autopct = "%.1f%%", figsize = (10,10), explode = (0.01,0.05));

In [None]:
#  Ssmoking with heart diseas
gender_stroke = dataset.loc[:,["gender","stroke"]].groupby("gender").count()
gender_stroke

In [None]:
fig,(ax1) = plt.subplots(figsize = (10,4.5))
sns.barplot(data = gender_stroke, y = "stroke", x = gender_stroke.index, ax= ax1)

In [None]:
fig, ax = plt.subplots( figsize = (12,7))
sns.scatterplot( data = dataset, x  = dataset.bmi,  y =dataset.age, hue = 'stroke', ax = ax , palette = "deep")

In [None]:
# feature correlation
fig,ax =plt.subplots(figsize = (14,7))
feature_check =sns.heatmap( dataset.corr(), cmap = "Blues", annot = True)
feature_check

In [None]:
associations(dataset, figsize = (15,10))

In [None]:


from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE 
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression



# Grouping by numerica and categorical columns and alsoe Split the dataset to X and y

In [None]:
# lets get the categorical and numeric columns
categorical_col = list(dataset.select_dtypes("object").drop(['Residence_type'],axis =1).columns)
categorical_col

In [None]:
numerical_col = list(dataset.select_dtypes(exclude="object").drop(['id',"stroke"],axis =1).columns)
numerical_col

In [None]:
X= dataset.drop(["id","Residence_type","stroke"],axis =1)
X.head()

In [None]:
y =dataset["stroke"]

## Preprocessing , change strings to numeric with OneHotEncoder

In [None]:
cat_pipeline = Pipeline(steps =[
                 ("ohe", OneHotEncoder(handle_unknown = "ignore")),
])

num_pipeline = Pipeline(steps = [
                  ("scaler", StandardScaler()),
  ])

In [None]:
# To Create ColumnTransformer to apply pipeline for each column set
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer([
    ("num_pipeline", num_pipeline,numerical_col),
    ("cat_pipeline",cat_pipeline,categorical_col)],
    remainder='drop'
    )

In [None]:
# Final pipeline to add a model or estimator
model = imbPipeline([
    ("transformer", transformer),
    # ("scaler",num_pipeline),
    ("smote", SMOTE(random_state = 42,k_neighbors= 10)),
    ("rfc",RandomForestClassifier(random_state=42)) 
    ])

In [None]:
# training test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state= 42)

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
print(f" Accuracy Score is {accuracy_score(y_test, y_pred)*100:.2f}%")

In [None]:
cm = confusion_matrix(y_test, y_pred)
confusion_matrix_dataframe = pd.DataFrame(cm)
confusion_matrix_dataframe.to_csv("confusion_matrix_dataframe.csv", encoding='utf-8', index=False)


In [None]:
fig, ax = plt.subplots(figsize = (10,6))
cm_plot = sns.heatmap(pd.DataFrame(cm), annot = True, cmap = "Blues", fmt = "g")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
y_pred_proba = model.predict_proba(X_test)

In [None]:
import scikitplot as skplt
fig, ax = plt.subplots(figsize=(14,7))
skplt.metrics.plot_roc(y_test, y_pred_proba,ax= ax)


# Saving model

In [None]:
import joblib
joblib.dump(model, "model_stroke.joblib")

In [None]:
fig, ax = plt.subplots(figsize=(14,7))
metric_graph(y_test, y_pred_proba[:,1], metric = "roc", ax = ax)

In [None]:
y_test.shape, y_pred_proba[:,1].shape, y_pred.shape

In [None]:
model.predict(X.loc[945].to_frame().T)

In [None]:
X.loc[945].to_frame().T

In [None]:
model