# Stroke Risk Assessment (Model Training)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel,VarianceThreshold
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,classification_report

In [2]:
%%capture

%run '../notebooks/1. EDA-stroke_risk_assessment.ipynb'

In [3]:
df = pd.read_csv("../data/1.0EDA_CleanData.csv")

df.head(2)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1


####  Split the dataset with 20% as the test set with random state 42

In [4]:
X = df.drop('stroke',axis=1)
y = df['stroke']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

#for cross validation
kfold = KFold(n_splits=5 , random_state= 1024, shuffle=True)

#### Column Transformation

#### for the numerical variables we will be using Robust Scaler as discussed in `./1. EDA-stroke_risk_assessment.ipynb`

In [6]:
categorical_data = X_train.select_dtypes(include=['object'])  
numeric_data = X_train.select_dtypes(include=['float64', 'int64'])  

In [7]:
numeric_transformer = RobustScaler()
categorical_transformer = OneHotEncoder(drop='if_binary')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", categorical_transformer, categorical_data.columns),
         ("RobustScaler", numeric_transformer, numeric_data.columns),        
    ]
)

In [8]:
#feature selection
feature_selection_methods = [
    ("RandomForest_Feat", SelectFromModel(estimator=RandomForestClassifier(n_estimators=10,max_depth=5,random_state=1024))),
    ("VarianceThres", VarianceThreshold(threshold=0.1))
]

#classifers
classifiers = [
    ("RandomForest", RandomForestClassifier(n_estimators=10,max_depth=5,random_state=1024)),
    ("NavieBay", GaussianNB()),
    ("SVC",SVC(kernel="rbf", gamma=0.5,C=5,max_iter=1000,random_state=42)),
    ("Adaboost",AdaBoostClassifier(learning_rate=0.1))
]

In [9]:
## Creating pipeline
pipelines :list[tuple[str,Pipeline]] =[]

for feat_sel_name, feat_sel_method in feature_selection_methods:
    for clf_name, clf in classifiers:
        steps = [("preprocessor", preprocessor), ("feat_sel", feat_sel_method), ("clf", clf)]
        pipeline = Pipeline(steps=steps)
        pipelines.append((feat_sel_name + "+" + clf_name, pipeline))

In [10]:
from sklearn.model_selection import cross_val_score

model_name = []
model_score = []

for name, pipeline in pipelines:
    score = cross_val_score(pipeline,X_train,y_train,cv=kfold)

    model_name.append(name)
    model_score.append(score.mean())



In [11]:
models = pd.DataFrame({"name":model_name,"score":model_score})

models

Unnamed: 0,name,score
0,RandomForest_Feat+RandomForest,0.953753
1,RandomForest_Feat+NavieBay,0.914603
2,RandomForest_Feat+SVC,0.954242
3,RandomForest_Feat+Adaboost,0.954242
4,VarianceThres+RandomForest,0.954242
5,VarianceThres+NavieBay,0.724807
6,VarianceThres+SVC,0.948615
7,VarianceThres+Adaboost,0.954242


In [12]:
best_model_name = models[models.score == max(models.score)].iloc[0,0]

best_model_name

'RandomForest_Feat+SVC'

In [13]:
selected_model = None

for name,pipeline in pipelines:
    if best_model_name == name:
        selected_model = pipeline

In [14]:
selected_model.fit(X_train,y_train)



## evaluate it using the test set.

In [15]:
y_predic = selected_model.predict(X_test)
accu_score = accuracy_score(y_test,y_predic)

print(f'The score on test set is {accu_score}')

The score on test set is 0.9393346379647749


### Classification Report 

In [16]:
print(classification_report(y_test, y_predic))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
