# Stroke

In [None]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt

## 1. Load the data

In [None]:
#get the data from google classroom
#kaggle competition
df = pd.read_csv("../data/brain_stroke.csv")

In [None]:
df.stroke.value_counts()

### Balance the class

In [None]:
#1. set the cond = Y, and cond = N
cond0 = df.stroke == 0  #to 0 or 1
cond1 = df.stroke == 1

df_0 = df[cond0].sample(n=248, random_state=999)
df_1 = df[cond1] #also 192

#3. concat these two dfs
df = pd.concat([df_0, df_1])

df.stroke.value_counts()


### Label encoding

This is a useful step to do before EDA, so that categories are turned into numbers.


In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
#gender = 2
#ever_married = 2
#work_type = got nan!!
#Residence_type = got 2 + nan
#smoking_status = 4

In [None]:
df.gender.unique(), df.ever_married.unique(), df.Residence_type.unique()

In [None]:
df.Residence_type.value_counts()

In [None]:
#do label encoding the Loan Status
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])

In [None]:
df.gender.unique(), df.ever_married.unique(), df.Residence_type.unique()
#I need to remember to clean up the 2 for Residence_type!!!

In [None]:
df.Residence_type.value_counts()

### One hot encoding

In [None]:
df.work_type.unique()

In [None]:
df.smoking_status.unique()

In [None]:
df = pd.get_dummies(df, columns=['work_type', 'smoking_status'])

In [None]:
df.head()

## 2. EDA (Exploratory Data Analysis)

In [None]:
df.columns

In [None]:
import seaborn as sns

#1. You want to divide into categorical and numerical columns using select_dtypes
df.dtypes
num_col = df.select_dtypes(include=['int64', 'float64'])
# numcol = numcol.drop([]) #Loan_Status
cat_col = df.select_dtypes(exclude=['int64', 'float64'])
num_col.columns, cat_col.columns

### Bar plots

In [None]:
#2. for numerical type, lets plot some a bar plot with Loan Status
# for col in num_col.columns:
#     plt.figure(figsize=(3, 2))
#     if col != 'stroke':
        # sns.barplot(x = df['stroke'], y = df[col])
        # plt.show()
        
#so age, hypertension, heart_disease, ever_married, avg_glucose_level, 

### Countplot / Histogram

In [None]:
#3. for categorial type, you may want to use countplot with Loan Status
# for col in cat_col.columns:
#     plt.figure(figsize=(3, 2))
#     sns.countplot(x = df[col], hue = df['stroke'])
#     #similar to hue in scatterplot
#     plt.show()
    # sns.countplot()  #because categorical, you need to count, not simply use the magnitude...so cannot use bar...
    
#work_type_self_employed, children, unknown, formerly smoked, 

### Correlation matrix

In [None]:
# plt.figure(figsize=(10, 12))
# sns.heatmap(df.corr(), annot=True)  #this is only for numeric values....
#same trend as plots above

### Predictive Power Score

In [None]:
# import ppscore as pps

# # before using pps, let's drop country and year
# dfcopy = df.copy()

# #this needs some minor preprocessing because seaborn.heatmap unfortunately does not accept tidy data
# matrix_df = pps.matrix(dfcopy)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')

# #plot
# plt.figure(figsize = (15,8))
# sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)

# same trend as above

## 3. Feature Engineering

Skipped

## 4. Feature Selection

Since our dataset is already splitted at the dataset level, thus we do not need to hurry to select the features.  We can preprocess and then select later.

#forward selection - process of starting with one feature, and slowly adding one feature at a time, until the performance does not improve by certain threshold you want.

In [None]:
df.columns

In [None]:
#work_type_self_employed, children, unknown, formerly smoked, 
#so age, hypertension, heart_disease, ever_married, avg_glucose_level, 

X = df[ ['age', 'hypertension', 'heart_disease', 'ever_married', 'avg_glucose_level', 'work_type_Self-employed', 'work_type_children', 'smoking_status_formerly smoked']]
y = df[ ['stroke']]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
y = y.stack()

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 5. Preprocessing

### Null values

In [None]:
#1. please check the missing value in df_train, and of course df_test
X_train.isna().sum()

In [None]:
X_train['avg_glucose_level'].fillna(X_train['avg_glucose_level'].median(), inplace=True)

In [None]:
X_train.isna().sum()

In [None]:
y_test.isna().sum()

In [None]:
y_train.isna().sum()

### Scaling

In [None]:
X_train.head()

In [None]:
from sklearn.preprocessing import StandardScaler

col_names = ['age', 'avg_glucose_level']

sc = StandardScaler()
X_train[['age']] = sc.fit_transform(X_train[['age']])
X_test[['age']]  = sc.transform(X_test[['age']])

X_train[['avg_glucose_level']] = sc.fit_transform(X_train[['avg_glucose_level']])
X_test[['avg_glucose_level']]  = sc.transform(X_test[['avg_glucose_level']])

## 6. Modeling

In [None]:
#3. perform cross validation with a selected set of models, to scope down which model is among the best
from sklearn.linear_model import LogisticRegression  #drawing a line based on linear regression but used for classification
from sklearn.naive_bayes import GaussianNB  #drawing a line based on probability
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier  #using trees to classify
from sklearn.svm import SVC  #drawing a line based on maximum distance
from sklearn.ensemble import GradientBoostingClassifier  #<<-------is the MOST complex

lr = LogisticRegression(random_state=999)
rf = RandomForestClassifier(random_state=999)
sv = SVC(random_state=999)

models = [lr, rf, sv]

#3.2 perform cross validation using KFold
from sklearn.model_selection import KFold, cross_val_score

kfold = KFold(n_splits = 5, shuffle = True, random_state=999)

for model in models:
    score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')  #f1, recall, precision, accuracy
    print("Scores: ", score, "- Scores mean: ", score.mean(), "- Scores std (lower better): ", score.std())  #out of 1 ; 1 means perfect accuracy
    #lr, rf, sv

In [None]:
#4. grid search to find the best version of that model
#remind you: grid search is BASICALLY same as cross validation but for the same model

from sklearn.model_selection import GridSearchCV

model = LogisticRegression(random_state=999)  #<----this is the model I choose, after cross validation

param_grid = dict()
param_grid['solver'] = ['newton-cg', 'lbfgs', 'liblinear']  #this is listed in the sklearn website

#add more parameters here
#param_grid[parameter] = list of parameters to search

#refit means it will pick the best model, and fit again, so it means grid is already the best model after this line
grid = GridSearchCV(model, param_grid, scoring="accuracy", cv=kfold, refit=True, return_train_score=True)
#scoring = f1, recall, precision, accuracy

#fit the grid, which will basically do cross validation across all combinatiosn, here we only have 3 comb
grid.fit(X_train, y_train)  #remember to use only training set here....

#print the best parameters and accuracy
# print(grid.best_params_)
# print(grid.best_score_)
print(grid.cv_results_)

#this score is cross-validation score, basically the accuracy/precision/etc on the validation set


## 7. Testing

In [None]:
from sklearn.metrics import classification_report

pred_y = grid.predict(X_test)

print(classification_report(y_test, pred_y))

## 8. Analysis: Feature Importance

- Basically understanding which features are important for prediction
- Different algorithms have different way for feature importance
- For Logistic Regression, similar to Linear Regression, you can look at the coeffients/weights/slope

In [None]:
grid.best_estimator_

In [None]:
grid.best_estimator_.coef_

In [None]:
feature_importance = pd.DataFrame(['age', 'hypertension', 'heart_disease', 'ever_married', 'avg_glucose_level', 'work_type_Self-employed', 'work_type_children', 'smoking_status_formerly smoked'], 
                                  columns=['features'])
feature_importance["importance"] = grid.best_estimator_.coef_[0]

feature_importance = feature_importance.sort_values(by = ['importance'], ascending=True)

feature_importance.plot.barh(x='features', y='importance')

## 9. Inference

In [None]:
#do by yourself!!