# AI: Machine Learning w/ Imbalanced Classes <a class = "anchor" id = "top"></a>

#### - Zan

---

In [14]:
# Import libraries
import os
os.chdir('/Users/operator/Documents')
from modeler import *
import pandas as pd
from imblearn.over_sampling import SMOTE

In [4]:
# Get
df = pd.read_csv('/Users/operator/Documents/data/stroke_data.csv').drop('id', axis = 1).dropna()

In [5]:
df.head(n = 10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
9,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
10,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
11,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1


In [6]:
# Encode categorical columns and split for training
cats = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
data = data_refinery(df, cats, 'stroke')

In [7]:
data.xtrain.head(n = 10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
3956,1,36.0,0,0,0,2,1,200.68,25.8,0
574,1,18.0,0,0,0,2,1,112.17,31.7,0
4013,1,44.0,1,0,1,2,0,91.28,26.5,2
3081,0,72.0,0,0,1,3,0,104.04,34.7,1
4201,1,43.0,0,0,1,2,0,207.37,29.5,1
1412,1,82.0,1,0,1,2,0,227.28,33.3,2
972,0,16.0,0,0,0,2,1,87.98,22.4,2
4526,1,60.0,0,0,1,2,1,80.67,33.5,0
3792,0,74.0,0,0,1,2,1,130.37,26.3,0
2275,0,82.0,0,0,1,3,0,117.75,29.8,2


In [15]:
# Oversampling with SMOTE
sm = SMOTE(random_state = 100)

smote_xtrain, smote_ytrain = sm.fit_resample(data.xtrain, data.ytrain.ravel())
smote_xval, smote_yval = sm.fit_resample(data.xval, data.yval.ravel())

In [29]:
# Function to count values
def counter(x, condition):
    
    x1 = [i for i in x if i == condition]
    
    return len(x1)

print('Overall Target Distribution: ')
print(df['stroke'].value_counts())
print()
print('Distribution for Machine Learning: \n')
print('Training- \n')
print(f'Negative: {counter(data.ytrain, 0)}')
print(f'Positive: {counter(data.ytrain, 1)}')
print('\nValidation- \n')
print(f'Negative: {counter(data.yval, 0)}')
print(f'Positive: {counter(data.yval, 1)}')
print('\nSMOTE Target Distribution:\n')
print('Training- \n')
print(f'Negative, {counter(smote_ytrain, 0)}')
print(f'Positive, {counter(smote_ytrain, 1)}')
print('\nValidation- \n')
print(f'Negative, {counter(smote_yval, 0)}')
print(f'Positive, {counter(smote_yval, 1)}')

Overall Target Distribution: 
0    4700
1     209
Name: stroke, dtype: int64

Distribution for Machine Learning: 

Training- 

Negative: 3752
Positive: 175

Validation- 

Negative: 948
Positive: 34

SMOTE Target Distribution:

Training- 

Negative, 3752
Positive, 3752

Validation- 

Negative, 948
Positive, 948


#### Ground-Truth Model:

In [30]:
pipeline = modeling_pipeline(data.xtrain, data.ytrain, data.xval, data.yval)
pipeline.execute_pipeline_flow()

results = pd.DataFrame()

for k, v in pipeline.scores.items():
    
    results = results.append({'model': k,
                              'acc': v[0],
                              'kfold_acc': v[1],
                              'roc': v[2],
                              'precision': v[3],
                              'recall': v[4],
                              'f1': v[5]}, ignore_index = True)

LogisticRegression :

 [[947   1]
 [ 32   2]]

Accuracy Score:  0.97
K-Fold Validation Mean Accuracy: 96.44%
Standard Deviation: 0.68
ROC AUC Score: 0.53
Precision: 0.67
Recall: 0.06
F1: 0.11
--------------------------------------

SVM :

 [[948   0]
 [ 34   0]]

Accuracy Score:  0.97
K-Fold Validation Mean Accuracy: 96.54%
Standard Deviation: 0.49
ROC AUC Score: 0.50
Precision: 0.00
Recall: 0.00
F1: 0.00
--------------------------------------

KNeighbors :

 [[939   9]
 [ 34   0]]

Accuracy Score:  0.96
K-Fold Validation Mean Accuracy: 96.33%
Standard Deviation: 0.67
ROC AUC Score: 0.50
Precision: 0.00
Recall: 0.00
F1: 0.00
--------------------------------------

GaussianNB :

 [[834 114]
 [ 19  15]]

Accuracy Score:  0.86
K-Fold Validation Mean Accuracy: 88.80%
Standard Deviation: 2.46
ROC AUC Score: 0.66
Precision: 0.12
Recall: 0.44
F1: 0.18
--------------------------------------

BernoulliNB :

 [[936  12]
 [ 32   2]]

Accuracy Score:  0.96
K-Fold Validation Mean Accuracy: 95.93%
S

#### SMOTE Model:

In [32]:
smote_pipeline = modeling_pipeline(smote_xtrain, smote_ytrain, smote_xval, smote_yval)
smote_pipeline.execute_pipeline_flow()

smote_results = pd.DataFrame()

for k, v in smote_pipeline.scores.items():
    
    smote_results = smote_results.append({'model': k,
                                          'acc': v[0],
                                          'kfold_acc': v[1],
                                          'roc': v[2],
                                          'precision': v[3],
                                          'recall': v[4],
                                          'f1': v[5]}, ignore_index = True)

LogisticRegression :

 [[743 205]
 [181 767]]

Accuracy Score:  0.8
K-Fold Validation Mean Accuracy: 84.81%
Standard Deviation: 2.73
ROC AUC Score: 0.80
Precision: 0.79
Recall: 0.81
F1: 0.80
--------------------------------------

SVM :

 [[670 278]
 [240 708]]

Accuracy Score:  0.73
K-Fold Validation Mean Accuracy: 80.64%
Standard Deviation: 2.13
ROC AUC Score: 0.73
Precision: 0.72
Recall: 0.75
F1: 0.73
--------------------------------------

KNeighbors :

 [[794 154]
 [411 537]]

Accuracy Score:  0.7
K-Fold Validation Mean Accuracy: 90.24%
Standard Deviation: 1.95
ROC AUC Score: 0.70
Precision: 0.78
Recall: 0.57
F1: 0.66
--------------------------------------

GaussianNB :

 [[699 249]
 [217 731]]

Accuracy Score:  0.75
K-Fold Validation Mean Accuracy: 82.33%
Standard Deviation: 3.58
ROC AUC Score: 0.75
Precision: 0.75
Recall: 0.77
F1: 0.76
--------------------------------------

BernoulliNB :

 [[487 461]
 [156 792]]

Accuracy Score:  0.67
K-Fold Validation Mean Accuracy: 70.99%
Sta

In [36]:
print(results.sort_values('f1', ascending = False)[['model', 'precision', 'recall', 'roc']])

                model  precision    recall       roc
3          GaussianNB   0.116279  0.441176  0.660462
5        DecisionTree   0.121951  0.147059  0.554542
0  LogisticRegression   0.666667  0.058824  0.528884
4         BernoulliNB   0.142857  0.058824  0.523083
6        RandomForest   0.500000  0.029412  0.514178
7             XGBoost   0.111111  0.029412  0.510486
1                 SVM   0.000000  0.000000  0.500000
2          KNeighbors   0.000000  0.000000  0.495253


In [None]:
print(smote_results.sort_values('f1', ascending = False)[['model', 'precision', 'recall', 'roc']])