# Balancing Effect on Models

In [49]:
import joblib
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, precision_recall_curve, auc, roc_auc_score
from scikitplot.metrics import plot_roc, plot_precision_recall, plot_cumulative_gain, plot_lift_curve
from statistics import mean, mode
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import pandas as pd

#  Plots  
import matplotlib.pyplot as plt

# Note:  May need to use class weight


In [3]:
# Loading data
# file_path = Path("../data/myopia.csv")
file_path = Path("../eda/reduced_filtered_df.csv")
df = pd.read_csv(file_path)
df.head(5)

<IPython.core.display.Javascript object>

Unnamed: 0,ACD,LT,VCD,SPORTHR,DADMY,delta_spheq,total_positive_screen,MYOPIC
0,3.702,3.392,15.29,4,1,1.358,8,0
1,3.462,3.514,15.52,14,0,1.929,10,0
2,3.224,3.556,15.36,10,1,2.494,26,0
3,3.186,3.654,15.49,12,1,1.433,16,0
4,3.732,3.584,15.08,12,0,2.022,8,0


In [4]:
# Check dataset balance
df["MYOPIC"].value_counts()

0    323
1     49
Name: MYOPIC, dtype: int64

In [5]:
# Define X,y
label = df["MYOPIC"]
X = df.iloc[:,:-1].copy()
X.head()

Unnamed: 0,ACD,LT,VCD,SPORTHR,DADMY,delta_spheq,total_positive_screen
0,3.702,3.392,15.29,4,1,1.358,8
1,3.462,3.514,15.52,14,0,1.929,10
2,3.224,3.556,15.36,10,1,2.494,26
3,3.186,3.654,15.49,12,1,1.433,16
4,3.732,3.584,15.08,12,0,2.022,8


In [6]:
# Note the use of strategy since the dataset is imbalanced. 
# I am isolaating the X_test and y_test from the preprossing 
X_train, X_test, y_train, y_test = train_test_split(X, label, random_state=42, test_size=0.1, stratify=label)

<IPython.core.display.Javascript object>

In [7]:
y_train.value_counts()

0    290
1     44
Name: MYOPIC, dtype: int64

In [8]:
y_test.value_counts()

0    33
1     5
Name: MYOPIC, dtype: int64

## Classifier with Stratified Cross Validation

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from statistics import mean, mode

In [None]:
model = DecisionTreeClassifier()
# evaluate pipeline
# for 5 fold cv, repeat ~100 times; for 10 fold cv, repeat ~50 times
scoring = ('f1', 'recall', 'precision', 'roc_auc')
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=cv, n_jobs=-1)

In [None]:
print(f'Mode of Scores: {mode(scores["test_roc_auc"])}')
print("--------"*10)
print('Mean f1: %.3f' % mean(scores['test_f1']))
print('Mean recall: %.3f' % mean(scores['test_recall']))
print('Mean precision: %.3f' % mean(scores['test_precision']))
print('Mean ROC AUC: %.3f' % mean(scores['test_roc_auc']))

In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

## Pipeline with SMOTE and Stratified Cross Validation
- Good explanation of stratified sampling - https://medium.com/sfu-cspmp/surviving-in-a-random-forest-with-imbalanced-datasets-b98b963d52eb

In [None]:
scoring = ('f1', 'recall', 'precision', 'roc_auc')
steps = [('over', SMOTE()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
# Note for imbalanced classification don't use k-fold cross-validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=cv, n_jobs=-1)

In [None]:
print('Mean f1: %.3f' % mean(scores['test_f1']))
print('Mean recall: %.3f' % mean(scores['test_recall']))
print('Mean precision: %.3f' % mean(scores['test_precision']))
print('Mean ROC AUC: %.3f' % mean(scores['test_roc_auc']))

Mean f1: 0.305
Mean recall: 0.383
Mean precision: 0.279
Mean ROC AUC: 0.609


In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90        24
           1       0.00      0.00      0.00         3

    accuracy                           0.81        27
   macro avg       0.44      0.46      0.45        27
weighted avg       0.78      0.81      0.80        27



## Pipeline with Stratified Cross Validation and Random Forest (imbalanced)

In [43]:
# may need to use class weight to balance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

In [59]:
irfc = RandomForestClassifier(n_estimators=150, random_state=1)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [60]:
# Printout of the folds
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
for train_index, test_index in cv.split(X_train_np, y_train_np):
    
    # select rows
    train_X, test_X = X_train_np[train_index], X_train_np[test_index]
    train_y, test_y = y_train_np[train_index], y_train_np[test_index]
    # summarize train and test composition
    train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
    test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
    print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=189, 1=27, Test: 0=20, 1=4
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=189, 1=27, Test: 0=20, 1=4
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>Train: 0=188, 1=28, Test: 0=21, 1=3
>

In [61]:
scoring = ('f1', 'recall', 'precision', 'roc_auc')
steps = [('model', irfc)]
pipeline = Pipeline(steps=steps)

In [62]:
#Evaluate irfc model
'''
When using X_train, y_train, the following warning occurs:
UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to 
no predicted samples. Use `zero_division` parameter to control this behavior

This is because some of the models don't classify any minority class.
'''
scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=cv)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
print('Mean f1: %.3f' % mean(scores['test_f1']))
print('Mean recall: %.3f' % mean(scores['test_recall']))
print('Mean precision: %.3f' % mean(scores['test_precision']))
print('AUC: %.3f' % mean(scores['test_roc_auc']))
print('-----'*20)
print('Precision Results for each fold')
scores['test_precision']

Mean f1: 0.162
Mean recall: 0.122
Mean precision: 0.272
AUC: 0.780
----------------------------------------------------------------------------------------------------
Precision Results for each fold


array([0.5       , 0.        , 0.        , 0.33333333, 0.        ,
       0.5       , 0.        , 0.        , 1.        , 0.        ,
       0.        , 1.        , 1.        , 0.33333333, 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.5       ,
       0.        , 0.        , 0.        , 1.        , 0.        ])

In [64]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [65]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90        24
           1       0.00      0.00      0.00         3

    accuracy                           0.81        27
   macro avg       0.44      0.46      0.45        27
weighted avg       0.78      0.81      0.80        27



## Pipeline with Balanced Random Forest (imbalanced)

In [51]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [52]:
brfc = BalancedRandomForestClassifier(n_estimators=150, random_state=2)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=2)
scoring1 = ('f1', 'recall', 'precision', 'roc_auc')

steps1 = [('model1', brfc)]
pipeline = Pipeline(steps=steps1)

In [53]:
X_train.values

array([[ 3.43600011,  3.54200005, 16.18000031, ...,  0.        ,
         1.64700002, 11.        ],
       [ 3.31599998,  3.53999996, 15.44999981, ...,  1.        ,
         1.602     , 15.        ],
       [ 3.69199991,  3.37800002, 15.68000031, ...,  1.        ,
         1.588     ,  7.        ],
       ...,
       [ 3.67600012,  3.51399994, 15.05000019, ...,  0.        ,
         1.977     , 16.        ],
       [ 3.99799991,  3.4059999 , 15.67000008, ...,  1.        ,
         1.12000001, 12.        ],
       [ 3.78399992,  3.77999997, 14.85000038, ...,  0.        ,
         2.09000003, 10.        ]])

In [54]:
y_train.shape

(240,)

In [55]:
#Evaluate SRF model
# Note needed to upgrade imbalance-learn to at least 0.9.1 and scikit-learn to at least 1.1.1
scores = cross_validate(pipeline, X_train.values, y_train.values, cv=cv, scoring=scoring1)
#brfc.fit(X_train.values.reshape(-1, 1), y_train)

In [56]:
print('Mean f1: %.3f' % mean(scores['test_f1']))
print('Mean recall: %.3f' % mean(scores['test_recall']))
print('Mean precision: %.3f' % mean(scores['test_precision']))
print('Mean AUC: %.3f' % mean(scores['test_roc_auc']))

Mean f1: 0.432
Mean recall: 0.783
Mean precision: 0.303
Mean AUC: 0.814


In [57]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [58]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.75      0.82        24
           1       0.14      0.33      0.20         3

    accuracy                           0.70        27
   macro avg       0.52      0.54      0.51        27
weighted avg       0.82      0.70      0.75        27

