In [1]:
import joblib
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Loading data
file_path = Path("../data/myopia.csv")
df = pd.read_csv(file_path)
df.head(5)

<IPython.core.display.Javascript object>

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [3]:
# Check dataset balance
df["MYOPIC"].value_counts()

0    537
1     81
Name: MYOPIC, dtype: int64

In [4]:
# Define X,y
label = df["MYOPIC"]
X = df.iloc[:,:-1].copy()
X.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, label, random_state=42, test_size=0.1)

<IPython.core.display.Javascript object>

In [6]:
y_test.value_counts()

0    54
1     8
Name: MYOPIC, dtype: int64

## Balancing Data - RandomOverSampler

In [None]:
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(X_train, y_train)

In [None]:
print('Original dataset shape', Counter(y_train))
print('Resample dataset shape', Counter(y_ros))

## Balancing Data - SMOTE
- synthetic minority over-sampling technique

In [None]:
# transform the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

## Balacing Data - SMOTE with Undersampling

In [None]:
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

In [None]:
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X, y = pipeline.fit_resample(X_train, y_train)

In [None]:
counter = Counter(y)
print(counter)

## Balancing Data - Sampling Low Density Areas

In [None]:
oversample = ADASYN()

In [None]:
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)

## Balancing Data - Overampling Misclassified

In [None]:
oversample = BorderlineSMOTE()

In [None]:
X, y = oversample.fit_resample(X_train, y_train)

In [None]:
counter = Counter(y)
print(counter)

### Usage
Often used to identify hard to classify datapoints.  The models that work well are KNN and SVM

## Model with Cross Validation

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from statistics import mean

In [13]:
model = DecisionTreeClassifier()
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.635


## Pipeline with Cross Validation
- Good explanation of stratified sampling - https://medium.com/sfu-cspmp/surviving-in-a-random-forest-with-imbalanced-datasets-b98b963d52eb

In [15]:
steps = [('over', SMOTE()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.667


## Pipeline with Random Forest (imbalanced)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate

In [40]:
irfc = RandomForestClassifier(n_estimators=150, random_state=0)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scoring = ('f1', 'recall', 'precision')

steps = [('model', irfc)]
pipeline = Pipeline(steps=steps)

In [41]:
#Evaluate SRF model
scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=cv)

In [42]:
from statistics import mean
print('Mean f1: %.3f' % mean(scores['test_f1']))
print('Mean recall: %.3f' % mean(scores['test_recall']))
print('Mean precision: %.3f' % mean(scores['test_precision']))

Mean f1: 0.417
Mean recall: 0.310
Mean precision: 0.768


## Pipeline with Random Forest (imbalanced)

In [8]:
import joblib
from imblearn.ensemble import BalancedRandomForestClassifier

In [11]:
brfc = BalancedRandomForestClassifier(n_estimators=150, random_state=2)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=2)
scoring1 = ('f1', 'recall', 'precision')

steps1 = [('model1', brfc)]
pipeline = Pipeline(steps=steps1)

In [12]:
X_train.values

array([[ 6.        ,  1.38399994, 22.63999939, ..., 24.        ,
         1.        ,  0.        ],
       [ 6.        ,  0.97000003, 22.40999985, ..., 19.        ,
         1.        ,  1.        ],
       [ 6.        ,  0.67799997, 22.39999962, ..., 37.        ,
         1.        ,  0.        ],
       ...,
       [ 6.        ,  1.14600003, 23.03000069, ..., 19.        ,
         0.        ,  1.        ],
       [ 8.        ,  0.94400001, 22.23999977, ..., 28.        ,
         0.        ,  0.        ],
       [ 6.        ,  0.62      , 22.60000038, ..., 45.        ,
         0.        ,  1.        ]])

In [13]:
y_train.shape

(556,)

In [19]:
#Evaluate SRF model
# Note needed to upgrade imbalance-learn to at least 0.9.1 and scikit-learn to at least 1.1.1
scores = cross_validate(pipeline, X_train.values, y_train.values, cv=cv, scoring=scoring1)
#brfc.fit(X_train.values.reshape(-1, 1), y_train)

In [20]:
print('Mean f1: %.3f' % mean(scores['test_f1']))
print('Mean recall: %.3f' % mean(scores['test_recall']))
print('Mean precision: %.3f' % mean(scores['test_precision']))

Mean f1: 0.457
Mean recall: 0.789
Mean precision: 0.324
