Testing the behaviour of SMOTE oversampling in a cross-validation scenario.

In [183]:
import numpy as np

from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [184]:
# initialize the data set
X, y = make_classification(n_samples=10000, n_features=10, n_informative = 8, weights=[0.8], n_repeated=0, n_classes=2)

In [209]:
# initialize some guessing strategies
dummy_constant = DummyClassifier(strategy='constant', constant=0)
dummy_uniform = DummyClassifier(strategy='uniform')
dummy_strat = DummyClassifier(strategy='stratified')

clf = LogisticRegression(penalty='l2', solver='lbfgs')

In [190]:
kf = StratifiedKFold(n_splits=10)

sm = SMOTE()
pipeline_constant = Pipeline([('dum_strat', dummy_constant)])
pipeline_constant_oversample = Pipeline([('sm', sm), ('dum_strat', dummy_constant)])

pipeline_uniform = Pipeline([('dum_strat', dummy_uniform)])
pipeline_uniform_oversample = Pipeline([('sm', sm), ('dum_strat', dummy_uniform)])

pipeline_strat = Pipeline([('dum_strat', dummy_strat)])
pipeline_strat_oversample = Pipeline([('sm', sm), ('dum_strat', dummy_strat)])

pipeline_clf = Pipeline([('clf', clf)])
pipeline_clf_oversample = Pipeline([('sm', sm), ('clf', clf)])

In [191]:
grid = GridSearchCV(estimator=pipeline_strat,
                    param_grid = {},
                    scoring='f1_micro',
                    cv=kf,
                    return_train_score=True)

grid.fit(X, y);
grid.cv_results_

{'mean_fit_time': array([0.00136323]),
 'std_fit_time': array([0.00061661]),
 'mean_score_time': array([0.0013562]),
 'std_score_time': array([0.0003798]),
 'params': [{}],
 'split0_test_score': array([0.6953047]),
 'split1_test_score': array([0.67932068]),
 'split2_test_score': array([0.66533467]),
 'split3_test_score': array([0.68331668]),
 'split4_test_score': array([0.672]),
 'split5_test_score': array([0.695]),
 'split6_test_score': array([0.68568569]),
 'split7_test_score': array([0.65965966]),
 'split8_test_score': array([0.67467467]),
 'split9_test_score': array([0.68668669]),
 'mean_test_score': array([0.6797]),
 'std_test_score': array([0.01126027]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([0.67818647]),
 'split1_train_score': array([0.68007556]),
 'split2_train_score': array([0.67774197]),
 'split3_train_score': array([0.67540838]),
 'split4_train_score': array([0.67422222]),
 'split5_train_score': array([0.67655556]),
 'split6_train_score':

In [181]:
grid = GridSearchCV(estimator=pipeline_strat_oversample,
                    param_grid = {},
                    scoring='f1_micro',
                    cv=kf,
                    return_train_score=True)

grid.fit(X, y);
grid.cv_results_

{'mean_fit_time': array([0.07127502]),
 'std_fit_time': array([0.00729249]),
 'mean_score_time': array([0.0012974]),
 'std_score_time': array([0.00022137]),
 'params': [{}],
 'split0_test_score': array([0.4985015]),
 'split1_test_score': array([0.47952048]),
 'split2_test_score': array([0.488]),
 'split3_test_score': array([0.516]),
 'split4_test_score': array([0.508]),
 'split5_test_score': array([0.485]),
 'split6_test_score': array([0.497]),
 'split7_test_score': array([0.473]),
 'split8_test_score': array([0.5035035]),
 'split9_test_score': array([0.51151151]),
 'mean_test_score': array([0.496]),
 'std_test_score': array([0.0135415]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([0.49916657]),
 'split1_train_score': array([0.49716635]),
 'split2_train_score': array([0.49833333]),
 'split3_train_score': array([0.49444444]),
 'split4_train_score': array([0.50188889]),
 'split5_train_score': array([0.48788889]),
 'split6_train_score': array([0.49844444]),


Why is the training and testing score the same when oversampling?

In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


X_train, y_train = SMOTE().fit_resample(X_train, y_train)
y_pred = dummy_strat.fit(X_train, y_train).predict(X_train)

f1_score(y_train, y_pred, average='micro')


0.5031411967959792

In [208]:
grid = GridSearchCV(estimator=pipeline_clf,
                    param_grid = {},
                    scoring='precision',
                    cv=kf,
                    return_train_score=True)

grid.fit(X, y);
grid.cv_results_



{'mean_fit_time': array([0.95241671]),
 'std_fit_time': array([0.37765972]),
 'mean_score_time': array([0.00237074]),
 'std_score_time': array([0.00028251]),
 'params': [{}],
 'split0_test_score': array([0.9039548]),
 'split1_test_score': array([0.87234043]),
 'split2_test_score': array([0.89444444]),
 'split3_test_score': array([0.828125]),
 'split4_test_score': array([0.8556701]),
 'split5_test_score': array([0.78787879]),
 'split6_test_score': array([0.82105263]),
 'split7_test_score': array([0.82439024]),
 'split8_test_score': array([0.83248731]),
 'split9_test_score': array([0.82673267]),
 'mean_test_score': array([0.84472706]),
 'std_test_score': array([0.03438399]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([0.84006928]),
 'split1_train_score': array([0.84219653]),
 'split2_train_score': array([0.84268363]),
 'split3_train_score': array([0.84677419]),
 'split4_train_score': array([0.8433526]),
 'split5_train_score': array([0.84761357]),
 'split6_t

In [197]:
grid = GridSearchCV(estimator=pipeline_clf_oversample,
                    param_grid = {},
                    scoring='precision',
                    cv=kf,
                    return_train_score=True)

grid.fit(X, y);
grid.cv_results_



{'mean_fit_time': array([0.16963615]),
 'std_fit_time': array([0.02870385]),
 'mean_score_time': array([0.0025419]),
 'std_score_time': array([0.00075992]),
 'params': [{}],
 'split0_test_score': array([0.7]),
 'split1_test_score': array([0.71538462]),
 'split2_test_score': array([0.76348548]),
 'split3_test_score': array([0.71984436]),
 'split4_test_score': array([0.73106061]),
 'split5_test_score': array([0.65156794]),
 'split6_test_score': array([0.72834646]),
 'split7_test_score': array([0.7372549]),
 'split8_test_score': array([0.736]),
 'split9_test_score': array([0.70188679]),
 'mean_test_score': array([0.71848264]),
 'std_test_score': array([0.02836797]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([0.71665957]),
 'split1_train_score': array([0.71544024]),
 'split2_train_score': array([0.71611253]),
 'split3_train_score': array([0.71887035]),
 'split4_train_score': array([0.71307301]),
 'split5_train_score': array([0.72167593]),
 'split6_train_scor

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


#X_train, y_train = SMOTE().fit_resample(X_train, y_train)
y_pred = clf.fit(X_train, y_train).predict(X_test)

f1_score(y_test, y_pred, average='macro')

0.8869925306706916

TypeError: 'StratifiedKFold' object is not iterable