Testing the effect of using SMOTE oversampling in a cross-validation scenario.

In [144]:
import numpy as np

from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [127]:
# initialize the data set
X, y = make_classification(n_samples=10000, n_features=10, n_informative = 8, weights=[0.8], n_repeated=0, n_classes=2)

In [160]:
# initialize two types of guessing strategies
dummy_constant = DummyClassifier(strategy='constant', constant=0)
dummy_uniform = DummyClassifier(strategy='uniform')
dummy_strat = DummyClassifier(strategy='stratified')

clf = 

In [176]:
kf = StratifiedKFold(n_splits=10)

sm = SMOTE()
pipeline_constant = Pipeline([('dum_strat', dummy_constant)])
pipeline_constant_oversample = Pipeline([('sm', sm), ('dum_strat', dummy_constant)])

pipeline_uniform = Pipeline([('dum_strat', dummy_uniform)])
pipeline_uniform_oversample = Pipeline([('sm', sm), ('dum_strat', dummy_uniform)])

pipeline_strat = Pipeline([('dum_strat', dummy_strat)])
pipeline_strat_oversample = Pipeline([('sm', sm), ('dum_strat', dummy_strat)])

In [182]:
grid = GridSearchCV(estimator=pipeline_strat,
                    param_grid = {},
                    scoring='f1_micro',
                    cv=kf,
                    return_train_score=True)

grid.fit(X, y);
grid.cv_results_

{'mean_fit_time': array([0.0014436]),
 'std_fit_time': array([0.00039714]),
 'mean_score_time': array([0.00171268]),
 'std_score_time': array([0.00051814]),
 'params': [{}],
 'split0_test_score': array([0.68731269]),
 'split1_test_score': array([0.64135864]),
 'split2_test_score': array([0.673]),
 'split3_test_score': array([0.669]),
 'split4_test_score': array([0.674]),
 'split5_test_score': array([0.688]),
 'split6_test_score': array([0.69]),
 'split7_test_score': array([0.671]),
 'split8_test_score': array([0.66766767]),
 'split9_test_score': array([0.66666667]),
 'mean_test_score': array([0.6728]),
 'std_test_score': array([0.01345478]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([0.67285254]),
 'split1_train_score': array([0.67207467]),
 'split2_train_score': array([0.67377778]),
 'split3_train_score': array([0.67155556]),
 'split4_train_score': array([0.67133333]),
 'split5_train_score': array([0.67655556]),
 'split6_train_score': array([0.66955556]

In [181]:
grid = GridSearchCV(estimator=pipeline_strat_oversample,
                    param_grid = {},
                    scoring='f1_micro',
                    cv=kf,
                    return_train_score=True)

grid.fit(X, y);
grid.cv_results_

{'mean_fit_time': array([0.07127502]),
 'std_fit_time': array([0.00729249]),
 'mean_score_time': array([0.0012974]),
 'std_score_time': array([0.00022137]),
 'params': [{}],
 'split0_test_score': array([0.4985015]),
 'split1_test_score': array([0.47952048]),
 'split2_test_score': array([0.488]),
 'split3_test_score': array([0.516]),
 'split4_test_score': array([0.508]),
 'split5_test_score': array([0.485]),
 'split6_test_score': array([0.497]),
 'split7_test_score': array([0.473]),
 'split8_test_score': array([0.5035035]),
 'split9_test_score': array([0.51151151]),
 'mean_test_score': array([0.496]),
 'std_test_score': array([0.0135415]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([0.49916657]),
 'split1_train_score': array([0.49716635]),
 'split2_train_score': array([0.49833333]),
 'split3_train_score': array([0.49444444]),
 'split4_train_score': array([0.50188889]),
 'split5_train_score': array([0.48788889]),
 'split6_train_score': array([0.49844444]),


In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


X_train, y_train = SMOTE().fit_resample(X_train, y_train)
y_pred = dummy_strat.fit(X_train, y_train).predict(X_train)

f1_score(y_train, y_pred, average='micro')


0.5031411967959792