In [1]:
%run auxiliary.ipynb

In [2]:
all_df = pd.read_csv('wo_outliers.csv') # Original dataa

In [3]:
results_dict = dict()

In [4]:
k_fold = StratifiedKFold(10)

X_features = ['CBO','CC','LCOM','WMC']

X = pd.DataFrame(all_df.loc[:, X_features])

y = pd.DataFrame(all_df.loc[:, 'will_change'])
X = X.rename(columns={'CBO':'f0', 'CC':'f1', 'LCOM':'f2', 'WMC':'f3'})

In [5]:
X.head()

Unnamed: 0,f0,f1,f2,f3
0,0.0,0.004098,0.0,0.017857
1,0.0,0.004098,0.0,0.017857
2,0.12963,0.018443,0.0,0.071429
3,0.006173,0.014344,0.0,0.035714
4,0.006173,0.004098,0.0,0.017857


In [6]:
parameters_svc_linear = {
        'clf__kernel':['linear'],
        'clf__C': [0.002, 1, 512, 1024, 2048], 
        'clf__class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
        'clf__max_iter': [10000]
    }

parameters_svc_rbf = {
        'clf__kernel':['rbf'],
        'clf__C': [0.002, 1, 512, 1024, 2048], 
        'clf__class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
        'clf__max_iter': [10000]
    }

parameters_dtc_grid = {
    'clf__criterion':('gini', 'entropy'), 
    'clf__min_samples_split':[0.1, 0.2, 0.3], 
    'clf__max_depth': [1, 10, 30, None],
    'clf__class_weight':[{1:1}, {1:10}, {1:20}, 'balanced'],
    'clf__presort':[False, True],
}

parameters_rfc_grid = { 
    'clf__n_estimators': [10, 50, 90],
    'clf__max_features': ['sqrt', 'log2'],
    'clf__max_depth' : [1, 10, 30, None],
    'clf__min_samples_split': [2, 8, 16],
    'clf__criterion' :['gini', 'entropy'],
    'clf__class_weight':[ {1:1}, {1:10}, {1:15}, {1:20}]
}

parameters_ABC_grid = {
    'clf__base_estimator__criterion' : ["gini", "entropy"],
    'clf__base_estimator__splitter' :   ["best", "random"],
    'clf__base_estimator__max_depth' : [1, 10, 30, None], 
    'clf__base_estimator__class_weight': [{1:1}, {1:10}, {1:15}, {1:20}], 
    'clf__n_estimators':  [10, 50, 90],
}


parameters_knn_grid = {'clf__n_neighbors': list(range(1,30))}


penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
parameters_lr_grid = [{'clf__C': np.logspace(0, 10, 25), 
                        'clf__penalty':['l1', 'l2'],
                        }]

parameters_GBC = {
    "clf__loss":["deviance"],
    "clf__learning_rate": [0.01, 0.025, 0.05],
    "clf__max_depth": [None, 2, 4, 6, 10],
    "clf__max_features":["log2","sqrt"],
    "clf__criterion": ["friedman_mse",  "mae"],
    'clf__n_estimators': [64, 128, 256, 512] #[1, 2, 4, 16, 32, 64, 128, 256, 512]
    }


param_xgb_grid = {
           'clf__clf__n_estimators':  [10, 50, 90],
           'clf__clf__max_depth' : [1, 10, 30, None],
           'clf__learning_rate': [0.0001],
           'clf__min_child_weight': [1], #tuning min_child_weight subsample colsample_bytree for fighting against overfit
           'clf__random_state': [42] #ensemble xgboost with multiple seeds may reduce variance
             }

param_lgm_grid = {
    "clf__clf__random_state":[42]
}

# SMOTE

In [7]:
sampler = SMOTE(random_state=42)

## Light GBM

In [None]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', lbm.LGBMClassifier())
    ])


clf = GridSearchCV(base_clf, param_lgm_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

In [None]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [None]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', LogisticRegression(random_state=42))
    ])


clf = GridSearchCV(base_clf, parameters_lr_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

In [None]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [None]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', svm.SVC(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_svc_linear, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

In [None]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [None]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', svm.SVC(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_svc_rbf, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

In [None]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [None]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', DecisionTreeClassifier(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_dtc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

In [None]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

%%time


base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', RandomForestClassifier(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_rfc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [8]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', KNeighborsClassifier())
    ])

clf = GridSearchCV(base_clf, parameters_knn_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6824554755589238
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
   out_step='deprecated', random_state=42, ratio=None,
   sampling_strategy='auto', svm_estimator='deprecated')), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])
[Confusion Matrix]: [0.8131868131868132, 0.55]
Fold: 2
ROC_AUC: 0.7117279272451686
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
   out_step='deprecated', random_state=42, ratio=None,
   sampling_strategy='auto', svm_estimator='deprecated')), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])

In [9]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', AdaBoostClassifier(random_state=42, base_estimator=adaboost_base))
    ])

clf = GridSearchCV(base_clf, parameters_ABC_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', GradientBoostingClassifier(random_state=42))
    ])


clf = GridSearchCV(base_clf, parameters_GBC, cv=10, scoring='roc_auc', n_jobs=-1)


scores_list = list()

results_dict['Gradient Boost - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [10]:
%%time


base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', xgb.XGBClassifier(random_state=42))
    ])


clf = GridSearchCV(base_clf, param_xgb_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6940602500947328
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
   out_step='deprecated', random_state=42, ratio=None,
   sampling_strategy='auto', svm_estimator='deprecated')), ('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsam...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])
[Confusion Matrix]: [0.8708791208791209, 0.52]
Fold: 2
ROC_AUC: 0.779651383099659
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
   out_step='deprecated', random_state=42, ratio=None,
   sampling_strategy='auto', svm_estimator='deprecated')), ('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsam...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=N

In [11]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# ADASYN

In [12]:
sampler = ADASYN(random_state=42)

## Light GBM

In [13]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', lbm.LGBMClassifier())
    ])


clf = GridSearchCV(base_clf, param_lgm_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.624194770746495
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_c...0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0))])
[Confusion Matrix]: [0.9725274725274725, 0.28]
Fold: 2
ROC_AUC: 0.627084122773778
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_c...0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_

In [14]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [15]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', LogisticRegression(random_state=42))
    ])


clf = GridSearchCV(base_clf, parameters_lr_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6534672224327397
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])
[Confusion Matrix]: [0.8241758241758241, 0.48]
Fold: 2
ROC_AUC: 0.7846722243273967
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])
[Co

In [16]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [17]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', svm.SVC(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_svc_linear, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6423361121636983
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False))])
[Confusion Matrix]: [0.8708791208791209, 0.41]
Fold: 2
ROC_AUC: 0.7878931413414171
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False))])
[Confusion Matrix]: [0.8516483516483516, 0.72]
Fold: 3
ROC_

In [18]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [19]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', svm.SVC(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_svc_rbf, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6395888594164456
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False))])
[Confusion Matrix]: [0.8653846153846154, 0.41]
Fold: 2
ROC_AUC: 0.7790356195528609
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False))])
[Confusion Matrix]: [0.7994505494505495, 0.76]
Fold: 3
ROC_

In [20]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [21]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', DecisionTreeClassifier(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_dtc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6824554755589238
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'))])
[Confusion Matrix]: [0.8131868131868132, 0.55]
Fold: 2
ROC_AUC: 0.6970917014020462
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_sp

In [22]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

%%time


base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', RandomForestClassifier(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_rfc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [23]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', KNeighborsClassifier())
    ])

clf = GridSearchCV(base_clf, parameters_knn_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6473569533914362
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])
[Confusion Matrix]: [0.7774725274725275, 0.52]
Fold: 2
ROC_AUC: 0.6532777567260326
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])
[Confusion Matrix]: [0.5824175824175825, 0.72]
Fold: 3
ROC_AUC: 0.5972906403940886
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, rando

In [24]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', AdaBoostClassifier(random_state=42, base_estimator=adaboost_base))
    ])

clf = GridSearchCV(base_clf, parameters_ABC_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', GradientBoostingClassifier(random_state=42))
    ])


clf = GridSearchCV(base_clf, parameters_GBC, cv=10, scoring='roc_auc', n_jobs=-1)


scores_list = list()

results_dict['Gradient Boost - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [25]:
%%time


base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', xgb.XGBClassifier(random_state=42))
    ])


clf = GridSearchCV(base_clf, param_xgb_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - ADASYN'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - ADASYN'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6456517620310724
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])
[Confusion Matrix]: [0.6016483516483516, 0.69]
Fold: 2
ROC_AUC: 0.6917392951875709
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', ADASYN(n_jobs=1, n_neighbors=5, random_state=42, ratio=None,
    sampling_strategy='auto')), ('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=

In [26]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Random OverSampler

In [27]:
sampler = RandomOverSampler(random_state=42)

## Light GBM

In [28]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', lbm.LGBMClassifier())
    ])


clf = GridSearchCV(base_clf, param_lgm_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6002273588480485
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samp...0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0))])
[Confusion Matrix]: [0.8901098901098901, 0.31]
Fold: 2
ROC_AUC: 0.6348522167487685
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samp...0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_fo

In [29]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [30]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', LogisticRegression(random_state=42))
    ])


clf = GridSearchCV(base_clf, parameters_lr_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6458412277377794
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])
[Confusion Matrix]: [0.8434065934065934, 0.45]
Fold: 2
ROC_AUC: 0.7817828723001137
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0

In [31]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [32]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', svm.SVC(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_svc_linear, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6278419856006063
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False))])
[Confusion Matrix]: [0.8763736263736264, 0.38]
Fold: 2
ROC_AUC: 0.7741568776051534
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False))])
[Confusion Matrix]: [0.8241758241

In [33]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [34]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', svm.SVC(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_svc_rbf, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6305892383478591
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False))])
[Confusion Matrix]: [0.8818681318681318, 0.38]
Fold: 2
ROC_AUC: 0.7733990147783253
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False))])
[Confusion Matrix]: [0.8571428571

In [35]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [36]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', DecisionTreeClassifier(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_dtc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.5
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'))])
[Confusion Matrix]: [0.0, 1.0]
Fold: 2
ROC_AUC: 0.7207275483137552
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=N

In [37]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', RandomForestClassifier(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_rfc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [38]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', KNeighborsClassifier())
    ])

clf = GridSearchCV(base_clf, parameters_knn_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6783345964380447
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])
[Confusion Matrix]: [0.804945054945055, 0.55]
Fold: 2
ROC_AUC: 0.6854869268662371
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])
[Confusion Matrix]: [0.6813186813186813, 0.69]
Fold: 3
ROC_AUC: 0.584785903751421
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSam

In [39]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', AdaBoostClassifier(random_state=42, base_estimator=adaboost_base))
    ])

clf = GridSearchCV(base_clf, parameters_ABC_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', GradientBoostingClassifier(random_state=42))
    ])


clf = GridSearchCV(base_clf, parameters_GBC, cv=10, scoring='roc_auc', n_jobs=-1)


scores_list = list()

results_dict['Gradient Boost - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [40]:
%%time


base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', xgb.XGBClassifier(random_state=42))
    ])


clf = GridSearchCV(base_clf, param_xgb_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.629973474801061
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])
[Confusion Matrix]: [0.8461538461538461, 0.41]
Fold: 2
ROC_AUC: 0.7111121636983706
Estimator and parameters: Pipeline(memory=None,
     steps=[('sampling', RandomOverSampler(random_state=42, ratio=None, return_indices=False,
         sampling_strategy='auto')), ('clf', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=N

In [41]:
with open('partial_results_oversampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)