In [1]:
%run auxiliary.ipynb

In [2]:
all_df = pd.read_csv('wo_outliers.csv') # Original data

In [3]:
results_dict = dict()

In [4]:
k_fold = StratifiedKFold(10)

X_features = ['CBO','CC','LCOM','WMC']

X = pd.DataFrame(all_df.loc[:, X_features])

y = pd.DataFrame(all_df.loc[:, 'will_change'])

In [5]:
parameters_svc_linear = {
        'kernel':['linear'],
        'C': [0.002, 1, 512, 1024, 2048], 
        'class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
    }

parameters_svc_rbf = {
        'kernel':['rbf'],
        'C': [0.002, 1, 512, 1024, 2048], 
        'class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
        'max_iter': [10000]
    }

parameters_dtc_grid = {
    'criterion':('gini', 'entropy'), 
    'min_samples_split':[0.1, 0.2, 0.3],  
    'max_depth': [1, 10, 30, None],
    'class_weight':[{1:1}, {1:10}, {1:20}, 'balanced'],
    'presort':[False, True],
}

parameters_rfc_grid = { 
    'n_estimators': [10, 50, 90],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [1, 10, 30, None],
    'min_samples_split': [2, 8, 16],
    'bootstrap': [True, False],
    'criterion' :['gini', 'entropy'],
    'class_weight':[ {1:1}, {1:10}, {1:15}, {1:20}]
}

parameters_ABC_grid = {
    'base_estimator__criterion' : ["gini", "entropy"],
    'base_estimator__splitter' :   ["best", "random"],
    'base_estimator__max_depth' : [1, 10, 30, None], 
    'base_estimator__class_weight': [{1:1}, {1:10}, {1:15}, {1:20}], 
    'n_estimators':  [10, 50, 90],
}


parameters_knn_grid = {'n_neighbors': list(range(1,30))}


penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
parameters_lr_grid = [{'C': np.logspace(0, 10, 25), 
                        'penalty':['l1', 'l2'],
                        }]

parameters_GBC = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05],
    "max_depth": [None, 2, 4, 6, 10],
    "max_features":["log2","sqrt", None],
    "criterion": ["friedman_mse",  "mae"],
    'n_estimators': [64, 128, 256, 512] #[1, 2, 4, 16, 32, 64, 128, 256, 512]
    }

param_xgb_grid = {
           'clf__n_estimators':  [10, 30, 50, 70, 90],
           'clf__max_depth' : [1, 10, 30, None],
           'learning_rate': [0.0001],
           'min_child_weight': [1], #tuning min_child_weight subsample colsample_bytree for fighting against overfit
           'random_state': [42] #ensemble xgboost with multiple seeds may reduce variance
             }

param_lgm_grid = {
    "random_state":[42]
}

# Random UnderSampler

## Light GBM

In [6]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - RandomUnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - RandomUnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6694770746494885
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.7527472527472527, 0.59]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6966180371352785
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_stat

In [7]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [8]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6685771125426297
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.8543956043956044, 0.48]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7804092459264873
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.8021978021978022, 0.76]
1    258
0    258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.6935865858279651
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, d

In [9]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [10]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6478306176582038
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.8818681318681318, 0.41]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.8037608942781357
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.8489010989010989, 0.76]
1    258
0    258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.7059492231906025
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, ga

In [11]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [12]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6519514967790829
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.8901098901098901, 0.41]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7810250094732853
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.8379120879120879, 0.72]
1    258
0    258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.6618510799545283
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, ga

In [13]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [14]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6145320197044335
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.6428571428571429, 0.59]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6769136036377416
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.4917582417

In [15]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [16]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6632247063281546
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.7747252747252747, 0.55]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7103543008715423
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimato

In [17]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [18]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6783345964380447
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.804945054945055, 0.55]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7047176960970065
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.7197802197802198, 0.69]
1    258
0    258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5998957938613111
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.717032967032967, 0.48]
1    258
0    258
Name: 

In [19]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [20]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6439939370973855
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.7362637362637363, 0.55]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7309586964759378
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=N

In [21]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [22]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6803239863584691
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.8434065934065934, 0.52]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7550682076544146
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
     

## XGBoost

In [23]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6499621068586586
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.8516483516483516, 0.45]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6677718832891246
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weigh

# Edited Nearest Neighbour

## Light GBM

In [24]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - EditedNearestNeighbous'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - EditedNearestNeighbous'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6586775293671845
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.9725274725274725, 0.34]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.699412656309208
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_s

In [25]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [26]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6338101553618796
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9917582417582418, 0.28]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6696665403561955
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9945054945054945, 0.34]
0    2927
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.613821523304282
Estimator and parameters: LogisticRegression(C=1.0, class_weight=No

In [27]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [28]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7517051913603638
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.6758241758241759, 0.83]
0    2927
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', m

In [29]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [30]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7784198560060629
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.7637362637362637, 0.79]
0    2927
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.6604774535809018
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  

In [31]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [32]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6179424024251611
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.9945054945054945, 0.24]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6827870405456613
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.986263

In [33]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [34]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6414361500568396
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9725274725274725, 0.31]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7014020462296324
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_esti

In [35]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [36]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5503505115574081
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9972527472527473, 0.1]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5689655172413793
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [1.0, 0.14]
0    2927
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5317355058734369
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9945054945054945, 0.07]
0    2919
1     258
Name: will_c

In [37]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [38]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6310629026146268
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.9862637362637363, 0.28]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7021599090564608
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_spl

In [39]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [40]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6324365289882532
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.989010989010989, 0.28]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6904130352406214
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
  

## XGBoost

In [41]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6193160287987874
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.9972527472527473, 0.24]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6814134141720348
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_w

# Tomek Links

## Light GBM

In [42]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6193160287987874
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.9972527472527473, 0.24]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6310629026146268
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_

In [43]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [44]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6020746494884426
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9972527472527473, 0.21]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6351837817355059
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9945054945054945, 0.28]
0    3251
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5476032588101554
Estimator and parameters: LogisticRegression(C=1.0, class_weight=N

In [45]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [46]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6325786282682835
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9203296703296703, 0.34]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7491474043198181
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9120879120879121, 0.59]
0    3251
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.7181697612732095
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree

In [47]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [48]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7442686623721106
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9368131868131868, 0.55]
0    3251
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.615479348237969
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  k

In [49]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [50]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6179424024251611
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.9945054945054945, 0.24]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7652993558165972
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.771978

In [51]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [52]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6365574081091323
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9972527472527473, 0.28]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6365574081091323
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_esti

In [53]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [54]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5517241379310345
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [1.0, 0.1]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5517241379310345
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [1.0, 0.1]
0    3251
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5172413793103449
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [1.0, 0.03]
0    3249
1     258
Name: will_change, dtype: int64
Fold: 4
ROC

In [55]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [56]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5514873057976506
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.4478021978021978, 0.66]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6710401667298219
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_spl

In [57]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [58]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6193160287987874
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.9972527472527473, 0.24]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.567591890867753
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
  

In [59]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [60]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, cv=10, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6020746494884426
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.9972527472527473, 0.21]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5689655172413793
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_w

In [61]:
with open('partial_results_undersampling.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)