In [1]:
%run auxiliary.ipynb

In [2]:
all_df = pd.read_csv('wo_outliers.csv') # Original data

In [3]:
results_dict = dict()

In [4]:
k_fold = StratifiedKFold(10, random_state=42)

X_features = ['CBO','CC','LCOM','WMC']

X = pd.DataFrame(all_df.loc[:, X_features])

y = pd.DataFrame(all_df.loc[:, 'will_change'])

X = X.rename(columns={'CBO':'f0', 'CC':'f1', 'LCOM':'f2', 'WMC':'f3'})

In [5]:
parameters_svc_linear = {
        'kernel':['linear'],
    'max_iter':[10000],
    }

parameters_svc_rbf = {
        'kernel':['rbf'],
    }

parameters_dtc_grid = {
    'criterion':['gini'], 
    }

parameters_rfc_grid = { 
    'criterion' :['gini'],
}

parameters_ABC_grid = { 
    'n_estimators':  [50],
}


parameters_knn_grid = {'n_neighbors': [5]}

parameters_lr_grid = [{ 
                        'penalty':['l2'],
                        }]

parameters_GBC = {
    "loss":["deviance"],
    }


param_xgb_grid = {
           'random_state': [42] #ensemble xgboost with multiple seeds may reduce variance
             }

param_lgm_grid = {
    "random_state":[42]
}

# Random UnderSampler

## Light GBM

In [6]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - RandomUnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - RandomUnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6694770746494885
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.7527472527472527, 0.59]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6966180371352785
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_stat

In [7]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [8]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.7071807502841986
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.8626373626373627, 0.55]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6996968548692687
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.8131868131868132, 0.59]
1    258
0    258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.7190697233800682
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, d

In [9]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [10]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.7291587722622206
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9065934065934066, 0.55]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6775767336112164
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.8379120879120879, 0.52]
1    258
0    258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.6948181129215613
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, ga

In [11]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [12]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.7305323986358468
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9093406593406593, 0.55]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6458412277377794
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.8434065934065934, 0.45]
1    258
0    258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.6775767336112164
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, ga

In [13]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [14]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6647404319818112
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.7087912087912088, 0.62]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.66564039408867
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.607142857142

In [15]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [16]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5904225085259568
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.6291208791208791, 0.55]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6795187571049641
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimato

In [17]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [18]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6633668056081848
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.7060439560439561, 0.62]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7193539219401288
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.6456043956043956, 0.79]
1    258
0    258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5959170140204623
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.6401098901098901, 0.55]
1    258
0    258
Name

In [19]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [20]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6247631678666161
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.6978021978021978, 0.55]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6801345206517622
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=N

In [21]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [22]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6681034482758621
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.75, 0.59]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7317165593027662
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_i

In [23]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [24]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])
    
    X_resampled = X_resampled.rename(columns={'CBO':'f0', 'CC':'f1', 'LCOM':'f2', 'WMC':'f3'})

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    258
0    258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6465990905646078
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.8104395604395604, 0.48]
1    258
0    258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.713101553618795
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight

In [25]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Edited Nearest Neighbour

## Light GBM

In [26]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6586775293671845
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.9725274725274725, 0.34]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.699412656309208
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_s

In [27]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [28]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5517241379310345
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [1.0, 0.1]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5517241379310345
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [1.0, 0.1]
0    2927
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5172413793103449
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=T

In [29]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [30]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5172413793103449
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.03]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    2927
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, pro

In [31]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [32]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    2927
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, 

In [33]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [34]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6408203865100417
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.9368131868131868, 0.34]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6809397499052672
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.879120

In [35]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [36]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6345680181887079
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9587912087912088, 0.31]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6739295187571049
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_esti

In [37]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [38]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5938328912466844
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9807692307692307, 0.21]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5869647593785525
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.967032967032967, 0.21]
0    2927
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5863489958317545
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9313186813186813, 0.24]
0    2919
1     2

In [39]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [40]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6408203865100417
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.9368131868131868, 0.34]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6429518757104964
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_spl

In [41]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [42]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6455570291777188
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.9807692307692307, 0.31]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6835449033724895
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
 

In [43]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [44]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    X_resampled = X_resampled.rename(columns={'CBO':'f0', 'CC':'f1', 'LCOM':'f2', 'WMC':'f3'})
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2922
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6414361500568396
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.9725274725274725, 0.31]
0    2922
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6586775293671845
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_w

In [45]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Tomek Links

## Light GBM

In [46]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6193160287987874
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.9972527472527473, 0.24]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6310629026146268
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_

In [47]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [48]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5344827586206896
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [1.0, 0.07]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5517241379310345
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [1.0, 0.1]
0    3251
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
         

In [49]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [50]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    3251
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, 

In [51]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [52]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    3251
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, 

In [53]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [54]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.613821523304282
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.9862637362637363, 0.24]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6469306555513451
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.9835164

In [55]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [56]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.613821523304282
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9862637362637363, 0.24]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6310629026146268
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estim

In [57]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [58]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5303618794998105
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9917582417582418, 0.07]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5517241379310345
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [1.0, 0.1]
0    3251
1     258
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5648446381205002
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9917582417582418, 0.14]
0    3249
1     258
Name: will_c

In [59]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [60]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5979537703675635
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.989010989010989, 0.21]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6455570291777188
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_spli

In [61]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [62]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6551724137931034
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [1.0, 0.31]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6007010231148162
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              mi

In [63]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [64]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    X_resampled = X_resampled.rename(columns={'CBO':'f0', 'CC':'f1', 'LCOM':'f2', 'WMC':'f3'})
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3246
1     258
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6379310344827587
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [1.0, 0.28]
0    3250
1     258
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6007010231148162
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
      

In [65]:
with open('base_Under_woo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)