In [72]:
%run auxiliary.ipynb

In [73]:
all_df = pd.read_excel('all_releases_no_repetitions.xlsx') # Original data

In [74]:
results_dict = dict()

In [87]:
k_fold = StratifiedKFold(10, shuffle=True, random_state=42)

X_features = ['CBO','CC','LCOM','WMC']

X = pd.DataFrame(all_df.loc[:, X_features])

y = pd.DataFrame(all_df.loc[:, 'will_change'])

X = X.rename(columns={'CBO':'f0', 'CC':'f1', 'LCOM':'f2', 'WMC':'f3'})

In [82]:
parameters_svc_linear = {
        'kernel':['linear'],
    'max_iter':[10000],
    }

parameters_svc_rbf = {
        'kernel':['rbf'],
    }

parameters_dtc_grid = {
    'criterion':['gini'], 
    }

parameters_rfc_grid = { 
    'criterion' :['gini'],
}

parameters_ABC_grid = { 
    'n_estimators':  [50],
}


parameters_knn_grid = {'n_neighbors': [5]}

parameters_lr_grid = [{ 
                        'penalty':['l2'],
                        }]

parameters_GBC = {
    "loss":["deviance"],
    }


param_xgb_grid = {
           'random_state': [42] #ensemble xgboost with multiple seeds may reduce variance
             }

param_lgm_grid = {
    "random_state":[42]
}

# Random UnderSampler

## Light GBM

In [83]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - RandomUnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - RandomUnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6736469072164949
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.6597938144329897, 0.69]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.725250322997416
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state

In [7]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [88]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.7495167525773196
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.8427835051546392, 0.66]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6984415374677002
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.8656330749354005, 0.53]
1    281
0    281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.7669834125197966
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, d

In [9]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [10]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6605992268041238
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9149484536082474, 0.41]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7347383720930234
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9069767441860465, 0.56]
1    281
0    281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.492289739101442
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gam

In [11]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [12]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6383698453608248
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.6829896907216495, 0.59]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7309027777777778
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.5555555555555556, 0.91]
1    281
0    281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.6306993415020421
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, ga

In [13]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [14]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.584729381443299
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.7319587628865979, 0.44]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6073158914728682
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.52713178294

In [15]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [16]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6087306701030928
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.6237113402061856, 0.59]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6150678294573644
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimato

In [17]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [18]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5902061855670102
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.6804123711340206, 0.5]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6826146640826873
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.5839793281653747, 0.78]
1    281
0    281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.637784446111528
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.5658914728682171, 0.71]
1    281
0    281
Name: 

In [19]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [20]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5620167525773196
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.5927835051546392, 0.53]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6306928294573644
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=N

In [21]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [22]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6560889175257731
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.7809278350515464, 0.53]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7095041989664083
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
     

In [23]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [24]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])
    
    X_resampled = X_resampled.rename(columns={'CBO':'f0', 'CC':'f1', 'LCOM':'f2', 'WMC':'f3'})

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6599548969072165
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.788659793814433, 0.53]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7211321059431525
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight

In [25]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Edited Nearest Neighbour

## Light GBM

In [26]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - EditedNearestNeighbous'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - EditedNearestNeighbous'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6396585051546392
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.9355670103092784, 0.34]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6174903100775194
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_

In [27]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [28]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5990657216494845
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.979381443298969, 0.22]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5951631136950905
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9715762273901809, 0.22]
0    2868
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5651412853213303
Estimator and parameters: LogisticRegression(C=1.0, class_weight=No

In [29]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [30]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.3735502577319588
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.028350515463917526, 0.72]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.2835917312661499
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.06718346253229975, 0.5]
0    2868
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.3278319579894974
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degr

In [31]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [32]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5339884020618556
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9742268041237113, 0.09]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5091650516795866
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9870801033591732, 0.03]
0    2868
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.4702842377260982
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree

In [33]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [34]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6087306701030928
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.8737113402061856, 0.34]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6593184754521964
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.881136

In [35]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [36]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5877899484536082
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.8943298969072165, 0.28]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5668604651162791
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_esti

In [37]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [38]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6227448453608248
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9329896907216495, 0.31]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6160771963824289
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9509043927648578, 0.28]
0    2868
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5979828290405935
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.8733850129198967, 0.32]
0    2945
1     

In [39]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [40]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6177512886597938
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.8917525773195877, 0.34]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6267764857881137
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_spl

In [41]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [42]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6409471649484536
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.9381443298969072, 0.34]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.522327196382429
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
  

In [43]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [44]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    X_resampled = X_resampled.rename(columns={'CBO':'f0', 'CC':'f1', 'LCOM':'f2', 'WMC':'f3'})
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2887
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6317654639175256
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.9510309278350515, 0.31]
0    2920
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5561611757105943
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_w

In [45]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Tomek Links

## Light GBM

In [46]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.49742268041237114
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.9948453608247423, 0.0]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5599160206718347
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_

In [47]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [48]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.546875
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [1.0, 0.09]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5286660206718345
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9948320413436692, 0.06]
0    3452
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5270901058597982
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_inte

In [49]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [50]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5716817010309279
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9871134020618557, 0.16]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6694121447028424
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9638242894056848, 0.38]
0    3452
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.39293156622488956
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degre

In [51]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [52]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.49742268041237114
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9948453608247423, 0.0]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    3452
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.49870801033591733
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
 

In [53]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [54]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.49355670103092786
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.9871134020618557, 0.0]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5977470930232558
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.976744

In [55]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [56]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5091817010309279
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9871134020618557, 0.03]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5339551033591731
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_esti

In [57]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [58]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.49355670103092786
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9871134020618557, 0.0]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5573320413436692
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9896640826873385, 0.12]
0    3452
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5393431691256148
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9819121447028424, 0.1]
0    3446
1     2

In [59]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [60]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5273840206185567
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.9922680412371134, 0.06]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5652051033591731
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_spl

In [61]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [62]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.49871134020618557
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.9974226804123711, 0.0]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5286660206718345
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
 

In [63]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [64]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    X_resampled = X_resampled.rename(columns={'CBO':'f0', 'CC':'f1', 'LCOM':'f2', 'WMC':'f3'})
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3444
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.49871134020618557
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.9974226804123711, 0.0]
0    3451
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5455830103359173
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_w

In [65]:
with open('base_Under_wo_wfs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)