In [1]:
%run auxiliary.ipynb

In [2]:
all_df = pd.read_excel('all_releases_no_repetitions.xlsx') # Original data

In [3]:
results_dict = dict()

In [4]:
k_fold = StratifiedKFold(10, random_state=42)

X_features = ['CBO','CC','DIT','LCOM','LOC','NOC','RFC','WMC']

X = pd.DataFrame(all_df.loc[:, X_features])

y = pd.DataFrame(all_df.loc[:, 'will_change'])

X = X.rename(columns={'CBO':'f0', 'CC':'f1', 'DIT':'f2', 'LCOM':'f3', 'LOC':'f4', 'NOC':'f5', 'RFC':'f6', 'WMC':'f7'})

In [5]:
parameters_svc_linear = {
        'kernel':['linear'],
    'max_iter':[10000],
    }

parameters_svc_rbf = {
        'kernel':['rbf'],
    }

parameters_dtc_grid = {
    'criterion':['gini'], 
    }

parameters_rfc_grid = { 
    'criterion' :['gini'],
}

parameters_ABC_grid = { 
    'n_estimators':  [50],
}


parameters_knn_grid = {'n_neighbors': [5]}

parameters_lr_grid = [{ 
                        'penalty':['l2'],
                        }]

parameters_GBC = {
    "loss":["deviance"],
    }


param_xgb_grid = {
           'random_state': [42] #ensemble xgboost with multiple seeds may reduce variance
             }

param_lgm_grid = {
    "random_state":[42]
}

# Random UnderSampler

## Light GBM

In [6]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - RandomUnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - RandomUnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6784793814432991
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.7319587628865979, 0.62]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6644056847545221
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_stat

In [7]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [8]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6805734536082474
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.8298969072164949, 0.53]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7327600129198967
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.7467700258397932, 0.72]
1    281
0    281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.7082187213470035
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, d

In [9]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [10]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.4178479381443299
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.023195876288659795, 0.81]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7149951550387598
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9612403100775194, 0.47]
1    281
0    281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.3980995248812203
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, 

In [11]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [12]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6945876288659794
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.6391752577319587, 0.75]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6971899224806202
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.5193798449612403, 0.88]
1    281
0    281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.626823372509794
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gam

In [13]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [14]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6278994845360825
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.6932989690721649, 0.56]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6123627260981912
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.5684754521

In [15]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [16]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6251610824742269
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.7190721649484536, 0.53]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6839066537467702
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimato

In [17]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [18]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6510953608247423
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.7396907216494846, 0.56]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6371527777777778
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.5555555555555556, 0.72]
1    281
0    281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.6306993415020421
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.5839793281653747, 0.68]
1    281
0    281
Name

In [19]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [20]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.642235824742268
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.6907216494845361, 0.59]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6500726744186046
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=No

In [21]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [22]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6509342783505155
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.770618556701031, 0.53]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6890746124031006
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
      

In [23]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [24]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Random UnderSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])
    
    X_resampled = X_resampled.rename(columns={'CBO':'f0', 'CC':'f1', 'DIT':'f2', 'LCOM':'f3', 'LOC':'f4', 'NOC':'f5', 'RFC':'f6', 'WMC':'f7'})

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Random UnderSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

1    280
0    280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.658666237113402
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.7860824742268041, 0.53]
1    280
0    280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.7292474160206719
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight

In [25]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Edited Nearest Neighbour

## Light GBM

In [26]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - EditedNearestNeighbous'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - EditedNearestNeighbous'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6473904639175257
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.9510309278350515, 0.34]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6825742894056848
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_

In [27]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [28]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6328930412371134
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9845360824742269, 0.28]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6094961240310077
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9689922480620154, 0.25]
0    2933
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5825623072434776
Estimator and parameters: LogisticRegression(C=1.0, class_weight=N

In [29]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [30]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6148518041237113
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9484536082474226, 0.28]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.38872739018087854
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.1524547803617571, 0.62]
0    2933
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.3735933983495874
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degre

In [31]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [32]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.4961340206185567
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9922680412371134, 0.0]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5117490310077519
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9922480620155039, 0.03]
0    2933
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5109610736017338
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=

In [33]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [34]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6201675257731959
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.9278350515463918, 0.31]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6046915374677002
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.865633

In [35]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [36]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6383698453608248
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9329896907216495, 0.34]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5979893410852714
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_esti

In [37]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [38]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6655927835051547
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9561855670103093, 0.38]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6278262273901809
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9431524547803618, 0.31]
0    2933
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5715595565558057
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.917312661498708, 0.23]
0    2899
1     2

In [39]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [40]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6709085051546392
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.9355670103092784, 0.41]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6045704134366925
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_spl

In [41]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [42]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6551224226804123
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.9664948453608248, 0.34]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6304102067183462
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
 

In [43]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [44]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Edited Nearest Neighbour'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = EditedNearestNeighbours(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    X_resampled = X_resampled.rename(columns={'CBO':'f0', 'CC':'f1', 'DIT':'f2', 'LCOM':'f3', 'LOC':'f4', 'NOC':'f5', 'RFC':'f6', 'WMC':'f7'})
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Edited Nearest Neighbour'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    2910
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.6589884020618556
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.9742268041237113, 0.34]
0    2909
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.6329941860465117
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_w

In [45]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Tomek Links

## Light GBM

In [46]:
%%time

base_clf = lbm.LGBMClassifier()


clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Light GBM - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Light GBM - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5117590206185567
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
[Confusion Matrix]: [0.9922680412371134, 0.03]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5286660206718345
Estimator and parameters: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_

In [47]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Logistic Regression

In [48]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Logistic Regression - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Logistic Regression - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5625
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [1.0, 0.12]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5260820413436693
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9896640826873385, 0.06]
0    3438
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5580561807118446
Estimator and parameters: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_interc

In [49]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - Linear

In [50]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM Linear - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM Linear - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.3853092783505155
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.020618556701030927, 0.75]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.3879198966408269
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.025839793281653745, 0.75]
0    3438
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.4348587146786697
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', de

In [51]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM - RBF

In [52]:
%%time

base_clf = svm.SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['SVM RBF - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['SVM RBF - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.49871134020618557
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [0.9974226804123711, 0.0]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)
[Confusion Matrix]: [1.0, 0.0]
0    3438
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.49870801033591733
Estimator and parameters: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
 

In [53]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Decision Tree

In [54]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Decision Tree - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Decision Tree - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5066043814432989
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.9819587628865979, 0.03]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5352470930232558
Estimator and parameters: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
[Confusion Matrix]: [0.976744

In [55]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Random Forest

In [56]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Random Forest - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Random Forest - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5391430412371134
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
[Confusion Matrix]: [0.9845360824742269, 0.09]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5573320413436692
Estimator and parameters: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_esti

In [57]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## KNN

In [58]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['KNN - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['KNN - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5260953608247423
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9896907216494846, 0.06]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5442910206718345
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9948320413436692, 0.09]
0    3438
1     281
Name: will_change, dtype: int64
Fold: 3
ROC_AUC: 0.5393431691256148
Estimator and parameters: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
[Confusion Matrix]: [0.9819121447028424, 0.1]
0    3435
1     2

In [59]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Adaboost

In [60]:
%%time

adaboost_base = DecisionTreeClassifier(random_state = 42)

base_clf = AdaBoostClassifier(random_state=42, base_estimator=adaboost_base)

clf = GridSearchCV(base_clf, parameters_ABC_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Adaboost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Adaboost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5286726804123711
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42)
[Confusion Matrix]: [0.9948453608247423, 0.06]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5391230620155038
Estimator and parameters: AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_spl

In [61]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Gradient Boost

In [62]:
%%time

base_clf = GradientBoostingClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_GBC, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['Gradient Boost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['Gradient Boost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5143363402061856
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
[Confusion Matrix]: [0.9974226804123711, 0.03]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5273740310077519
Estimator and parameters: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
 

In [63]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## XGBoost

In [64]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid, scoring='roc_auc', n_jobs=-1)

scores_list = list()

results_dict['XGBoost - Tomek Links'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    X_resampled, y_resampled = TomekLinks(random_state=42).fit_sample(X.iloc[train], y.iloc[train])
    X_resampled = pd.DataFrame(X_resampled, columns=X_features)
    y_resampled = pd.DataFrame(y_resampled, columns=['will_change'])

    print(y_resampled['will_change'].value_counts())
    
    X_resampled = X_resampled.rename(columns={'CBO':'f0', 'CC':'f1', 'DIT':'f2', 'LCOM':'f3', 'LOC':'f4', 'NOC':'f5', 'RFC':'f6', 'WMC':'f7'})
    
    clf.fit(X_resampled, y_resampled)
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ' + str(clf.get_params()['estimator']))
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    
    results_dict['XGBoost - Tomek Links'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])), 
                                                             str(clf.get_params()['estimator']),
                                                             '[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

0    3429
1     280
Name: will_change, dtype: int64
Fold: 1
ROC_AUC: 0.5143363402061856
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
[Confusion Matrix]: [0.9974226804123711, 0.03]
0    3442
1     280
Name: will_change, dtype: int64
Fold: 2
ROC_AUC: 0.5273740310077519
Estimator and parameters: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_w

In [65]:
with open('base_Under_wo_wofs.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)