In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import lightgbm as lgb
import xgboost as xgb

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier


from sklearn.decomposition import pca

from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.preprocessing import MinMaxScaler

import pickle
import warnings
warnings.filterwarnings('ignore') 

from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score
def get_scores(y_test, y_pred):
    scores = []
    
    scores.append(f1_score(y_test, y_pred, average='micro'))
    #print("F1-Score(micro): " + str(scores[-1]))
    
    scores.append(f1_score(y_test, y_pred, average='macro'))
    #print("F1-Score(macro): " + str(scores[-1]))
    
    scores.append(f1_score(y_test, y_pred, average=None))
    #print("F1-Score(None): " + str(scores[-1]))
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    #Sensitivity
    sensitivity = tp / (tp+fn)
    scores.append(tp / (tp+fn))
    #print("Sensitivity: " + str(scores[-1]))
    
    #Specificity
    specificity = tn / (tn+fp)
    scores.append (tn / (tn+fp))
    #print("Specificity: " + str(scores[-1]))
    
    #VPP
    scores.append(tp / (tp+fp))
    #print("VPP: " + str(scores[-1]))
    
    #VPN
    scores.append(tn / (tn+fn))
    #print("VPN: " + str(scores[-1]))
    
    #RVP
    scores.append(sensitivity / (1-specificity))
    #print("RVP: " + str(scores[-1]))
    
    #RVN
    scores.append((1 - sensitivity) / specificity)
    #print("RVN: " + str(scores[-1]))
    
    #ROC_AUC
    scores.append(roc_auc_score(y_test, y_pred))
    #print("ROC_AUC: " + str(scores[-1]))
        
    scores.append([tn, fp, fn, tp])
    
    return scores

### Data Set load and Normalization

In [2]:
df = pd.read_excel('all_releases_only_oo.xlsx')

k_fold = StratifiedKFold(10, random_state=42, shuffle=True)

scaler = MinMaxScaler()

X = df.drop(columns=['Path', 'will_change'])
X = pd.DataFrame(scaler.fit_transform(X))
y = pd.DataFrame(df.loc[:,'will_change'])

X = X.rename(columns={0:'f0', 1:'f1', 2:'f2', 3:'f3'})

### GridSearch Parameters

In [3]:
parameters_svc_linear = {
        'clf__kernel':['linear'],
        'clf__C': [0.002, 1, 512, 1024, 2048], 
        'clf__class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
        'clf__max_iter': [10000]
    }

parameters_svc_rbf = {
        'clf__kernel':['rbf'],
        'clf__C': [0.002, 1, 512, 1024, 2048], 
        'clf__class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
        'clf__max_iter': [10000]
    }

parameters_dtc_grid = {
    'clf__criterion':('gini', 'entropy'), 
    'clf__min_samples_split':[0.1, 0.2, 0.3], 
    'clf__max_depth': [1, 10, 30, None],
    'clf__class_weight':[{1:1}, {1:10}, {1:20}, 'balanced'],
    'clf__presort':[False, True],
}

parameters_rfc_grid = { 
    'clf__n_estimators': [10, 50, 90],
    'clf__max_features': ['sqrt', 'log2'],
    'clf__max_depth' : [1, 10, 30, None],
    'clf__min_samples_split': [2, 8, 16],
    'clf__criterion' :['gini', 'entropy'],
    'clf__class_weight':[ {1:1}, {1:10}, {1:15}, {1:20}]
}

parameters_ABC_grid = {
    'clf__base_estimator__criterion' : ["gini", "entropy"],
    'clf__base_estimator__splitter' :   ["best", "random"],
    'clf__base_estimator__max_depth' : [1, 10, 30, None], 
    'clf__base_estimator__class_weight': [{1:1}, {1:10}, {1:15}, {1:20}], 
    'clf__n_estimators':  [10, 50, 90],
}


parameters_knn_grid = {'clf__n_neighbors': list(range(1,30))}


parameters_lr_grid = [{ 
                        'clf__penalty':['l2', 'l1'],
                        'clf__C':[0.0001, 0.001, 0.1, 1, 100, 1000],
                        }]

parameters_GBC = {
    "clf__loss":["deviance"],
    "clf__learning_rate": [0.01, 0.025, 0.05],
    "clf__max_depth": [None, 2, 4, 6, 10],
    "clf__max_features":["log2","sqrt"],
    "clf__criterion": ["friedman_mse",  "mae"],
    'clf__n_estimators': [64, 128, 256, 512] #[1, 2, 4, 16, 32, 64, 128, 256, 512]
    }


param_xgb_grid = {
           'clf__clf__n_estimators':  [10, 50, 90],
           'clf__clf__max_depth' : [1, 10, 30, None],
           'clf__learning_rate': [0.0001],
           'clf__min_child_weight': [1], #tuning min_child_weight subsample colsample_bytree for fighting against overfit
           'clf__random_state': [42] #ensemble xgboost with multiple seeds may reduce variance
             }

param_lgm_grid = {
    'clf__learning_rate': [0.005],
    'clf__n_estimators': [40],
    'clf__num_leaves': [6,8,12,16],
    'clf__boosting_type' : ['gbdt'],
    'clf__objective' : ['binary'],
    'clf__random_state' : [501], # Updated from 'seed'
    'clf__colsample_bytree' : [0.65, 0.66],
    'clf__subsample' : [0.7,0.75],
    'clf__reg_alpha' : [1,1.2],
    'clf__reg_lambda' : [1,1.2,1.4],
}
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'nthread': 3, # Updated from nthread
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'}

parameters_bag_grid = {
    'clf__base_estimator__max_depth' : [1, 2, 3, 4, 5],
    'clf__max_samples' : [0.05, 0.1, 0.2, 0.5]
}

In [4]:
results_dict = dict()

## Sampler

In [5]:
sampler = RandomOverSampler(random_state=42)

# Classifiers

### Logistic Regression

In [6]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', LogisticRegression(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['Logistic Regression - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['Logistic Regression - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7393938553682342
Estimator and parameters: 
{'clf__C': 1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8225377107364685, 0.66]

Fold: 2
ROC_AUC: 0.7062721827861579
Estimator and parameters: 
{'clf__C': 1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8500443655723159, 0.56]

Fold: 3
ROC_AUC: 0.7255434782608696
Estimator and parameters: 
{'clf__C': 1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8260869565217391, 0.62]

Fold: 4
ROC_AUC: 0.7055090815332608
Estimator and parameters: 
{'clf__C': 1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8303730017761989, 0.58]

Fold: 5
ROC_AUC: 0.6691113275654615
Estimator and parameters: 
{'clf__C': 1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8543516873889876, 0.48]

Fold: 6
ROC_AUC: 0.7207500143241848
Estimator and parameters: 
{'clf__C': 100, 'clf__penalty': 'l2'}
[Confusion Matrix]: [0.8285968028419183, 0.61]

Fold: 7
ROC_AUC: 0.7766859565690712
Estimator and parameters: 
{'clf__C': 0.1, 'clf__penalty': 'l1'}
[Confusion Matr

In [7]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### LightGBM

In [8]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight']))
        ])

clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['LightGBM - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['LightGBM - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7443711180124224
Estimator and parameters: 
{'clf__boosting_type': 'gbdt', 'clf__colsample_bytree': 0.65, 'clf__learning_rate': 0.005, 'clf__n_estimators': 40, 'clf__num_leaves': 16, 'clf__objective': 'binary', 'clf__random_state': 501, 'clf__reg_alpha': 1.2, 'clf__reg_lambda': 1.4, 'clf__subsample': 0.7}
[Confusion Matrix]: [0.8012422360248447, 0.69]

Fold: 2
ROC_AUC: 0.716573314108252
Estimator and parameters: 
{'clf__boosting_type': 'gbdt', 'clf__colsample_bytree': 0.65, 'clf__learning_rate': 0.005, 'clf__n_estimators': 40, 'clf__num_leaves': 12, 'clf__objective': 'binary', 'clf__random_state': 501, 'clf__reg_alpha': 1.2, 'clf__reg_lambda': 1.2, 'clf__subsample': 0.7}
[Confusion Matrix]: [0.839396628216504, 0.59]

Fold: 3
ROC_AUC: 0.7519132653061225
Estimator and parameters: 
{'clf__boosting_type': 'gbdt', 'clf__colsample_bytree': 0.65, 'clf__learning_rate': 0.005, 'clf__n_estimators': 40, 'clf__num_leaves': 12, 'clf__objective': 'binary', 'clf__random_state': 501

In [9]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### SVM - Linear

In [10]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', SVC(random_state=42))])

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['SVM Linear - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['SVM Linear - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7237688553682342
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8225377107364685, 0.62]

Fold: 2
ROC_AUC: 0.7151452972493345
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.867790594498669, 0.56]

Fold: 3
ROC_AUC: 0.7080468056787932
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8535936113575865, 0.56]

Fold: 4
ROC_AUC: 0.7124706354208445
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8765541740674956, 0.55]

Fold: 5
ROC_AUC: 0.6380421704005043
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.88898756660

In [11]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### SVM - RBF

In [12]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', SVC(random_state=42))])

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['SVM RBF - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['SVM RBF - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7237688553682342
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8225377107364685, 0.62]

Fold: 2
ROC_AUC: 0.7151452972493345
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.867790594498669, 0.56]

Fold: 3
ROC_AUC: 0.7080468056787932
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8535936113575865, 0.56]

Fold: 4
ROC_AUC: 0.7124706354208445
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8765541740674956, 0.55]

Fold: 5
ROC_AUC: 0.6380421704005043
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.88898756660

In [13]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Decision Tree

In [14]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', DecisionTreeClassifier(random_state=42))])

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['Decision Tree - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['Decision Tree - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7034577417923692
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'gini', 'clf__max_depth': 30, 'clf__min_samples_split': 0.1, 'clf__presort': False}
[Confusion Matrix]: [0.7506654835847383, 0.66]

Fold: 2
ROC_AUC: 0.6681177905944987
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__min_samples_split': 0.1, 'clf__presort': False}
[Confusion Matrix]: [0.7737355811889973, 0.56]

Fold: 3
ROC_AUC: 0.7411130212954747
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'entropy', 'clf__max_depth': 10, 'clf__min_samples_split': 0.3, 'clf__presort': False}
[Confusion Matrix]: [0.7009760425909495, 0.78]

Fold: 4
ROC_AUC: 0.6852117114536184
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__min_samples_split': 0.2, 'clf__presort': False}
[Confusion Matrix]: [0.6607460035523979, 0.71]

Fold: 5
ROC_AUC: 0.519094

In [15]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Random Forest

In [16]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', RandomForestClassifier(random_state=42))])

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['Random Forest - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['Random Forest - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.5
Estimator and parameters: 
{'clf__class_weight': {1: 10}, 'clf__criterion': 'entropy', 'clf__max_depth': 1, 'clf__max_features': 'sqrt', 'clf__min_samples_split': 2, 'clf__n_estimators': 50}
[Confusion Matrix]: [0.0, 1.0]

Fold: 2
ROC_AUC: 0.5
Estimator and parameters: 
{'clf__class_weight': {1: 15}, 'clf__criterion': 'entropy', 'clf__max_depth': 1, 'clf__max_features': 'sqrt', 'clf__min_samples_split': 2, 'clf__n_estimators': 90}
[Confusion Matrix]: [0.0, 1.0]

Fold: 3
ROC_AUC: 0.5
Estimator and parameters: 
{'clf__class_weight': {1: 15}, 'clf__criterion': 'gini', 'clf__max_depth': 1, 'clf__max_features': 'sqrt', 'clf__min_samples_split': 2, 'clf__n_estimators': 90}
[Confusion Matrix]: [0.0, 1.0]

Fold: 4
ROC_AUC: 0.5
Estimator and parameters: 
{'clf__class_weight': {1: 15}, 'clf__criterion': 'gini', 'clf__max_depth': 1, 'clf__max_features': 'sqrt', 'clf__min_samples_split': 2, 'clf__n_estimators': 50}
[Confusion Matrix]: [0.0, 1.0]

Fold: 5
ROC_AUC: 0.5
Estimator

In [17]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### KNN

In [18]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', KNeighborsClassifier())])

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['KNN - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['KNN - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.69838342945874
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.80301685891748, 0.59]

Fold: 2
ROC_AUC: 0.6809838065661047
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.7994676131322094, 0.56]

Fold: 3
ROC_AUC: 0.7144520851818988
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.8039041703637977, 0.62]

Fold: 4
ROC_AUC: 0.6155388758379648
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.8117229129662522, 0.42]

Fold: 5
ROC_AUC: 0.6639259726121584
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.8117229129662522, 0.52]

Fold: 6
ROC_AUC: 0.7060963731163696
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.7992895204262878, 0.61]

Fold: 7
ROC_AUC: 0.7241448461582536
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.7708703374777975, 0.68]

Fold: 8
ROC_AUC: 0.6844955022059246
Estimator and pa

In [19]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### XGBoost

In [20]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', xgb.XGBClassifier(random_state=42))])

clf = GridSearchCV(base_clf, param_xgb_grid,  scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['XGBoost - RandomOverSampler'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['XGBoost - RandomOverSampler'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6997143966282166
Estimator and parameters: 
{'clf__clf__max_depth': 1, 'clf__clf__n_estimators': 10, 'clf__learning_rate': 0.0001, 'clf__min_child_weight': 1, 'clf__random_state': 42}
[Confusion Matrix]: [0.805678793256433, 0.59]

Fold: 2
ROC_AUC: 0.7050382653061225
Estimator and parameters: 
{'clf__clf__max_depth': 1, 'clf__clf__n_estimators': 10, 'clf__learning_rate': 0.0001, 'clf__min_child_weight': 1, 'clf__random_state': 42}
[Confusion Matrix]: [0.8163265306122449, 0.59]

Fold: 3
ROC_AUC: 0.7434838065661047
Estimator and parameters: 
{'clf__clf__max_depth': 1, 'clf__clf__n_estimators': 10, 'clf__learning_rate': 0.0001, 'clf__min_child_weight': 1, 'clf__random_state': 42}
[Confusion Matrix]: [0.7994676131322094, 0.69]

Fold: 4
ROC_AUC: 0.6818168796195496
Estimator and parameters: 
{'clf__clf__max_depth': 1, 'clf__clf__n_estimators': 10, 'clf__learning_rate': 0.0001, 'clf__min_child_weight': 1, 'clf__random_state': 42}
[Confusion Matrix]: [0.7184724689165186, 0.65

In [21]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)