In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import lightgbm as lgb
import xgboost as xgb

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier


from sklearn.decomposition import pca

from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.preprocessing import MinMaxScaler

import pickle
import warnings
warnings.filterwarnings('ignore') 

from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score
def get_scores(y_test, y_pred):
    scores = []
    
    scores.append(f1_score(y_test, y_pred, average='micro'))
    #print("F1-Score(micro): " + str(scores[-1]))
    
    scores.append(f1_score(y_test, y_pred, average='macro'))
    #print("F1-Score(macro): " + str(scores[-1]))
    
    scores.append(f1_score(y_test, y_pred, average=None))
    #print("F1-Score(None): " + str(scores[-1]))
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    #Sensitivity
    sensitivity = tp / (tp+fn)
    scores.append(tp / (tp+fn))
    #print("Sensitivity: " + str(scores[-1]))
    
    #Specificity
    specificity = tn / (tn+fp)
    scores.append (tn / (tn+fp))
    #print("Specificity: " + str(scores[-1]))
    
    #VPP
    scores.append(tp / (tp+fp))
    #print("VPP: " + str(scores[-1]))
    
    #VPN
    scores.append(tn / (tn+fn))
    #print("VPN: " + str(scores[-1]))
    
    #RVP
    scores.append(sensitivity / (1-specificity))
    #print("RVP: " + str(scores[-1]))
    
    #RVN
    scores.append((1 - sensitivity) / specificity)
    #print("RVN: " + str(scores[-1]))
    
    #ROC_AUC
    scores.append(roc_auc_score(y_test, y_pred))
    #print("ROC_AUC: " + str(scores[-1]))
        
    scores.append([tn, fp, fn, tp])
    
    return scores

### Data Set load and Normalization

In [2]:
df = pd.read_excel('all_releases_only_oo.xlsx')

k_fold = StratifiedKFold(10, random_state=42, shuffle=True)

scaler = MinMaxScaler()

X = df.drop(columns=['Path', 'will_change'])
X = pd.DataFrame(scaler.fit_transform(X))
y = pd.DataFrame(df.loc[:,'will_change'])

X = X.rename(columns={0:'f0', 1:'f1', 2:'f2', 3:'f3'})

In [3]:
X.head()

Unnamed: 0,f0,f1,f2,f3
0,0.0,0.004098,0.0,0.017857
1,0.0,0.004098,0.0,0.017857
2,0.12963,0.018443,0.0,0.071429
3,0.006173,0.014344,0.0,0.035714
4,0.006173,0.004098,0.0,0.017857


### GridSearch Parameters

In [4]:
parameters_svc_linear = {
        'clf__kernel':['linear'],
        'clf__C': [0.002, 1, 512, 1024, 2048], 
        'clf__class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
        'clf__max_iter': [10000]
    }

parameters_svc_rbf = {
        'clf__kernel':['rbf'],
        'clf__C': [0.002, 1, 512, 1024, 2048], 
        'clf__class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
        'clf__max_iter': [10000]
    }

parameters_dtc_grid = {
    'clf__criterion':('gini', 'entropy'), 
    'clf__min_samples_split':[0.1, 0.2, 0.3], 
    'clf__max_depth': [1, 10, 30, None],
    'clf__class_weight':[{1:1}, {1:10}, {1:20}, 'balanced'],
    'clf__presort':[False, True],
}

parameters_rfc_grid = { 
    'clf__n_estimators': [10, 50, 90],
    'clf__max_features': ['sqrt', 'log2'],
    'clf__max_depth' : [1, 10, 30, None],
    'clf__min_samples_split': [2, 8, 16],
    'clf__criterion' :['gini', 'entropy'],
    'clf__class_weight':[ {1:1}, {1:10}, {1:15}, {1:20}]
}

parameters_ABC_grid = {
    'clf__base_estimator__criterion' : ["gini", "entropy"],
    'clf__base_estimator__splitter' :   ["best", "random"],
    'clf__base_estimator__max_depth' : [1, 10, 30, None], 
    'clf__base_estimator__class_weight': [{1:1}, {1:10}, {1:15}, {1:20}], 
    'clf__n_estimators':  [10, 50, 90],
}


parameters_knn_grid = {'clf__n_neighbors': list(range(1,30))}


parameters_lr_grid = [{ 
                        'clf__penalty':['l2', 'l1'],
                        'clf__C':[0.0001, 0.001, 0.1, 1, 100, 1000],
                        }]

parameters_GBC = {
    "clf__loss":["deviance"],
    "clf__learning_rate": [0.01, 0.025, 0.05],
    "clf__max_depth": [None, 2, 4, 6, 10],
    "clf__max_features":["log2","sqrt"],
    "clf__criterion": ["friedman_mse",  "mae"],
    'clf__n_estimators': [64, 128, 256, 512] #[1, 2, 4, 16, 32, 64, 128, 256, 512]
    }


param_xgb_grid = {
           'clf__clf__n_estimators':  [10, 50, 90],
           'clf__clf__max_depth' : [1, 10, 30, None],
           'clf__learning_rate': [0.0001],
           'clf__min_child_weight': [1], #tuning min_child_weight subsample colsample_bytree for fighting against overfit
           'clf__random_state': [42] #ensemble xgboost with multiple seeds may reduce variance
             }

param_lgm_grid = {
    'clf__learning_rate': [0.005],
    'clf__n_estimators': [40],
    'clf__num_leaves': [6,8,12,16],
    'clf__boosting_type' : ['gbdt'],
    'clf__objective' : ['binary'],
    'clf__random_state' : [501], # Updated from 'seed'
    'clf__colsample_bytree' : [0.65, 0.66],
    'clf__subsample' : [0.7,0.75],
    'clf__reg_alpha' : [1,1.2],
    'clf__reg_lambda' : [1,1.2,1.4],
}
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'nthread': 3, # Updated from nthread
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'}

parameters_bag_grid = {
    'clf__base_estimator__max_depth' : [1, 2, 3, 4, 5],
    'clf__max_samples' : [0.05, 0.1, 0.2, 0.5]
}

In [5]:
results_dict = dict()

## Sampler

In [6]:
sampler = SMOTE(random_state=42)

# Classifiers

### Logistic Regression

In [7]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', LogisticRegression(random_state=42))
    ])

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['Logistic Regression - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['Logistic Regression - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7380628881987578
Estimator and parameters: 
{'clf__C': 0.1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8198757763975155, 0.66]

Fold: 2
ROC_AUC: 0.7040539041703637
Estimator and parameters: 
{'clf__C': 0.1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8456078083407276, 0.56]

Fold: 3
ROC_AUC: 0.7237688553682342
Estimator and parameters: 
{'clf__C': 0.1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8225377107364685, 0.62]

Fold: 4
ROC_AUC: 0.7050650317996906
Estimator and parameters: 
{'clf__C': 0.1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8294849023090586, 0.58]

Fold: 5
ROC_AUC: 0.6785796138199737
Estimator and parameters: 
{'clf__C': 0.1, 'clf__penalty': 'l1'}
[Confusion Matrix]: [0.8410301953818827, 0.52]

Fold: 6
ROC_AUC: 0.7189738153899042
Estimator and parameters: 
{'clf__C': 1, 'clf__penalty': 'l2'}
[Confusion Matrix]: [0.8250444049733571, 0.61]

Fold: 7
ROC_AUC: 0.7731335587005099
Estimator and parameters: 
{'clf__C': 0.1, 'clf__penalty': 'l1'}
[Confus

In [8]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### LightGBM

In [9]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight']))
        ])

clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['LightGBM - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['LightGBM - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7135647737355811
Estimator and parameters: 
{'clf__boosting_type': 'gbdt', 'clf__colsample_bytree': 0.65, 'clf__learning_rate': 0.005, 'clf__n_estimators': 40, 'clf__num_leaves': 12, 'clf__objective': 'binary', 'clf__random_state': 501, 'clf__reg_alpha': 1.2, 'clf__reg_lambda': 1.2, 'clf__subsample': 0.75}
[Confusion Matrix]: [0.8021295474711624, 0.62]

Fold: 2
ROC_AUC: 0.7223408385093166
Estimator and parameters: 
{'clf__boosting_type': 'gbdt', 'clf__colsample_bytree': 0.65, 'clf__learning_rate': 0.005, 'clf__n_estimators': 40, 'clf__num_leaves': 16, 'clf__objective': 'binary', 'clf__random_state': 501, 'clf__reg_alpha': 1, 'clf__reg_lambda': 1, 'clf__subsample': 0.7}
[Confusion Matrix]: [0.8509316770186336, 0.59]

Fold: 3
ROC_AUC: 0.7094748225377108
Estimator and parameters: 
{'clf__boosting_type': 'gbdt', 'clf__colsample_bytree': 0.65, 'clf__learning_rate': 0.005, 'clf__n_estimators': 40, 'clf__num_leaves': 12, 'clf__objective': 'binary', 'clf__random_state': 501,

In [10]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### SVM - Linear

In [11]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', SVC(random_state=42))])

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['SVM Linear - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['SVM Linear - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7156860026619343
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8376220053238687, 0.59]

Fold: 2
ROC_AUC: 0.7138143300798581
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.865128660159716, 0.56]

Fold: 3
ROC_AUC: 0.7022792812777285
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.842058562555457, 0.56]

Fold: 4
ROC_AUC: 0.7206067724746462
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8605683836589698, 0.58]

Fold: 5
ROC_AUC: 0.6331576233312325
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.879218472468

In [12]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### SVM - RBF

In [13]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', SVC(random_state=42))])

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['SVM RBF - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['SVM RBF - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7156860026619343
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8376220053238687, 0.59]

Fold: 2
ROC_AUC: 0.7138143300798581
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.865128660159716, 0.56]

Fold: 3
ROC_AUC: 0.7022792812777285
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.842058562555457, 0.56]

Fold: 4
ROC_AUC: 0.7206067724746462
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.8605683836589698, 0.58]

Fold: 5
ROC_AUC: 0.6331576233312325
Estimator and parameters: 
{'clf__C': 1, 'clf__class_weight': {1: 1}, 'clf__kernel': 'linear', 'clf__max_iter': 10000}
[Confusion Matrix]: [0.879218472468

In [14]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Decision Tree

In [15]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', DecisionTreeClassifier(random_state=42))])

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['Decision Tree - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['Decision Tree - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6593417258207632
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__min_samples_split': 0.1, 'clf__presort': False}
[Confusion Matrix]: [0.7249334516415262, 0.59]

Fold: 2
ROC_AUC: 0.69084128216504
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__min_samples_split': 0.1, 'clf__presort': False}
[Confusion Matrix]: [0.7879325643300799, 0.59]

Fold: 3
ROC_AUC: 0.6721106921029281
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__min_samples_split': 0.1, 'clf__presort': False}
[Confusion Matrix]: [0.7817213842058562, 0.56]

Fold: 4
ROC_AUC: 0.6791525812181287
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'entropy', 'clf__max_depth': 10, 'clf__min_samples_split': 0.1, 'clf__presort': False}
[Confusion Matrix]: [0.7131438721136767, 0.65]

Fold: 5
ROC_AUC: 0.60740273

In [16]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Random Forest

In [17]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', RandomForestClassifier(random_state=42))])

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['Random Forest - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['Random Forest - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6587594276841171
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'entropy', 'clf__max_depth': 10, 'clf__max_features': 'sqrt', 'clf__min_samples_split': 16, 'clf__n_estimators': 90}
[Confusion Matrix]: [0.9112688553682342, 0.41]

Fold: 2
ROC_AUC: 0.5795530168589175
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'gini', 'clf__max_depth': None, 'clf__max_features': 'sqrt', 'clf__min_samples_split': 16, 'clf__n_estimators': 90}
[Confusion Matrix]: [0.9716060337178349, 0.19]

Fold: 3
ROC_AUC: 0.6877911490683231
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__max_features': 'sqrt', 'clf__min_samples_split': 16, 'clf__n_estimators': 90}
[Confusion Matrix]: [0.906832298136646, 0.47]

Fold: 4
ROC_AUC: 0.7024150575832235
Estimator and parameters: 
{'clf__class_weight': {1: 1}, 'clf__criterion': 'entropy', 'clf__max_depth': 10, 'clf__max_features': 'sqrt', 'clf

In [18]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### KNN

In [19]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', KNeighborsClassifier())])

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['KNN - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['KNN - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.6934061668145519
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.8243123336291038, 0.56]

Fold: 2
ROC_AUC: 0.6621561668145519
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.8243123336291038, 0.5]

Fold: 3
ROC_AUC: 0.7228815439219167
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.8207630878438332, 0.62]

Fold: 4
ROC_AUC: 0.644545350369564
Estimator and parameters: 
{'clf__n_neighbors': 28}
[Confusion Matrix]: [0.8374777975133215, 0.45]

Fold: 5
ROC_AUC: 0.6401048530338624
Estimator and parameters: 
{'clf__n_neighbors': 28}
[Confusion Matrix]: [0.8285968028419183, 0.45]

Fold: 6
ROC_AUC: 0.7386552455165301
Estimator and parameters: 
{'clf__n_neighbors': 27}
[Confusion Matrix]: [0.8321492007104796, 0.65]

Fold: 7
ROC_AUC: 0.7861542428235834
Estimator and parameters: 
{'clf__n_neighbors': 29}
[Confusion Matrix]: [0.8303730017761989, 0.74]

Fold: 8
ROC_AUC: 0.6701426688821406
Estimator and 

In [20]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### XGBoost

In [21]:
%%time

base_clf = Pipeline([
        ('sampling', sampler),
        ('clf', xgb.XGBClassifier(random_state=42))])

clf = GridSearchCV(base_clf, param_xgb_grid,  scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['XGBoost - SMOTE'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['XGBoost - SMOTE'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.7001580523513754
Estimator and parameters: 
{'clf__clf__max_depth': 1, 'clf__clf__n_estimators': 10, 'clf__learning_rate': 0.0001, 'clf__min_child_weight': 1, 'clf__random_state': 42}
[Confusion Matrix]: [0.8065661047027507, 0.59]

Fold: 2
ROC_AUC: 0.6748142191659272
Estimator and parameters: 
{'clf__clf__max_depth': 1, 'clf__clf__n_estimators': 10, 'clf__learning_rate': 0.0001, 'clf__min_child_weight': 1, 'clf__random_state': 42}
[Confusion Matrix]: [0.6308784383318545, 0.72]

Fold: 3
ROC_AUC: 0.704192546583851
Estimator and parameters: 
{'clf__clf__max_depth': 1, 'clf__clf__n_estimators': 10, 'clf__learning_rate': 0.0001, 'clf__min_child_weight': 1, 'clf__random_state': 42}
[Confusion Matrix]: [0.6583850931677019, 0.75]

Fold: 4
ROC_AUC: 0.6476107259496935
Estimator and parameters: 
{'clf__clf__max_depth': 1, 'clf__clf__n_estimators': 10, 'clf__learning_rate': 0.0001, 'clf__min_child_weight': 1, 'clf__random_state': 42}
[Confusion Matrix]: [0.5532859680284192, 0.74

In [22]:
with open('partial_results_undersample.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)