In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import lightgbm as lgb
import xgboost as xgb

from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier


from sklearn.decomposition import pca

from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.preprocessing import MinMaxScaler

import pickle
import warnings
warnings.filterwarnings('ignore') 

In [2]:
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score
def get_scores(y_test, y_pred):
    scores = []
    
    scores.append(f1_score(y_test, y_pred, average='micro'))
    #print("F1-Score(micro): " + str(scores[-1]))
    
    scores.append(f1_score(y_test, y_pred, average='macro'))
    #print("F1-Score(macro): " + str(scores[-1]))
    
    scores.append(f1_score(y_test, y_pred, average=None))
    #print("F1-Score(None): " + str(scores[-1]))
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    #Sensitivity
    sensitivity = tp / (tp+fn)
    scores.append(tp / (tp+fn))
    #print("Sensitivity: " + str(scores[-1]))
    
    #Specificity
    specificity = tn / (tn+fp)
    scores.append (tn / (tn+fp))
    #print("Specificity: " + str(scores[-1]))
    
    #VPP
    scores.append(tp / (tp+fp))
    #print("VPP: " + str(scores[-1]))
    
    #VPN
    scores.append(tn / (tn+fn))
    #print("VPN: " + str(scores[-1]))
    
    #RVP
    scores.append(sensitivity / (1-specificity))
    #print("RVP: " + str(scores[-1]))
    
    #RVN
    scores.append((1 - sensitivity) / specificity)
    #print("RVN: " + str(scores[-1]))
    
    #ROC_AUC
    scores.append(roc_auc_score(y_test, y_pred))
    #print("ROC_AUC: " + str(scores[-1]))
        
    scores.append([tn, fp, fn, tp])
    
    return scores

### Data Set load and Normalization

In [3]:
df = pd.read_excel('all_releases_only_smells.xlsx')

k_fold = StratifiedKFold(10, random_state=42, shuffle=True)

scaler = MinMaxScaler()

X = df.drop(columns=['Path', 'will_change'])
X = pd.DataFrame(scaler.fit_transform(X))
y = pd.DataFrame(df.loc[:,'will_change'])

### GridSearch Parameters

In [4]:
parameters_svc_linear = {
        'kernel':['linear'],
        'C': [0.002, 1, 512, 1024, 2048], 
        'class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
        'max_iter': [10000]
    }

parameters_svc_rbf = {
        'kernel':['rbf'],
        'C': [0.002, 1, 512, 1024, 2048], 
        'class_weight':[{1:1}, {1:10}, {1:15}, {1:20}],
        'max_iter': [10000]
    }

parameters_dtc_grid = {
    'criterion':('gini', 'entropy'), 
    'min_samples_split':[0.1, 0.2, 0.3], 
    'max_depth': [1, 10, 30, None],
    'class_weight':[{1:1}, {1:10}, {1:20}, 'balanced'],
    'presort':[False, True],
}

parameters_rfc_grid = { 
    'n_estimators': [10, 50, 90],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [1, 10, 30, None],
    'min_samples_split': [2, 8, 16],
    'criterion' :['gini', 'entropy'],
    'class_weight':[ {1:1}, {1:10}, {1:15}, {1:20}]
}

parameters_ABC_grid = {
    'base_estimator__criterion' : ["gini", "entropy"],
    'base_estimator__splitter' :   ["best", "random"],
    'base_estimator__max_depth' : [1, 10, 30, None], 
    'base_estimator__class_weight': [{1:1}, {1:10}, {1:15}, {1:20}], 
    'n_estimators':  [10, 50, 90],
}


parameters_knn_grid = {'n_neighbors': list(range(1,30))}


parameters_lr_grid = [{ 
                        'penalty':['l2', 'l1'],
                        'C':[0.0001, 0.001, 0.1, 1, 100, 1000],
                        }]

parameters_GBC = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05],
    "max_depth": [None, 2, 4, 6, 10],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    'n_estimators': [64, 128, 256, 512] #[1, 2, 4, 16, 32, 64, 128, 256, 512]
    }


param_xgb_grid = {
           'clf__n_estimators':  [10, 50, 90],
           'clf__max_depth' : [1, 10, 30, None],
           'learning_rate': [0.0001],
           'min_child_weight': [1], #tuning min_child_weight subsample colsample_bytree for fighting against overfit
           'random_state': [42] #ensemble xgboost with multiple seeds may reduce variance
             }

param_lgm_grid = {
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501], # Updated from 'seed'
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
}
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'nthread': 3, # Updated from nthread
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'}

parameters_bag_grid = {
    'base_estimator__max_depth' : [1, 2, 3, 4, 5],
    'max_samples' : [0.05, 0.1, 0.2, 0.5]
}

In [5]:
results_dict = dict()

# Classifiers

### Logistic Regression

In [6]:
%%time

base_clf = LogisticRegression(random_state=42)

clf = GridSearchCV(base_clf, parameters_lr_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['Logistic Regression - Imbalanced'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['Logistic Regression - Imbalanced'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.515625
Estimator and parameters: 
{'C': 100, 'penalty': 'l1'}
[Confusion Matrix]: [1.0, 0.03]

Fold: 2
ROC_AUC: 0.53125
Estimator and parameters: 
{'C': 1, 'penalty': 'l1'}
[Confusion Matrix]: [1.0, 0.06]

Fold: 3
ROC_AUC: 0.49955634427684115
Estimator and parameters: 
{'C': 100, 'penalty': 'l2'}
[Confusion Matrix]: [0.9991126885536823, 0.0]

Fold: 4
ROC_AUC: 0.49955595026642985
Estimator and parameters: 
{'C': 1, 'penalty': 'l2'}
[Confusion Matrix]: [0.9991119005328597, 0.0]

Fold: 5
ROC_AUC: 0.5
Estimator and parameters: 
{'C': 1, 'penalty': 'l1'}
[Confusion Matrix]: [1.0, 0.0]

Fold: 6
ROC_AUC: 0.5161290322580645
Estimator and parameters: 
{'C': 1, 'penalty': 'l2'}
[Confusion Matrix]: [1.0, 0.03]

Fold: 7
ROC_AUC: 0.5
Estimator and parameters: 
{'C': 1, 'penalty': 'l2'}
[Confusion Matrix]: [1.0, 0.0]

Fold: 8
ROC_AUC: 0.49822380106571934
Estimator and parameters: 
{'C': 100, 'penalty': 'l2'}
[Confusion Matrix]: [0.9964476021314387, 0.0]

Fold: 9
ROC_AUC: 0.4986678

In [7]:
with open('partial_results_imbalanced.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### LightGBM

In [8]:
%%time

base_clf = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

clf = GridSearchCV(base_clf, param_lgm_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['LightGBM - Imbalanced'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['LightGBM - Imbalanced'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.5
Estimator and parameters: 
{'boosting_type': 'gbdt', 'colsample_bytree': 0.65, 'learning_rate': 0.005, 'n_estimators': 40, 'num_leaves': 6, 'objective': 'binary', 'random_state': 501, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.75}
[Confusion Matrix]: [1.0, 0.0]

Fold: 2
ROC_AUC: 0.5
Estimator and parameters: 
{'boosting_type': 'gbdt', 'colsample_bytree': 0.66, 'learning_rate': 0.005, 'n_estimators': 40, 'num_leaves': 6, 'objective': 'binary', 'random_state': 501, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.75}
[Confusion Matrix]: [1.0, 0.0]

Fold: 3
ROC_AUC: 0.5
Estimator and parameters: 
{'boosting_type': 'gbdt', 'colsample_bytree': 0.65, 'learning_rate': 0.005, 'n_estimators': 40, 'num_leaves': 8, 'objective': 'binary', 'random_state': 501, 'reg_alpha': 1, 'reg_lambda': 1.2, 'subsample': 0.7}
[Confusion Matrix]: [1.0, 0.0]

Fold: 4
ROC_AUC: 0.5
Estimator and parameters: 
{'boosting_type': 'gbdt', 'colsample_bytree': 0.66, 'learning_rate': 0.005, 'n_estim

In [9]:
with open('partial_results_imbalanced.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### SVM - Linear

In [10]:
%%time

base_clf = SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_linear, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['SVM Linear - Imbalanced'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['SVM Linear - Imbalanced'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.649040594498669
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 20}, 'kernel': 'linear', 'max_iter': 10000}
[Confusion Matrix]: [0.735581188997338, 0.56]

Fold: 2
ROC_AUC: 0.602165594498669
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 20}, 'kernel': 'linear', 'max_iter': 10000}
[Confusion Matrix]: [0.735581188997338, 0.47]

Fold: 3
ROC_AUC: 0.6677711845607809
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 20}, 'kernel': 'linear', 'max_iter': 10000}
[Confusion Matrix]: [0.7417923691215617, 0.59]

Fold: 4
ROC_AUC: 0.6677648541797971
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 20}, 'kernel': 'linear', 'max_iter': 10000}
[Confusion Matrix]: [0.7548845470692718, 0.58]

Fold: 5
ROC_AUC: 0.6692402452300463
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 20}, 'kernel': 'linear', 'max_iter': 10000}
[Confusion Matrix]: [0.7255772646536413, 0.61]

Fold: 6
ROC_AUC: 0.6790093393685899
Estimator and parameters: 
{

In [11]:
with open('partial_results_imbalanced.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### SVM - RBF

In [12]:
%%time

base_clf = SVC(random_state=42)

clf = GridSearchCV(base_clf, parameters_svc_rbf, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['SVM RBF - Imbalanced'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['SVM RBF - Imbalanced'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.5
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 10}, 'kernel': 'rbf', 'max_iter': 10000}
[Confusion Matrix]: [1.0, 0.0]

Fold: 2
ROC_AUC: 0.5
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 10}, 'kernel': 'rbf', 'max_iter': 10000}
[Confusion Matrix]: [1.0, 0.0]

Fold: 3
ROC_AUC: 0.5
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 20}, 'kernel': 'rbf', 'max_iter': 10000}
[Confusion Matrix]: [1.0, 0.0]

Fold: 4
ROC_AUC: 0.5
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 10}, 'kernel': 'rbf', 'max_iter': 10000}
[Confusion Matrix]: [1.0, 0.0]

Fold: 5
ROC_AUC: 0.5072480375866614
Estimator and parameters: 
{'C': 1, 'class_weight': {1: 10}, 'kernel': 'rbf', 'max_iter': 10000}
[Confusion Matrix]: [0.9822380106571936, 0.03]

Fold: 6
ROC_AUC: 0.5
Estimator and parameters: 
{'C': 0.002, 'class_weight': {1: 10}, 'kernel': 'rbf', 'max_iter': 10000}
[Confusion Matrix]: [1.0, 0.0]

Fold: 7
ROC_AUC: 0.5
Estimator and parameters: 
{

In [13]:
with open('partial_results_imbalanced.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Decision Tree

In [14]:
%%time

base_clf = DecisionTreeClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_dtc_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['Decision Tree - Imbalanced'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['Decision Tree - Imbalanced'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.5
Estimator and parameters: 
{'class_weight': {1: 1}, 'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 0.2, 'presort': False}
[Confusion Matrix]: [1.0, 0.0]

Fold: 2
ROC_AUC: 0.6136590505767524
Estimator and parameters: 
{'class_weight': {1: 10}, 'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 0.3, 'presort': False}
[Confusion Matrix]: [0.9148181011535049, 0.31]

Fold: 3
ROC_AUC: 0.7172665261756876
Estimator and parameters: 
{'class_weight': {1: 10}, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 0.2, 'presort': False}
[Confusion Matrix]: [0.9032830523513753, 0.53]

Fold: 4
ROC_AUC: 0.6661605454649631
Estimator and parameters: 
{'class_weight': {1: 10}, 'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 0.2, 'presort': False}
[Confusion Matrix]: [0.9129662522202486, 0.42]

Fold: 5
ROC_AUC: 0.5
Estimator and parameters: 
{'class_weight': {1: 1}, 'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 0.2, 'presort': False}
[

In [15]:
with open('partial_results_imbalanced.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Random Forest

In [16]:
%%time

base_clf = RandomForestClassifier(random_state=42)

clf = GridSearchCV(base_clf, parameters_rfc_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['Random Forest - Imbalanced'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['Random Forest - Imbalanced'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.5
Estimator and parameters: 
{'class_weight': {1: 1}, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 16, 'n_estimators': 50}
[Confusion Matrix]: [1.0, 0.0]

Fold: 2
ROC_AUC: 0.5
Estimator and parameters: 
{'class_weight': {1: 1}, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 16, 'n_estimators': 50}
[Confusion Matrix]: [1.0, 0.0]

Fold: 3
ROC_AUC: 0.5
Estimator and parameters: 
{'class_weight': {1: 1}, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 16, 'n_estimators': 90}
[Confusion Matrix]: [1.0, 0.0]

Fold: 4
ROC_AUC: 0.49955595026642985
Estimator and parameters: 
{'class_weight': {1: 1}, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 16, 'n_estimators': 50}
[Confusion Matrix]: [0.9991119005328597, 0.0]

Fold: 5
ROC_AUC: 0.5
Estimator and parameters: 
{'class_weight': {1: 1}, 'criterion': 'entropy', 'max_depth': 10

In [17]:
with open('partial_results_imbalanced.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### KNN

In [18]:
%%time

base_clf = KNeighborsClassifier()

clf = GridSearchCV(base_clf, parameters_knn_grid, scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['KNN - Imbalanced'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['KNN - Imbalanced'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 29}
[Confusion Matrix]: [1.0, 0.0]

Fold: 2
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 29}
[Confusion Matrix]: [1.0, 0.0]

Fold: 3
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 27}
[Confusion Matrix]: [1.0, 0.0]

Fold: 4
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 29}
[Confusion Matrix]: [1.0, 0.0]

Fold: 5
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 29}
[Confusion Matrix]: [1.0, 0.0]

Fold: 6
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 26}
[Confusion Matrix]: [1.0, 0.0]

Fold: 7
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 28}
[Confusion Matrix]: [1.0, 0.0]

Fold: 8
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 29}
[Confusion Matrix]: [1.0, 0.0]

Fold: 9
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 28}
[Confusion Matrix]: [1.0, 0.0]

Fold: 10
ROC_AUC: 0.5
Estimator and parameters: 
{'n_neighbors': 26}
[Confusion Matrix]: [1.0, 0.0]


In [19]:
with open('partial_results_imbalanced.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### XGBoost

In [20]:
%%time

base_clf = xgb.XGBClassifier(random_state=42)

clf = GridSearchCV(base_clf, param_xgb_grid,  scoring='roc_auc', n_jobs=-1, cv=10)

scores_list = list()

results_dict['XGBoost - Imbalanced'] = dict()

for k, (train, test) in enumerate(k_fold.split(X, y)):
    clf.fit(X.iloc[train], y.iloc[train])
    
    print('Fold: ' + str(k + 1))
    
    print('ROC_AUC: ' + str(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test]))))
    print('Estimator and parameters: ')
    print(clf.best_params_)
    cnf_matrix = confusion_matrix(y.iloc[test], clf.predict(X.iloc[test]))
    cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    print('[Confusion Matrix]: [' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']')
    print()
    
    results_dict['XGBoost - Imbalanced'][k + 1] = [get_scores(y.iloc[test], clf.predict(X.iloc[test])),str(clf.best_params_),'[' + str(cnf_matrix[0][0]) + ', ' + str(round(cnf_matrix[1][1],2)) + ']']
    
    scores_list.append(roc_auc_score(y.iloc[test], clf.predict(X.iloc[test])))
    
print('Mean: ' + str(np.mean(scores_list)))
print('Std deviation: ' + str(np.std(scores_list)))

Fold: 1
ROC_AUC: 0.5
Estimator and parameters: 
{'clf__max_depth': 1, 'clf__n_estimators': 10, 'learning_rate': 0.0001, 'min_child_weight': 1, 'random_state': 42}
[Confusion Matrix]: [1.0, 0.0]

Fold: 2
ROC_AUC: 0.49955634427684115
Estimator and parameters: 
{'clf__max_depth': 1, 'clf__n_estimators': 10, 'learning_rate': 0.0001, 'min_child_weight': 1, 'random_state': 42}
[Confusion Matrix]: [0.9991126885536823, 0.0]

Fold: 3
ROC_AUC: 0.515625
Estimator and parameters: 
{'clf__max_depth': 1, 'clf__n_estimators': 10, 'learning_rate': 0.0001, 'min_child_weight': 1, 'random_state': 42}
[Confusion Matrix]: [1.0, 0.03]

Fold: 4
ROC_AUC: 0.5318140147825589
Estimator and parameters: 
{'clf__max_depth': 1, 'clf__n_estimators': 10, 'learning_rate': 0.0001, 'min_child_weight': 1, 'random_state': 42}
[Confusion Matrix]: [0.9991119005328597, 0.06]

Fold: 5
ROC_AUC: 0.5
Estimator and parameters: 
{'clf__max_depth': 1, 'clf__n_estimators': 10, 'learning_rate': 0.0001, 'min_child_weight': 1, 'random_s

In [21]:
with open('partial_results_imbalanced.pickle', 'wb') as handle:
    pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)