Todo:
 1. remove noisy samples: one option - remove rows with num of nan > trash
 2. feature selection (annova, pca) 
 3. check for cat boost 
 4. check SMOTETomek and undersample & SMOTENC
 
Must:
 1. XGBOOST with missing ==== DONE GOOD ====
 2. random oversample  ===== DONE - BAD ====
 3. Net
 4. complete missing with classifier

In [16]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from utils import *
from imblearn.under_sampling import RandomUnderSampler
from fancyimpute import KNN
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

In [17]:
%%time
##### load & clean data
train, test = load_data()
train = train.dropna(axis=0, how='all') # remove missing rows
y_all = train.Class
train = train.drop(columns='Class')

#columns_to_drop = ['Pre_L_Pupil_Day', 'T_L_Actual_AblDepth']
#train, test = remove_columns(train, test, columns_to_drop)

print("train.shape without missing rows =", train.shape)
print("class count:\n", y_all.value_counts())

##### add here new steps to preprocess
preprocessing = Pipeline([
                ('scaler', StandardScaler()),
                ('knn-imputation', KNN(k=5, print_interval=10000)) # =========== ACTIVATE in real time ===========
                ])

X_all, X_test = preprocess_data(train, test, preprocessing)
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size = 0.2, stratify=y_all, random_state=42)

train.shape without missing rows = (30451, 51)
class count:
 0.0    29949
1.0      502
Name: Class, dtype: int64
Imputing row 1/42903 with 10 missing, elapsed time: 813.829
Imputing row 10001/42903 with 10 missing, elapsed time: 842.423
Imputing row 20001/42903 with 11 missing, elapsed time: 876.213
Imputing row 30001/42903 with 14 missing, elapsed time: 897.794
Imputing row 40001/42903 with 18 missing, elapsed time: 917.867
Wall time: 15min 27s


In [18]:
# %%time
# xg = XGBClassifier(n_estimators=40, 
#                    tree_method='gpu_exact', 
#                    predictor='gpu_predictor', 
#                    random_state=52, 
#                    scale_pos_weight=1.2,
#                    max_depth=4, 
#                    n_jobs=-1)

# evaluation =  Pipeline([
# #                 ('smote', SMOTE(random_state=42, ratio=1, n_jobs=-1, k_neighbors=3)),
# #                 ('random_oversample', RandomOverSampler()),
#                 ('xg',xg)
#                 ])
# print(evaluation.get_params().keys())

In [34]:
%%time
xg_hyperparams = {
    'n_estimators': hp.choice('n_estimators', np.arange(30, 300, 10, dtype=int)),
    'learning_rate': hp.uniform('learning_rate', 0.1,0.7),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1),
    'max_depth': hp.choice('max_depth', np.arange(3, 15, dtype=int)),
    'subsample': hp.uniform('subsample', 0.7, 1)
}

def objective(param_space):
    xgbst = XGBClassifier(n_estimators = param_space['n_estimators'],
                          learning_rate=param_space['learning_rate'],
                          colsample_bytree=param_space['colsample_bytree'],
                          max_depth = param_space['max_depth'],
                          subsample = param_space['subsample'],
                          tree_method='gpu_exact', 
                          predictor='gpu_predictor', 
                          random_state=52, 
                          scale_pos_weight=1.2,
                          n_jobs=-1
                          )

#     eval_set  = [(X_train, y_train), (X_val, y_val)]

    xgbst.fit(X_train,y_train)
#               eval_set=eval_set, 
#               eval_metric="auc",
#               early_stopping_rounds=100)

    pred = xgbst.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, pred)
    if auc>0.6575:
        print("SCORE: {:.4f}".format(auc))
        ans = {'n_estimators': param_space['n_estimators'],
               'colsample_bytree': param_space['colsample_bytree'],
               'learning_rate': param_space['learning_rate'],
               'max_depth':  param_space['max_depth'],
               'subsample': param_space['subsample']
                }
        print(ans)

    return {'loss': 1-auc, 'status': STATUS_OK }


trials = Trials()
best = fmin(fn=objective,
            space=xg_hyperparams,
            algo=tpe.suggest,
            max_evals=250,
            trials=trials)
print(best)

SCORE: 0.6603
{'n_estimators': 40, 'colsample_bytree': 0.9550575463726813, 'learning_rate': 0.1622497530312635, 'max_depth': 3, 'subsample': 0.7286986128905933}
SCORE: 0.6703
{'n_estimators': 40, 'colsample_bytree': 0.9869441687261314, 'learning_rate': 0.12308477729361539, 'max_depth': 3, 'subsample': 0.7065671864689577}
SCORE: 0.6685
{'n_estimators': 40, 'colsample_bytree': 0.9113033798358997, 'learning_rate': 0.15679758732676893, 'max_depth': 3, 'subsample': 0.7622984946942586}
SCORE: 0.6661
{'n_estimators': 40, 'colsample_bytree': 0.873212334197526, 'learning_rate': 0.10041851556235826, 'max_depth': 3, 'subsample': 0.7610334888865297}
SCORE: 0.6606
{'n_estimators': 40, 'colsample_bytree': 0.9452656412797206, 'learning_rate': 0.14883357354800392, 'max_depth': 3, 'subsample': 0.7349974507286338}
SCORE: 0.6626
{'n_estimators': 40, 'colsample_bytree': 0.918615308314015, 'learning_rate': 0.14613675248363717, 'max_depth': 3, 'subsample': 0.7380120164430177}
SCORE: 0.6726
{'n_estimators': 

In [40]:
best = [
    {'n_estimators': 40, 'colsample_bytree': 0.9246637836459941, 'learning_rate': 0.1024409296340219, 'max_depth': 3, 'subsample': 0.731947365299333},
    {'n_estimators': 40, 'colsample_bytree': 0.9461492183502577, 'learning_rate': 0.10208853122888187, 'max_depth': 3, 'subsample': 0.7267248973697573},
    {'n_estimators': 40, 'colsample_bytree': 0.9869441687261314, 'learning_rate': 0.12308477729361539, 'max_depth': 3, 'subsample': 0.7065671864689577},
    {'n_estimators': 40, 'colsample_bytree': 0.9575002350389508, 'learning_rate': 0.11293422216136395, 'max_depth': 3, 'subsample': 0.7109511204369361}
]
for i,b  in enumerate(best):
    print("config:\n\t{}".format(b))
    xg = XGBClassifier(**b,
                       tree_method='gpu_exact', 
                       predictor='gpu_predictor', 
                       random_state=52, 
                       scale_pos_weight=1.2,
                       n_jobs=-1)
    xg.fit(X_train, y_train)
    x_val_pred = xg.predict_proba(X_val)[:,1]
    print('auc score on x_val:\n\t{:.4f}'.format(roc_auc_score(y_val, x_val_pred)))
    print()
    xg.fit(X_all, y_all)
    pred = xg.predict_proba(X_test)[:,1]
    to_file(pred, "xg_tune_config{}".format(i), True, "xg_tune_config{}".format(i))

config:
	{'n_estimators': 40, 'colsample_bytree': 0.9246637836459941, 'learning_rate': 0.1024409296340219, 'max_depth': 3, 'subsample': 0.731947365299333}
auc score on x_val:
	0.6726

config:
	{'n_estimators': 40, 'colsample_bytree': 0.9461492183502577, 'learning_rate': 0.10208853122888187, 'max_depth': 3, 'subsample': 0.7267248973697573}
auc score on x_val:
	0.6716

config:
	{'n_estimators': 40, 'colsample_bytree': 0.9869441687261314, 'learning_rate': 0.12308477729361539, 'max_depth': 3, 'subsample': 0.7065671864689577}
auc score on x_val:
	0.6703

config:
	{'n_estimators': 40, 'colsample_bytree': 0.9575002350389508, 'learning_rate': 0.11293422216136395, 'max_depth': 3, 'subsample': 0.7109511204369361}
auc score on x_val:
	0.6702



In [None]:
import scikitplot as skplt
import matplotlib.pyplot as plt
y_true = y_val
y_probas = gs.predict_proba(X_val)
skplt.metrics.plot_roc_curve(y_true, y_probas)
plt.show()