In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from utils import *
from imblearn.under_sampling import RandomUnderSampler
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.externals import joblib
from sklearn.ensemble import VotingClassifier
# from fancyimpute import KNN


import warnings
warnings.filterwarnings('ignore')

In [4]:
%%time
##### load & clean data
train, test = load_data()
train = train.dropna(axis=0, how='all') # remove missing rows
y_all = train.Class
train = train.drop(columns='Class')

print("train.shape without missing rows =", train.shape)
print("class count:\n", y_all.value_counts())

train.shape without missing rows = (30451, 51)
class count:
 0.0    29949
1.0      502
Name: Class, dtype: int64
Wall time: 335 ms


In [5]:
##### add here new steps to preprocess
preprocessing = Pipeline([
                ('scaler', StandardScaler()),
#                 ('knn-imputation', KNN(k=5)) 
                ])

X_all, X_test = preprocess_data(train, test, preprocessing)
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size = 0.2, stratify=y_all, random_state=42)

In [6]:
xg = XGBClassifier(n_estimators=40, 
                   tree_method='gpu_exact', 
                   predictor='gpu_predictor', 
                   random_state=52, 
                   scale_pos_weight=1.2,
                   max_depth=4, 
                   n_jobs=-1)

# evaluation =  Pipeline([
#                 ('smote', SMOTE(random_state=42, ratio=1, n_jobs=-1, k_neighbors=3)),
#                 ('random_oversample', RandomOverSampler()),
#                 ('xg',xg)
#                 ])

In [8]:
### cross validate
xg_hyperparams = {
    'n_estimators': hp.choice('n_estimators', np.arange(30, 300, 10, dtype=int)),
    'learning_rate': hp.uniform('learning_rate', 0.1,0.7),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1),
    'max_depth': hp.choice('max_depth', np.arange(3, 15, dtype=int)),
    'subsample': hp.uniform('subsample', 0.7, 1)
}

val_last_score = 0.6575

def objective(param_space):
    xgbst = XGBClassifier(n_estimators = param_space['n_estimators'],
                          learning_rate=param_space['learning_rate'],
                          colsample_bytree=param_space['colsample_bytree'],
                          max_depth = param_space['max_depth'],
                          subsample = param_space['subsample'],
                          tree_method='gpu_exact', 
                          predictor='gpu_predictor', 
                          random_state=52, 
                          scale_pos_weight=1.2,
                          n_jobs=-1
                          )

    xgbst.fit(X_train,y_train)
    
    pred = xgbst.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, pred)
    if auc>val_last_score:
        print("SCORE: {:.4f}".format(auc))
        ans = {'n_estimators': param_space['n_estimators'],
               'colsample_bytree': param_space['colsample_bytree'],
               'learning_rate': param_space['learning_rate'],
               'max_depth':  param_space['max_depth'],
               'subsample': param_space['subsample']
                }
        print(ans)

    return {'loss': 1-auc, 'status': STATUS_OK }

def get_best_params():
    trials = Trials()
    best = fmin(fn=objective,
                space=xg_hyperparams,
                algo=tpe.suggest,
                max_evals=250,
                trials=trials)
    return best


In [11]:
best = {'n_estimators': 40, 'colsample_bytree': 0.9869441687261314, 'learning_rate': 0.12308477729361539, 'max_depth': 3, 'subsample': 0.7065671864689577}

xg_tune = XGBClassifier(**best,
                   tree_method='gpu_exact', 
                   predictor='gpu_predictor', 
                   random_state=52, 
                   scale_pos_weight=1.2,
                   n_jobs=-1)


In [12]:
xg_vote = VotingClassifier(estimators=[('xg', xg), ('xg_tune', xg_tune)], voting='soft')
xg_vote.fit(X_train, y_train)
x_val_pred = xg_vote.predict_proba(X_val)[:,1]
print('auc score on x_val:\n\t{:.4f}'.format(roc_auc_score(y_val, x_val_pred)))

xg_vote.fit(X_all, y_all)
pred = xg_vote.predict_proba(X_test)[:,1]



auc score on x_val:
	0.6565


In [13]:
# submit 
# to_file(pred, "sub", to_kaggle=True, "submission")