In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

import utils
import skutils
import pandas as pd

TRAIN_DATA = "../../data/train.csv"

# Load the training data
train = pd.read_csv(TRAIN_DATA)

In [2]:
# Get uniques
uniques = train.apply(lambda x: len(x.unique()))

def transform(data, threshold_unimportant=0.0000001):
    
    cols_emtpy = list(uniques[uniques == 1].index.values)
    
    # We have to remove the correlated ones by hand
    # because there can be multiple correlations with one column
    cols_correlated = [
        'ind_var1', 'ind_var1_0',
        'ind_var5',
        'ind_var6', 'ind_var6_0', 'num_var6',
        'ind_var8', 'ind_var8_0',
        'ind_var12',
        'ind_var13_corto', 'ind_var13_corto_0', 'ind_var13_medio', 'ind_var13_medio_0',
        'ind_var18', 'ind_var18_0', 'num_var18', 'num_var18_0',
        'ind_var20', 'ind_var20_0',
        'ind_var29', 'ind_var29_0',
        'ind_var24', 'ind_var24_0',
        'ind_var25_0', 'num_var25_0',
        'ind_var26_0', 'num_var26_0',
        'ind_var32_0', 'num_var32_0',
        'ind_var34', 'ind_var34_0', 'num_var34', 'num_var34_0',
        'ind_var37_0', 'num_var37_0',
        'ind_var39',
        'ind_var40', 'num_var40',
        'ind_var44',
        'num_var29_0',
        'num_var13_medio', 'num_var13_medio_0',
        'num_var7_emit_ult1',
        'num_var39',
        'num_var40_0',
        'num_var44_0',
        'saldo_var6',
        'saldo_var18',
        'saldo_var13_medio',
        'saldo_medio_var17_ult3',
        'saldo_medio_var33_ult1',
        'delta_imp_aport_var13_1y3', 'delta_imp_reemb_var13_1y3', 'delta_num_reemb_var13_1y3',
        'delta_imp_aport_var17_1y3', 'delta_imp_reemb_var17_1y3', 'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var17_out_1y3',
        'delta_imp_aport_var33_1y3', 'delta_imp_reemb_var33_1y3', 'delta_num_reemb_var33_1y3', 'delta_num_trasp_var33_out_1y3',
        'delta_imp_compra_var44_1y3', 'delta_imp_venta_var44_1y3',
        'delta_imp_trasp_var17_in_1y3', 'delta_imp_trasp_var17_out_1y3',
        'delta_imp_trasp_var33_in_1y3', 'delta_imp_trasp_var33_out_1y3',
        'imp_reemb_var17_hace3',
        'imp_reemb_var33_ult1', 'imp_trasp_var33_out_ult1',
        'num_med_var45_ult3',
        'imp_op_var39_efect_ult1',
        'imp_op_var39_efect_ult3',
        'imp_op_var39_ult1',
        'imp_amort_var18_ult1',
        'saldo_medio_var17_hace3', 'saldo_medio_var17_ult1',
        'saldo_medio_var13_medio_ult3',
    ]

    # Extract the ids
    ids = data['ID']
    
    # Extract the data
    X = data.drop(['ID'] + cols_emtpy + cols_correlated,  axis=1)
    if 'TARGET' in data.columns:
        X.drop('TARGET', axis=1, inplace=True)

    # Extract the labels
    y = data['TARGET'] if 'TARGET' in data.columns else None
    
    return ids, X, y

In [3]:
from sklearn.metrics import roc_auc_score

# Learning Rates with decay function
def learning_rates(steps, start=1, end=0, start_exp_x=0.0, end_exp_x=7.0):
    return end + np.exp(-np.linspace(start_exp_x, end_exp_x, steps)) * (start - end)

def cross_validate_xgb(X_train, y_train, params, fit_params={}, folds=None, scorer=roc_auc_score):

    if folds is None:
        folds = skutils.folds(y_train, n_folds=4, stratified=True)
    
    test_scores = []
    train_scores = []
    
    for train_idx, test_idx in folds:

        clf = xgb.XGBClassifier(**params)
        
        # Needs latest XGBoost from chaosmail
        # https://github.com/chaosmail/xgboost
        # Build Instructions: https://github.com/dmlc/xgboost/blob/master/doc/build.md
        # https://github.com/dmlc/xgboost/pull/1018
        clf.fit(X_train.values[train_idx], y_train.values[train_idx], verbose=False, **fit_params)
        
        # Predict the test output
        y_predict = clf.predict_proba(X_train.values[test_idx])[:,1]
        test_scores.append(scorer(y_train.values[test_idx], y_predict))
        
        # Predict the train output
        y_predict = clf.predict_proba(X_train.values[train_idx])[:,1]
        train_scores.append(scorer(y_train.values[train_idx], y_predict))
    
    return np.mean(test_scores), np.std(test_scores), np.mean(train_scores), np.std(train_scores)

# Create a XGB Classifier

In [4]:
from hyperopt import hp
import xgboost as xgb
import numpy as np

# Transform the training data
Ids_train, X, y = transform(train)

# In the end we will use this amount of estimators
n_total_estimators = 500
n_cv_estimators = 100

seed = 42

params = {
    'colsample_bytree': 0.98406117320608466,
    'max_depth': int(4.0),
    'min_child_weight': 4.0,
    'subsample': 0.80576464743565634,
    'n_estimators': n_cv_estimators,
    'seed': seed
}

learn_params = {
    'learning_rate_end': 0.08303127679504986,
    'learning_rate_start': 0.11158443875324964,
}

def iter_learning_rates(i, n): 
    return learning_rates(
        n_total_estimators, start=learn_params['learning_rate_start'], end=learn_params['learning_rate_end']
    )[i]
        

fit_params = {
    #'early_stopping_rounds': 20,
    'learning_rates': iter_learning_rates,
    'eval_metric': 'auc',
    #'eval_set': [(X_test.values, y_test.values)],
}
# eval_set: set that is used for watching the training performance

space = hp.choice('method', [
    ('random', {
        'seed': hp.quniform('seed', 1, 1000000, 1),
        'fzeros': hp.uniform('fzeros', 0.5, 1.0),
     }
    ),
])

# Find the best balance in the dataset

In [7]:
from hyperopt import fmin, rand, hp, STATUS_OK, Trials

def objective(args):
    
    method, opt = args
    
    if method == 'random':
        X_train, y_train = skutils.random_subset(
            X, y, dims=[(0, opt['fzeros']), (1, 1.0)], seed=int(opt['seed']))
    else:
         X_train, y_train = X, y
    
    folds = skutils.folds(y_train, n_folds=4, stratified=True, random_state=seed)
    
    test_mean, test_std, train_mean, train_std = cross_validate_xgb(
        X_train, y_train, params, fit_params, folds=folds)
    
    print("Cross validation test-auc-mean score %.8f (+- %.8f)" % (test_mean, test_std))
    print("Cross validation train-auc-mean score %.8f (+- %.8f)" % (train_mean, train_std))

    return {
        'loss': 1 - test_mean,
        'loss_variance': test_std**2,
        'status': STATUS_OK,
        
        # Custom data
        'acu-test-mean': test_mean,
        'acu-test-std': test_std,
        'acu-train-mean': train_mean,
        'acu-train-std': train_std,
    } 

trials = Trials()

# Needs latest hyperopt from chaosmail
# https://github.com/chaosmail/hyperopt
# https://github.com/hyperopt/hyperopt/issues/234
best = fmin(fn=objective, space=space, algo=rand.suggest, max_evals=50, trials=trials)

# Print the best result
print(best)

Cross validation test-auc-mean score 0.83886516 (+- 0.00580546)
Cross validation train-auc-mean score 0.86692373 (+- 0.00129910)
Cross validation test-auc-mean score 0.83731181 (+- 0.00621860)
Cross validation train-auc-mean score 0.86666106 (+- 0.00072476)
Cross validation test-auc-mean score 0.83836851 (+- 0.00367987)
Cross validation train-auc-mean score 0.86595117 (+- 0.00132393)
Cross validation test-auc-mean score 0.83669359 (+- 0.00791563)
Cross validation train-auc-mean score 0.86522596 (+- 0.00236773)
Cross validation test-auc-mean score 0.83712399 (+- 0.00618422)
Cross validation train-auc-mean score 0.86471783 (+- 0.00217692)
Cross validation test-auc-mean score 0.83852269 (+- 0.00589564)
Cross validation train-auc-mean score 0.86724187 (+- 0.00164061)
Cross validation test-auc-mean score 0.83776485 (+- 0.00717386)
Cross validation train-auc-mean score 0.86684070 (+- 0.00218461)
Cross validation test-auc-mean score 0.83706596 (+- 0.00894993)
Cross validation train-auc-mean s

In [8]:
sorted_trials = sorted(trials.trials, key=lambda trial: trial['result']['loss'])

scores = sorted_trials[0]['result']

print("Cross validation test-auc-mean score %.8f (+- %.8f)" % (scores['acu-test-mean'], scores['acu-test-std']))
print("Cross validation train-auc-mean score %.8f (+- %.8f)" % (scores['acu-train-mean'], scores['acu-train-std']))
print(best)

Cross validation test-auc-mean score 0.83978895 (+- 0.00633829)
Cross validation train-auc-mean score 0.86386899 (+- 0.00234691)
{'method': 0, 'seed': 627297.0, 'fzeros': 0.99387885958024613}
