In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

import utils
import skutils
import pandas as pd

TRAIN_DATA = "../../data/train.csv"
TEST_DATA = "../../data/test.csv"

# Load the training data
train = pd.read_csv(TRAIN_DATA)

# Load the testing data
test = pd.read_csv(TEST_DATA)

In [2]:
# Get zero mean faetures
u = train.apply(lambda x: len(x.unique()))
zeroMeanFeatures = list(u[u == 1].index.values)

def transform(data):
    # Extract the ids
    ids = data['ID']
    
    # Extract the data
    X = data.drop(['ID'] + zeroMeanFeatures,  axis=1)
    if 'TARGET' in data.columns:
        X.drop('TARGET', axis=1, inplace=True)

    # Extract the labels
    y = data['TARGET'] if 'TARGET' in data.columns else None
    
    return ids, X, y

In [3]:
from sklearn.metrics import roc_auc_score

# Learning Rates with decay function
def learning_rates(steps, start=1, end=0, start_exp_x=0.0, end_exp_x=7.0):
    return end + np.exp(-np.linspace(start_exp_x, end_exp_x, steps)) * (start - end)

def cross_validate_xgb(X_train, y_train, params, fit_params={}, scorer=roc_auc_score,
                       n_folds=5, rep_folds=1, stratified=False, random_state=42):
    test_scores = []
    train_scores = []

    np.random.seed(random_state)
    
    # Repeat the CV folds *rep_folds* times
    for i in range(rep_folds):
        
        # Create a random seed for each iteration
        fold_state = np.random.randint(0, np.iinfo('i').max)
        folds = skutils.folds(y_train, n_folds=n_folds, stratified=stratified, random_state=fold_state)
        # Do the CV
        for train_idx, test_idx in folds:
            clf = xgb.XGBClassifier(**params)

            fit_params['early_stopping_rounds'] = 20
            fit_params['eval_set'] = [(X_train.values[test_idx], y_train.values[test_idx])]
            
            # Needs latest XGBoost from chaosmail
            # https://github.com/chaosmail/xgboost
            # Build Instructions: https://github.com/dmlc/xgboost/blob/master/doc/build.md
            # https://github.com/dmlc/xgboost/pull/1018
            clf.fit(X_train.values[train_idx], y_train.values[train_idx], verbose=False, **fit_params)

            # Predict the test output
            y_predict = clf.predict_proba(X_train.values[test_idx])[:,1]
            test_scores.append(scorer(y_train.values[test_idx], y_predict))

            # Predict the train output
            y_predict = clf.predict_proba(X_train.values[train_idx])[:,1]
            train_scores.append(scorer(y_train.values[train_idx], y_predict))
    
    return np.mean(test_scores), np.std(test_scores), np.mean(train_scores), np.std(train_scores)

# Create a XGB Classifier

In [4]:
from hyperopt import hp
import xgboost as xgb
import numpy as np

# In the end we will use this amount of estimators
n_total_estimators = 350

params = {
    'max_depth': hp.quniform('max_depth', 2, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 3, 12, 1),
    'subsample': hp.uniform('subsample', 0.7, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.8, 1.0),
    'learning_rate_start': hp.uniform('learning_rate_start', 0.5, 0.3),
    'learning_rate_end': hp.uniform('learning_rate_end', 0.01, 0.5),
    'seed': 42,
}

fit_params = {
    'eval_metric': 'auc',
}

In [5]:
# Transform the training data
Ids_train, X_train, y_train = transform(train)

# Tune the parameters of XGB

In [6]:
from hyperopt import fmin, rand, hp, STATUS_OK, Trials

def objective(params):
        
    cv_params = {
        'max_depth': int(params['max_depth']),
        'min_child_weight': params['min_child_weight'],
        'subsample': params['subsample'],
        'colsample_bytree': params['colsample_bytree'],
        'n_estimators': n_total_estimators,
        'seed': params['seed'],
    }
    
    def iter_learning_rates(i, n): 
        return learning_rates(
            n, start=params['learning_rate_start'], end=params['learning_rate_end']
        )[i]
        
    fit_params['learning_rates'] = iter_learning_rates
    
    test_mean, test_std, train_mean, train_std = cross_validate_xgb(
        X_train, y_train, cv_params, fit_params, rep_folds=1, stratified=True)
    
    print("Cross validation test-auc-mean score %.8f (+- %.8f)" % (test_mean, test_std))
    print("Cross validation train-auc-mean score %.8f (+- %.8f)" % (train_mean, train_std))
    print("---")

    return {
        'loss': 1 - test_mean,
        'loss_variance': test_std**2,
        'status': STATUS_OK,
        
        # Custom data
        'acu-test-mean': test_mean,
        'acu-test-std': test_std,
        'acu-train-mean': train_mean,
        'acu-train-std': train_std,
    } 

trials = Trials()

# Needs latest hyperopt from chaosmail
# https://github.com/chaosmail/hyperopt
# https://github.com/hyperopt/hyperopt/issues/234
best = fmin(fn=objective, space=params, algo=rand.suggest, max_evals=100, trials=trials)

# Print the best result
print(best)

Cross validation test-auc-mean score 0.83512188 (+- 0.00185135)
Cross validation train-auc-mean score 0.85290478 (+- 0.00317449)
---
Cross validation test-auc-mean score 0.83753255 (+- 0.00256667)
Cross validation train-auc-mean score 0.88938963 (+- 0.00645788)
---
Cross validation test-auc-mean score 0.81103834 (+- 0.03073157)
Cross validation train-auc-mean score 0.86585047 (+- 0.01469914)
---
Cross validation test-auc-mean score 0.83749120 (+- 0.00293573)
Cross validation train-auc-mean score 0.87985870 (+- 0.00450913)
---
Cross validation test-auc-mean score 0.83732807 (+- 0.00231204)
Cross validation train-auc-mean score 0.89280251 (+- 0.00735566)
---
Cross validation test-auc-mean score 0.83665169 (+- 0.00303986)
Cross validation train-auc-mean score 0.84906622 (+- 0.00168656)
---
Cross validation test-auc-mean score 0.83885734 (+- 0.00390286)
Cross validation train-auc-mean score 0.86367710 (+- 0.00384097)
---
Cross validation test-auc-mean score 0.80875608 (+- 0.03119224)
Cross

In [7]:
sorted_trials = sorted(trials.trials, key=lambda trial: trial['result']['loss'])

scores = sorted_trials[0]['result']

print("Cross validation test-auc-mean score %.8f (+- %.8f)" % (scores['acu-test-mean'], scores['acu-test-std']))
print("Cross validation train-auc-mean score %.8f (+- %.8f)" % (scores['acu-train-mean'], scores['acu-train-std']))
print("---")
print("Public score should be in interval [%.8f, %.8f]" %
      (scores['acu-test-mean']-scores['acu-test-std'], scores['acu-test-mean']+scores['acu-test-std']))
print("---")
print(best)

Cross validation test-auc-mean score 0.83949041 (+- 0.00377496)
Cross validation train-auc-mean score 0.86452560 (+- 0.00305782)
---
Public score should be in interval [0.83571545, 0.84326537]
---
{'min_child_weight': 8.0, 'subsample': 0.85950852308428294, 'learning_rate_start': 0.49052436785544767, 'learning_rate_end': 0.28108248900522836, 'max_depth': 4.0, 'colsample_bytree': 0.99488319773882816}


# Make Submission

In [8]:
Ids_test_, X_test_, y_test_ = transform(test)

# Seriously, this must go in the preprocessing
# X_, y_ = skutils.random_subset(X_train, y_train, dims=[(0, fzeros), (1, 1.0)], seed=subset_seed)
X_, y_ = X_train, y_train

best_params = {
    'max_depth': int(best['max_depth']),
    'min_child_weight': int(best['min_child_weight']),
    'subsample': float(best['subsample']),
    'colsample_bytree': float(best['colsample_bytree']),
    'n_estimators': n_total_estimators,
    'seed': params['seed'],
}

def iter_learning_rates(i, n): 
    return learning_rates(
        n, start=best['learning_rate_start'], end=best['learning_rate_end']
    )[i]

fit_params['learning_rates'] = iter_learning_rates

clf = xgb.XGBClassifier(**best_params)

fit_params['early_stopping_rounds'] = None
fit_params['eval_set'] = None

clf.fit(X_.values, y_.values, verbose=False, **fit_params)

y_pred = clf.predict_proba(X_test_.values)

submission = pd.DataFrame({"ID":Ids_test_, "TARGET":y_pred[:,1]})
submission.to_csv("submissions/submission_%s.csv" % utils.timestamp(), index=False)

# Public Score: 0.836987