## 1. Load and encode datasets with the CESAMOEncoder

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Load dataset, dropping id column
df = pd.read_csv('train.csv').iloc[0:,1:]
original_cols = df.columns
df.columns = range(len(df.columns))

# Set which variables are categorical
cat_cols = df.columns[:-1]

# Print summary of df
print(df.shape)
df.head()

(300000, 24)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [2]:
# Load encoded dataset if already exists or encode directly
import os.path
import time

if os.path.isfile('encoded_CESAMO.csv'):
    encoded = pd.read_csv('encoded_CESAMO.csv', header=None)
else:
    """ Fitting the cesamo encoder with 50% of the data takes 
        about 5 hours on an AMD Ryzen-5 U3500 CPU 
    """ 
    # Import encoder
    import sys
    sys.path.append('../encoders')
    from cesamo import CESAMOEncoder
    cesamo = CESAMOEncoder(max_sampling=300)
    
    # Prepare sample to fit encoder
    sample = df.sample(frac=0.5, replace=False).reset_index(drop=True)
    X = sample[sample.columns[:-1]]
    y = sample[sample.columns[-1]]
    
    # Fit encoder
    tic = time.perf_counter()
    cesamo.fit(X, y, cat_cols)
    toc = time.perf_counter()
    print('Fitting completed in', round(toc-tic,1), 'seconds')
    
    # Encode whole dataset, and save for future use
    tic = time.perf_counter()
    encoded = cesamo.transform(df[df.columns[:-1]])
    toc = time.perf_counter()
    print('Transform full dataset in', round(toc-tic,1), 'seconds')
    encoded[encoded.shape[1]] = df[df.columns[-1]]
    encoded.to_csv('encoded_CESAMO.csv', index=None, header=None)
    
    # Encode test dataset
    test = pd.read_csv('test.csv', low_memory=False)
    test.columns = range(test.shape[1])
    test = test.loc[0:,1:]    
    test.columns = range(test.shape[1])
    tic = time.perf_counter()
    test = cesamo.transform(test)
    toc = time.perf_counter()
    print('Transform test dataset in', round(toc-tic,1), 'seconds')
    test.to_csv('test_CESAMO.csv', index=None, header=None)
    
encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.899031,0.561248,0.766722,0.216364,0.892412,0.895969,0.581203,0.969714,0.683569,0.096444,...,0.670115,0.392172,0.882236,0.154645,0.640249,0.918886,0.634161,0.373925,0.741033,0
1,0.899031,0.563185,0.766722,0.216364,0.892412,0.895969,0.39641,0.714208,0.947631,0.01864,...,0.27976,0.586907,0.882236,0.367212,0.572052,0.808215,0.272389,0.339732,0.82418,0
2,0.899031,0.561248,0.766722,0.216644,0.892412,0.954332,0.39641,0.840209,0.947631,0.417157,...,0.928569,0.586907,0.745638,0.055358,0.640249,0.622023,0.30883,0.339732,0.741033,0
3,0.899031,0.563185,0.766722,0.216644,0.892412,0.944538,0.39641,0.969714,0.719615,0.173263,...,0.563623,0.586907,0.882236,0.397769,0.257676,0.918886,0.367397,0.373925,0.248084,1
4,0.899031,0.561248,0.766722,0.216644,0.892781,0.944538,0.39641,0.840209,0.719615,0.173263,...,0.202355,0.586907,0.882236,0.104607,0.572052,0.622023,0.319622,0.339732,0.82418,0


## 2. Search for best xgboost parameters with Hyperopt

In [3]:
# Define objective function for Hyperopt (credit to Leonardo Ferreira, 
# https://www.kaggle.com/kabure/eda-feat-engineering-encode-conquer)
from sklearn.model_selection import KFold
import xgboost as xgb
import time
from hyperopt import hp, fmin, tpe, space_eval

def objective(params):
    tic = time.perf_counter()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n>> Hyperopt New Run")
    print(f"params = {params}")
    FOLDS = 8
    count=1
    max_rounds = 500
    kf = KFold(n_splits=FOLDS, shuffle=False)
    
    mean_auc = 0.0
    for train_idx, val_idx in kf.split(X, y):
        # Prepare data for xgboost
        dtrain = xgb.DMatrix(X.iloc[train_idx, :], label=y.iloc[train_idx])
        dval = xgb.DMatrix(X.iloc[val_idx, :], label=y.iloc[val_idx])
        evallist = [(dtrain, 'train'), (dval, 'eval')]
        
        params['objective']= 'binary:logistic'
        params['eval_metric']= 'auc'
        params['n_thread'] = 7
        evals_result={}
        bst = xgb.train(params, dtrain, max_rounds, evallist, 
                        early_stopping_rounds=30, verbose_eval=False,
                        evals_result=evals_result) 
        
        auc = evals_result['eval']['auc'][-1]
        tauc = evals_result['train']['auc'][-1]
        mean_auc += auc/FOLDS
        print(f'Fold {count} - train_auc {round(tauc, 5)} - val_auc: {round(auc, 5)}')
        count += 1
        del dtrain, dval, bst
    toc = time.perf_counter()
    print('Run time:', round(toc-tic, 1), 'seconds.')
    print('Mean val_auc:', round(mean_auc,5))
    
    return 1-mean_auc

# Define search space for Hyperopt (based on the one by Leonardo Ferreira, 
# https://www.kaggle.com/kabure/eda-feat-engineering-encode-conquer)
space = {
    'max_depth': hp.choice('max_depth', [3, 4, 5, 6, 7]),
    'reg_alpha':  hp.uniform('reg_alpha', 0.05, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.05, 0.4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.15),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
    'gamma': hp.uniform('gamma', 0.01, 0.7),
    'subsample': hp.choice('subsample', [0.5, 0.6, 0.7, 0.8]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 0.8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 0.9) }

In [4]:
# Take sample for xgboost parameter optimisation
sample = encoded.sample(frac=1, replace=False)
X = sample[sample.columns[:-1]]
y = sample[sample.columns[-1]]

# Run hyperopt 
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=15)

# Set best params
best_params = space_eval(space, best)
best_params 

                                                      
>> Hyperopt New Run
params = {'max_depth': 6, 'gamma': '0.566', 'subsample': '0.50', 'reg_alpha': '0.172', 'reg_lambda': '0.288', 'learning_rate': '0.060', 'colsample_bytree': '0.655', 'feature_fraction': '0.477', 'bagging_fraction': '0.780'}
Fold 1 - train_auc 0.81135 - val_auc: 0.76891         
Fold 2 - train_auc 0.81181 - val_auc: 0.76707         
Fold 3 - train_auc 0.81092 - val_auc: 0.76922         
Fold 4 - train_auc 0.81101 - val_auc: 0.7628          
Fold 5 - train_auc 0.81068 - val_auc: 0.76754         
Fold 6 - train_auc 0.81086 - val_auc: 0.76507         
Fold 7 - train_auc 0.81085 - val_auc: 0.77056         
Fold 8 - train_auc 0.8111 - val_auc: 0.76697          
Run time:                                             
491.6                                                 
seconds.                                              
Mean val_auc:                                         
0.76727                                   

{'bagging_fraction': 0.6426262038736648,
 'colsample_bytree': 0.4036760584598187,
 'feature_fraction': 0.6727088496122846,
 'gamma': 0.15290992137796441,
 'learning_rate': 0.13456641357810958,
 'max_depth': 6,
 'reg_alpha': 0.36324121397028947,
 'reg_lambda': 0.15784757773508043,
 'subsample': 0.7}

## Train best model further and make predictions to submit

In [5]:
# Train xgboost model further with best params and full dataset
from sklearn.model_selection import train_test_split
train, val = train_test_split(encoded, train_size=0.75, random_state=0)
dtrain = xgb.DMatrix(train[train.columns[:-1]], label=train[train.columns[-1]])
dval = xgb.DMatrix(val[val.columns[:-1]], label=val[val.columns[-1]])

evallist = [(dtrain, 'train'), (dval, 'eval')]
evals_result = {}
best_params['objective']= 'binary:logistic'
best_params['eval_metric']= 'auc'
best_params['n_thread'] = 7
bst = xgb.train(best_params, dtrain, 4000, evallist, early_stopping_rounds=50, 
                verbose_eval=False, evals_result=evals_result) 

print('Train auc '+str(evals_result['train']['auc'][-1])+', Test auc '+str(evals_result['eval']['auc'][-1]))

Train auc 0.873787, Test auc 0.773439


In [6]:
# Load test set
test = pd.read_csv('test_CESAMO.csv', header=None, low_memory=False)
index = pd.read_csv('test.csv', low_memory=False)['id'].values
dtest = xgb.DMatrix(test)
# Make predictions and save
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

if not os.path.exists('submissions'):
    os.mkdir('submissions')
pd.DataFrame({'id':index, 'target':ypred}, index=None).to_csv('./submissions/predicted_CESAMO.csv', index=None)