## 1. Load and encode datasets with the TargetEncoder

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append('../encoders/pattern_preserving/')
from utilities import *

# Load dataset, dropping id column
df = pd.read_csv('train.csv').iloc[0:,1:]
original_cols = df.columns
df.columns = range(len(df.columns))

# Set which variables are categorical
cat_cols = df.columns[:-1]

# Print summary of df
print(df.shape)
df.head()

(300000, 24)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [2]:
# Load encoded dataset if already exists or encode directly
import os.path
import time

if os.path.isfile('encoded_Target.csv'):
    encoded = pd.read_csv('encoded_Target.csv', header=None)
else:
    # Import encoder
    from category_encoders import TargetEncoder
    target = TargetEncoder()
  
    # Introduce categories for numerical features
    temp = set_categories(df[df.columns[:-1]], df.columns[:-1])
    temp[len(temp.columns)] = df[df.columns[-1]]
    df = temp
    
    # Prepare sample to fit encoder
    X = df[df.columns[:-1]]
    y = df[df.columns[-1]]
    
    # Fit encoder
    tic = time.perf_counter()
    target.fit(X, y)
    toc = time.perf_counter()
    print('Fitting completed in', round(toc-tic,1), 'seconds')
    
    # Encode whole dataset, and save for future use
    tic = time.perf_counter()
    encoded = target.transform(df[df.columns[:-1]])
    toc = time.perf_counter()
    print('Transformed full dataset in', round(toc-tic,1), 'seconds')
    encoded[encoded.shape[1]] = df[df.columns[-1]]
    encoded.to_csv('encoded_Target.csv', index=None, header=None)
    
    # Encode test dataset
    test = pd.read_csv('test.csv', low_memory=False)
    test.columns = range(test.shape[1])
    test = test.loc[0:,1:]    
    test.columns = range(test.shape[1])
    test = set_categories(test, test.columns)
    tic = time.perf_counter()
    test = target.transform(test)
    toc = time.perf_counter()
    print('Transformed test dataset in', round(toc-tic,1), 'seconds')
    test.to_csv('test_Target.csv', index=None, header=None)
    
encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.305495,0.329946,0.301541,0.302537,0.290107,0.327145,0.360978,0.307162,0.242813,0.237743,...,0.368421,0.334688,0.403885,0.257877,0.306993,0.208354,0.401186,0.322048,0.244432,0
1,0.305495,0.236159,0.301541,0.302537,0.290107,0.327145,0.290054,0.359209,0.289954,0.304164,...,0.076924,0.278168,0.403885,0.326315,0.206599,0.186877,0.30388,0.340292,0.327496,0
2,0.305495,0.329946,0.301541,0.309384,0.290107,0.24179,0.290054,0.293085,0.289954,0.353951,...,0.172414,0.278168,0.317175,0.403126,0.306993,0.351864,0.206843,0.340292,0.244432,0
3,0.305495,0.236159,0.301541,0.309384,0.290107,0.351052,0.290054,0.307162,0.339793,0.329472,...,0.227273,0.278168,0.403885,0.360961,0.330148,0.208354,0.355985,0.322048,0.255729,1
4,0.305495,0.329946,0.301541,0.309384,0.333773,0.351052,0.290054,0.293085,0.339793,0.329472,...,0.2,0.278168,0.403885,0.225214,0.206599,0.351864,0.404345,0.340292,0.327496,0


## 2. Search for best xgboost parameters with Hyperopt

In [3]:
# Define objective function for Hyperopt (credit to Leonardo Ferreira, 
# https://www.kaggle.com/kabure/eda-feat-engineering-encode-conquer)
from sklearn.model_selection import KFold
import xgboost as xgb
import time
from hyperopt import hp, fmin, tpe, space_eval

def objective(params):
    tic = time.perf_counter()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n>> Hyperopt New Run")
    print(f"params = {params}")
    FOLDS = 8
    count=1
    max_rounds = 500
    kf = KFold(n_splits=FOLDS, shuffle=False)
    
    mean_auc = 0.0
    for train_idx, val_idx in kf.split(X, y):
        # Prepare data for xgboost
        dtrain = xgb.DMatrix(X.iloc[train_idx, :], label=y.iloc[train_idx])
        dval = xgb.DMatrix(X.iloc[val_idx, :], label=y.iloc[val_idx])
        evallist = [(dtrain, 'train'), (dval, 'eval')]
        
        params['objective']= 'binary:logistic'
        params['eval_metric']= 'auc'
        params['n_thread'] = 7
        evals_result={}
        bst = xgb.train(params, dtrain, max_rounds, evallist, 
                        early_stopping_rounds=30, verbose_eval=False,
                        evals_result=evals_result) 
        
        auc = evals_result['eval']['auc'][-1]
        tauc = evals_result['train']['auc'][-1]
        mean_auc += auc/FOLDS
        print(f'Fold {count} - train_auc {round(tauc, 5)} - val_auc: {round(auc, 5)}')
        count += 1
        del dtrain, dval, bst
    toc = time.perf_counter()
    print('Run time:', round(toc-tic, 1), 'seconds.')
    print('Mean val_auc:', round(mean_auc,5))
    
    return 1-mean_auc

# Define search space for Hyperopt (based on the one by Leonardo Ferreira, 
# https://www.kaggle.com/kabure/eda-feat-engineering-encode-conquer)
space = {
    'max_depth': hp.choice('max_depth', [3, 4, 5, 6, 7]),
    'reg_alpha':  hp.uniform('reg_alpha', 0.05, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.05, 0.4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.15),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
    'gamma': hp.uniform('gamma', 0.01, 0.7),
    'subsample': hp.choice('subsample', [0.5, 0.6, 0.7, 0.8]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 0.8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 0.9) }

In [4]:
# Take sample for xgboost parameter optimisation
sample = encoded.sample(frac=1, replace=False)
X = sample[sample.columns[:-1]]
y = sample[sample.columns[-1]]

# Run hyperopt 
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=15)

# Set best params
best_params = space_eval(space, best)
best_params 

                                                      
>> Hyperopt New Run
params = {'max_depth': 5, 'gamma': '0.059', 'subsample': '0.70', 'reg_alpha': '0.290', 'reg_lambda': '0.172', 'learning_rate': '0.030', 'colsample_bytree': '0.434', 'feature_fraction': '0.688', 'bagging_fraction': '0.539'}
Fold 1 - train_auc 0.84334 - val_auc: 0.83229         
Fold 2 - train_auc 0.84387 - val_auc: 0.82864         
Fold 3 - train_auc 0.8434 - val_auc: 0.8319           
Fold 4 - train_auc 0.84342 - val_auc: 0.8322          
Fold 5 - train_auc 0.84368 - val_auc: 0.82997         
Fold 6 - train_auc 0.84368 - val_auc: 0.82902         
Fold 7 - train_auc 0.84335 - val_auc: 0.83264         
Fold 8 - train_auc 0.84274 - val_auc: 0.83672         
Run time:                                             
431.2                                                 
seconds.                                              
Mean val_auc:                                         
0.83167                                   

{'bagging_fraction': 0.4132451357093028,
 'colsample_bytree': 0.48780293211242975,
 'feature_fraction': 0.795201840658105,
 'gamma': 0.07753569755270401,
 'learning_rate': 0.12535983773459997,
 'max_depth': 3,
 'reg_alpha': 0.14311556668829817,
 'reg_lambda': 0.36067718575997043,
 'subsample': 0.7}

## Train best model further and make predictions to submit

In [5]:
# Train xgboost model further with best params and full dataset
from sklearn.model_selection import train_test_split
train, val = train_test_split(encoded, train_size=0.75, random_state=0)
dtrain = xgb.DMatrix(train[train.columns[:-1]], label=train[train.columns[-1]])
dval = xgb.DMatrix(val[val.columns[:-1]], label=val[val.columns[-1]])

evallist = [(dtrain, 'train'), (dval, 'eval')]
evals_result = {}
best_params['objective']= 'binary:logistic'
best_params['eval_metric']= 'auc'
best_params['n_thread'] = 7
bst = xgb.train(best_params, dtrain, 4000, evallist, early_stopping_rounds=50, 
                verbose_eval=False, evals_result=evals_result) 

print('Train auc '+str(evals_result['train']['auc'][-1])+', Test auc '+str(evals_result['eval']['auc'][-1]))

Train auc 0.840344, Test auc 0.83017


In [6]:
# Load test set
test = pd.read_csv('test_Target.csv', header=None, low_memory=False)
index = pd.read_csv('test.csv', low_memory=False)['id'].values
dtest = xgb.DMatrix(test)
# Make predictions and save
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

if not os.path.exists('submissions'):
    os.mkdir('submissions')
pd.DataFrame({'id':index, 'target':ypred}, index=None).to_csv('./submissions/predicted_Target.csv', index=None)