## 1. Load and encode datasets with the EntityEmbeddingEncoder

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Load dataset, dropping id column
df = pd.read_csv('train.csv').iloc[0:,1:]
original_cols = df.columns
df.columns = range(len(df.columns))

# Set which variables are categorical
cat_cols = df.columns[:-1]

# Print summary of df
print(df.shape)
df.head()

(300000, 24)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [2]:
# Load test dataset (only for this encoder)
test = pd.read_csv('test.csv', low_memory=False)
test.columns = range(test.shape[1])
test = test.loc[0:,1:]    
test.columns = range(test.shape[1])
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0,0,1,T,Y,Blue,Triangle,Axolotl,Finland,Piano,...,9d117320c,3c49b42b8,2,Novice,Warm,j,P,be,5,11
1,0,0,0,T,N,Red,Square,Lion,Canada,Piano,...,46ae3059c,285771075,1,Master,Lava Hot,l,A,RP,7,5
2,1,0,1,F,Y,Blue,Square,Dog,China,Piano,...,b759e21f0,6f323c53f,2,Expert,Freezing,a,G,tP,1,12
3,0,0,1,T,Y,Red,Star,Cat,China,Piano,...,0b6ec68ff,b5de3dcc4,1,Contributor,Lava Hot,b,Q,ke,2,3
4,0,1,1,F,N,Red,Trapezoid,Dog,China,Piano,...,f91f3b1ee,967cfa9c9,3,Grandmaster,Lava Hot,l,W,qK,4,11


In [3]:
# Load encoded dataset if already exists or encode directly
import os.path
import time

if os.path.isfile('encoded_EntityEmbedding.csv'):
    encoded = pd.read_csv('encoded_EntityEmbedding.csv', header=None, low_memory=False)
else:
    # Import encoder
    import sys
    sys.path.append('../encoders')
    from entity_embedding import EntityEmbeddingEncoder
    entity = EntityEmbeddingEncoder(epochs=10, dense_layers_sizes=(200,100), dropout=True)
    
    # Prepare data to fit encoder
    X = df[df.columns[:-1]]
    y = df[df.columns[-1]]
    
    # Fit encoder
    tic = time.perf_counter()
    entity.fit(X, y, cat_cols=cat_cols, verbose=True, test=test)
    toc = time.perf_counter()
    print('Fitting completed in', round(toc-tic,1), 'seconds')
    
    # Encode whole dataset, and save for future use
    tic = time.perf_counter()
    encoded = entity.transform(df[df.columns[:-1]])
    toc = time.perf_counter()
    print('Transformed full dataset in', round(toc-tic,1), 'seconds')
    encoded[encoded.shape[1]] = df[df.columns[-1]]
    encoded.to_csv('encoded_EntityEmbedding.csv', index=None, header=None)
    
    # Encode test dataset
    tic = time.perf_counter()
    test = entity.transform(test)
    toc = time.perf_counter()
    print('Transformed test dataset in', round(toc-tic,1), 'seconds')
    test.to_csv('test_EntityEmbedding.csv', index=None, header=None)
    
encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,219,220,221,222,223,224,225,226,227,228
0,0.010083,0.054558,0.035305,0.017554,0.072943,0.045411,-0.003575,-0.125848,0.043817,0.046323,...,0.012554,-0.020566,-0.001665,0.037157,0.028859,0.032746,-0.031049,0.053178,-0.116814,0
1,0.010083,-0.161606,0.035305,0.017554,0.072943,0.045411,-0.003575,0.024588,-0.001618,-0.04175,...,-0.004317,-0.058213,0.032167,0.065228,0.008721,-0.010989,0.015391,-0.027246,0.039602,0
2,0.010083,0.054558,0.035305,-0.030663,0.072943,-0.114537,0.089466,0.024588,-0.001618,-0.04175,...,-0.069591,-0.058213,0.032167,0.065228,0.028859,0.032746,-0.031049,0.053178,-0.116814,0
3,0.010083,-0.161606,0.035305,-0.030663,0.072943,0.049918,-0.090379,0.024588,-0.001618,-0.04175,...,-0.010577,-0.020566,-0.001665,0.037157,0.02845,0.078041,-0.048122,0.04596,-0.074072,1
4,0.010083,0.054558,0.035305,-0.030663,-0.074199,0.049918,-0.090379,0.024588,-0.001618,-0.04175,...,-0.029268,-0.058213,0.032167,0.065228,0.008721,-0.010989,0.015391,-0.027246,0.039602,0


## 2. Search for best xgboost parameters with Hyperopt

In [12]:
# Define objective function for Hyperopt (credit to Leonardo Ferreira, 
# https://www.kaggle.com/kabure/eda-feat-engineering-encode-conquer)
from sklearn.model_selection import KFold
import xgboost as xgb
import time
from hyperopt import hp, fmin, tpe, space_eval

def objective(params):
    tic = time.perf_counter()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n>> Hyperopt New Run")
    print(f"params = {params}")
    FOLDS = 8
    count=1
    max_rounds = 600
    kf = KFold(n_splits=FOLDS, shuffle=False)
    
    mean_auc = 0.0
    for train_idx, val_idx in kf.split(X, y):
        # Prepare data for xgboost
        dtrain = xgb.DMatrix(X.iloc[train_idx, :], label=y.iloc[train_idx])
        dval = xgb.DMatrix(X.iloc[val_idx, :], label=y.iloc[val_idx])
        evallist = [(dtrain, 'train'), (dval, 'eval')]
        
        params['objective']= 'binary:logistic'
        params['eval_metric']= 'auc'
        params['n_thread'] = 7
        evals_result={}
        bst = xgb.train(params, dtrain, max_rounds, evallist, 
                        early_stopping_rounds=30, verbose_eval=False,
                        evals_result=evals_result) 
        
        auc = evals_result['eval']['auc'][-1]
        mean_auc += auc/FOLDS
        print(f'Fold {count} - val_auc: {round(auc, 5)}')
        count += 1
        del dtrain, dval, bst
    toc = time.perf_counter()
    print('Run time:', round(toc-tic, 1), 'seconds.')
    print('Mean val_auc:', round(mean_auc,5))
    
    return 1-mean_auc

# Define search space for Hyperopt (based on the one by Leonardo Ferreira, 
# https://www.kaggle.com/kabure/eda-feat-engineering-encode-conquer)
space = {
    'max_depth': hp.choice('max_depth', [3, 4, 5, 6, 7]),
    'reg_alpha':  hp.uniform('reg_alpha', 0.05, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.05, 0.4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.15),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
    'gamma': hp.uniform('gamma', 0.01, 0.7),
    'subsample': hp.choice('subsample', [0.5, 0.6, 0.7, 0.8]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 0.8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 0.9) }

In [13]:
# Take sample for xgboost parameter optimisation
sample = encoded.sample(frac=0.5, replace=False)
X = sample[sample.columns[:-1]]
y = sample[sample.columns[-1]]

# Run hyperopt 
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10)

# Set best params
best_params = space_eval(space, best)
best_params 

                                                      
>> Hyperopt New Run
params = {'max_depth': 3, 'gamma': '0.580', 'subsample': '0.60', 'reg_alpha': '0.304', 'reg_lambda': '0.295', 'learning_rate': '0.046', 'colsample_bytree': '0.417', 'feature_fraction': '0.651', 'bagging_fraction': '0.504'}
  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

job exception: name 'X' is not defined



  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]


NameError: name 'X' is not defined

## Train further best model and make predictions to submit

In [5]:
# Train xgboost model further with best params and full dataset
from sklearn.model_selection import train_test_split
train, val = train_test_split(encoded, train_size=0.75, random_state=0)
dtrain = xgb.DMatrix(train[train.columns[:-1]], label=train[train.columns[-1]])
dval = xgb.DMatrix(val[val.columns[:-1]], label=val[val.columns[-1]])

evallist = [(dtrain, 'train'), (dval, 'eval')]
evals_result = {}
best_params['objective']= 'binary:logistic'
best_params['eval_metric']= 'auc'
best_params['n_thread'] = 7
bst = xgb.train(best_params, dtrain, 4000, evallist, early_stopping_rounds=50, 
                verbose_eval=False, evals_result=evals_result) 

print('Train auc '+str(evals_result['train']['auc'][-1])+', Test auc '+str(evals_result['eval']['auc'][-1]))

Train auc 0.871617, Test auc 0.823029


In [6]:
# Load test set
test = pd.read_csv('test_EntityEmbedding.csv', header=None, low_memory=False)
index = pd.read_csv('test.csv', low_memory=False)['id'].values
dtest = xgb.DMatrix(test)
# Make predictions and save
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

if not os.path.exists('submissions'):
    os.mkdir('submissions')
pd.DataFrame({'id':index, 'target':ypred}, index=None).to_csv('./submissions/predicted_EntityEmbedding.csv', index=None)