In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import time
import sys
sys.path.append('./encoders/')
# Load files
train = pd.read_csv('./unshuffled/train_unshuffled.csv', header=None)
test = pd.read_csv('./unshuffled/test_unshuffled.csv', header=None)

In [9]:
# Metric
def mape(y_true, y_pred): 
    # Mean Absolute Percentage Error (mape)
    assert(min(y_true) > 0)
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    relative_error = np.abs((y_true-y_pred)/y_true)
    return np.sum(relative_error)/len(y_true)

In [14]:
cat_cols = [0, 1, 2, 3, 4, 5, 6]
Xtrain = train[train.columns[:-1]]
ytrain = train[train.columns[-1]]
Xtest = test[test.columns[:-1]]
ytest = test[test.columns[-1]]

# Entity Embedding Encoder
from entities_embedding import EntityEmbeddingEncoder
entity = EntityEmbeddingEncoder(epochs=10)

# Fit encoder
tic = time.perf_counter()
Xtrain = entity.fit_transform(Xtrain, ytrain, cat_cols)
toc = time.perf_counter()
print("Fitting completed in ",round(toc-tic,2), "sec")
Xtest = entity.transform(Xtest, cat_cols)

Fitting completed in  330.94 sec


### XGBOOST 

In [15]:
dtrain = xgb.DMatrix(Xtrain, label=ytrain)
dtest = Xtest
dtest.columns = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
                '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
                '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
                '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
                '45', '46', '47', '48', '49', '50', '51', '52', '53']

dtest = xgb.DMatrix(dtest, label=ytest)

In [16]:
# Parameters, taken from 'Entity Embeddings of Categorical Variables'
param = {'max_depth': 10, 'eta': 0.02, 'objective': 'reg:squarederror', 'colsample_bytree': 0.7, 'subsample': 0.7}
param['nthread'] = 6
param['eval_metric'] = 'mae'
param['verbosity'] = 0
evallist = [(dtest, 'eval'), (dtrain, 'train')]
# Training
num_round = 3000
bst = xgb.train(param, dtrain, num_round)#, evallist)
#bst.save_model('xgboost_model')

In [17]:
# Load model if trained earlier
#bst = xgb.Booster({'nthread': 4})  # init model
#bst.load_model('xgboost_model')  # load data
# Evaluation on test data
file = open('results.txt', 'a+') 
file.write(' xgboost: '+str(mape(ytest, bst.predict(dtest)))+'\n')
file.close()

### KNN

In [18]:
from sklearn.neighbors import KNeighborsRegressor
KNN = KNeighborsRegressor(n_neighbors=10, weights='distance', p=1, n_jobs=-1)
KNN.fit(Xtrain, ytrain)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=10, p=1,
                    weights='distance')

In [19]:
file = open('results.txt', 'a+') 
file.write(' KNN: '+str(mape(ytest, KNN.predict(Xtest)))+'\n')
file.close()

### Random Forest

In [20]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(n_estimators=200, max_depth=35, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
RFR.fit(Xtrain,ytrain)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=35, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [21]:
file = open('results.txt', 'a+') 
file.write(' RFR: '+str(mape(ytest, RFR.predict(Xtest)))+'\n')
file.close()

## Following "Entity Embeddings of categorical variables" the predictions are the average prediction over 5 neural networks

In [3]:
# Entity Embedding Encoder
from entities_embedding import EntityEmbeddingEncoder
def train_and_predict(X,y,Xt):
    entity = EntityEmbeddingEncoder(epochs=10) 
    entity.fit(X,y)
    return np.concatenate(entity.model.predict(entity.burst_and_ohencode(Xt)), axis=0)

# Datasets
X = pd.DataFrame(train[train.columns[:-1]].values)
y = pd.Series(train[train.columns[-1]].values)
Xt = pd.DataFrame(test[test.columns[:-1]].values)
yt = pd.Series(test[test.columns[-1]].values)

# Get the predictions of five models
predictions = []
for i in range(5):
    predictions.append(train_and_predict(X,y,Xt))

In [13]:
# Calculate mean predictions as final predictions
mean_predictions = np.mean(predictions, axis=0)
file = open('results.txt', 'a+') 
file.write(' NN: '+str(mape(yt, mean_predictions))+'\n')
file.close()