In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd

In [2]:
# Metric
def mape(y_true, y_pred): 
    # Mean Absolute Percentage Error (mape)
    assert(min(y_true) > 0)
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    relative_error = np.abs((y_true-y_pred)/y_true)
    return np.sum(relative_error)/len(y_true)

In [3]:
import time
train = pd.read_csv('./unshuffled/train_unshuffled.csv', header=None)
test = pd.read_csv('./unshuffled/test_unshuffled.csv', header=None)
Xtrain = train[train.columns[:-1]]
ytrain = train[train.columns[-1]]
Xtest = test[test.columns[:-1]]
ytest = test[test.columns[-1]]

# Target Encoder
try:
    from category_encoders import TargetEncoder
    target = TargetEncoder()
except:
    print("Module category_encoders not available")

# Fit encoder
tic = time.perf_counter()
Xtrain = target.fit_transform(Xtrain, ytrain)
toc = time.perf_counter()
print("Fitting completed in ",round(toc-tic,2), "sec")
Xtest = target.transform(Xtest)

Fitting completed in  0.8 sec


### XGBOOST 

In [10]:
dtrain = xgb.DMatrix(Xtrain, label=ytrain)
dtest = Xtest
cols = []
for i in range(7):
    cols.append(str(i))
dtest.columns = cols
dtest = xgb.DMatrix(dtest, label=ytest)

In [11]:
# Parameters, taken from 'Entity Embeddings of Categorical Variables'
param = {'max_depth': 10, 'eta': 0.02, 'objective': 'reg:linear', 'colsample_bytree': 0.7, 'subsample': 0.7}
param['nthread'] = 8
param['eval_metric'] = 'mae'
param['verbosity'] = 1
evallist = [(dtest, 'eval'), (dtrain, 'train')]
# Training
num_round = 3000
bst = xgb.train(param, dtrain, num_round)#, early_stopping_rounds=100)#, evallist)
#bst.save_model('xgboost_model')



In [12]:
# Load model if trained earlier
#bst = xgb.Booster({'nthread': 4})  # init model
#bst.load_model('xgboost_model')  # load data
# Evaluation on test data
file = open('results.txt', 'a+') 
file.write(' xgboost: '+str(mape(ytest, bst.predict(dtest)))+'\n')
file.close()

### KNN

In [13]:
from sklearn.neighbors import KNeighborsRegressor
KNN = KNeighborsRegressor(n_neighbors=10, weights='distance', p=1, n_jobs=-1)
KNN.fit(Xtrain, ytrain)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=10, p=1,
                    weights='distance')

In [14]:
file = open('results.txt', 'a+') 
file.write(' KNN: '+str(mape(ytest, KNN.predict(Xtest)))+'\n')
file.close()

### Random Forest

In [15]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(n_estimators=200, max_depth=35, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
RFR.fit(Xtrain,ytrain)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=35, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [16]:
file = open('results.txt', 'a+') 
file.write(' RFR: '+str(mape(ytest, RFR.predict(Xtest)))+'\n')
file.close()

### Neural Networks

In [17]:
# Neural network
import tensorflow as tf
s = len(Xtrain.columns)
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(1000, activation='relu', input_shape=(s,)),
    tf.keras.layers.Dense(500, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss=tf.keras.losses.mean_absolute_error)
model.build()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              8000      
_________________________________________________________________
dense_1 (Dense)              (None, 500)               500500    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 501       
Total params: 509,001
Trainable params: 509,001
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(Xtrain.values, ytrain.values, epochs=10, batch_size=64)

Train on 200000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9749dedb10>

In [19]:
file = open('results.txt', 'a+') 
file.write(' NN: '+str(mape(ytest, np.concatenate(model.predict(Xtest.values), axis=0))))
file.close()