In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

from datetime import datetime

from sklearn.preprocessing import LabelEncoder

-----
## Preprocessing

In [2]:
train = pd.read_csv("data/ech_apprentissage.csv", sep = ';', index_col = 'id', low_memory = False)
X = train.drop('prime_tot_ttc', axis = 1)
X_test = pd.read_csv('data/ech_test.csv', sep = ';', index_col = 'id', low_memory = False)
y = train.prime_tot_ttc

df = pd.concat([X, X_test], axis=0)
print(X.shape, X_test.shape)
# df.dtypes

# vars_to_drop = ['var1', 'var3', 'var11', 'var14', 'codepostal']
# vars_to_drop = ['var1', 'var3', 'var11', 'var14', 'codepostal', 'annee_naissance'] # bad idea: +0.034%
vars_to_drop = ['var1', 'var3', 'var11', 'var14', 'codepostal', 'var12'] 
# vars_to_drop = ['var1', 'var3', 'var11', 'codepostal'] # bad idea: +0.8%
# vars_to_drop = ['var1', 'var11', 'codepostal'] # bad idea: +1.06%
df.drop(vars_to_drop, axis = 1, inplace=True)

(300000, 32) (30000, 32)


In [3]:
df.replace('NR', 0, inplace=True)

In [4]:
df.var6.replace('N', 0, inplace=True)

In [5]:
cat_variables = [i for i in df.columns if df[i].dtype == 'O']
print(cat_variables)

['marque', 'energie_veh', 'profession', 'var6', 'var7', 'var8', 'var16']


In [6]:
encods = [LabelEncoder() for col in cat_variables]
for i, col in enumerate(cat_variables):
    df[col] = encods[i].fit_transform(df[col].astype(str))

In [7]:
df.fillna(0, inplace=True)

X = df[:300000]
X_test = df[300000:]

------
## XGBoost

In [8]:
# человеческая метрика
def mape2(preds, dtrain):
    y_true = dtrain.get_label()
    return 'mape2', np.mean(np.abs((y_true - preds) / y_true))

In [9]:
dtrain = xgb.DMatrix(X, y, missing=0.0)
dtest = xgb.DMatrix(X_test)

In [12]:
param = { 'max_depth' : 7 } # -0.12%
# num_round = 250 # +0.02%
num_round = 100
bst = xgb.train(param, dtrain, num_round)

pred = bst.predict(dtest)

Will train until train error hasn't decreased in 25 rounds.
[0]	train-mape2:0.691577
[1]	train-mape2:0.476839
[2]	train-mape2:0.327271
[3]	train-mape2:0.227430
[4]	train-mape2:0.167077
[5]	train-mape2:0.134763
[6]	train-mape2:0.118724
[7]	train-mape2:0.111318
[8]	train-mape2:0.106217
[9]	train-mape2:0.104183
[10]	train-mape2:0.101883
[11]	train-mape2:0.100529
[12]	train-mape2:0.099663
[13]	train-mape2:0.098825
[14]	train-mape2:0.098449
[15]	train-mape2:0.096692
[16]	train-mape2:0.096290
[17]	train-mape2:0.095178
[18]	train-mape2:0.094897
[19]	train-mape2:0.094357
[20]	train-mape2:0.093958
[21]	train-mape2:0.093518
[22]	train-mape2:0.093180
[23]	train-mape2:0.093006
[24]	train-mape2:0.092732
[25]	train-mape2:0.092516
[26]	train-mape2:0.092424
[27]	train-mape2:0.092141
[28]	train-mape2:0.091736
[29]	train-mape2:0.091574
[30]	train-mape2:0.091504
[31]	train-mape2:0.091348
[32]	train-mape2:0.091215
[33]	train-mape2:0.091068
[34]	train-mape2:0.090979
[35]	train-mape2:0.090864
[36]	train-map

In [13]:
pred

array([ 313.0140686 ,  220.15553284,  264.82012939, ...,  404.96298218,
        365.11065674,  360.31872559], dtype=float32)

In [14]:
pred = pred * 0.99 # multiplying the answers by 0.99 is a nice hack

In [15]:
filename = 'submissions/xgb_' + datetime.now().strftime(format = '%d%m_%H%M') + '.csv'
print('saving to %s...' % filename)
pd.DataFrame({'id': X_test.index, 'pred': pred}).to_csv(filename,
                                                        index=False,
                                                        sep=';')

saving to submissions/xgb_2308_0043.csv...
