In [None]:
import pandas as pd
import numpy as np

import xgboost as xgb

In [2]:
DATA_PATH = 'data/prepared'
SUBMISSION_PATH = 'data/submission'

USE_LOG = True

In [3]:
# Import pickled data
train_df = pd.read_pickle(f'{DATA_PATH}/train_df.pkl')
test_df = pd.read_pickle(f'{DATA_PATH}/test_df.pkl')

In [4]:
# Split the last 6 weeks data as hold-out set 
# (https://www.kaggle.com/c/rossmann-store-sales/discussion/18024)
train_df = train_df.sort_values(['Date'], ascending = False)

valid_mask = 6 * 7 * 1115
valid_df = train_df[:valid_mask]
train_df = train_df[valid_mask:]

In [9]:
# Create the datasets for XGB
X_train_df = train_df.drop(['Sales', 'Customers', 'Date'], axis = 1)
X_valid_df = valid_df.drop(['Sales', 'Customers', 'Date'], axis = 1)
X_test_df = test_df.drop(['Date', 'Id'], axis = 1)

y_train = train_df['Sales'].values
y_valid = valid_df['Sales'].values

if USE_LOG:
    y_train = np.log1p(y_train)
    y_valid = np.log1p(y_valid)

train_dm = xgb.DMatrix(X_train_df, y_train)
valid_dm = xgb.DMatrix(X_valid_df, y_valid)

In [17]:
# Parameters and model training
params = {"objective": "reg:squarederror",
          "booster" : "gbtree",
          "eta": 0.03,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 10
          }
num_boost_round = 6000
watchlist = [(train_dm, 'train'), (valid_dm, 'eval')]

model = xgb.train(params,
                  train_dm,
                  num_boost_round,
                  evals=watchlist,
                  early_stopping_rounds=100,
                  verbose_eval=True)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:7.30399	eval-rmse:7.44709
[1]	train-rmse:7.08560	eval-rmse:7.22407
[2]	train-rmse:6.87417	eval-rmse:7.00914
[3]	train-rmse:6.66944	eval-rmse:6.80002
[4]	train-rmse:6.47241	eval-rmse:6.61223
[5]	train-rmse:6.27945	eval-rmse:6.41574
[6]	train-rmse:6.09161	eval-rmse:6.22454
[7]	train-rmse:5.90937	eval-rmse:6.03870
[8]	train-rmse:5.73262	eval-rmse:5.85858
[9]	train-rmse:5.56119	eval-rmse:5.68383
[10]	train-rmse:5.39494	eval-rmse:5.51447
[11]	train-rmse:5.23348	eval-rmse:5.35052
[12]	train-rmse:5.07720	eval-rmse:5.19145
[13]	train-rmse:4.92822	eval-rmse:5.05335
[14]	train-rmse:4.78108	eval-rmse:4.90303
[15]	train-rmse:4.63833	eval-rmse:4.75751
[16]	train-rmse:4.49962	e

In [18]:
# Predict test data
y_hat = model.predict(xgb.DMatrix(X_test_df))

# And create the submission file 
submission_df = test_df.copy()
submission_df['Sales'] = np.expm1(y_hat)
submission_df = submission_df[['Id', 'Sales']]

# Quick check
assert submission_df.shape[0] == test_df.shape[0]

submission_df.to_csv(f'{SUBMISSION_PATH}/submission_xgb.csv', index=False)