In [1]:
import numpy as np
import pandas as pd
import random as rd
import datetime
import os

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.sparse.csr

from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression, Ridge

from xgboost import XGBRegressor

import importlib
import kaggle_forecast 
importlib.reload( kaggle_forecast )

from kaggle_forecast import *


In [2]:
kg = KaggleData()

In [3]:
features = { 'lag' : [1, 2, 3, 12], 'mean' : [2, 3, 6], 'shop_id' : [], 'item_id' : [], 'shop_item_id' : [], \
             'cat_id' : [], 'date_block_num' : [], 'month' : [], 'year' : [] }

X, Y = get_labels_and_features( kg, features=features )

# Get the combined features in matrix form for the train, validation and test sets
xx_raw, yy_raw = combine_features( X, Y )


In [9]:
# Create Pandas Data Frames from the numpy matrices
df_dict = dict()
labels = xx_raw[DESC]
for ds in [ TRAIN, VALID, TEST ]:
    df_dict[ds] = pd.DataFrame( np.hstack( [ xx_raw[ds], yy_raw[ds] ] ), columns = xx_raw[DESC] + [TARGET_COL] )


In [10]:
# Only keep the data after a certain observation month
date_block_cutoff = 10
for ds in [ TRAIN, VALID, TEST ]:
    df_dict[ds] = get_recent_data( df_dict[ds], date_block_cutoff=date_block_cutoff )


In [11]:
# Clip the historical sales into the interval [0,20]
lower_limit = 0
upper_limit = 20
for ds in [ TRAIN, VALID ]:
    df_dict[ds][TARGET_COL] = np.maximum( lower_limit, df_dict[ds][TARGET_COL] )
    df_dict[ds][TARGET_COL] = np.minimum( upper_limit, df_dict[ds][TARGET_COL] )    


In [12]:
alpha = 5
n_splits = 5
target_cols = [ 'sales', 'sales_mean_06' ]
group_cols = [ 'item_id', 'shop_id', 'cat_id', 'shop_item_id', 'date_block_num', 'month' ]

for target_col in target_cols:
    for group_col in group_cols:

        new_col_name = target_col + '_mean_' + group_col
        print('{}'.format(datetime.datetime.now()) + ' : ' + new_col_name )

        if new_col_name not in df_dict[TRAIN]:
            df_dict[TRAIN][new_col_name] = encode_means_with_cv( df_dict[TRAIN], \
                                        target_col=target_col, group_col=group_col, n_splits=n_splits ).to_numpy()

        if new_col_name not in df_dict[VALID]:
            df_dict[VALID][new_col_name] = encode_means_from_test_train_split( df_dict[TRAIN], df_dict[VALID], \
                                        target_col=target_col, group_col=group_col)    

        if new_col_name not in df_dict[TEST]:
            test_data = pd.concat( [ df_dict[TRAIN], df_dict[VALID] ])                
            df_dict[TEST][new_col_name] = encode_means_from_test_train_split( test_data, df_dict[VALID], \
                                        target_col=target_col, group_col=group_col)            


2019-06-27 15:11:54.612267 : sales_mean_item_id
2019-06-27 15:12:29.191426 : sales_mean_shop_id
2019-06-27 15:12:56.848402 : sales_mean_cat_id
2019-06-27 15:13:23.691993 : sales_mean_shop_item_id
2019-06-27 15:15:45.395135 : sales_mean_date_block_num
2019-06-27 15:16:15.460565 : sales_mean_month
2019-06-27 15:16:46.645818 : sales_mean_06_mean_item_id
2019-06-27 15:17:24.166455 : sales_mean_06_mean_shop_id
2019-06-27 15:17:56.965197 : sales_mean_06_mean_cat_id
2019-06-27 15:18:30.102644 : sales_mean_06_mean_shop_item_id
2019-06-27 15:20:58.841546 : sales_mean_06_mean_date_block_num
2019-06-27 15:21:34.597509 : sales_mean_06_mean_month


In [13]:
n_estimators = 50
max_depth = 4           # No lower than 3. Increase until performance stops improving
learning_rate = 0.05    # Keep in the range of 0.01 and 0.1
gamma = 5               # Regularization parameter: use value 0, 1, or 5
colsample_bytree = 0.2  # Between 0.3 and 0.8 when dataset has many columns

print('{} : {}'.format(datetime.datetime.now(), n_estimators  ) )
# Construct the model
model_constructor_fun = lambda : XGBRegressor( n_estimators=n_estimators, \
                                               gamma=gamma, \
                                               colsample_bytree=colsample_bytree,\
                                               max_depth=max_depth, \
                                               learning_rate=learning_rate )

model_train = fit_model( model_constructor_fun, TRAIN, df_dict )

# Check the out-of-sample fit for the validation set
yhat_valid = predict_model( model_train, VALID, df_dict, clip_forecasts=(0,20) )

print('{}'.format(datetime.datetime.now()  ) )

2019-06-27 15:22:10.898715 : 50
valid RMSE 0.7439604919385264
2019-06-27 15:33:45.926359


In [14]:
model_test = fit_model( model_constructor_fun, TEST, df_dict )

# Check the out-of-sample fit for the test set
yhat_test = predict_model( model_test, TEST, df_dict, clip_forecasts=(0,20) )


test RMSE nan


In [16]:
output_file = '../forecasts/xgboost_03.csv'
write_forecast_to_csv( kg, yhat_test, output_file )