In [37]:
import numpy as np
import pandas as pd
import random as rd
import datetime
import os

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.sparse.csr

from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression, Ridge

from xgboost import XGBRegressor

import importlib
import kaggle_forecast 
import config 

importlib.reload( kaggle_forecast )
importlib.reload( config )

from kaggle_forecast import *
from config import *

In [2]:
%%time
kg = KaggleData()

CPU times: user 3min 39s, sys: 3.56 s, total: 3min 43s
Wall time: 3min 47s


In [4]:
features = { 'lag' : [1, 2, 3, 4, 5, 6, 9, 12], 'mean' : [2, 3, 4, 5, 6, 9, 12], 'shop_id' : [], 'item_id' : [], 'shop_item_id' : [], \
             'cat_id' : [], 'date_block_num' : [], 'month' : [], 'year' : [] }

X, Y = get_labels_and_features( kg, features=features )

# Get the combined features in matrix form for the train, validation and test sets
xx_raw, yy_raw = combine_features( X, Y )


In [16]:
# Create Pandas Data Frames from the numpy matrices
df_dict = dict()
labels = xx_raw[DESC]
for ds in [ TRAIN, VALID, TEST ]:
    df_dict[ds] = pd.DataFrame( np.hstack( [ xx_raw[ds], yy_raw[ds] ] ), columns = xx_raw[DESC] + [TARGET_COL] )


In [17]:
# Only keep the data after a certain observation month
date_block_cutoff = 30
for ds in [ TRAIN, VALID, TEST ]:
    df_dict[ds] = get_recent_data( df_dict[ds], date_block_cutoff=date_block_cutoff )


In [18]:
# Clip the historical sales into the interval [0,20]
lower_limit = 0
upper_limit = 20
for ds in [ TRAIN, VALID ]:
    df_dict[ds][TARGET_COL] = np.maximum( lower_limit, df_dict[ds][TARGET_COL] )
    df_dict[ds][TARGET_COL] = np.minimum( upper_limit, df_dict[ds][TARGET_COL] )    


['sales_lag_01',
 'sales_lag_02',
 'sales_lag_03',
 'sales_lag_04',
 'sales_lag_05',
 'sales_lag_06',
 'sales_lag_09',
 'sales_lag_12',
 'sales_mean_02',
 'sales_mean_03',
 'sales_mean_04',
 'sales_mean_05',
 'sales_mean_06',
 'sales_mean_09',
 'sales_mean_12']

In [20]:
alpha = 5
n_splits = 5
target_cols = list( df_dict[TRAIN].columns[ [ x.startswith( 'sales_' ) for x in df_dict[TRAIN].columns ] ] )
group_cols = [ 'item_id', 'shop_id', 'cat_id', 'shop_item_id', 'date_block_num', 'month' ]

for target_col in target_cols:
    for group_col in group_cols:

        new_col_name = target_col + '_mean_' + group_col
        print('{}'.format(datetime.datetime.now()) + ' : ' + new_col_name )

        if new_col_name not in df_dict[TRAIN]:
            df_dict[TRAIN][new_col_name] = encode_means_with_cv( df_dict[TRAIN], \
                                        target_col=target_col, group_col=group_col, n_splits=n_splits ).to_numpy()

        if new_col_name not in df_dict[VALID]:
            df_dict[VALID][new_col_name] = encode_means_from_test_train_split( df_dict[TRAIN], df_dict[VALID], \
                                        target_col=target_col, group_col=group_col)    

        if new_col_name not in df_dict[TEST]:
            test_data = pd.concat( [ df_dict[TRAIN], df_dict[VALID] ])                
            df_dict[TEST][new_col_name] = encode_means_from_test_train_split( test_data, df_dict[VALID], \
                                        target_col=target_col, group_col=group_col)            


2019-07-03 07:39:22.763513 : sales_lag_01_mean_item_id
2019-07-03 07:39:25.416380 : sales_lag_01_mean_shop_id
2019-07-03 07:39:26.891755 : sales_lag_01_mean_cat_id
2019-07-03 07:39:28.364106 : sales_lag_01_mean_shop_item_id
2019-07-03 07:39:44.981384 : sales_lag_01_mean_date_block_num
2019-07-03 07:39:46.497446 : sales_lag_01_mean_month
2019-07-03 07:39:48.025670 : sales_lag_02_mean_item_id
2019-07-03 07:39:50.339913 : sales_lag_02_mean_shop_id
2019-07-03 07:39:51.947855 : sales_lag_02_mean_cat_id
2019-07-03 07:39:53.555338 : sales_lag_02_mean_shop_item_id
2019-07-03 07:40:10.225440 : sales_lag_02_mean_date_block_num
2019-07-03 07:40:11.866850 : sales_lag_02_mean_month
2019-07-03 07:40:13.545082 : sales_lag_03_mean_item_id
2019-07-03 07:40:15.979912 : sales_lag_03_mean_shop_id
2019-07-03 07:40:17.733602 : sales_lag_03_mean_cat_id
2019-07-03 07:40:19.496733 : sales_lag_03_mean_shop_item_id
2019-07-03 07:40:36.355415 : sales_lag_03_mean_date_block_num
2019-07-03 07:40:38.149744 : sales_l

In [21]:
n_estimators = 50
max_depth = 4           # No lower than 3. Increase until performance stops improving
learning_rate = 0.05    # Keep in the range of 0.01 and 0.1
gamma = 5               # Regularization parameter: use value 0, 1, or 5
colsample_bytree = 0.2  # Between 0.3 and 0.8 when dataset has many columns

print('{} : {}'.format(datetime.datetime.now(), n_estimators  ) )
# Construct the model
model_constructor_fun = lambda : XGBRegressor( n_estimators=n_estimators, \
                                               gamma=gamma, \
                                               colsample_bytree=colsample_bytree,\
                                               max_depth=max_depth, \
                                               learning_rate=learning_rate )

model_train = fit_model( model_constructor_fun, TRAIN, df_dict )

# Check the out-of-sample fit for the validation set
yhat_valid = predict_model( model_train, VALID, df_dict, clip_forecasts=(0,20) )

print('{}'.format(datetime.datetime.now()  ) )

2019-07-03 07:47:32.483530 : 50
valid RMSE 0.7388026153270856
2019-07-03 07:47:58.999150


In [22]:
model_test = fit_model( model_constructor_fun, TEST, df_dict )

# Check the out-of-sample fit for the test set
yhat_test = predict_model( model_test, TEST, df_dict, clip_forecasts=(0,20) )


test RMSE nan


In [38]:
output_file = 'xgboost_04.csv'
write_forecast_to_csv( kg, yhat_test, output_file )

ImportError: /home/ubuntu/projects/.env/lib/python3.6/site-packages/catboost/_catboost.so: cannot make segment writable for relocation: Cannot allocate memory