In [1]:
import pandas as pd
import os
from fastai.imports import *
from fastai.structured import *

from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from sklearn import metrics

In [4]:
PATH = "data/grocery_store/favorita-grocery-sales-forecasting/"
!ls {PATH}

holidays_events.csv.7z   stores.csv.7z            train.csv.7z
items.csv.7z             test.csv                 train_shuffled.csv
oil.csv.7z               test.csv.7z              transactions.csv.7z
sample_submission.csv.7z train.csv


# Read data

In [5]:
types = {'id': 'int64', 'item_nbr':'int32',
        'store_nbr':'int8','unit_sales':'float32',
        'onpromotion':'object'}

In [29]:
df_all = pd.read_csv(f'{PATH}train_shuffled.csv', 
                     parse_dates=['date'],
                     dtype=types,
                     infer_datetime_format=True)

In [30]:
# export to raw
#os.makedirs('tmp', exist_ok=True)
df_all.to_feather(f'{PATH}/grocerystores-raw-shuffled')

In [None]:
#df_all = pd.read_feather('tmp/grocerystores-raw')

In [31]:
df_all.onpromotion.fillna(False, inplace=True)
df_all.onpromotion = df_all.onpromotion.map({'False':
                                            False,
                                            'True':
                                            True})
df_all.onpromotion = df_all.onpromotion.astype(bool)

%time df_all.to_feather('tmp/raw_groceries')

CPU times: user 57.3 ms, sys: 64 ms, total: 121 ms
Wall time: 157 ms


In [32]:
%time df_all.describe(include='all')

CPU times: user 1.17 s, sys: 180 ms, total: 1.35 s
Wall time: 1.37 s


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
count,3370664.0,3370664,3370664.0,3370664.0,3370664.0,3370664
unique,,1684,,,,2
top,,2017-06-03 00:00:00,,,,False
freq,,3254,,,,2579429
first,,2013-01-01 00:00:00,,,,
last,,2017-08-15 00:00:00,,,,
mean,62744360.0,,27.44079,972777.5,8.543539,
std,36231490.0,,16.33039,520636.3,21.0826,
min,18.0,,1.0,96995.0,-478.0,
25%,31387810.0,,12.0,522383.0,2.0,


In [33]:
df_all.tail()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
3370659,2452620,2013-03-02,45,562568,2.0,True
3370660,8301409,2013-07-14,28,872306,3.0,True
3370661,109715614,2017-03-19,29,890825,2.0,False
3370662,90206633,2016-09-06,28,1239916,6.0,False
3370663,78716992,2016-05-09,46,213788,22.0,False


In [34]:
df_all.unit_sales = np.log1p(np.clip(df_all.unit_sales, 
                                    0, None))

In [35]:
add_datepart(df_all, 'date')

In [36]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

In [37]:
# read test
df_test = pd.read_csv(f'{PATH}test.csv', 
                     parse_dates=['date'],
                     dtype=types,
                     infer_datetime_format=True)
df_test.onpromotion.fillna(False, inplace=True)
df_test.onpromotion = df_test.onpromotion.map({'False':
                                            False,
                                            'True':
                                            True})
df_test.onpromotion = df_test.onpromotion.astype(bool)

%time df_test.to_feather(f'{PATH}/raw_groceries-test')

CPU times: user 22.7 ms, sys: 42.3 ms, total: 65 ms
Wall time: 89.4 ms


In [38]:
n_valid = len(df_test)
n_trn = len(df_all) - n_valid
train, valid = split_vals(df_all, n_trn)
train.shape, valid.shape

((200, 18), (3370464, 18))

In [39]:
df_test.shape, df_all.shape, n_valid

((3370464, 5), (3370664, 18), 3370464)

In [None]:
# train_cats(raw_train)
# apply_cats(raw_valid, raw_train)

In [41]:
%%time
trn, y, nas_trn = proc_df(train, 'unit_sales')
val, y_val, nas_val = proc_df(valid, 'unit_sales')

CPU times: user 1.23 s, sys: 463 ms, total: 1.69 s
Wall time: 1.84 s


In [43]:
trn.shape, y.shape, nas_trn

((200, 17), (200,), {})

# Models

In [44]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(x),y), rmse(m.predict(val), y_val),
           m.score(x,y), m.score(val, y_val)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print (res)

In [47]:
set_rf_samples(1_000_000)
# Avoid using oob_score in large datasets since it takes very long

In [46]:
# Do that because RandomForestRegressor does that anyway,
# so do it
%time x = np.array(trn, dtype=np.float32)
%prun x = np.array(trn, dtype=np.float32)

CPU times: user 1.44 ms, sys: 804 µs, total: 2.25 ms
Wall time: 2.53 ms
 

In [53]:
def run_random_forest(n_estimators,
                     min_samples_leaf):
    m = RandomForestRegressor(n_estimators=n_estimators,
                              min_samples_leaf=min_samples_leaf,
                              n_jobs=-1)
    %time m.fit(x,y)
    print_score(m)
    return m

In [54]:
#m = RandomForestRegressor(n_estimators=20,
#                          min_samples_leaf=100, n_jobs=-1)
#%time m.fit(x,y)
run_random_forest(20, 100)

CPU times: user 398 ms, sys: 34.5 ms, total: 433 ms
Wall time: 222 ms
[0.7710754927673557, 0.8886755628721202, 0.0023175321775951163, -0.016947218978810463]


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=100, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [55]:
run_random_forest(20, 10)

CPU times: user 428 ms, sys: 52.3 ms, total: 480 ms
Wall time: 228 ms
[0.6169397697089277, 0.9415453737102303, 0.3613188856914188, -0.14154870239937734]


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [56]:
run_random_forest(20, 3)

CPU times: user 432 ms, sys: 51.7 ms, total: 484 ms
Wall time: 332 ms
[0.37719530633262544, 1.0324726416010253, 0.7612567508091869, -0.3726791662563016]


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

# Group averages

In [58]:
# One kernel uses the average of last 2 weeks
# using average sales by item, store_number, on_promotion

In [59]:
# Nice thing is to compare predictions of different models
# by plotting.

In [60]:
# Average number of sales on holidays -> way to
# tackle this problem is to create additional columns

In [62]:
# Look at Rossmann store kaggle competition
# Use test set to calibrate validation set.
# Understand the data -> test set begins on payday,
# ends on after payday.