In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from datetime import datetime, timedelta
import gc

from fastai import *
from fastai.vision import *
from fastai.tabular import *
from scripts import m5_common
from tqdm.notebook import tqdm

In [3]:
pd.options.display.max_columns = 50

## Setup variables

In [4]:
path = Path('/kaggle/m5_forecasting/')
assert(path.exists())

In [5]:
FIRST_DAY = 1

In [6]:
h = 28 
max_lags = h * 2 + 1
tr_last = 1913
fday = datetime(2016, 4, 25)

## Read Data

In [7]:
%%time

prices, cal = m5_common.prepare_tables(path)

CPU times: user 1.42 s, sys: 160 ms, total: 1.58 s
Wall time: 1.58 s


In [8]:
event_name_1_map, event_type_1_map = m5_common.replace_cal_cols(cal)

In [9]:
cal[(cal.date > '2012-01-01') & (cal.date < '2012-01-05')]

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
338,2012-01-02,11149,1,3,1,2012,d_339,0,0,0,0,1.0,0.0,1.0
339,2012-01-03,11149,5,4,1,2012,d_340,0,0,0,0,1.0,1.0,1.0
340,2012-01-04,11149,6,5,1,2012,d_341,0,0,0,0,1.0,0.0,0.0


In [10]:
uint8_types= ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'month', 'wday', 'weekday', 
              'snap_CA', 'snap_TX', 'snap_WI']
m5_common.convert_uint8(cal, uint8_types)

In [11]:
m5_common.add_days_before(cal)

In [12]:
%%time

df = m5_common.create_dt(cal, prices, is_train=True, first_day=FIRST_DAY, tr_last=tr_last, path=path)

CPU times: user 42 s, sys: 5.93 s, total: 47.9 s
Wall time: 47.9 s


In [13]:
df.date.min(), df.date.max()

(Timestamp('2011-01-29 00:00:00'), Timestamp('2016-04-24 00:00:00'))

## Create features

In [14]:
def create_fea(dt):
    
    wins = [7, 28]
    lags = [7, 28]
    
    grouped_sales = dt[["id","sales"]].groupby("id")["sales"]
    
    for win in tqdm(wins, total=len(wins)):
        mean_col = f'mean_{win}'
        emean_col = f'e{mean_col}' # exponential mean average
        esmean_col = f'es{mean_col}'
        dt[emean_col] = grouped_sales.transform(lambda x : x.ewm(span=win, adjust=False).mean())
        dt[esmean_col] = grouped_sales.transform(lambda x : x.ewm(alpha=1/win, adjust=False).mean())
        for lag in lags:
            dt[f'emean_{win}_{lag}'] = dt[["id", emean_col]].groupby("id").shift(lag)
            dt[f'esmean_{win}_{lag}'] = dt[["id", esmean_col]].groupby("id").shift(lag)
        del dt[emean_col]
        del dt[esmean_col]
            
    ra = [1, 2]
    for simple_lag in ra:
        dt[f'lag_{simple_lag}'] = dt[["id","sales"]].groupby("id")["sales"].shift(simple_lag)

In [None]:
%%time

create_fea(df)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

In [None]:
df.dropna(inplace = True)

In [None]:
df.info()

## Training Preparation

In [None]:
cat_feats = ['wday', 'month', 'year', 'Dayofyear', 'before_christmas', 'week', 'mday','item_id', 'dept_id','store_id', 'cat_id', 'state_id', 
             "event_name_1", "event_name_2", "event_type_1",  "event_type_2", 'snap_CA', 'snap_TX', 'snap_WI']
useless_cols = ["id", "date", "sales", "sales_positive", "d", "wm_yr_wk", "weekday", "revenue"]

train_cols = df.columns[~df.columns.isin(useless_cols)]
cont_names = [col for col in train_cols if col not in cat_feats]

In [None]:
procs = [FillMissing, Categorify, Normalize]

In [None]:
def convert_float32(df):
    for c in ['sell_price', 'lag_price_1', 'emean_7_7', 'esmean_7_7', 'emean_7_28', 'esmean_7_28', 'emean_28_7', 'esmean_28_7', 'emean_28_28', 'esmean_28_28', 'lag_1', 'lag_2']:
        df[c] = df[c].astype('float32')
        
convert_float32(df)

In [None]:
data = (TabularList.from_df(df, path=path, cat_names=cat_feats, cont_names=cont_names, procs=procs)
                           .split_by_rand_pct(valid_pct=0.1)
                           .label_from_df(cols='sales', label_cls=FloatList)
                           .databunch())
data.batch_size=2048

In [None]:
data.show_batch(rows=10)

In [None]:
data.x.cont_names

In [None]:
max_y = np.max(df['sales'] * 1.2)
y_range = torch.tensor([0., max_y], device=defaults.device)

In [None]:
del df
gc.collect()

## Training

In [None]:
# inline static double LossOnPoint(label_t label, double score, const Config& config) {
#     const double rho = config.tweedie_variance_power;
#     const double eps = 1e-10f;
#     if (score < eps) {
#       score = eps;
#     }
#     const double a = label * std::exp((1 - rho) * std::log(score)) / (1 - rho);
#     const double b = std::exp((2 - rho) * std::log(score)) / (2 - rho);
#     return -a + b;
#   }

In [None]:
def mse_loss(pred, targ):
    pred, targ = pred.contiguous().view(-1),targ.contiguous().view(-1)
    return F.mse_loss(pred, targ)

In [None]:
tweedie_variance_power = 1.1
rho = tensor(tweedie_variance_power).cuda()
eps = tensor(1e-10).cuda()

def tweedie_loss(pred, targ):
    pred, targ = pred.contiguous().view(-1), targ.contiguous().view(-1)
    pred = torch.where(pred < eps, eps, pred)
    a = targ * torch.exp((1 - rho) * torch.log(pred)) / (1 - rho)
    b = torch.exp((2 - rho) * torch.log(pred)) / (2 - rho)
    return torch.mean(-a + b)

def mse_tweedie_loss(pred, targ):
    return (mse_loss(pred, targ) * 0.7 + tweedie_loss(pred, targ) * 0.3) / tensor(2.).cuda()

In [None]:


def mqe_loss(pred, targ):
    pred, targ = pred.contiguous().view(-1),targ.contiguous().view(-1)
    return torch.mean((pred - targ) ** 4)

def mae_loss(pred, targ):
    pred, targ = pred.contiguous().view(-1),targ.contiguous().view(-1)
    return torch.mean(torch.abs(pred - targ))

def mape_loss(pred, targ):
    pred, targ = pred.contiguous().view(-1),targ.contiguous().view(-1)
    return torch.mean(torch.abs((targ - pred) / (targ + 1e-5)))

def poisson_loss(pred, targ):
    """Custom loss function for Poisson model."""
    pred, targ = flatten_check(pred, targ)
    return F.poisson_nll_loss(pred, targ)

In [None]:
class ExportModelCallback(callbacks.TrackerCallback):
    "A `TrackerCallback` that saves the model when monitored quantity is best."
    def __init__(self, learn:Learner, monitor:str='valid_loss', mode:str='auto', every:str='improvement', name:str='bestmodel'):
        super().__init__(learn, monitor=monitor, mode=mode)
        self.every,self.name = every,name
        if self.every not in ['improvement', 'epoch']:
            warn(f'SaveModel every {self.every} is invalid, falling back to "improvement".')
            self.every = 'improvement'

    def jump_to_epoch(self, epoch:int)->None:
        try:
            self.learn.load(f'{self.name}_{epoch-1}', purge=False)
            print(f"Loaded {self.name}_{epoch-1}")
        except: print(f'Model {self.name}_{epoch-1} not found.')

    def on_epoch_end(self, epoch:int, **kwargs:Any)->None:
        "Compare the value monitored to its best score and maybe save the model."
        if self.every=="epoch": self.learn.save(f'{self.name}_{epoch}')
        else: #every="improvement"
            current = self.get_monitor_value()
            if current is not None and self.operator(current, self.best):
                print(f'Better model found at epoch {epoch} with {self.monitor} value: {current}.')
                self.best = current
                learn.export(file=str(path/f'm5_model_{epoch}_export.pkl'))

    def on_train_end(self, **kwargs):
        "Load the best model."
        if self.every=="improvement" and (self.learn.path/f'{self.learn.model_dir}/{self.name}.pth').is_file():
            self.learn.load(f'{self.name}', purge=False)

In [None]:
learn = tabular_learner(data, layers=[1500, 750], y_range=y_range, metrics=rmse)

In [None]:
learn.loss_func = tweedie_loss

In [None]:
learn.model

In [None]:
learn.lr_find(num_it=400)

In [None]:
learn.recorder.plot()

In [None]:
lr = 1e-3

In [None]:
%%time
learn.fit_one_cycle(10, lr, callbacks=[callbacks.EarlyStoppingCallback(learn, monitor="root_mean_squared_error", 
                                                                       mode="min", patience=30),
                                     callbacks.ExportModelCallback(learn, monitor='root_mean_squared_error',mode='min', 
                                                                 name='m5_best_1')])

In [None]:
%%time

lr = 1e-5
learn.fit_one_cycle(10, lr, callbacks=[callbacks.EarlyStoppingCallback(learn, monitor="root_mean_squared_error", 
                                                                       mode="min", patience=30),
                                     callbacks.ExportModelCallback(learn, monitor='root_mean_squared_error',mode='min', 
                                                                 name='m5_best_2')])

In [None]:
!ls {path}

In [None]:
learn.export(file=str(path/'m5_model'))

In [None]:
learn = load_learner(path=str(path), file='m5_model')

## Prediction

In [None]:
from tqdm.notebook import tqdm

In [None]:
def fastai_tabular_predict(tst):
    '''
    tst - a dataframe missing the label for which we want to be able to predict
    '''
    # sales is the label
    tst['sales'] = 0
    # Create a databunch with the same categorical features, continuous features and procedures as for the training set, split so that you only have training data
    tbldb = TabularList.from_df(tst, path=path, cat_names=cat_feats, cont_names=cont_names, procs=procs).split_by_rand_pct(valid_pct=0.).label_from_df(cols='sales', label_cls=FloatList).databunch()
    # Set the batch size to the length of the dataframe
    tbldb.batch_size = len(tst)
    # Fetch the training data features
    x, _ = next(iter(tbldb.dl(ds_type=DatasetType.Train)))
    # Call the model using first the categorical data and then the continuous data, convert to numpy
    return to_np(learn.model(x[0].cuda(), x[1].cuda()).squeeze())

In [None]:
%%time

max_lags = h * 2 + 1
sub = 0.
cols = [f"F{i}" for i in range(1,29)]
te = m5_common.create_dt(cal, prices, False, first_day=FIRST_DAY, path=path)

learn.model.eval()

for tdelta in tqdm(range(0, h), total=h):
    day = fday + timedelta(days=tdelta)
    print(tdelta, day)
    tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
    create_fea(tst)
    tst = tst.loc[tst.date == day, train_cols]
    convert_float32(tst)
    te.loc[te.date == day, "sales"] = fastai_tabular_predict(tst)


In [None]:
%%time

te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
                                                                      "id"].str.replace("validation$", "evaluation")
te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
te_sub.fillna(0., inplace = True)
te_sub.sort_values("id", inplace = True)
te_sub.reset_index(drop=True, inplace = True)
sub = te_sub

In [None]:
%time

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)

In [None]:
!wc -l submission.csv

In [None]:
pd.read_csv('submission.csv')