In [None]:
# default_exp modelling

# Modelling & submitting

> Playing with different models and submitting predictions over the test set to kaggle.

## TODOs
* remove the timestampElapsed field and see if the kaggle private score improves => it does a bit, reached 1.5 private loss
* feature importance https://scikit-learn.org/stable/modules/permutation_importance.html
* predict values one or two years into the future (the patterns should remain very similar) using the timestamp_Elapsed field

Finding: make sure your test set values are not out of domain $\Rightarrow$ `timestampYear` in this notebook is put into the training set but there only takes on the value 2016.0, but in the test set it's 2017.0 and 2018.0, causing the predictions to zero out everywhere.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
from ashrae import preprocessing

import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import typing
import pickle


from sklearn import linear_model, tree, model_selection, ensemble

from fastai.tabular.all import *

import ipywidgets as widgets

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
# predict_test = True
do_test = False
do_submit = False
data_path = Path("../data")

In [None]:
#export
def evaluate_torch(y_true:torch.Tensor, y_pred:torch.Tensor): return torch.sqrt(F.mse_loss(y_true, y_pred))

## Loading

In [None]:
%%time
var_names = preprocessing.load_var_names(data_path/'var_names.pckl')
var_names

In [None]:
%%time
df = preprocessing.load_df(data_path/'X.parquet')

if do_test:
    df_test = preprocessing.load_df(data_path/'X_test.parquet')

## Sampling `df`

In [None]:
%%time
n = len(df)

if True: # per building_id and meter sampling
    n_sample_per_bid = 50
    replace = True

    df = (df.groupby(['building_id', 'meter'])
         .sample(n=n_sample_per_bid, replace=replace))

    print(f'using {len(df)} samples = {len(df)/n*100:.2f} %')

if False: # general sampling
    frac_samples = .5
    replace = False

    df = (df.sample(frac=frac_samples, replace=replace))

    print(f'using {len(df)} samples = {len(df)/n*100:.2f} %')

## Split

In [None]:
#export
def split_dataset(X:pd.DataFrame, split_kind:str='random',
                  train_frac:float=8):
    
    def random_split():
        n_train = int(len(X)*train_frac)
        train_bool = X.index.isin(np.random.choice(X.index.values, size=n_train, replace=False))
        return train_bool
    
    def time_split():
        time_col = 'timestampElapsed'
        ts = X[time_col].sort_values(ascending=True)
        ix = int(len(X)*train_frac)
        threshold_t = ts.iloc[ix:].values[0]
        return X[time_col] < threshold_t
     
    split_funs = {
        'random': random_split,
        'time': time_split,
    }
    
    assert split_kind in split_funs
    train_bool = split_funs[split_kind]()
    
    train_idx = np.where(train_bool)[0]
    valid_idx = np.where(~train_bool)[0]

    return (list(train_idx), list(valid_idx))

In [None]:
%%time
split_kind = 'random'
#split_kind = 'time'
splits = split_dataset(df, split_kind=split_kind, train_frac=.8)
#splits=None

In [None]:
%%time
to = preprocessing.get_tabular_object(df, var_names, splits=splits)

In [None]:
%%time
train_bs = 256*8
val_bs = 256*8

dls = to.dataloaders(bs=train_bs, val_bs=val_bs)

In [None]:
%%time
test_bs = 1024*4

if do_test:
    test_dl = dls.test_dl(df_test, bs=test_bs) 

## Modelling with

### `sklearn`

In [None]:
%%time
params = {'n_estimators': 20, 'max_features': 'sqrt'}
model = ensemble.RandomForestRegressor
# params = {}
# model = linear_model.LinearRegression

m = model(**params)

In [None]:
%%time
m.fit(to.train.xs.values, to.train.ys.values.ravel())

In [None]:
y_valid_pred = m.predict(to.valid.xs.values)

if do_test:
    y_test_pred = m.predict(test_dl.xs)

In [None]:
y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true), 
                          torch.from_numpy(y_valid_pred)).item()
print(f'sklearn loss {nb_score:.4f}')

### `fastai`

In [None]:
y_range = [0,
           np.max([to.train.ys.values.max(), to.valid.ys.values.max()]),]
y_range

In [None]:
#export
class Swish(nn.ReLU):
    def forward(self, input:Tensor) -> Tensor:
        if self.inplace:
            res = input.clone()
            torch.sigmoid_(res)
            input *= res
            return input
        else:
            return torch.sigmoid(input) * input
    
class Sine(nn.ReLU):
    def forward(self, input:Tensor) -> Tensor:
        if self.inplace:
            return torch.sin_(input)
        else:
            return torch.sin(input)

In [None]:
layers = [50, 25]

config = None
# config = tabular_config(act_cls=nn.ReLU(inplace=True))
# config = tabular_config(act_cls=Swish(inplace=True))
# config = tabular_config(act_cls=Sine(inplace=True))

learn = tabular_learner(dls, y_range=y_range, layers=layers,
                        n_out=1, config=config, loss_func=evaluate_torch)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, 5e-2)

In [None]:
%%time
y_valid_pred, y_valid_true = learn.get_preds()

if do_test:
    y_test_pred, _ = learn.get_preds(dl=test_dl)

In [None]:
nb_score = evaluate_torch(y_valid_true, 
                          y_valid_pred).item()
print(f'fastai loss {nb_score:.4f}')

In [None]:
# clone numpy ravel
cnr = lambda x: x.clone().numpy().ravel()

## Inspecting

### `dep_var` distribution

Train vs validation distributions

In [None]:
pick_random = lambda x: np.random.choice(x, size=5000, replace=False)

In [None]:
preprocessing.hist_plot_preds(pick_random(y_valid_true), 
                              pick_random(y_valid_pred), 
                              label0='truth', label1='prediction')

In [None]:
if do_test:
    preprocessing.hist_plot_preds(pick_random(y_valid_true), 
                                  pick_random(y_test_pred), 
                                  label0='truth (validation)', 
                                  label1='prediction (test set)')

### Boldly wrong predictions

In [None]:
%%time
bwt = preprocessing.BoldlyWrongTimeseries(to.valid.xs, y_valid_true, y_valid_pred,
                                          t=df.iloc[splits[1]].loc[:,['timestampElapsed']].copy())

In [None]:
bwt.run_boldly()

Finding fastai:
- sudden jumps betweem 0 and 8 or so cause issues for the model. for the most serious cases like bid 79 and meter 2 the predicted values lie between the true values
- there are oddly frequent values at 3.766 for meter 2 big 76, 7.6 for meter 3 bid 1219, 3.7 4.39 5.76 for meter 3 bid 1257


## Submission to kaggle

In [None]:
if do_test and do_submit:
    y_test_pred_original = torch.exp(tensor(y_test_pred)) - 1

    y_out = pd.DataFrame(cnr(y_test_pred_original),
                         columns=['meter_reading'],
                         index=df_test.index)
#     y_out.index.rename('row_id', inplace=True) # TODO: make sure the row_id value is actually correct
    display(y_out.head())

    assert len(y_out) == 41697600

In [None]:
%%time
if predict_test and do_submit:
    y_out.to_csv(data_path/'my_submission.csv')

`kaggle competitions submit -c ashrae-energy-prediction -f submission.csv -m "Message"`

In [None]:
message = ['random forest', '50 obs/bid', f'nb score {nb_score:.4f}']
# message = ['linear model', '50 obs/bid', f'nb score {nb_score:.4f}']
# message = ['tabular_learner', '50 obs/bid', f'nb score {nb_score:.4f}']
message = ' + '.join(message)
message

In [None]:
if do_test & do_submit:
    print('Submitting...')
    !kaggle competitions submit -c ashrae-energy-prediction -f '{data_path}/my_submission.csv' -m '{message}'

**submission scores**

random forest:
- 5 obs per building ID, .75 max_features, 100 estimators: 
    - nb score = 2.37
    - kaggle score = 1.68 / 1.86
    
tabular learner:
- 5 obs per building ID, layers=[500,250], lr = 2e-3: 
    - nb score = 1.55
    - kaggle score = 1.8 / 2.13
- 5 obs per building ID, layers=[500,250], second run with lr = 1e-3: 
    - nb score = 1.57
    - kaggle score = 1.846 / 2.13
- 50 obs per building ID, layers=[500,250], 2 rounds: 
    - nb score = 1.39
    - kaggle score = 1.722 / 2.51
- 50 obs per building ID, layers=[500,250], 2 rounds: 
    - nb score = 1.34
    - kaggle score = 1.641 / 2.266
- 50 obs per building ID, layers=[500,250], 2 rounds, bs=256: 
    - nb score = 1.32
    - kaggle score = 1.643 / 1.926
- 500 obs per building ID, layers=[500,250], 3 rounds: 
    - nb score = 1.19
    - kaggle score = 1.62 / 2.55

Finding:
- nb scores are lower than the kaggle scores
- random forest seems to have public and private score closer to each other than tabular learner

**randomly splitting**
    
Finding (modified target values, all info = info except time):
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100k: 2.3
    - all info incl time @100k: 2.32
    - all info incl time + ids @100k: 2.32
- RandomForest:
    - meter only @100k: 2.2
    - all info minus time @100k: 2.7
    - all info incl time @100k: 2.74
    - all info incl time + ids @100k: 2.82
- tabular_learner:
    - meter only @100k: 2.1
    - all info minus time @100k: 1.56
    - all info incl time @100k: 1.52
    - all info incl time + ids @100k: 0.96
    
**splitting along time**
Finding:
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.2
    - all info incl time @100k: 2.3
    - all info incl time + ids @100k: 2.29
- RandomForest:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.7
    - all info incl time @100k: 2.52
    - all info incl time + ids @100k: 2.62
- tabular_learner:
    - meter only @100k: 2.06
    - all info minus time @100K: 1.62
    - all info incl time @100k: 1.62
    - all info incl time + ids @100k: 1.31