In [None]:
# default_exp modelling

# Modelling & submitting

> Playing with different models and submitting predictions over the test set to kaggle. Predicting each meter individually. 

Current implementation of this notebook leads to (private leaderboard score): 
- baseline (linear regression on dep_var_stats and meter 1hot) 1.7
- RandomForest 1.65, 
- tabular_learner 1.55 and 
- lgbm 1.67. 

Those scores relate to a validation set error (`nb score`) of .8 - 1.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
from ashrae import preprocessing

import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import typing
import pickle


from sklearn import linear_model, tree, model_selection, ensemble

from fastai.tabular.all import *

import lightgbm as lgb

import ipywidgets as widgets

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
do_test = True
do_submit = True
data_path = Path("../data")

In [None]:
#export
def evaluate_torch(y_true:torch.Tensor, y_pred:torch.Tensor): return torch.sqrt(F.mse_loss(y_true, y_pred))

In [None]:
#export
cnr = lambda x: x.clone().numpy().ravel() # clone numpy ravel

## Loading

In [None]:
%%time
var_names = preprocessing.load_var_names(data_path/'var_names.pckl')
var_names

In [None]:
%%time
df = preprocessing.load_df(data_path/'X.parquet')#.sample(100000)

if do_test:
    df_test = preprocessing.load_df(data_path/'X_test.parquet')#.sample(100000)

## Sampling `df`

In [None]:
%%time
n = len(df)

if False: # per building_id and meter sampling
    n_sample_per_bid = 500
    replace = True

    df = (df.groupby(['building_id', 'meter'])
         .sample(n=n_sample_per_bid, replace=replace))

if False: # general sampling
    frac_samples = .9
    replace = False

    df = (df.sample(frac=frac_samples, replace=replace))

print(f'using {len(df)} samples = {len(df)/n*100:.2f} %')

## Split

In [None]:
var_names

In [None]:
var_names_no_anns = {
    'dep_var': var_names['dep_var'],
    'conts': [v for v in var_names['conts'] if 'meter_reading' in v],
    'cats': [v for v in var_names['cats'] if v.startswith('meter_')]
}
var_names_no_anns

In [None]:
var_names_anns = {
    'dep_var': var_names['dep_var'],
    'conts': var_names['conts'],
    'cats': [v for v in var_names['cats'] if not v.startswith('meter_')]
}
var_names_anns

In [None]:
%%time
t_train = pd.read_parquet(data_path/'t_train.parquet')

In [None]:
%%time
split_kind = 'random'
#split_kind = 'time'
# split_kind = 'fix_time'
# split_kind = 'time_split_day'

# t_train = None
train_frac = .8
meter_train_samples = {}

all_splits = {}
for meter, _df in df.groupby('meter'):
    splits = preprocessing.split_dataset(_df, split_kind=split_kind, 
                                         train_frac=train_frac,
                                         t_train=t_train)
    all_splits[meter] = splits
    meter_train_samples[meter] = len(splits[0])
    print(f'meter: {meter} ⇒ sets {len(splits)}, train {len(splits[0])} = {len(splits[0])/len(df):.4f}, valid {len(splits[1])} = {len(splits[1])/len(_df):.4f}')

In [None]:
%%time

# procs = [] 
procs = [FillMissing, Normalize, Categorify]

tos = {}
for meter, _df in df.groupby('meter'):
    print(meter)
    splits = all_splits[meter]
    display(_df.head())
    
    tos[meter] = preprocessing.get_tabular_object(_df,
                                          var_names_anns,
    #                                       var_names_no_anns,
    #                                       var_names, 
                                          splits=splits,
                                          procs=procs)

In [None]:
meter_train_samples

In [None]:
%%time
# train_bs = 256
# val_bs = 256
train_bs = val_bs = {
    0: 100000,
    1: 40000,
    2: 20000,
    3: 10000,
}

all_dls = {meter: to.dataloaders(bs=train_bs[meter], val_bs=val_bs[meter])
           for meter, to in tos.items()}

Warning: Takes about 12min with the test set

In [None]:
%%time
test_bs = 4096*4

if do_test:
    all_test_dls = {meter: dls.test_dl(df_test.loc[df_test['meter']==meter,:], bs=test_bs) 
                    for meter, dls in all_dls.items()}

## Modelling with

In [None]:
def sort_ys(ys:typing.List[tuple]):
    'preds is a list of tuples, each of size 2. the first entry is the index and the second the predictions'
    y = pd.Series(np.concatenate([_y for (_,_y) in ys]),
                  index=np.concatenate([_ix for (_ix,_) in ys]))
    return y

### `sklearn`

In [None]:
%%time
params = {'n_estimators': 20, 'max_features': 'sqrt'}
model = ensemble.RandomForestRegressor
# params = {}
# model = linear_model.LinearRegression

ms = {meter: model(**params) for meter in tos}

In [None]:
%%time
for meter in ms:
    ms[meter].fit(tos[meter].train.xs.values, 
                 tos[meter].train.ys.values.ravel())

In [None]:
%%time
y_valid_preds = [(tos[meter].valid.xs.index, ms[meter].predict(tos[meter].valid.xs.values))
                 for meter in tos]

In [None]:
y_valid_pred = sort_ys(y_valid_preds)
y_valid_pred

In [None]:
%%time
if do_test:
    y_test_preds = [(all_test_dls[meter].xs.index, 
                     ms[meter].predict(all_test_dls[meter].xs))
                     for meter in all_test_dls]
    y_test_pred = sort_ys(y_test_preds)

In [None]:
y_valid_true = [(tos[meter].valid.ys.index, tos[meter].valid.ys.values.ravel())
                for meter in tos]
y_valid_true = sort_ys(y_valid_true)

In [None]:
y_valid_true

In [None]:
nb_score = evaluate_torch(torch.from_numpy(y_valid_true.values), 
                          torch.from_numpy(y_valid_pred.values)).item()
print(f'sklearn loss {nb_score:.4f}')

### `fastai`

Fastai finding: make sure your test set values are not out of domain $\Rightarrow$ `timestampYear` in this notebook is put into the training set but there only takes on the value 2016.0, but in the test set it's 2017.0 and 2018.0, causing the predictions to zero out everywhere.

In [None]:
y_ranges = {meter: [0, np.max([to.train.ys.values.max(), to.valid.ys.values.max()])]
            for meter, to in tos.items()}
y_ranges

In [None]:
#export
class Swish(nn.ReLU):
    def forward(self, input:Tensor) -> Tensor:
        if self.inplace:
            res = input.clone()
            torch.sigmoid_(res)
            input *= res
            return input
        else:
            return torch.sigmoid(input) * input
    
class Sine(nn.ReLU):
    def forward(self, input:Tensor) -> Tensor:
        if self.inplace:
            return torch.sin_(input)
        else:
            return torch.sin(input)

In [None]:
layers = [500, 250, 125]

embed_p = .1
# embed_p = 0.

# ps = [.1, .1, .1, .1, .1]
ps = [.1 for _ in layers]
ps[0] = .2

# config = None
config = tabular_config(embed_p=embed_p, ps=ps,
#                         act_cls=Swish(inplace=True)
                        )
# config = tabular_config(act_cls=nn.ReLU(inplace=True))
# config = tabular_config(act_cls=Swish(inplace=True))
# config = tabular_config(act_cls=Sine(inplace=True))

learners = {meter: tabular_learner(dls, y_range=y_ranges[meter], 
                                   layers=layers, n_out=1, 
                                   config=config, loss_func=evaluate_torch)
            for meter, dls in all_dls.items()}

In [None]:
# learners[meter] = tabular_learner(all_dls[meter], y_range=y_ranges[meter], 
#                                   layers=layers, n_out=1, 
#                                   config=config, loss_func=evaluate_torch)

In [None]:
meter = 3

In [None]:
learners[meter].lr_find()

In [None]:
learners[meter].fit_one_cycle(12, lr_max=7e-2)

In [None]:
learners[meter].recorder.plot_loss()

0: .42
1: .87
2: 1.1
3: 1.12

In [None]:
%%time
y_valid_preds, y_valid_true = [], []
for meter in learners:
    pred, true = learners[meter].get_preds()
    y_valid_preds.append((tos[meter].valid.xs.index, cnr(pred)))
    y_valid_true.append((tos[meter].valid.xs.index, cnr(true)))

In [None]:
y_valid_pred = sort_ys(y_valid_preds)
y_valid_pred

In [None]:
y_valid_true = sort_ys(y_valid_true)
y_valid_true

In [None]:
%%time
if do_test:
    y_test_preds = [(all_test_dls[meter].xs.index, 
                     cnr(learners[meter].get_preds(dl=all_test_dls[meter])[0]))
                     for meter in all_test_dls]
    y_test_pred = sort_ys(y_test_preds)

In [None]:
nb_score = evaluate_torch(torch.from_numpy(y_valid_true.values), 
                          torch.from_numpy(y_valid_pred.values)).item()
print(f'fastai loss {nb_score:.4f}')

In [None]:
y_valid_pred, y_valid_true

## `lightgbm`

In [None]:
%%time
lgb_trains = {meter: lgb.Dataset(to.train.xs.values, to.train.ys.values.ravel())
              for meter, to in tos.items()}
lgb_evals = {meter: lgb.Dataset(to.valid.xs.values, to.valid.ys.values.ravel(), 
                       reference=lgb_trains[meter])
             for meter, to in tos.items()}

In [None]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 42,
    'learning_rate': 0.5,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [None]:
%%time
gbms = {meter: lgb.train(params, lgb_train,
                num_boost_round=100,
                valid_sets=lgb_evals[meter],
                early_stopping_rounds=5)
        for meter, lgb_train in lgb_trains.items()}

In [None]:
%%time
y_valid_preds = [(tos[meter].valid.xs.index, gbms[meter].predict(tos[meter].valid.xs.values,
                                                                 num_iteration=gbms[meter].best_iteration))
                 for meter in tos]

In [None]:
y_valid_pred = sort_ys(y_valid_preds)
y_valid_pred

In [None]:
%%time
if do_test:
    y_test_preds = [(all_test_dls[meter].xs.index, 
                     gbms[meter].predict(all_test_dls[meter].xs.values,
                                         num_iteration=gbms[meter].best_iteration))
                     for meter in all_test_dls]
    y_test_pred = sort_ys(y_test_preds)

In [None]:
y_valid_true = [(tos[meter].valid.ys.index, tos[meter].valid.ys.values.ravel())
                for meter in tos]
y_valid_true = sort_ys(y_valid_true)

In [None]:
y_valid_true

In [None]:
nb_score = evaluate_torch(torch.from_numpy(y_valid_true.values), 
                          torch.from_numpy(y_valid_pred.values)).item()
print(f'lightgbm loss {nb_score:.4f}')

## Inspecting

### `dep_var` distribution

Train vs validation distributions

In [None]:
#export
pick_random = lambda x: np.random.choice(x, size=5000, replace=False)

In [None]:
preprocessing.hist_plot_preds(pick_random(y_valid_true), 
                              pick_random(y_valid_pred), 
                              label0='truth', label1='prediction')

In [None]:
if do_test:
    preprocessing.hist_plot_preds(pick_random(y_valid_true), 
                                  pick_random(y_test_pred), 
                                  label0='truth (validation)', 
                                  label1='prediction (test set)').show()

### Boldly wrong predictions

In [None]:
%%time
base_cols = ['building_id', 'meter','timestamp']
miss_cols = [v for v in base_cols if v not in tos[0].valid.xs.columns]
miss_cols

In [None]:
if len(miss_cols) > 0:
    tmp = pd.concat([to.valid.xs.drop(columns=set(base_cols).difference(miss_cols)).join(df.loc[:,base_cols])
                     for to in tos.values()])
else:
    tmp = pd.concat([to.valid.xs for to in tos.values()])

In [None]:
bwt = preprocessing.BoldlyWrongTimeseries(tmp, y_valid_true, y_valid_pred)

In [None]:
bwt.run_boldly()

Finding:
- lgbm makes predictions of negative values!

## Submission to kaggle

In [None]:
y_test_pred = y_test_pred.sort_index()
y_test_pred

In [None]:
%%time
if do_test:
    y_test_pred_original = torch.exp(tensor(y_test_pred)) - 1

    y_out = pd.DataFrame(cnr(y_test_pred_original),
                         columns=['meter_reading'],
                         index=df_test.index)
    display(y_out.head())

    assert len(y_out) == 41697600

In [None]:
%%time
if do_submit:
    y_out.to_csv(data_path/'my_submission.csv')

`kaggle competitions submit -c ashrae-energy-prediction -f submission.csv -m "Message"`

In [None]:
act = 'ReLu'

# model_msg = f'baseline (linear regression)'
model_msg = f'tabular_learner (one per meter): act {act}, layers {layers}, ps {ps}, embed_p {embed_p}'
split_msg = f'split kind "{split_kind}" train_frac {train_frac}'
samples_msg = f'num samples {len(df)} = {len(df)/n*100:.2f} %'
features_msg = f'train_bs = {train_bs} dep_var_stats and 1hot meter and remove leading empty weeks and us_holidays and fix bid 363'
score_msg = f'nb score {nb_score:.4f}'
# message = ['baseline (linear regression on dep_var_stats and 1hot meter) ', '500 obs/bid', f'nb score {nb_score:.4f}']
# message = ['random forest', '500 obs/bid', 'all features', f'nb score {nb_score:.4f}']
# message = ['lightgbm', '500 obs/bid', '100 rounds', '42 leaves', 'lr .5', f'nb score {nb_score:.4f}']
# message = ['tabular_learner', '500 obs/bid', 'all features', f'layers {layers}, embed_p .1, ps [.1,.1,.1]', f'nb score {nb_score:.4f}']
message = ' + '.join([model_msg, samples_msg, split_msg, features_msg, score_msg])
message

In [None]:
if do_test and do_submit:
    print('Submitting...')
    !kaggle competitions submit -c ashrae-energy-prediction -f '{data_path}/my_submission.csv' -m '{message}'

**submission scores**

random forest:
- 5 obs per building ID, .75 max_features, 100 estimators: 
    - nb score = 2.37
    - kaggle score = 1.68 / 1.86
    
tabular learner:
- 5 obs per building ID, layers=[500,250], lr = 2e-3: 
    - nb score = 1.55
    - kaggle score = 1.8 / 2.13
- 5 obs per building ID, layers=[500,250], second run with lr = 1e-3: 
    - nb score = 1.57
    - kaggle score = 1.846 / 2.13
- 50 obs per building ID, layers=[500,250], 2 rounds: 
    - nb score = 1.39
    - kaggle score = 1.722 / 2.51
- 50 obs per building ID, layers=[500,250], 2 rounds: 
    - nb score = 1.34
    - kaggle score = 1.641 / 2.266
- 50 obs per building ID, layers=[500,250], 2 rounds, bs=256: 
    - nb score = 1.32
    - kaggle score = 1.643 / 1.926
- 500 obs per building ID, layers=[500,250], 3 rounds: 
    - nb score = 1.19
    - kaggle score = 1.62 / 2.55

Finding:
- nb scores are lower than the kaggle scores
- random forest seems to have public and private score closer to each other than tabular learner

**randomly splitting**
    
Finding (modified target values, all info = info except time):
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100k: 2.3
    - all info incl time @100k: 2.32
    - all info incl time + ids @100k: 2.32
- RandomForest:
    - meter only @100k: 2.2
    - all info minus time @100k: 2.7
    - all info incl time @100k: 2.74
    - all info incl time + ids @100k: 2.82
- tabular_learner:
    - meter only @100k: 2.1
    - all info minus time @100k: 1.56
    - all info incl time @100k: 1.52
    - all info incl time + ids @100k: 0.96
    
**splitting along time**
Finding:
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.2
    - all info incl time @100k: 2.3
    - all info incl time + ids @100k: 2.29
- RandomForest:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.7
    - all info incl time @100k: 2.52
    - all info incl time + ids @100k: 2.62
- tabular_learner:
    - meter only @100k: 2.06
    - all info minus time @100K: 1.62
    - all info incl time @100k: 1.62
    - all info incl time + ids @100k: 1.31