In [None]:
# default_exp modelling

# Modelling meter readings

> First building a base model predicting medians/means, then linear, tree-based and ANN-based models.

Finding: make sure your test set values are not out of domain $\Rightarrow$ `timestampYear` in this notebook is put into the training set but there only takes on the value 2016.0, but in the test set it's 2017.0 and 2018.0, causing the predictions to zero out everywhere.

In [None]:
#export
import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import typing
import pickle


from sklearn import linear_model, tree, model_selection, ensemble

from fastai.tabular.all import *

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
base_path = Path("../data")

In [None]:
#export
def evaluate_torch(y_true:torch.Tensor, y_pred:torch.Tensor): return torch.sqrt(torch.mean((y_pred - y_true)**2))

## Loading

In [None]:
%%time
with open(base_path/'var_types.pckl', 'rb') as f:
    var_types = pickle.load(f)

In [None]:
continuous, categorical = var_types['cont'], var_types['cat']
continuous, categorical

In [None]:
continuous = continuous[:9] + [continuous[-1]]
continuous

## Loading `X` and `X_test`

In [None]:
%%time
X_all = pd.read_parquet(base_path/'X.parquet') #.sample(100000)

In [None]:
X_all.groupby('building_id').size().describe()

In [None]:
n_sample_per_bid = 50
replace = True

In [None]:
X = (X_all.groupby('building_id')
     .sample(n=n_sample_per_bid, replace=replace))

In [None]:
print(f'using {len(X)} samples = {len(X)/len(X_all)*100:.2f} %')

In [None]:
display(len(X), X.head())

In [None]:
%%time
X_test = pd.read_parquet(base_path/'X_test.parquet') #.sample(10000)

In [None]:
display(len(X_test), X_test.head())

In [None]:
#export
def split_dataset(X:pd.DataFrame, split_kind:str='random',
                  train_frac:float=8):
    
    def random_split():
        n_train = int(len(X)*train_frac)
        train_bool = X.index.isin(np.random.choice(X.index.values, size=n_train, replace=False))
        return train_bool
    
    def time_split():
        time_col = 'timestampElapsed'
        ts = X[time_col].sort_values(ascending=True)
        ix = int(len(X)*train_frac)
        threshold_t = ts.iloc[ix:].values[0]
        return X[time_col] < threshold_t
     
    split_funs = {
        'random': random_split,
        'time': time_split,
    }
    
    assert split_kind in split_funs
    train_bool = split_funs[split_kind]()
    
    train_idx = np.where(train_bool)[0]
    valid_idx = np.where(~train_bool)[0]

    return (list(train_idx), list(valid_idx))

In [None]:
%%time
split_kind = 'random'
#split_kind = 'time'
splits = split_dataset(X, split_kind=split_kind, train_frac=.8)
#splits=None

In [None]:
sorted(X.iloc[splits[0]].loc[:, 'timestampMonth'].unique())

Super simplistic input data

In [None]:
# %%time
# procs = [Categorify, FillMissing, Normalize]
# to = TabularPandas(X, procs, ['meter'],
#                    [], y_names='meter_reading', splits=splits)

All input data

In [None]:
%%time
procs = [Categorify, FillMissing, Normalize]
to = TabularPandas(X, procs, categorical,
                   continuous, 
                   y_names='meter_reading', splits=splits)

train_bs = 256
val_bs = 256
dls = to.dataloaders(bs=train_bs, val_bs=val_bs)

In [None]:
%%time
test_dl = dls.test_dl(X_test, bs=1024)

In [None]:
to.train.xs

In [None]:
to.train.ys

## Modelling

In [None]:
def hist_plot_preds(y_valid_pred, y_test_pred):
    res = pd.concat(
        (
            pd.DataFrame({
                'y': y_valid_pred, 
                'set': ['valid']*len(y_valid_pred)
            }),
            pd.DataFrame({
                'y':y_test_pred, 
                'set': ['test']*len(y_test_pred)
            })
        ),
        ignore_index=True
    )

    return px.histogram(res, x='y', color='set', marginal='box',
                        barmode='overlay', histnorm='probability density')

### super simple base model

In [None]:
class SimpleEstimator:
    cols = []
    def fit(self, X, y):
        self.cols = [col for col in ['building_id', 'meter'] if col in X.columns]
        tmp = pd.concat((X.loc[:,self.cols], to.train.ys), axis=1)
        self.consts = tmp.groupby(self.cols)['meter_reading'].describe().to_dict()
        display(self.consts)
    def predict(self, X, quantity:str='mean'):
#         if len(self.cols) == 1:
        return [self.consts[quantity][tuple(row[self.cols].values)] for i,row in X.iterrows()]
#         elif len(self.cols) == 2:
#             return [self.consts[quantity][row[self.cols[0]]][row[self.cols[1]]] for i,row in X.iterrows()]
#         raise NotImplementedError


In [None]:
se = SimpleEstimator()

In [None]:
se.fit(to.train.xs, to.train.ys)

In [None]:
%%time
y_valid_pred = se.predict(to.valid.xs, quantity='mean')

In [None]:
y_true = to.valid.ys.values.ravel()
y_pred = y_valid_pred
assert y_true.shape[0] == len(y_pred)
evaluate_torch(torch.from_numpy(y_true), 
               torch.Tensor(y_pred))

In [None]:
# %%time
# y_test_pred = se.predict(test_dl.xs)
# y_test_pred[:5]

In [None]:
# %%time
# _y_train = np.random.choice(y_valid_pred, size=5000)
# _y_test = np.random.choice(y_test_pred, size=5000)

# hist_plot_preds(_y_train, _y_test)

In [None]:
# %%time
# y_test_pred = torch.Tensor(y_test_pred)

### Modelling with sklearn

In [None]:
m = linear_model.LinearRegression()

In [None]:
m = ensemble.RandomForestRegressor(n_estimators=100, max_features=.75, criterion='mse')

In [None]:
%%time
m.fit(to.train.xs, to.train.ys.values.ravel())

In [None]:
y_valid_pred = m.predict(to.valid.xs.values)
y_valid_pred[:5]

In [None]:
y_true = to.valid.ys.values.ravel()
y_pred = y_valid_pred
assert y_true.shape == y_pred.shape
nb_score = evaluate_torch(torch.from_numpy(y_true), 
                          torch.from_numpy(y_pred))
nb_score

In [None]:
%%time
y_test_pred = m.predict(test_dl.xs)
y_test_pred[:5]

In [None]:
%%time
_y_train = np.random.choice(y_valid_pred, size=5000)
_y_test = np.random.choice(y_test_pred, size=5000)

hist_plot_preds(_y_train, _y_test)

In [None]:
%%time
y_test_pred = torch.Tensor(y_test_pred)

Finding:
- values in the range of 90. this is way to large, but the values predicted for the validation set are okay. what is different between the prediction over the validation set and the test set?

### Modelling with fastai

In [None]:
y_range = [np.min([to.train.ys.values.min(), to.valid.ys.values.min()]),
           np.max([to.train.ys.values.max(), to.valid.ys.values.max()]),]
y_range

In [None]:
# y_range = [to.train.ys.values.min(),
#            to.train.ys.values.max()]
# y_range

In [None]:
learn = tabular_learner(dls, y_range=y_range, layers=[500,250],
                        n_out=1, loss_func=evaluate_torch)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, 5e-3)

In [None]:
y_valid_pred, y_valid_true = learn.get_preds()

In [None]:
y_valid_pred, y_valid_true

In [None]:
nb_score = evaluate_torch(y_valid_pred, y_valid_true)
nb_score

In [None]:
y_test_pred, _ = learn.get_preds(dl=test_dl)
y_test_pred[:5]

In [None]:
%%time
_y_valid = y_valid_pred.clone().numpy().ravel()
_y_valid = np.random.choice(_y_valid, size=5000)

_y_test = y_test_pred.clone().numpy().ravel()
_y_test = np.random.choice(_y_test, size=5000)

hist_plot_preds(_y_valid, _y_test)

## Transforming back and storing in submission format

In [None]:
y_test_pred_original = torch.exp(y_test_pred) - 1
y_test_pred_original[:5]

In [None]:
y_out = pd.DataFrame(y_test_pred_original.clone().numpy(),
                     columns=['meter_reading'])
y_out.index.rename('row_id', inplace=True)
y_out.head()

In [None]:
assert len(y_out) == 41697600

In [None]:
%%time
y_out.to_csv('test_submission_randomforest.csv')

In [None]:
%%time
y_out.to_csv('test_submission_tabularlearner.csv')

`kaggle competitions submit -c ashrae-energy-prediction -f submission.csv -m "Message"`

In [None]:
!kaggle competitions submit -c ashrae-energy-prediction -f test_submission_randomforest.csv -m f"50 obs per bid - randomforest with filtered outliers nb score {nb_score}"

In [None]:
!kaggle competitions submit -c ashrae-energy-prediction -f test_submission_tabularlearner.csv -m f"50 obs per bid - tabularlearner with bs=256 and filtered outliers and bid&sid nb score {nb_score}"

**submission scores**

random forest:
- 5 obs per building ID, .75 max_features, 100 estimators: 
    - nb score = 2.37
    - kaggle score = 1.68 / 1.86
    
tabular learner:
- 5 obs per building ID, layers=[500,250], lr = 2e-3: 
    - nb score = 1.55
    - kaggle score = 1.8 / 2.13
- 5 obs per building ID, layers=[500,250], second run with lr = 1e-3: 
    - nb score = 1.57
    - kaggle score = 1.846 / 2.13
- 50 obs per building ID, layers=[500,250], 2 rounds: 
    - nb score = 1.39
    - kaggle score = 1.722 / 2.51
- 50 obs per building ID, layers=[500,250], 2 rounds: 
    - nb score = 1.34
    - kaggle score = 1.641 / 2.266
- 50 obs per building ID, layers=[500,250], 2 rounds, bs=256: 
    - nb score = 1.32
    - kaggle score = 1.643 / 1.926
- 500 obs per building ID, layers=[500,250], 3 rounds: 
    - nb score = 1.19
    - kaggle score = 1.62 / 2.55

Finding:
- nb scores are lower than the kaggle scores
- random forest seems to have public and private score closer to each other than tabular learner

**randomly splitting**
    
Finding (modified target values, all info = info except time):
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100k: 2.3
    - all info incl time @100k: 2.32
    - all info incl time + ids @100k: 2.32
- RandomForest:
    - meter only @100k: 2.2
    - all info minus time @100k: 2.7
    - all info incl time @100k: 2.74
    - all info incl time + ids @100k: 2.82
- tabular_learner:
    - meter only @100k: 2.1
    - all info minus time @100k: 1.56
    - all info incl time @100k: 1.52
    - all info incl time + ids @100k: 0.96
    
**splitting along time**
Finding:
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.2
    - all info incl time @100k: 2.3
    - all info incl time + ids @100k: 2.29
- RandomForest:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.7
    - all info incl time @100k: 2.52
    - all info incl time + ids @100k: 2.62
- tabular_learner:
    - meter only @100k: 2.06
    - all info minus time @100K: 1.62
    - all info incl time @100k: 1.62
    - all info incl time + ids @100k: 1.31