In [None]:
# default_exp modelling

# Modelling meter readings

> API details.

Finding: make sure your test set values are not out of domain $\Rightarrow$ `timestampYear` in this notebook is put into the training set but there only takes on the value 2016.0, but in the test set it's 2017.0 and 2018.0, causing the predictions to zero out everywhere.

In [None]:
#export
import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import typing
import pickle

from sklearn import linear_model, tree, model_selection, ensemble

from fastai.tabular.all import *

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
base_path = Path("../data")

In [None]:
#export
def numpy_evaluate(y_true:np.ndarray, y_pred:np.ndarray): return np.sqrt(np.mean((y_pred  - y_true)**2))

def evaluate_torch(y_true:torch.Tensor, y_pred:torch.Tensor): return torch.sqrt(torch.mean((y_pred - y_true)**2))

## Loading

In [None]:
%%time
with open(base_path/'var_types.pckl', 'rb') as f:
    var_types = pickle.load(f)

In [None]:
continuous, categorical = var_types['cont'], var_types['cat']
continuous, categorical

In [None]:
continuous = continuous[:9] + [continuous[-1]]
continuous

## Loading `X` and `X_test`

In [None]:
%%time
X_all = pd.read_parquet(base_path/'X.parquet')#.sample(100000)

In [None]:
X_all.groupby('building_id').size().describe()

In [None]:
n_sample_per_bid = 5
replace = True

In [None]:
X = (X_all.groupby('building_id')
     .sample(n=n_sample_per_bid, replace=replace))
len(X)

In [None]:
display(len(X), X.head())

In [None]:
%%time
X_test = pd.read_parquet(base_path/'X_test.parquet')

In [None]:
display(len(X_test), X_test.head())

In [None]:
#export
def split_dataset(X:pd.DataFrame, split_kind:str='random',
                  train_frac:float=8):
    
    def random_split():
        n_train = int(len(X)*train_frac)
        train_bool = X.index.isin(np.random.choice(X.index.values, size=n_train, replace=False))
        return train_bool
    
    def time_split():
#        print(X.columns)
        time_col = 'timestampElapsed'
        ts = X[time_col].sort_values(ascending=True)
#        print(ts)
        ix = int(len(X)*train_frac)
#        print('ix', ix)
        threshold_t = ts.iloc[ix:].values[0]
#        print('threshold_t', threshold_t)
        return X[time_col] < threshold_t
     
    split_funs = {
        'random': random_split,
        'time': time_split,
    }
    
    assert split_kind in split_funs
    train_bool = split_funs[split_kind]()
    
    train_idx = np.where(train_bool)[0]
    valid_idx = np.where(~train_bool)[0]

    return (list(train_idx), list(valid_idx))

In [None]:
%%time
split_kind = 'random'
#split_kind = 'time'
splits = split_dataset(X, split_kind=split_kind, train_frac=.8)
#splits=None

In [None]:
sorted(X.iloc[splits[0]].loc[:, 'timestampMonth'].unique())

Super simplistic input data

In [None]:
# %%time
# procs = [Categorify, FillMissing, Normalize]
# to = TabularPandas(X, procs, ['meter'],
#                    [], y_names='meter_reading', splits=splits)

All input data

In [None]:
%%time
procs = [Categorify, FillMissing, Normalize]
to = TabularPandas(X, procs, categorical,
                   continuous, 
                   y_names='meter_reading', splits=splits)

dls = to.dataloaders()

In [None]:
%%time
test_dl = dls.test_dl(X_test) # .iloc[:100]

In [None]:
to.train.xs

In [None]:
to.train.ys

## Modelling

### Modelling with sklearn

In [None]:
m = linear_model.LinearRegression()

In [None]:
# m = ensemble.RandomForestRegressor(n_estimators=100, max_features=.75, criterion='mse')

In [None]:
%%time
m.fit(to.train.xs, to.train.ys.values.ravel())

In [None]:
m.predict(to.valid.xs.values)[:5]

In [None]:
evaluate_torch(torch.from_numpy(to.valid.ys.values), 
               torch.from_numpy(m.predict(to.valid.xs.values).ravel()))

In [None]:
%%time
y_test_pred = m.predict(test_dl.xs)

In [None]:
%%time
y_test_pred = torch.Tensor(y_test_pred)

In [None]:
y_test_pred[:5]

Finding:
- values in the range of 90. this is way to large, but the values predicted for the validation set are okay. what is different between the prediction over the validation set and the test set?

### Modelling with fastai

In [None]:
y_range = [np.min([to.train.ys.values.min(), to.valid.ys.values.min()]),
           np.max([to.train.ys.values.max(), to.valid.ys.values.max()]),]
y_range

In [None]:
# y_range = [to.train.ys.values.min(),
#            to.train.ys.values.max()]
# y_range

In [None]:
learn = tabular_learner(dls, y_range=y_range, layers=[500,250],
                        n_out=1, loss_func=evaluate_torch)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, 2e-3)

In [None]:
preds, targs = learn.get_preds()

In [None]:
preds, targs

In [None]:
evaluate_torch(targs, preds)

In [None]:
# test = X.iloc[:50].copy()
# test = test.drop('meter_reading', axis=1)
# test

In [None]:
# test = X_test.head(100).copy()
# test.head()

In [None]:
# test_dl = learn.dls.test_dl(test)

In [None]:
# test_dl.xs

In [None]:
y_test_pred, _ = learn.get_preds(dl=test_dl)
y_test_pred[:5]

## Transforming back and storing in submission format

In [None]:
y_test_pred_original = torch.exp(y_test_pred) - 1
y_test_pred_original[:5]

In [None]:
y_out = pd.DataFrame(y_test_pred_original.clone().numpy(),
                     columns=['meter_reading'])
y_out.index.rename('row_id', inplace=True)
y_out.head()

In [None]:
assert len(y_out) == 41697600

In [None]:
y_out.to_csv('test_submission_linear.csv')

**randomly splitting**
    
Finding (modified target values, all info = info except time):
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100k: 2.3
    - all info incl time @100k: 2.32
    - all info incl time + ids @100k: 2.32
- RandomForest:
    - meter only @100k: 2.2
    - all info minus time @100k: 2.7
    - all info incl time @100k: 2.74
    - all info incl time + ids @100k: 2.82
- tabular_learner:
    - meter only @100k: 2.1
    - all info minus time @100k: 1.56
    - all info incl time @100k: 1.52
    - all info incl time + ids @100k: 0.96
    
**splitting along time**
Finding:
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.2
    - all info incl time @100k: 2.3
    - all info incl time + ids @100k: 2.29
- RandomForest:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.7
    - all info incl time @100k: 2.52
    - all info incl time + ids @100k: 2.62
- tabular_learner:
    - meter only @100k: 2.06
    - all info minus time @100K: 1.62
    - all info incl time @100k: 1.62
    - all info incl time + ids @100k: 1.31