# Building one model per meter

> This notebook is a slight modification of `all_meters_one_model.ipynb`

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
from ashrae import loading, preprocessing, feature_testing, modelling

import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import typing
import pickle


from sklearn import linear_model, tree, model_selection, ensemble

from fastai.tabular.all import *

import lightgbm as lgb

import ipywidgets as widgets

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
do_test = True
do_submit = False
data_path = loading.DATA_PATH

In [None]:
loading.N_TRAIN = 10_000
loading.N_TEST = 10_000

## Loading

In [None]:
%%time
ashrae_data = loading.load_all()

In [None]:
%%time
processor = preprocessing.Processor() # t_train=t_train
tfms_config = {
    'add_random_noise_features':{},
    'add_time_features':{},
    'add_weather_features':{'fix_time_offset':True,
                            'add_na_indicators':True,
                            'impute_nas':True},
    'add_building_features':{},
}

df, var_names = processor(ashrae_data['meter_train'], tfms_configs=tfms_config,
                          df_weather=ashrae_data['weather_train'],
                          df_building=ashrae_data['building'])

%time
df_test, _ = processor(ashrae_data['meter_test'], tfms_configs=tfms_config,
                         df_weather=ashrae_data['weather_test'],
                         df_building=ashrae_data['building'])
df_test = preprocessing.align_test(df, var_names, df_test)

## Sampling `df`

In [None]:
%%time
n = len(df)

if False: # per building_id and meter sampling
    n_sample_per_bid = 500
    replace = True

    df = (df.groupby(['building_id', 'meter'])
         .sample(n=n_sample_per_bid, replace=replace))

if False: # general sampling
    frac_samples = .1
    replace = False

    df = (df.sample(frac=frac_samples, replace=replace))

print(f'using {len(df)} samples = {len(df)/n*100:.2f} %')

## Split

In [None]:
%%time
# t_train = pd.read_parquet(data_path/'t_train.parquet')
t_train = None

%time
#split_kind = 'random'
#split_kind = 'time'
# split_kind = 'fix_time'
split_kind = 'time_split_day'
train_frac = .9

In [None]:
meter_train_samples = {}

all_splits = {}
for meter, _df in df.groupby('meter'):
    splits = preprocessing.split_dataset(_df, split_kind=split_kind, train_frac=train_frac,
                                         t_train=t_train)
    all_splits[meter] = splits
    meter_train_samples[meter] = len(splits[0])
    print(f'meter: {meter} ⇒ sets {len(splits)}, train {len(splits[0])} = {len(splits[0])/len(_df):.4f}, valid {len(splits[1])} = {len(splits[1])/len(_df):.4f}')

In [None]:
%%time

# procs = [] 
procs = [FillMissing, Normalize, Categorify]

tos = {}
for meter, _df in df.groupby('meter'):
    splits = all_splits[meter]
    
    tos[meter] = feature_testing.get_tabular_object(_df,
                                                    var_names,
                                                    splits=splits,
                                                    procs=procs)

In [None]:
%%time
train_bs = val_bs = {
    0: 500, # 100000,
    1: 200, # 40000,
    2: 150, # 20000,
    3: 50, # 10000,
}

all_dls = {meter: to.dataloaders(bs=train_bs[meter], val_bs=val_bs[meter])
           for meter, to in tos.items()}

Warning: Takes about 12min with the test set

In [None]:
%%time
test_bs = 500

if do_test:
    all_test_dls = {meter: dls.test_dl(df_test.loc[df_test['meter']==meter,:], bs=test_bs) 
                    for meter, dls in all_dls.items()}

## Modelling with

In [None]:
def sort_ys(ys:typing.List[tuple]):
    'preds is a list of tuples, each of size 2. the first entry is the index and the second the predictions'
    y = pd.Series(np.concatenate([_y for (_,_y) in ys]),
                  index=np.concatenate([_ix for (_ix,_) in ys]))
    return y

### `sklearn`

In [None]:
%%time
params = {'n_estimators': 20, 'max_features': 'sqrt'}
model = ensemble.RandomForestRegressor
# params = {}
# model = linear_model.LinearRegression

ms = {meter: model(**params) for meter in tos}

In [None]:
%%time
for meter in ms:
    ms[meter].fit(tos[meter].train.xs.values, 
                  tos[meter].train.ys.values.ravel())

In [None]:
%%time
y_valid_preds = [(tos[meter].valid.xs.index, ms[meter].predict(tos[meter].valid.xs.values))
                 for meter in tos]

In [None]:
y_valid_pred = sort_ys(y_valid_preds)
y_valid_pred

In [None]:
%%time
if do_test:
    y_test_preds = [(all_test_dls[meter].xs.index, 
                     ms[meter].predict(all_test_dls[meter].xs))
                     for meter in all_test_dls]
    y_test_pred = sort_ys(y_test_preds)

In [None]:
y_valid_true = [(tos[meter].valid.ys.index, tos[meter].valid.ys.values.ravel())
                for meter in tos]
y_valid_true = sort_ys(y_valid_true)

In [None]:
nb_score = modelling.evaluate_torch(torch.from_numpy(y_valid_true.values), 
                                    torch.from_numpy(y_valid_pred.values)).item()
print(f'sklearn loss {nb_score:.4f}')

## Inspecting

### `dep_var` distribution

Train vs validation distributions

In [None]:
#export
pick_random = lambda x: x if len(x)<5000 else np.random.choice(x, size=5000, replace=False)

In [None]:
feature_testing.hist_plot_preds(pick_random(y_valid_true), 
                                pick_random(y_valid_pred), 
                                label0='truth', label1='prediction')

In [None]:
if do_test:
    feature_testing.hist_plot_preds(pick_random(y_valid_true), 
                                    pick_random(y_test_pred), 
                                    label0='truth (validation)', 
                                    label1='prediction (test set)').show()

### Boldly wrong predictions

In [None]:
%%time
base_cols = ['building_id', 'meter','timestamp']
miss_cols = [v for v in base_cols if v not in tos[0].valid.xs.columns]
miss_cols

In [None]:
if len(miss_cols) > 0:
    tmp = pd.concat([to.valid.xs.drop(columns=set(base_cols).difference(miss_cols)).join(df.loc[:,base_cols])
                     for to in tos.values()])
else:
    tmp = pd.concat([to.valid.xs for to in tos.values()])

In [None]:
bwt = feature_testing.BoldlyWrongTimeseries(tmp, y_valid_true, y_valid_pred)

In [None]:
bwt.run_boldly()

## Submission to kaggle

In [None]:
y_test_pred = y_test_pred.sort_index()
y_test_pred

In [None]:
%%time
if do_submit:
    y_test_pred_original = torch.exp(tensor(y_test_pred)) - 1

    y_out = pd.DataFrame(cnr(y_test_pred_original),
                         columns=['meter_reading'],
                         index=df_test.index)
    display(y_out.head())

    assert len(y_out) == 41697600

In [None]:
%%time
if do_submit:
    y_out.to_csv(data_path/'my_submission.csv')

`kaggle competitions submit -c ashrae-energy-prediction -f submission.csv -m "Message"`

In [None]:
act = 'ReLu'

model_msg = f'RandomForest'
# model_msg = f'tabular_learner (one per meter): act {act}, layers {layers}, ps {ps}, embed_p {embed_p}'
split_msg = f'split kind "{split_kind}" train_frac {train_frac}'
samples_msg = f'num samples {len(df)} = {len(df)/n*100:.2f} %'
features_msg = f'train_bs = {train_bs} dep_var_stats and 1hot meter and remove leading empty weeks and us_holidays and fix bid 363'
score_msg = f'nb score {nb_score:.4f}'
# message = ['baseline (linear regression on dep_var_stats and 1hot meter) ', '500 obs/bid', f'nb score {nb_score:.4f}']
# message = ['random forest', '500 obs/bid', 'all features', f'nb score {nb_score:.4f}']
# message = ['lightgbm', '500 obs/bid', '100 rounds', '42 leaves', 'lr .5', f'nb score {nb_score:.4f}']
# message = ['tabular_learner', '500 obs/bid', 'all features', f'layers {layers}, embed_p .1, ps [.1,.1,.1]', f'nb score {nb_score:.4f}']
message = ' + '.join([model_msg, samples_msg, split_msg, features_msg, score_msg])
message

In [None]:
if do_test and do_submit:
    print('Submitting...')
    !kaggle competitions submit -c ashrae-energy-prediction -f '{data_path}/my_submission.csv' -m '{message}'