In [None]:
# default_exp preprocessing

# Preprocessing data

> Preprocessing data for the modelling step.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import typing
import pickle
import ipywidgets as widgets

from sklearn import linear_model, tree, model_selection, ensemble
from ashrae import loading, inspection
from fastai.tabular.all import *

import tqdm

from sklearn import linear_model, tree, model_selection, ensemble
from sklearn.preprocessing import OneHotEncoder

import itertools
from pandas.tseries.holiday import USFederalHolidayCalendar as us_calendar

import math
from loguru import logger

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
dep_var = 'meter_reading'

In [None]:
data_path = loading.DATA_PATH

In [None]:
# set to None if the pre-processing is supposed to be done for all samples, otherwise choose a number of samples
n_samples_quick = 100000

## Loading

In [None]:
%%time
ashrae_data = loading.load_all()

## Splitting

In [None]:
#export
def split_dataset(X:pd.DataFrame, split_kind:str='random',
                  train_frac:float=8, t_train:pd.DataFrame=None):

    def random_split():
        n_train = int(len(X)*train_frac)
        train_bool = X.index.isin(np.random.choice(X.index.values, size=n_train, replace=False))
        return train_bool

    def time_split():
        assert 'timestamp' in X.columns
        time_col = 'timestamp'
        ts = X[time_col].sort_values(ascending=True)
        ix = int(len(X)*train_frac)
        threshold_t = ts.iloc[ix:].values[0]
        return X[time_col] < threshold_t

    def time_split_day():
        time_col = 'timestampDayofyear'

        if time_col not in X.columns:
            t = X['timestamp'].dt.dayofyear
        else:
            t = X[time_col]

        days = (t.value_counts()
                .rename('count')
                .sample(frac=1)
                .to_frame()
                .cumsum()
                .pipe(lambda x: x.loc[x['count'] <= (train_frac * len(t))]))

        num_train_days = len(days)
        mask = t.isin(days.index.values)

        assert mask.sum() > 0
        return mask

    def fix_time_split():
        assert t_train is not None
        time_col = 'timestamp'
        assert time_col in X.columns

        mask = X[time_col].isin(t_train[time_col])
        assert mask.sum() > 0
        return mask

    split_funs = {
        'random': random_split,
        'time': time_split,
        'fix_time': fix_time_split,
        'time_split_day': time_split_day,
    }

    assert split_kind in split_funs
    train_bool = split_funs[split_kind]()

    train_idx = np.where(train_bool)[0]
    valid_idx = np.where(~train_bool)[0]

    return (list(train_idx), list(valid_idx))

In [None]:
%%time
#split_kind = 'random'
#split_kind = 'time'
# split_kind = 'fix_time'
split_kind = 'time_split_day'

t_train = None
train_frac = .8
splits = split_dataset(ashrae_data['meter_train'], split_kind=split_kind, train_frac=train_frac,
                       t_train=t_train)

In [None]:
print(f'sets {len(splits)}, train {len(splits[0])} = {len(splits[0])/len(ashrae_data["meter_train"]):.4f}, valid {len(splits[1])} = {len(splits[1])/len(ashrae_data["meter_train"]):.4f}')

In [None]:
t_train = ashrae_data['meter_train'].iloc[splits[0]][['timestamp']]
t_train.head()

In [None]:
%%time
t_train.to_parquet(data_path/'t_train.parquet')

Visualizing the split

In [None]:
%%time
tmp = pd.concat((ashrae_data['meter_train'].iloc[splits[0]]
                 .assign(label='train')
                 .assign(meter_reading=lambda x: np.log(x['meter_reading']+1)),
                (ashrae_data['meter_train'].iloc[splits[1]]
                 .assign(label='valid')
                 .assign(meter_reading=lambda x: np.log(x['meter_reading']+1)))), 
                axis=0, ignore_index=True)
tmp.groupby('label').size()

In [None]:
px.scatter(tmp.sample(10000), x='timestamp', y='meter_reading', color='label')

## Wrangling

cleaning: https://www.kaggle.com/purist1024/ashrae-simple-data-cleanup-lb-1-08-no-leaks
* remove all 0s for meter 0
* remove all 0s for meter 2 and 3 if not summer
* potentially remove 0s for meter 1 during winter
* remove "known-bad" electrical readings from the first 141 days of the data for site 0 (i.e. UCF)
* remove most absurdly high readings from building 1099. These are orders of magnitude higher than all data, and have been emperically seen in LB probes to be harmful outliers.
* time time zone for weather data
* impute nas for weather data
* convert cyclic features, like hour, to  2d features (sin,cos)

In [None]:
#export
DEP_VAR = 'meter_reading'
TIME_COL = 'timestamp'

class Processor:

    dep_var_stats:dict = None
    
    def __init__(self, dep_var:str=None, time_col:str=None,
                 t_train:pd.Series=None):
        self.dep_var = DEP_VAR if dep_var is None else dep_var
        self.time_col = TIME_COL if time_col is None else time_col
        self.t_train = t_train
    
    def __call__(self, df_core:pd.DataFrame, df_building:pd.DataFrame=None,
                 df_weather:pd.DataFrame=None, t_train:pd.DataFrame=None,
                 tfms_configs:dict=None) -> (pd.DataFrame, dict):

        # check if `df` is a test set (dep_var is missing)
        self.is_train = self.dep_var in df_core.columns
        self.df_core = df_core.copy()
        self.conts, self.cats, self.cats_order = [], [], {}
        self.cats += ['building_id', 'meter']
        self.n = len(df_core)

        # core pieces of dependent and independent variables
        self.update_dep_var()

        if tfms_configs is None:
            logger.info('Empty transform configs `tfms_configs`. Returning ...')
        else:
            # if `t_train` (timestamps which belong to the training set) are provided perform a check which rows are effected
            if self.t_train is not None:
                self.t_train_set = set(self.t_train.values.ravel())

            self.df_building = df_building.copy() if df_building is not None else None
            self.df_weather = df_weather.copy() if df_weather is not None else None

            # running transformations
            self.sanity_check_input_for_tfms(tfms_configs)
            for fun_name, config in tfms_configs.items():
                self.df_core = getattr(self, fun_name)(**config)

        df_core, var_names = self.cleanup()
        logger.info(f'Reduced samples by {self.n - len(df_core)} rows = {(self.n - len(df_core))/self.n*100:.2f} %')
        return df_core, var_names

    @property
    def t_in_train_set(self):
        return self.df_core['timestamp'].isin(self.t_train_set)

    def update_dep_var(self) -> pd.DataFrame:

        if self.dep_var.endswith('log1p'):
            return self.df_core

        dep_var_new = f'{self.dep_var}_log1p'
        if self.is_train:
            self.df_core[dep_var_new] = np.log(self.df_core[self.dep_var].values + 1) # 3s
#             self.df_core[dep_var_new] = self.df_core[self.dep_var].apply(lambda x: math.log(x+1)) # 12s with math.log, 27s with np.log
#             self.df_core[dep_var_new] = self.df_core[self.dep_var].swifter.apply(lambda x: math.log(x+1)) # 15s with math.log + swifter
        self.dep_var = dep_var_new
        return self.df_core
    
    def sanity_check_input_for_tfms(self, tfms_configs:dict):
        # sanity check presence of df_building if df_weather is given
        if self.df_weather is not None:
            assert self.df_building is not None, 'To join the weather info in `df_weather` you need to pass `df_building`.'

        # making sure all required inputs are specified in `tfms_configs`
        self.test_run = True
        if tfms_configs is not None:
            building_fun_names = ['add_building_features']
            weather_fun_names = ['add_weather_features']
            for fun_name, config in tfms_configs.items():
                getattr(self, fun_name)(**config)
                if fun_name in building_fun_names:
                    assert self.df_building is not None, 'You need to pass `df_building` in Processor.__call__.'
                if fun_name in weather_fun_names:
                    assert self.df_weather is not None, 'You need to pass `df_weather` in Processor.__call__.'
        self.test_run = False

    def get_var_names(self) -> dict:
        return {'conts': self.conts, 'cats': self.cats, 'dep_var': self.dep_var}

    def cleanup(self) -> (pd.DataFrame, dict):
        # converting cats to category type
        for col in self.cats:
            if self.df_core[col].dtype == bool: continue
            self.df_core[col] = self.df_core[col].astype('category')
            if col in self.cats_order:
                self.df_core[col].cat.set_categories(self.cats_order[col],
                                                     ordered=True, inplace=True)

        # removing features
        to_remove_cols = ['meter_reading', 'timestampYear'] # , self.time_col
        self.df_core.drop(columns=[c for c in self.df_core.columns if c in to_remove_cols],
                          inplace=True)

        # shrinking the data frame
        self.df_core = df_shrink(self.df_core, int2uint=True)

        var_names = self.get_var_names()

        if not self.is_train:
            self.df_core.set_index('row_id', inplace=True)

        missing_cols = [col for col in self.df_core.columns.values if col not in self.cats + self.conts + [self.dep_var]
                        and col not in ['timestampElapsed', self.time_col, 'meter_reading']]

        assert len(missing_cols) == 0, f'Missed to assign columns: {missing_cols} to `conts` or `cats`'

        return self.df_core, var_names

Only applying the `dep_var` transform

In [None]:
%%time
processor = Processor()
_df, _vars = processor(ashrae_data['meter_train'])
display(_df.head(), _vars)

In [None]:
px.histogram(_df.groupby('meter').sample(10000), 
             x='meter_reading_log1p', facet_row='meter')

In [None]:
#hide
cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p']
assert len(_df.columns) == len(cols) and cols == list(_df.columns.values), f'Unexpected columns: {_df.columns} != {cols}'
assert len(_vars) == 3 and len(_vars['conts']) == 0 and _vars['cats'] == ['building_id', 'meter'] and _vars['dep_var'] == 'meter_reading_log1p'

The meter readings for `building_id` 363 before July 30th are likely due to a construction phase since the bulding's year value is 2017. So this method removes the readings from during the construction time.

In [None]:
#export
@patch
def fix_bid_363(self:Processor):
    if self.test_run: return
    if not self.is_train: return self.df_core
    assert 'timestamp' in self.df_core.columns
    rm = (self.df_core['building_id'] == 363)
    rm = rm & (self.df_core['meter'] == 0)
    rm = rm & (self.df_core['timestamp'] < pd.to_datetime('2016-07-30'))
    ok = ~rm
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.t_in_train_set
    logger.info(f'Fixing building_id 363: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:]
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'fix_bid_363':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(_df) == len(ashrae_data['meter_train']) - 5063

In [None]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {'fix_bid_363':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(ashrae_data['meter_train']) - 5063 <= len(_df) < len(ashrae_data['meter_train'])

In [None]:
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
                                  weather=ashrae_data['weather_train'],
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

There seem to be quite a few imputed / filled values in the meter readings, being visible as constant meter readings for more than a week at a time. This method removes those values.

In [None]:
%%time
it = inspection.InspectTimeseries(ashrae_data['meter_train'], building=ashrae_data['building'],
                                  weather=ashrae_data['weather_train'])
it.inspect_boldly()

Removing all 0s for meter 0

In [None]:
#export
@patch
def remove_0s_meter0(self:Processor):
    if self.test_run: return
    if not self.is_train: return self.df_core
    assert 'timestamp' in self.df_core.columns
    rm = (self.df_core['meter'] == 0)
    rm = rm & (self.df_core[self.dep_var] == 0)
    ok = ~rm
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.t_in_train_set
    logger.info(f'Removing 0s for meter 0: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:]
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'remove_0s_meter0':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(_df) == len(ashrae_data['meter_train']) - 530169
assert 0 in _df.loc[_df['meter']!=0, 'meter_reading_log1p'].values

In [None]:
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
                                  weather=ashrae_data['weather_train'],
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

Removing all 0s for meters 2 and 3 outside of summer

In [None]:
#export
@patch
def remove_not_summer_0s_meter_2_and_3(self:Processor):
    if self.test_run: return
    if not self.is_train: return self.df_core
    assert 'timestamp' in self.df_core.columns
    add_month = 'timestampMonth' not in self.df_core.columns
    if add_month:
        self.df_core['timestampMonth'] = self.df_core['timestamp'].dt.month
    rm = (self.df_core['meter'].isin([2,3]))
    rm = rm & (self.df_core[self.dep_var] == 0)
    rm = rm & (self.df_core['timestampMonth'].isin([6,7,8]))
    ok = ~rm
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.t_in_train_set
    logger.info(f'Removing 0s for meter 2 and 3 during summer: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:]
    if add_month:
        self.df_core.drop(columns=['timestampMonth'],inplace=True)
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'remove_not_summer_0s_meter_2_and_3':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(_df) == len(ashrae_data['meter_train']) - 253743

In [None]:
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
                                  weather=ashrae_data['weather_train'],
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

Removing "bad" electrical (meter 0) readings from the first 141 days of site 0 (i.e. UCF)

In [None]:
#export
@patch
def remove_bad_meter0_readings_of_first_141days(self:Processor):
    if self.test_run: return
    if not self.is_train: return self.df_core
    assert 'timestamp' in self.df_core.columns
    add_month = 'timestampDayofyear' not in self.df_core.columns
    if add_month:
        self.df_core['timestampDayofyear'] = self.df_core['timestamp'].dt.dayofyear
    add_site_id = 'site_id' not in self.df_core.columns
    if add_site_id:
        assert self.df_building is not None, 'df_building cannot be None for this method.'
        self.df_core = pd.merge(self.df_core, self.df_building.loc[:,['building_id','site_id']], on='building_id', how='left')
        assert self.df_core['site_id'].isna().sum() == 0
    rm = self.df_core['meter'] == 0
    rm = rm & (self.df_core['site_id'] == 0)
    rm = rm & (self.df_core['timestampDayofyear'] < 141)
    ok = ~rm
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.t_in_train_set
    logger.info(f'Bad readings for meter 0 for the first 141 days for site 0: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:]
    if add_month:
        self.df_core.drop(columns=['timestampDayofyear'],inplace=True)
    if add_site_id:
        self.df_core.drop(columns=['site_id'],inplace=True)
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'remove_bad_meter0_readings_of_first_141days':{}}
_df, _vars = processor(ashrae_data['meter_train'], df_building=ashrae_data['building'],
                       tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(_df) == len(ashrae_data['meter_train']) - 346112
assert (_df.loc[(_df['building_id']==0)&(_df['meter']==0), 'timestamp'] < pd.to_datetime('2016-05-20')).sum() == 0, 'Not correctly removed all first 141 days for meter 0 and site_id 0'

In [None]:
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
                                  weather=ashrae_data['weather_train'],
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

Fixing absurdly high meter 2 readings for building 1099

In [None]:
#export
@patch
def fix_bid_1099(self:Processor, threshold:float=10.):
    if self.test_run: return
    if not self.is_train: return self.df_core
    assert 'timestamp' in self.df_core.columns
    rm = (self.df_core['building_id'] == 1099)
    rm = rm & (self.df_core['meter'] == 2)
    rm = rm & (self.df_core[self.dep_var] > threshold)
    ok = ~rm
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.t_in_train_set
    logger.info(f'Fixing building_id 1099: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:]
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'fix_bid_1099':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(_df) == len(ashrae_data['meter_train']) - 3377
assert (_df.loc[(_df['building_id']==1099) & (_df['meter']==2), 'meter_reading_log1p'] > 10).sum() == 0

In [None]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {'fix_bid_1099':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(ashrae_data['meter_train']) - 3377 <= len(_df) < len(ashrae_data['meter_train'])

In [None]:
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
                                  weather=ashrae_data['weather_train'],
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

In [None]:
#export
@patch
def remove_imputed_weeks(self:Processor, dep_var='meter_reading'):
    if self.test_run: return
    if not self.is_train: return self.df_core
    grp = ['building_id', 'meter', pd.Grouper(key='timestamp', freq='W-MON')]
    wks = (self.df_core.groupby(grp)[dep_var]).describe(percentiles=[.05,.95])

    min_date = self.df_core['timestamp'].dt.date.min() - pd.Timedelta(7,unit='w')
    max_date = self.df_core['timestamp'].dt.date.max() + pd.Timedelta(7,unit='d')
    w_range = pd.date_range(min_date, max_date, freq='W-MON')

    self.df_core['week'] = [v.right for v in pd.cut(self.df_core['timestamp'], w_range)]

    self.df_core = self.df_core.join(wks.loc[:,['5%','95%']],
                                     on=['building_id','meter','week'])
    rm = np.isclose(self.df_core['5%'], self.df_core['95%'])
    ok = ~rm
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.t_in_train_set
    logger.info(f'Imputed weeks: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:].drop(columns=['5%','95%','week'])
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'remove_imputed_weeks':{'dep_var':'meter_reading'}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==0)&(ashrae_data['meter_train']['meter']==0)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(_df) == len(tmp) - 3265

In [None]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {'remove_imputed_weeks':{'dep_var':'meter_reading'}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==0)&(ashrae_data['meter_train']['meter']==0)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(tmp) - 3265 <= len(_df) < len(tmp)

In [None]:
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
                                  weather=ashrae_data['weather_train'], 
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

There are outliers! 😨 Let's remove them as well. Example `building_id` 60 and `meter` 1.

In [None]:
%%time
it = inspection.InspectTimeseries(ashrae_data['meter_train'], building=ashrae_data['building'],
                                  weather=ashrae_data['weather_train'])
it.inspect_boldly()

In [None]:
#export
@patch
def remove_outliers(self:Processor, f:float=10, dep_var:str='meter_reading'):
    if self.test_run: return None
    if not self.is_train: return self.df_core
    s = self.df_core.groupby(['building_id','meter'])[dep_var].describe()
    s['threshold'] = s['50%'] + (s['75%'] - s['50%']) * f
    self.df_core = self.df_core.join(s.loc[:,['threshold']],
                                     on=['building_id', 'meter'])
    ok = self.df_core[dep_var] < self.df_core['threshold']
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.t_in_train_set

    logger.info(f'Outliers: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:].drop(columns=['threshold'])
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'remove_outliers':{'f':10,'dep_var':'meter_reading'}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert len(_df) == len(tmp) - 1

In [None]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {'remove_outliers':{'f':10,'dep_var':'meter_reading'}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
assert  len(tmp) - 1 <= len(_df) <= len(tmp)

In [None]:
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
                                  weather=ashrae_data['weather_train'], 
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

Adding building information

In [None]:
#export
@patch
def add_building_features(self:Processor):
    if self.test_run: return
    n = len(self.df_core)
    self.df_core = pd.merge(self.df_core, self.df_building, on='building_id', how='left')
    assert n == len(self.df_core)
    _cats = ['site_id', 'primary_use']
    _conts = ['square_feet', 'year_built', 'floor_count']
    logger.info(f'Added building features: \n\tcategorical: {_cats}\n\tcontinuous: {_conts}')
    self.cats.extend(_cats)
    self.conts.extend(_conts)
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'add_building_features':{}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
                       df_building=ashrae_data['building'])
display(_df.head(), _vars)

In [None]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'site_id', 'primary_use', 'square_feet', 'year_built', 'floor_count']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == ['square_feet', 'year_built', 'floor_count'])
assert (_vars['cats'] == ['building_id', 'meter', 'site_id', 'primary_use'])

Adding weather information

In [None]:
#export
site_GMT_offsets = [-5, 0, -7, -5, -8, 0, -5, -5, -5, -6, -7, -5, 0, -6, -5, -5]
GMT_offset_map = {site: offset for site, offset in enumerate(site_GMT_offsets)}

@patch
def add_weather_features(self:Processor,
                         fix_time_offset:bool=False,
                         add_na_indicators:bool=False,
                         impute_nas:bool=False):
    if self.test_run: return
    n = len(self.df_core)
    add_site_id = 'site_id' not in self.df_core.columns
    if add_site_id:
        self.df_core = self.df_core.join(self.df_building.set_index('building_id').loc[:,['site_id']],
                                         on='building_id')

    if fix_time_offset:
        dt = (self.df_weather['site_id']
              .map(GMT_offset_map)
              .apply(lambda x: pd.Timedelta(x, unit='hours')))
        self.df_weather['timestamp'] = self.df_weather['timestamp'] + dt

    cols = ['air_temperature', 'cloud_coverage', 'dew_temperature',
            'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
            'wind_speed']

    # adding na columns
    na_cols = []
    if add_na_indicators:
        for col in cols:
            nas = self.df_weather[col].isna()
            na_col = f'{col}_na(processor)'
            if nas.sum()>0:
                self.df_weather[na_col] = nas
                self.df_weather[na_col] = self.df_weather[na_col].astype(bool)
                na_cols.append(na_col)

    # imputing na columns
    if impute_nas:
        new_weather = []
        aggs = {col: self.df_weather[col].median()
                for col in cols}
        for site, grp in self.df_weather.groupby('site_id'):
            grp = grp.sort_values('timestamp')
            for col in cols:
                nas = grp[col].isna()
                if nas.sum() == len(grp):
                    grp[col] = aggs[col]
                elif nas.sum() > 0:
                    grp[col] = grp[col].interpolate(limit_direction='both',
                                                    method='linear')
                nas = grp[col].isna()
                grp.loc[nas, col] = aggs[col]

            new_weather.append(grp)
        n_weather = len(self.df_weather)
        self.df_weather = pd.concat(new_weather)
        assert len(self.df_weather) == n_weather, f'Interpolation step changed rows from {n_weather} to {len(self.df_weather)}'

    self.df_core = pd.merge(self.df_core, self.df_weather,
                            on=['site_id', 'timestamp'],
                            how='left')
    assert n == len(self.df_core), f'Merging lead to an increase from {n} rows to {len(self.df_core)}'

    if add_site_id:
        self.df_core.drop(columns=['site_id'], inplace=True)
    
    _conts = ['wind_direction', 'air_temperature', 'dew_temperature', 'precip_depth_1_hr',
              'sea_level_pressure', 'wind_speed', 'cloud_coverage']
    self.cats.extend(na_cols)
#     self.cats_order['cloud_coverage'] = sorted([v for v in self.df_core['cloud_coverage'].unique() if np.isfinite(v)])
    self.conts.extend(_conts)
    logger.info(f'Added weather features: \n\tcategorical: {na_cols}\n\tcontinuous: {_conts}')
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'add_weather_features':{'fix_time_offset':False,
                                       'add_na_indicators':False,
                                       'impute_nas':False}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
                       df_building=ashrae_data['building'],
                       df_weather=ashrae_data['weather_train'])
display(_df.head().T, _vars)

In [None]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'air_temperature', 'cloud_coverage', 'dew_temperature', 
                 'precip_depth_1_hr', 'sea_level_pressure',
                 'wind_direction', 'wind_speed']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == ['wind_direction', 'air_temperature',
                           'dew_temperature', 'precip_depth_1_hr',
                           'sea_level_pressure',  'wind_speed',
                           'cloud_coverage'])
assert (_vars['cats'] == ['building_id', 'meter'])

In [None]:
%%time
processor = Processor()
tfms_config = {'add_weather_features':{'fix_time_offset':True,
                                       'add_na_indicators':False,
                                       'impute_nas':False}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
                       df_building=ashrae_data['building'],
                       df_weather=ashrae_data['weather_train'])
display(_df.head().T, _vars)

In [None]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'air_temperature', 'cloud_coverage', 'dew_temperature', 
                 'precip_depth_1_hr', 'sea_level_pressure',
                 'wind_direction', 'wind_speed']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == ['wind_direction', 'air_temperature',
                           'dew_temperature', 'precip_depth_1_hr',
                           'sea_level_pressure',  'wind_speed',
                           'cloud_coverage'])
assert (_vars['cats'] == ['building_id', 'meter'])

In [None]:
%%time
processor = Processor()
tfms_config = {'add_weather_features':{'fix_time_offset':True,
                                       'add_na_indicators':True,
                                       'impute_nas':False}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
                       df_building=ashrae_data['building'],
                       df_weather=ashrae_data['weather_train'])
display(_df.head().T, _vars)

In [None]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'air_temperature', 'cloud_coverage', 'dew_temperature', 
                 'precip_depth_1_hr', 'sea_level_pressure',
                 'wind_direction', 'wind_speed',
                 'air_temperature_na(processor)', 'cloud_coverage_na(processor)',
                 'dew_temperature_na(processor)', 'precip_depth_1_hr_na(processor)',
                 'sea_level_pressure_na(processor)', 'wind_direction_na(processor)',
                 'wind_speed_na(processor)']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == ['wind_direction', 'air_temperature',
                           'dew_temperature', 'precip_depth_1_hr',
                           'sea_level_pressure',  'wind_speed', 'cloud_coverage'])
assert (_vars['cats'] == ['building_id', 'meter',
                          'air_temperature_na(processor)', 'cloud_coverage_na(processor)',
                          'dew_temperature_na(processor)', 'precip_depth_1_hr_na(processor)',
                          'sea_level_pressure_na(processor)', 'wind_direction_na(processor)',
                          'wind_speed_na(processor)'])

In [None]:
%%time
processor = Processor()
tfms_config = {'add_weather_features':{'fix_time_offset':True,
                                       'add_na_indicators':False,
                                       'impute_nas':True}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
                       df_building=ashrae_data['building'],
                       df_weather=ashrae_data['weather_train'])
display(_df.head().T, _vars)

In [None]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'air_temperature', 'cloud_coverage', 'dew_temperature', 
                 'precip_depth_1_hr', 'sea_level_pressure',
                 'wind_direction', 'wind_speed']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == ['wind_direction', 'air_temperature',
                           'dew_temperature', 'precip_depth_1_hr',
                           'sea_level_pressure',  'wind_speed',
                           'cloud_coverage'])
assert (_vars['cats'] == ['building_id', 'meter'])
assert _df[expected_cols[:4]].isna().sum().sum() == 0

Add time features

In [None]:
#export
@patch
def add_time_features(self:Processor):
    if self.test_run: return
    _cats = ['timestampMonth', 'timestampDay', 'timestampWeek', 'timestampDayofweek',
                      'timestampDayofyear', 'timestampIs_month_end', 'timestampIs_month_start',
                      'timestampIs_quarter_start', 'timestampIs_quarter_end',
                      'timestampIs_year_start', 'timestampIs_year_end', 'timestampHour',
                      'timestampIs_us_holiday']
    self.cats.extend(_cats)

    self.df_core = add_datepart(self.df_core, self.time_col, drop=False)

    self.df_core['timestampHour'] = self.df_core[self.time_col].dt.hour

    dates_range = pd.date_range(start='2015-12-31', end='2019-01-01')
    us_holidays = us_calendar().holidays(start=dates_range.min(), end=dates_range.max())

    self.df_core['timestampIs_us_holiday'] = (self.df_core['timestamp'].dt.date.astype('datetime64')
                                              .isin(us_holidays)
                                              .astype(bool))
    logger.info(f'Added categorical time features: {_cats}')
    self.cats_order.update({
        c: sorted(self.df_core[c].unique()) for c in ['timestampMonth', 'timestampDay',
                                                      'timestampWeek', 'timestampDayofweek',
                                                      'timestampDayofyear', 'timestampHour']
    })
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'add_time_features':{}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head().T, _vars)

In [None]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'timestampWeek', 'meter_reading_log1p', 
                 'timestampMonth', 'timestampDay', 'timestampDayofweek',
                 'timestampDayofyear', 'timestampIs_month_end', 'timestampIs_month_start',
                 'timestampIs_quarter_end', 'timestampIs_quarter_start',
                 'timestampIs_year_end', 'timestampIs_year_start', 'timestampElapsed',
                 'timestampHour', 'timestampIs_us_holiday']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == [])
assert (_vars['cats'] == ['building_id', 'meter', 'timestampMonth',
                          'timestampDay', 'timestampWeek', 'timestampDayofweek',
                          'timestampDayofyear', 'timestampIs_month_end', 
                          'timestampIs_month_start', 'timestampIs_quarter_start', 
                          'timestampIs_quarter_end', 'timestampIs_year_start',
                          'timestampIs_year_end', 'timestampHour',
                          'timestampIs_us_holiday'])
assert len(_df) == len(tmp)

Adding `dep_var_stats`

In [None]:
#export
DEFAULT_GRP_COLS = ['building_id', 'timestampHour', 'meter']

@patch
def add_dep_var_stats(self:Processor, grp_cols:typing.List[str]=None):
    if self.test_run: return

    grp_cols = DEFAULT_GRP_COLS if grp_cols is None else grp_cols

    assert self.is_train or self.dep_var_stats is not None
    if self.is_train:
        self.dep_var_stats = dict()

    funs = {
        'median': lambda x: torch.median(tensor(x)).item(),
        'mean': lambda x: torch.mean(tensor(x)).item(),
        '5%': lambda x: np.percentile(x, 5),
        '95%': lambda x: np.percentile(x, 95),
    }
    _conts = []
    # computing stats for self.dep_var on the coarsest possible level
    for name, fun in funs.items():
        name = f'{self.dep_var}_{name}'
        _conts.append(name)
        self.conts.append(name)

        if self.is_train:
            value = fun(self.df_core[self.dep_var].values)
            self.df_core[name] = value
            self.dep_var_stats[name] = value
        else:
            self.df_core[name] = self.dep_var_stats[name]

    # adding stats of self.dep_var on a more granular level
    if grp_cols is not None:
        t_col = 'timestampHour'
        do_add_t = t_col in grp_cols and t_col not in self.df_core.columns.values
        if do_add_t:
            self.df_core[t_col] = self.df_core['timestamp'].dt.hour

        assert all([c in self.df_core.columns.values for c in grp_cols])

        for fun_name, fun in funs.items():
            name = f'{self.dep_var}_{"-".join(grp_cols)}_{fun_name}'
            _conts.append(name)
            self.conts.append(name)

            if self.is_train:

                self.dep_var_stats[name] = (self.df_core.groupby(grp_cols)[self.dep_var]
                                            .agg(fun)
                                            .rename(name))
            self.df_core = self.df_core.join(self.dep_var_stats[name], on=grp_cols)
            self.df_core[name].fillna(self.dep_var_stats[f'{self.dep_var}_{fun_name}'], inplace=True)

        if do_add_t:
            self.df_core.drop(columns=[t_col], inplace=True)
    logger.info(f'Added continuous target columns: {_conts}')
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'add_dep_var_stats':{}}
mask = (ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)

tmp = ashrae_data['meter_train'].loc[mask]
_df, _vars = processor(tmp, tfms_configs=tfms_config)

display(_df.head().T, _vars)

In [None]:
%%time
mask = (ashrae_data['meter_test']['building_id']==60)&(ashrae_data['meter_test']['meter']==1)
tmp_test = ashrae_data['meter_test'].loc[mask]
_df_test, _ = processor(tmp_test, tfms_configs=tfms_config)

In [None]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'meter_reading_log1p_median', 'meter_reading_log1p_mean',
                 'meter_reading_log1p_5%', 'meter_reading_log1p_95%', 
                 'meter_reading_log1p_building_id-timestampHour-meter_median',
                 'meter_reading_log1p_building_id-timestampHour-meter_mean',
                 'meter_reading_log1p_building_id-timestampHour-meter_5%',
                 'meter_reading_log1p_building_id-timestampHour-meter_95%']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == ['meter_reading_log1p_median',
                           'meter_reading_log1p_mean',
                           'meter_reading_log1p_5%',
                           'meter_reading_log1p_95%', 
                           'meter_reading_log1p_building_id-timestampHour-meter_median',
                           'meter_reading_log1p_building_id-timestampHour-meter_mean',
                           'meter_reading_log1p_building_id-timestampHour-meter_5%',
                           'meter_reading_log1p_building_id-timestampHour-meter_95%'])
assert (_vars['cats'] == ['building_id', 'meter'])
assert len(_df) == len(tmp)
assert all(inspection.show_nans(_df_test)['# NaNs'] == 0)

In [None]:
#export

DEFAULT_ONEHOT_COLS = ['meter']

@patch
def add_onehot_encoded(self:Processor, onehot_cols:typing.List[str]=None):
    if self.test_run: return
    onehot_cols = DEFAULT_ONEHOT_COLS if onehot_cols is None else onehot_cols

    t_col = 'timestampHour'
    do_add_t = t_col in onehot_cols and t_col not in self.df_core.columns.values
    if do_add_t:
        self.df_core[t_col] = self.df_core['timestamp'].dt.hour

    self.df_core['id'] = [str(v) for v in zip(*[self.df_core[v] for v in onehot_cols])]

    if self.is_train:
        self.onehot_cols = onehot_cols
        self.onehot_tfm = OneHotEncoder()
        self.onehot_tfm.fit(self.df_core.loc[:, ['id']])


    names = [f'{"-".join(self.onehot_cols)}_{v}' for v in self.onehot_tfm.categories_[0]]
    
    self.cats.extend(names)

    df_onehot = pd.DataFrame(self.onehot_tfm.transform(self.df_core.loc[:, ['id']]).toarray(),
                             columns=names, index=self.df_core.index, dtype=bool)
    logger.info(f'Added one hot encoded features: {names}')
    to_drop = ['id']
    if do_add_t:
        to_drop.append(t_col)
    self.df_core.drop(columns=to_drop, inplace=True)
    self.df_core = pd.concat((self.df_core, df_onehot), axis=1)
    return self.df_core

In [None]:
%%time
processor = Processor()
tfms_config = {'add_onehot_encoded':{}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']<=60)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

In [None]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'meter_(0,)', 'meter_(1,)']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == [])
assert (_vars['cats'] == ['building_id', 'meter', 'meter_(0,)', 'meter_(1,)'])
assert len(_df) == len(tmp)

Running through part of the train / validation set

In [None]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {
    'fix_bid_363':{},
    'fix_bid_1099':{'threshold': 10.},
    'remove_bad_meter0_readings_of_first_141days': {},
    'remove_not_summer_0s_meter_2_and_3': {},
    'remove_0s_meter0': {},
    'add_dep_var_stats':{},
    'add_time_features':{},
    'add_weather_features':{'fix_time_offset':True,
                            'add_na_indicators':True,
                            'impute_nas':True},
    'add_building_features':{},
    'remove_outliers':{'f':10,'dep_var':'meter_reading'},
    'remove_imputed_weeks':{'dep_var':'meter_reading'},
#     'add_onehot_encoded':{},
}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']<=60)]
_df_train, _vars = processor(tmp, tfms_configs=tfms_config,
                             df_weather=ashrae_data['weather_train'],
                             df_building=ashrae_data['building'])
display(_df_train.head(), _vars)

Running through part of the test set

In [None]:
%%time
tmp = ashrae_data['meter_test'].loc[(ashrae_data['meter_test']['building_id']<=60)]
_df_test, _ = processor(tmp, tfms_configs=tfms_config,
                        df_weather=ashrae_data['weather_test'],
                        df_building=ashrae_data['building'])

display(_df_test.head().T)

In [None]:
#export
def align_test(df_train:pd.DataFrame, var_names:dict,
               df_test:pd.DataFrame):
    return df_test.loc[:,[v for v in df_train.columns if v != var_names['dep_var']]]

In [None]:
%%time
_df_test = align_test(_df_train, _vars, _df_test)

In [None]:
#hide
a, b = list(_df_test.columns.values), [v for v in _df_train.columns.values if v != _vars['dep_var']]
assert a == b, f'Columns are mismatching! \n\tIn train but not test: {set(b)-set(a)}\n\tIn test but not train: {set(a)-set(b)}'

Done testing. Let's apply the transforms to the entire data set. Takes about 20min with `remove_imputed_weeks`.

In [None]:
%%time
processor = Processor() # t_train=t_train
tfms_config = {
    'fix_bid_363':{},
    'fix_bid_1099':{'threshold': 10.},
    'remove_bad_meter0_readings_of_first_141days': {},
    'remove_not_summer_0s_meter_2_and_3': {},
    'remove_0s_meter0': {},
    'remove_outliers':{'f':10,'dep_var':'meter_reading'},
    'remove_imputed_weeks':{'dep_var':'meter_reading'},
    'add_dep_var_stats':{},
    'add_time_features':{},
    'add_weather_features':{'fix_time_offset':True,
                            'add_na_indicators':True,
                            'impute_nas':True},
    'add_building_features':{},
#     'add_onehot_encoded':{},
}

tmp = ashrae_data['meter_train'] if n_samples_quick is None else ashrae_data['meter_train'].sample(n=n_samples_quick)

df, var_names = processor(tmp, tfms_configs=tfms_config,
                          df_weather=ashrae_data['weather_train'],
                          df_building=ashrae_data['building'])
display(df.head(), var_names)

In [None]:
var_names

In [None]:
df.head().T

In [None]:
df.info()

Processing the test set (takes ~4 min)

In [None]:
%%time
tmp = ashrae_data['meter_test'] if n_samples_quick is None else ashrae_data['meter_test'].sample(n=n_samples_quick)
df_test_p, _ = processor(tmp, tfms_configs=tfms_config,
                         df_weather=ashrae_data['weather_test'],
                         df_building=ashrae_data['building'])

Ensuring that all columns found with NaN values for the test set also have NaN values in the train set

In [None]:
test_nans = inspection.show_nans(df_test_p)
train_nans = inspection.show_nans(df)

In [None]:
test_nans

Making sure that columns with NaNs in the test set do not have NaN values in the train set

In [None]:
test_nan_cols = [col for col in test_nans.loc[test_nans['# NaNs']>0].index]
assert (train_nans.loc[train_nans.index.isin(test_nan_cols),'# NaNs'] == 0).sum() == 0

In [None]:
df_test = align_test(df, var_names, df_test_p)
if n_samples_quick is None:
    assert len(df_test) == len(ashrae_data['meter_test'])

In [None]:
#hide
assert len(df_test.columns) + 1 == len(df.columns)

In [None]:
#export
def test_var_names(var_names:dict):
    assert isinstance(var_names, dict)
    assert 'conts' in var_names and 'cats' in var_names and 'dep_var' in var_names
    assert isinstance(var_names['conts'], list)
    assert isinstance(var_names['cats'], list)
    assert isinstance(var_names['dep_var'], str)

In [None]:
test_var_names(var_names)

In [None]:
#export
def store_var_names(data_path:Path, var_names:dict):
    fname = data_path/'var_names.pckl'
    print(f'Storing var names at: {fname}')
    with open(fname, 'wb') as f:
        pickle.dump(var_names, f)

In [None]:
%%time
store_var_names(data_path, var_names)

In [None]:
#export
def load_var_names(fname:Path):
    print(f'Reading var names at: {fname}')
    with open(fname, 'rb') as f:
        var_names = pickle.load(f)
    return var_names

In [None]:
%%time
# var_names = load_var_names(data_path/'var_names.pckl')

In [None]:
#hide
test_var_names(var_names)

In [None]:
#export
def store_df(path:Path, df:pd.DataFrame): df.to_parquet(path)

In [None]:
%%time
store_df(data_path/'X.parquet', df)

In [None]:
%%time
store_df(data_path/'X_test.parquet', df_test_p)

In [None]:
#export
def load_df(path:Path): return pd.read_parquet(path)

In [None]:
%%time
# df = load_df(data_path/'X.parquet')

## Testing if certain features improve the score beyond the baseline

Training to get a basic idea if the added features do have any benefit

In [None]:
%%time
df = load_df(data_path/'X.parquet')

In [None]:
%%time
df_test_p = load_df(data_path/'X_test.parquet')

In [None]:
%%time
var_names = load_var_names(data_path/'var_names.pckl')

In [None]:
#export
def get_tabular_object(df:pd.DataFrame, var_names:dict,
                       splits=None, procs:list=[Categorify, FillMissing, Normalize]):
    return TabularPandas(df.copy(), procs,
                         var_names['cats'], var_names['conts'],
                         y_names=var_names['dep_var'],
                         splits=splits)

SPLIT_PARAMS = dict(
    train_frac = .8,
    split_kind = 'time_split_day',
)


def train_predict(df:pd.DataFrame, var_names:dict,
                  model, params:dict=None, n_rep:int=3,
                  n_samples_train:int=10000,
                  n_samples_valid:int=10000,
                  procs:list=[Categorify, FillMissing, Normalize],
                  split_params:dict=None):

    split_params = SPLIT_PARAMS if split_params is None else split_params
    y_col = var_names['dep_var']
    score_vals = []
    params = {} if params is None else params

    to = get_tabular_object(df, var_names, procs=procs)

    for i in tqdm.tqdm(range(n_rep), total=n_rep, desc='Repetition'):

        m = model(**params)
        splits = split_dataset(df, **split_params)

        mask = to.xs.index.isin(splits[0])

        _X = to.xs.loc[~mask, :].iloc[:n_samples_train]
        _y = to.ys.loc[~mask, y_col].iloc[:n_samples_train]
        m.fit(_X.values, _y.values)

        _X = to.xs.loc[mask, :].iloc[:n_samples_valid]
        _y = to.ys.loc[mask, y_col].iloc[:n_samples_valid]
        pred = m.predict(_X.values)
        s = torch.sqrt(F.mse_loss(tensor(pred), tensor(_y.values))).item()
        score_vals.append({'iter': i, 'rmse loss': s})

    return pd.DataFrame(score_vals)

In [None]:
split_params = dict(
    #split_kind = 'random',
    #split_kind = 'time',
    #split_kind = 'fix_time',
    split_kind = 'time_split_day',
    t_train = None,
    train_frac = .8,
)

In [None]:
# params = {'n_estimators': 20, 'max_features': 'sqrt',
#           'n_jobs': -1}
# model = ensemble.RandomForestRegressor
params = {}
model = linear_model.LinearRegression
n_rep = 21
n_samples_train = 10000
n_samples_valid = 1000

In [None]:
var_names

The following is not always necessary. Sensible in the case of a linear model to remove categorical values which are not onehot encoded

In [None]:
to_remove = {'cats':['building_id', 'meter'], 'conts': []}

for k in ['cats', 'conts']:
    var_names[k] = [_v for _v in var_names[k] if _v not in to_remove[k]]
var_names

In [None]:
%%time
procs = [] #[Categorify, FillMissing, Normalize]
df_rep = train_predict(df.copy(), var_names, model, params=params, 
                       n_rep=n_rep, n_samples_train=n_samples_train,
                       n_samples_valid=n_samples_valid, procs=procs,
                       split_params=split_params)

In [None]:
df_rep['rmse loss'].describe()

In [None]:
px.box(df_rep, y='rmse loss', range_y=(0, 2.5))

Baseline model = RandomForest with 20 estimators and sqrt features, training over 100k samples and predicting over 1k

<table>
    <tr>
        <th>input</th>
        <th>model</th>
        <th>rmse loss</th>
        <th>time [s/it]</th>
    </tr>
    <tr>
        <td>meter and building id only</td>
        <td>random forest</td>
        <td>1.2 - 1.21</td>
        <td>10.2</td>
    </tr>
    <tr>
        <td>using dep_var stats</td>
        <td>random forest</td>
        <td>1.16 - 1.18</td>
        <td>17.3</td>
    </tr>
    <tr>
        <td>using time stats</td>
        <td>random forest</td>
        <td>1.2 - 1.21</td>
        <td>13.2 - 13.7</td>
    </tr>
    <tr>
        <td>using building info</td>
        <td>random forest</td>
        <td>1.19</td>
        <td>17 - 18</td>
    </tr>
    <tr>
        <td>using weather (+ building) info</td>
        <td>random forest</td>
        <td>1.13 - 1.139</td>
        <td>14.6 - 15</td>
    </tr>
    <tr>
        <td>using all above</td>
        <td>random forest</td>
        <td>1.19 - 1.21</td>
        <td>20 - 26</td>
    </tr>
    <tr>
        <td>removing leading 0s in `dep_var`</td>
        <td>random forest</td>
        <td>.36 - .37</td>
        <td>4</td>
    </tr>
    <tr>
        <td>removing trailing 0s in `dep_var`</td>
        <td>random forest</td>
        <td>1.2</td>
        <td>4</td>
    </tr>
    <tr>
        <td>removing empty weeks before the first full week</td>
        <td>random forest</td>
        <td>1.16</td>
        <td>4</td>
    </tr>
    <tr>
        <td>meter only</td>
        <td>linear model</td>
        <td>2.2</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter + hour</td>
        <td>linear model</td>
        <td>2.1</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter reading stats only (meter, building_id, hour)</td>
        <td>linear model</td>
        <td>1.23 - 1.24 / 1.68 - 1.7</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter + meter reading stats (meter, building_id, hour)</td>
        <td>linear model</td>
        <td>1.51 - 1.52</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter reading stats (meter, building_id, hour)</td>
        <td>random forest</td>
        <td>0.58 - 0.6</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter + meter reading stats (meter, building_id, hour)</td>
        <td>random forest</td>
        <td>1.21 - 1.22</td>
        <td>5</td>
    </tr>
</table>

## Comparing `dep_var` distributions

In [None]:
%%time
splits = split_dataset(df, **split_params)

In [None]:
%%time
to = get_tabular_object(df, var_names, splits=splits)

In [None]:
n_samples_train = 10000
n_samples_valid = 10000

In [None]:
%%time
# params = {'n_estimators': 20, 'max_features': 'sqrt'}
# model = ensemble.RandomForestRegressor
m = model(**params)

In [None]:
%%time
_X = to.train.xs.sample(n_samples_train).values
_y = to.train.ys.sample(n_samples_train).values.ravel()
m.fit(_X, _y)

In [None]:
%%time
_X = to.valid.xs.sample(n_samples_valid).values
_y = to.valid.ys.sample(n_samples_valid).values.ravel()
pred = m.predict(_X)

In [None]:
#hide
assert np.isfinite(_y).all() and np.isfinite(pred).all()
assert _y.shape == pred.shape

In [None]:
#export
def hist_plot_preds(y0:np.ndarray, y1:np.ndarray,
                    label0:str='y0', label1:str='y1'):
    res = pd.concat(
        (
            pd.DataFrame({
                'y': y0,
                'set': [label0] * len(y0)
            }),
            pd.DataFrame({
                'y':y1,
                'set': [label1] * len(y1)
            })
        ),
        ignore_index=True
    )

    return px.histogram(res, x='y', color='set', marginal='box',
                        barmode='overlay', histnorm='probability density')

In [None]:
hist_plot_preds(_y, pred, label0='truth (valid)', label1='prediction (valid)')

## Inspecting confidently wrong predictions

In [None]:
#export
class BoldlyWrongTimeseries:
    def __init__(self, xs, y_true, y_pred, info:pd.DataFrame=None):
        if info is None:
            self.df = xs.loc[:,['meter', 'building_id', 'timestamp']].copy()
        else:
            assert all([v in info.columns.values for v in ['meter', 'building_id', 'timestamp']])
            self.df = xs.join(info)

        for col in ['meter', 'building_id']:
            self.df[col] = self.df[col].astype('category')
            self.df[col].cat.set_categories(sorted(self.df[col].unique()),
                                            ordered=True, inplace=True)

        self.df['y_true'] = y_true
        self.df['y_pred'] = y_pred
        self.compute_misses()

    def compute_misses(self):
        fun = lambda x: np.sqrt(np.mean(x**2))
        self.miss = (self.df.assign(difference=lambda x: x['y_pred']-x['y_true'])
                     .groupby(['building_id', 'meter'])
                     .agg(loss=pd.NamedAgg(column='difference', aggfunc=fun))
                     .dropna()
                     .sort_values('loss'))

In [None]:
%%time
_X = to.valid.xs 
_y = to.valid.ys.values.ravel() 
pred = m.predict(to.valid.xs.values)

In [None]:
assert _y.shape == pred.shape

In [None]:
%%time
bwt = BoldlyWrongTimeseries(to.valid.xs.join(df.loc[:,['building_id', 'meter','timestamp']]), 
                            _y, pred)

In [None]:
#hide
assert len(bwt.miss) == 2380

Adding plotting capability based on the loss or meter/building id

In [None]:
#export
@patch
def plot_boldly_wrong(self:BoldlyWrongTimeseries,
                      nth_last:int=None,
                      meter:int=None, bid:int=None):

    assert (meter is not None and bid is not None) or (nth_last is not None)

    if nth_last is not None:
        ix = self.miss.iloc[[nth_last],:]
        meter = ix.index[0][1]
        bid = ix.index[0][0]
        loss = ix["loss"].values[0]
    else:
        ix = self.miss.xs((bid,meter))
        loss = ix.values[0]


    df_plot = self.df.loc[(self.df['meter']==int(meter)) & (self.df['building_id']==int(bid))]
    df_plot = pd.concat((
        df_plot[['timestamp', 'y_true']].rename(columns={'y_true':'y'}).assign(label='true'),
        df_plot[['timestamp', 'y_pred']].rename(columns={'y_pred':'y'}).assign(label='pred')),
        ignore_index=True
    )
    return df_plot.plot(kind='scatter', x='timestamp',
                        y='y', color='label', opacity=.4,
                        title=f'pos {nth_last}: meter = {meter}, building_id = {bid}<br>loss = {loss:.3f}')

In [None]:
bwt.plot_boldly_wrong(nth_last=-1)

In [None]:
bwt.plot_boldly_wrong(meter=2, bid=1099)

Adding widgets for interactive exploration

In [None]:
#export
@patch
def init_widgets(self:BoldlyWrongTimeseries):
    self.int_txt_loss = widgets.IntText(min=-len(self.miss), max=len(self.miss),
                                        description='Position', value=-1)
    self.int_txt_meter = widgets.IntText(min=self.df['meter'].min(), max=self.df['meter'].max(),
                                         description='Meter')
    self.int_txt_bid = widgets.IntText(min=self.df['building_id'].min(), max=self.df['building_id'].max(),
                                       description='building id')
    self.run_btn = widgets.Button(description='plot')
    self.switch_btn = widgets.Checkbox(description='Loss-based', value=True)
    self.run_btn.on_click(self.click_boldly_wrong)
    self.out_wdg = widgets.Output()

@patch
def run_boldly(self:BoldlyWrongTimeseries):
    if not hasattr(self, 'switch_btn'):
        self.init_widgets()
    return widgets.VBox([self.switch_btn, self.int_txt_loss,
                         self.int_txt_meter, self.int_txt_bid,
                         self.run_btn, self.out_wdg])

@patch
def click_boldly_wrong(self:BoldlyWrongTimeseries, change):
    self.out_wdg.clear_output()
    nth_last = None if self.switch_btn.value == False else self.int_txt_loss.value
    meter = None if self.switch_btn.value == True else self.int_txt_meter.value
    bid = None if self.switch_btn.value == True else self.int_txt_bid.value
    with self.out_wdg:
        print(f'nth_last {nth_last} meter {meter} bid {bid}')
        try:
            self.plot_boldly_wrong(nth_last=nth_last, meter=meter, bid=bid).show()
        except:
            raise ValueError(f'nth_last {nth_last} meter {meter} bid {bid} not a valid combination! Likely due to missing meter/bid combination')

In [None]:
bwt.run_boldly()