In [None]:
# default_exp preprocessing

# Preprocessing data

> Inspecting any particular irregularities and general preparation of the data for modelling as well as basic model inspection.

In [2]:
%load_ext autoreload
%autoreload 2

In [121]:
#export
import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import typing
import pickle
import ipywidgets as widgets

from sklearn import linear_model, tree, model_selection, ensemble
from ashrae import inspection
from fastai.tabular.all import *

import tqdm

from sklearn import linear_model, tree, model_selection, ensemble
from sklearn.preprocessing import OneHotEncoder

import itertools

from pandas.tseries.holiday import USFederalHolidayCalendar as us_calendar

In [4]:
pd.options.plotting.backend = "plotly"

In [5]:
data_path = Path("../data")

In [6]:
dep_var = 'meter_reading'

In [7]:
csvs = inspection.get_csvs(data_path)
csvs

{'building': Path('../data/building_metadata.csv'),
 'sample_submission': Path('../data/sample_submission.csv'),
 'test': Path('../data/test.csv'),
 'train': Path('../data/train.csv'),
 'weather_test': Path('../data/weather_test.csv'),
 'weather_train': Path('../data/weather_train.csv')}

## Loading csvs

In [30]:
%%time
train = inspection.get_core_Xy(csvs['train'])
display(train.head(), train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype         
---  ------         -----         
 0   building_id    uint16        
 1   meter          uint8         
 2   timestamp      datetime64[ns]
 3   meter_reading  float32       
dtypes: datetime64[ns](1), float32(1), uint16(1), uint8(1)
memory usage: 289.2 MB


Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0


None

CPU times: user 7.69 s, sys: 2.36 s, total: 10 s
Wall time: 10.1 s


In [297]:
%%time
test = inspection.get_core_Xy(csvs['test'])
display(test.head(), test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
 #   Column       Dtype         
---  ------       -----         
 0   row_id       uint32        
 1   building_id  uint16        
 2   meter        uint8         
 3   timestamp    datetime64[ns]
dtypes: datetime64[ns](1), uint16(1), uint32(1), uint8(1)
memory usage: 596.5 MB


Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01
1,1,1,0,2017-01-01
2,2,2,0,2017-01-01
3,3,3,0,2017-01-01
4,4,4,0,2017-01-01


None

CPU times: user 18.9 s, sys: 35.5 s, total: 54.4 s
Wall time: 56.8 s


In [9]:
%%time
building = inspection.get_building_X(csvs['building'])
display(building.head(), building.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   site_id      1449 non-null   uint8   
 1   building_id  1449 non-null   uint16  
 2   primary_use  1449 non-null   category
 3   square_feet  1449 non-null   uint32  
 4   year_built   675 non-null    float32 
 5   floor_count  355 non-null    float32 
dtypes: category(1), float32(2), uint16(1), uint32(1), uint8(1)
memory usage: 23.5 KB


Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


None

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 29.5 ms


In [10]:
%%time
weather_train = inspection.get_weather_X(csvs['weather_train'])
display(weather_train.head(), weather_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   site_id             139773 non-null  int64         
 1   timestamp           139773 non-null  datetime64[ns]
 2   air_temperature     139718 non-null  float64       
 3   cloud_coverage      70600 non-null   float64       
 4   dew_temperature     139660 non-null  float64       
 5   precip_depth_1_hr   89484 non-null   float64       
 6   sea_level_pressure  129155 non-null  float64       
 7   wind_direction      133505 non-null  float64       
 8   wind_speed          139469 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 9.6 MB


Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


None

CPU times: user 125 ms, sys: 0 ns, total: 125 ms
Wall time: 133 ms


In [298]:
%%time
weather_test = inspection.get_weather_X(csvs['weather_test'])
display(weather_test.head(), weather_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277243 entries, 0 to 277242
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   site_id             277243 non-null  int64         
 1   timestamp           277243 non-null  datetime64[ns]
 2   air_temperature     277139 non-null  float64       
 3   cloud_coverage      136795 non-null  float64       
 4   dew_temperature     276916 non-null  float64       
 5   precip_depth_1_hr   181655 non-null  float64       
 6   sea_level_pressure  255978 non-null  float64       
 7   wind_direction      264873 non-null  float64       
 8   wind_speed          276783 non-null  float64       
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 19.0 MB


Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2017-01-01 00:00:00,17.8,4.0,11.7,,1021.4,100.0,3.6
1,0,2017-01-01 01:00:00,17.8,2.0,12.8,0.0,1022.0,130.0,3.1
2,0,2017-01-01 02:00:00,16.1,0.0,12.8,0.0,1021.9,140.0,3.1
3,0,2017-01-01 03:00:00,17.2,0.0,13.3,0.0,1022.2,140.0,3.1
4,0,2017-01-01 04:00:00,16.7,2.0,13.3,0.0,1022.3,130.0,2.6


None

CPU times: user 281 ms, sys: 15.6 ms, total: 297 ms
Wall time: 307 ms


## Splitting

In [11]:
#export
def split_dataset(X:pd.DataFrame, split_kind:str='random',
                  train_frac:float=8, t_train:pd.DataFrame=None):
    
    def random_split():
        n_train = int(len(X)*train_frac)
        train_bool = X.index.isin(np.random.choice(X.index.values, size=n_train, replace=False))
        return train_bool
    
    def time_split():
        assert 'timestamp' in X.columns
        time_col = 'timestamp'
        ts = X[time_col].sort_values(ascending=True)
        ix = int(len(X)*train_frac)
        threshold_t = ts.iloc[ix:].values[0]
        return X[time_col] < threshold_t
    
    def time_split_day():
        time_col = 'timestampDayofyear'
        
        if time_col not in X.columns:
            t = X['timestamp'].dt.dayofyear
        else:
            t = X[time_col]
        
        days = (t.value_counts()
                .rename('count')
                .sample(frac=1)
                .to_frame()
                .cumsum()
                .pipe(lambda x: x.loc[x['count'] <= (train_frac * len(t))]))

        num_train_days = len(days)
        mask = t.isin(days.index.values)
        
        assert mask.sum() > 0
        return mask
    
    def fix_time_split():
        assert t_train is not None
        time_col = 'timestamp'
        assert time_col in X.columns
        
        mask = X[time_col].isin(t_train[time_col])
        assert mask.sum() > 0
        return mask
     
    split_funs = {
        'random': random_split,
        'time': time_split,
        'fix_time': fix_time_split,
        'time_split_day': time_split_day,
    }
    
    assert split_kind in split_funs
    train_bool = split_funs[split_kind]()
    
    train_idx = np.where(train_bool)[0]
    valid_idx = np.where(~train_bool)[0]

    return (list(train_idx), list(valid_idx))

In [12]:
%%time
#split_kind = 'random'
#split_kind = 'time'
# split_kind = 'fix_time'
split_kind = 'time_split_day'

t_train = None
train_frac = .8
splits = split_dataset(train, split_kind=split_kind, train_frac=train_frac,
                       t_train=t_train)

CPU times: user 2.12 s, sys: 1.66 s, total: 3.78 s
Wall time: 3.87 s


In [13]:
print(f'sets {len(splits)}, train {len(splits[0])} = {len(splits[0])/len(train):.4f}, valid {len(splits[1])} = {len(splits[1])/len(train):.4f}')

sets 2, train 16119856 = 0.7974, valid 4096244 = 0.2026


In [14]:
t_train = train.iloc[splits[0]][['timestamp']]
t_train.head()

Unnamed: 0,timestamp
0,2016-01-01
1,2016-01-01
2,2016-01-01
3,2016-01-01
4,2016-01-01


In [None]:
%%time
t_train.to_parquet(data_path/'t_train.parquet')

Visualizing the split

In [None]:
%%time
tmp = pd.concat((train.iloc[splits[0]]
                 .assign(label='train')
                 .assign(meter_reading=lambda x: np.log(x['meter_reading']+1)),
                (train.iloc[splits[1]]
                 .assign(label='valid')
                 .assign(meter_reading=lambda x: np.log(x['meter_reading']+1)))), 
                axis=0, ignore_index=True)
tmp.groupby('label').size()

In [None]:
px.scatter(tmp.sample(10000), x='timestamp', y='meter_reading', color='label')

## Wrangling

In [285]:
#export
DEP_VAR = 'meter_reading'
TIME_COL = 'timestamp'

class Processor:
    
    dep_var_stats:dict = None
        
    def __init__(self, dep_var:str=None, time_col:str=None,
                 t_train:pd.Series=None):
        self.dep_var = DEP_VAR if dep_var is None else dep_var
        self.time_col = TIME_COL if time_col is None else time_col
        self.t_train = t_train
    
    def __call__(self, df_core:pd.DataFrame, df_building:pd.DataFrame=None,
                 df_weather:pd.DataFrame=None, t_train:pd.DataFrame=None,
                 tfms_configs:dict=None) -> (pd.DataFrame, dict):       
        
        # check if `df` is a test set (dep_var is missing)
        self.is_train = self.dep_var in df_core.columns
        self.df_core = df_core.copy()
        self.conts, self.cats, self.cats_order = [], [], {}
        self.cats += ['building_id', 'meter']

        # core pieces of dependent and independent variables
        self.update_dep_var()  
        
        if tfms_configs is None:
            print('Empty transform configs `tfms_configs`. Returning ...')
        else:
            self.df_building = df_building
            self.df_weather = df_weather
            self.sanity_check_input_for_tfms(tfms_configs)
            for fun_name, config in tfms_configs.items():
                self.df_core = getattr(self, fun_name)(**config)
        
        df_core, var_names = self.cleanup()
        return df_core, var_names
    
    def update_dep_var(self) -> pd.DataFrame:
        
        if self.dep_var.endswith('log1p'):
            return self.df_core
        
        dep_var_new = f'{self.dep_var}_log1p'
        if self.is_train:
            self.df_core[dep_var_new] = np.log(self.df_core[self.dep_var].values + 1)
        self.dep_var = dep_var_new
        return self.df_core
    
    def sanity_check_input_for_tfms(self, tfms_configs:dict):
        # sanity check presence of df_building if df_weather is given
        if self.df_weather is not None:
            assert self.df_building is not None, 'To join the weather info in `df_weather` you need to pass `df_building`.'
        
        # making sure all required inputs are specified in `tfms_configs`
        self.test_run = True
        if tfms_configs is not None:
            building_fun_names = ['add_building_features']
            weather_fun_names = ['add_weather_features']
            for fun_name, config in tfms_configs.items():
                getattr(self, fun_name)(**config)
                if fun_name in building_fun_names:
                    assert self.df_building is not None, 'You need to pass `df_building` in Processor.__call__.'
                if fun_name in weather_fun_names:
                    assert self.df_weather is not None, 'You need to pass `df_weather` in Processor.__call__.'
        self.test_run = False
        
    def get_var_names(self) -> dict:
        return {'conts': self.conts, 'cats': self.cats, 'dep_var': self.dep_var}
    
    def cleanup(self) -> (pd.DataFrame, dict):
        # converting cats to category type
        for col in self.cats:
            if self.df_core[col].dtype == bool: continue
            self.df_core[col] = self.df_core[col].astype('category')
            if col in self.cats_order: 
                self.df_core[col].cat.set_categories(self.cats_order[col],
                                                     ordered=True, inplace=True)

        # removing features 
        to_remove_cols = ['meter_reading', 'timestampYear'] # , self.time_col
        self.df_core.drop(columns=[c for c in self.df_core.columns if c in to_remove_cols],
                          inplace=True)
        
        # shrinking the data frame
        self.df_core = df_shrink(self.df_core, int2uint=True)
        
        var_names = self.get_var_names()
        
        if not self.is_train:
            self.df_core.set_index('row_id', inplace=True)
        
        missing_cols = [col for col in self.df_core.columns.values if col not in self.cats + self.conts + [self.dep_var]
                        and col not in ['timestampElapsed', self.time_col, 'meter_reading']]
        
        assert len(missing_cols) == 0, f'Missed to assign columns: {missing_cols} to `conts` or `cats`'
        
        return self.df_core, var_names    

Only applying the `dep_var` transform

In [220]:
%%time
processor = Processor()
_df, _vars = processor(train)
display(_df.head(), _vars)

Empty transform configs `tfms_configs`. Returning ...


Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0


{'conts': [],
 'cats': ['building_id', 'meter'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 797 ms, sys: 1.97 s, total: 2.77 s
Wall time: 2.77 s


In [219]:
#hide
cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p']
assert len(_df.columns) == len(cols) and cols == list(_df.columns.values), f'Unexpected columns: {_df.columns} != {cols}'
assert len(_vars) == 3 and len(_vars['conts']) == 0 and _vars['cats'] == ['building_id', 'meter'] and _vars['dep_var'] == 'meter_reading_log1p'

The meter readings for `building_id` 363 before July 30th are likely due to a construction phase since the bulding's year value is 2017. So this method removes the readings from during the construction time.

In [287]:
#export
@patch
def fix_bid_363(self:Processor):
    if self.test_run: return
    if not self.is_train: return self.df_core
    assert 'timestamp' in self.df_core.columns
    rm = (self.df_core['building_id'] == 363) 
    rm = rm & (self.df_core['meter'] == 0)
    rm = rm & (self.df_core['timestamp'] < pd.to_datetime('2016-07-30'))
    ok = ~rm
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.df_core['timestamp'].isin(self.t_train.values.ravel())
    print(f'Fixing building_id 363: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:]
    return self.df_core

In [269]:
%%time
processor = Processor()
tfms_config = {'fix_bid_363':{}}
_df, _vars = processor(train, tfms_configs=tfms_config)
display(_df.head(), _vars)

Fixing building_id 363: removing 5063 data points = 0.03 %


Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0


{'conts': [],
 'cats': ['building_id', 'meter'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 1.03 s, sys: 2.72 s, total: 3.75 s
Wall time: 3.74 s


In [215]:
#hide
assert len(_df) == len(train) - 5063

In [253]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {'fix_bid_363':{}}
_df, _vars = processor(train, tfms_configs=tfms_config)
display(_df.head(), _vars)

fix_bid_363 {}
Fixing building_id 363: removing 4055 data points = 0.02 %


Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0


{'conts': [],
 'cats': ['building_id', 'meter'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 2.03 s, sys: 3.62 s, total: 5.66 s
Wall time: 5.72 s


In [237]:
#hide
assert len(train) - 5063 <= len(_df) < len(train)

In [234]:
%%time
it = inspection.InspectTimeseries(_df, building=building,
                                  weather=weather_train,
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

CPU times: user 1.11 s, sys: 2.95 s, total: 4.06 s
Wall time: 4.03 s


VBox(children=(IntText(value=0, description='Meter'), IntText(value=0, description='building id'), Dropdown(de…

There seem to be quite a few imputed / filled values in the meter readings, being visible as constant meter readings for more than a week at a time. This method removes those values.

In [108]:
%%time
it = inspection.InspectTimeseries(train, building=building,
                                  weather=weather_train)
it.inspect_boldly()

CPU times: user 875 ms, sys: 984 ms, total: 1.86 s
Wall time: 1.86 s


VBox(children=(IntText(value=0, description='Meter'), IntText(value=0, description='building id'), Button(desc…

In [288]:
#export
@patch
def remove_imputed_weeks(self:Processor, dep_var='meter_reading'):
    if self.test_run: return
    if not self.is_train: return self.df_core
    grp = ['building_id', 'meter', pd.Grouper(key='timestamp', freq='W-MON')]
    wks = (self.df_core.groupby(grp)[dep_var]).describe(percentiles=[.05,.95])

    min_date = self.df_core['timestamp'].dt.date.min() - pd.Timedelta(7,unit='w')
    max_date = self.df_core['timestamp'].dt.date.max() + pd.Timedelta(7,unit='d')
    w_range = pd.date_range(min_date, max_date, freq='W-MON')

    self.df_core['week'] = [v.right for v in pd.cut(self.df_core['timestamp'], w_range)]

    self.df_core = self.df_core.join(wks.loc[:,['5%','95%']], 
                                     on=['building_id','meter','week'])
    rm = np.isclose(self.df_core['5%'], self.df_core['95%'])
    ok = ~rm
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.df_core['timestamp'].isin(self.t_train.values.ravel())
    print(f'Imputed weeks: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:].drop(columns=['5%','95%','week'])
    return self.df_core

In [256]:
%%time
processor = Processor()
tfms_config = {'remove_imputed_weeks':{'dep_var':'meter_reading'}}
tmp = train.loc[(train['building_id']==0)&(train['meter']==0)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

Imputed weeks: removing 3265 data points = 37.17 %


Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p
7311560,0,0,2016-05-16 01:00:00,0.0
7313906,0,0,2016-05-16 02:00:00,0.0
7316253,0,0,2016-05-16 03:00:00,0.0
7318597,0,0,2016-05-16 04:00:00,0.0
7320940,0,0,2016-05-16 05:00:00,0.0


{'conts': [],
 'cats': ['building_id', 'meter'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 391 ms, sys: 46.9 ms, total: 438 ms
Wall time: 432 ms


In [257]:
#hide
assert len(_df) == len(tmp) - 3265

In [258]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {'remove_imputed_weeks':{'dep_var':'meter_reading'}}
tmp = train.loc[(train['building_id']==0)&(train['meter']==0)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

Imputed weeks: removing 2616 data points = 29.78 %


Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p
220604,0,0,2016-01-05 00:00:00,0.0
222903,0,0,2016-01-05 01:00:00,0.0
225202,0,0,2016-01-05 02:00:00,0.0
227503,0,0,2016-01-05 03:00:00,0.0
229803,0,0,2016-01-05 04:00:00,0.0


{'conts': [],
 'cats': ['building_id', 'meter'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 422 ms, sys: 31.2 ms, total: 453 ms
Wall time: 425 ms


In [259]:
#hide
assert len(tmp) - 3265 <= len(_df) < len(tmp)

In [260]:
%%time
it = inspection.InspectTimeseries(_df, building=building,
                                  weather=weather_train, 
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 29.9 ms


VBox(children=(IntText(value=0, description='Meter'), IntText(value=0, description='building id'), Dropdown(de…

There are outliers! 😨 Let's remove them as well. Example `building_id` 60 and `meter` 1.

In [145]:
%%time
it = inspection.InspectTimeseries(train, building=building,
                                  weather=weather_train)
it.inspect_boldly()

CPU times: user 750 ms, sys: 844 ms, total: 1.59 s
Wall time: 1.59 s


VBox(children=(IntText(value=0, description='Meter'), IntText(value=0, description='building id'), Dropdown(de…

In [289]:
#export
@patch
def remove_outliers(self:Processor, f:float=10, dep_var:str='meter_reading'):
    if self.test_run: return None
    if not self.is_train: return self.df_core
    s = self.df_core.groupby(['building_id','meter'])[dep_var].describe()
    s['threshold'] = s['50%'] + (s['75%'] - s['50%']) * f
    display(s)
    self.df_core = self.df_core.join(s.loc[:,['threshold']], 
                                     on=['building_id', 'meter'])
    ok = self.df_core[dep_var] < self.df_core['threshold']
    if self.is_train and self.t_train is not None:
        ok = ok | ~self.df_core['timestamp'].isin(self.t_train.values.ravel())

    print(f'Outliers: removing {(~ok).sum()} data points = {(~ok).sum()/len(ok)*100:.2f} %')
    self.df_core = self.df_core.loc[ok,:].drop(columns=['threshold'])
    return self.df_core

In [262]:
%%time
processor = Processor()
tfms_config = {'remove_outliers':{'f':10,'dep_var':'meter_reading'}}
tmp = train.loc[(train['building_id']==60)&(train['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max,threshold
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
60,1,7268.0,2167.063232,2728.710693,0.0,464.313995,1561.780029,3925.560059,172611.0,25199.580322


Outliers: removing 1 data points = 0.01 %


Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p
3172349,60,1,2016-02-29 09:00:00,0.0
3174448,60,1,2016-02-29 10:00:00,0.0
3176551,60,1,2016-02-29 11:00:00,0.0
3178653,60,1,2016-02-29 12:00:00,0.0
3180744,60,1,2016-02-29 13:00:00,0.0


{'conts': [],
 'cats': ['building_id', 'meter'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 78.1 ms, sys: 15.6 ms, total: 93.8 ms
Wall time: 83.8 ms


In [263]:
#hide
assert len(_df) == len(tmp) - 1

In [264]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {'remove_outliers':{'f':10,'dep_var':'meter_reading'}}
tmp = train.loc[(train['building_id']==60)&(train['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max,threshold
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
60,1,7268.0,2167.063232,2728.710693,0.0,464.313995,1561.780029,3925.560059,172611.0,25199.580322


Outliers: removing 0 data points = 0.00 %


Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p
3172349,60,1,2016-02-29 09:00:00,0.0
3174448,60,1,2016-02-29 10:00:00,0.0
3176551,60,1,2016-02-29 11:00:00,0.0
3178653,60,1,2016-02-29 12:00:00,0.0
3180744,60,1,2016-02-29 13:00:00,0.0


{'conts': [],
 'cats': ['building_id', 'meter'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 78.1 ms, sys: 31.2 ms, total: 109 ms
Wall time: 116 ms


In [265]:
#hide
assert  len(tmp) - 1 <= len(_df) <= len(tmp)

In [266]:
%%time
it = inspection.InspectTimeseries(_df, building=building,
                                  weather=weather_train, 
                                  dep_var='meter_reading_log1p')
it.inspect_boldly()

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 30.5 ms


VBox(children=(IntText(value=0, description='Meter'), IntText(value=0, description='building id'), Dropdown(de…

Adding building information

In [290]:
#export
@patch
def add_building_features(self:Processor):
    if self.test_run: return
    n = len(self.df_core)
    self.df_core = pd.merge(self.df_core, self.df_building, on='building_id', how='left')
    assert n == len(self.df_core)

    self.cats.extend(['site_id', 'primary_use'])
    self.conts.extend(['square_feet', 'year_built', 'floor_count'])
    return self.df_core

In [170]:
%%time
processor = Processor()
tfms_config = {'add_building_features':{}}
tmp = train.loc[(train['building_id']==60)&(train['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
                       df_building=building)
display(_df.head(), _vars)

Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p,site_id,primary_use,square_feet,year_built,floor_count
0,60,1,2016-02-29 09:00:00,0.0,0,Education,81576,1997.0,
1,60,1,2016-02-29 10:00:00,0.0,0,Education,81576,1997.0,
2,60,1,2016-02-29 11:00:00,0.0,0,Education,81576,1997.0,
3,60,1,2016-02-29 12:00:00,0.0,0,Education,81576,1997.0,
4,60,1,2016-02-29 13:00:00,0.0,0,Education,81576,1997.0,


{'conts': ['square_feet', 'year_built', 'floor_count'],
 'cats': ['building_id', 'meter', 'site_id', 'primary_use'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 42.2 ms


In [174]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'site_id', 'primary_use', 'square_feet', 'year_built', 'floor_count']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == ['square_feet', 'year_built', 'floor_count'])
assert (_vars['cats'] == ['building_id', 'meter', 'site_id', 'primary_use'])

Adding weather information

In [291]:
#export
@patch
def add_weather_features(self:Processor):
    if self.test_run: return
    n = len(self.df_core)
    add_site_id = 'site_id' not in self.df_core.columns
    if add_site_id:
        self.df_core = self.df_core.join(self.df_building.set_index('building_id').loc[:,['site_id']], 
                                         on='building_id')
    self.df_core = pd.merge(self.df_core, self.df_weather, 
                            on=['site_id', 'timestamp'], 
                            how='left')
    assert n == len(self.df_core)
    
    if add_site_id:
        self.df_core.drop(columns=['site_id'], inplace=True)
        
    self.cats.extend(['cloud_coverage'])
    self.cats_order['cloud_coverage'] = sorted([v for v in self.df_core['cloud_coverage'].unique() if np.isfinite(v)])
    self.conts.extend(['wind_direction', 'air_temperature', 'dew_temperature', 'precip_depth_1_hr',
                  'sea_level_pressure', 'wind_speed'])
    return self.df_core

In [185]:
%%time
processor = Processor()
tfms_config = {'add_weather_features':{}}
tmp = train.loc[(train['building_id']==60)&(train['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
                       df_building=building,
                       df_weather=weather_train)
display(_df.head(), _vars)

Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,60,1,2016-02-29 09:00:00,0.0,12.8,,8.9,0.0,1021.900024,0.0,0.0
1,60,1,2016-02-29 10:00:00,0.0,12.2,,9.4,0.0,1021.900024,0.0,0.0
2,60,1,2016-02-29 11:00:00,0.0,12.8,,9.4,0.0,1022.099976,0.0,0.0
3,60,1,2016-02-29 12:00:00,0.0,11.1,6.0,8.9,0.0,1022.700012,0.0,0.0
4,60,1,2016-02-29 13:00:00,0.0,13.3,,11.1,0.0,1023.099976,0.0,0.0


{'conts': ['wind_direction',
  'air_temperature',
  'dew_temperature',
  'precip_depth_1_hr',
  'sea_level_pressure',
  'wind_speed'],
 'cats': ['building_id', 'meter', 'cloud_coverage'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 62.5 ms, sys: 0 ns, total: 62.5 ms
Wall time: 62.5 ms


In [186]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'air_temperature', 'cloud_coverage', 'dew_temperature', 
                 'precip_depth_1_hr', 'sea_level_pressure',
                 'wind_direction', 'wind_speed']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == ['wind_direction', 'air_temperature',
                           'dew_temperature', 'precip_depth_1_hr',
                           'sea_level_pressure',  'wind_speed'])
assert (_vars['cats'] == ['building_id', 'meter', 'cloud_coverage'])

Add time features

In [292]:
#export
@patch
def add_time_features(self:Processor):
    if self.test_run: return
    self.cats.extend(['timestampMonth', 'timestampDay', 'timestampWeek', 'timestampDayofweek',
                      'timestampDayofyear', 'timestampIs_month_end', 'timestampIs_month_start',
                      'timestampIs_quarter_start', 'timestampIs_quarter_end',
                      'timestampIs_year_start', 'timestampIs_year_end', 'timestampHour',
                      'timestampIs_us_holiday'])

    self.df_core = add_datepart(self.df_core, self.time_col, drop=False)  

    self.df_core['timestampHour'] = self.df_core[self.time_col].dt.hour

    dates_range = pd.date_range(start='2015-12-31', end='2019-01-01')
    us_holidays = us_calendar().holidays(start=dates_range.min(), end=dates_range.max())

    self.df_core['timestampIs_us_holiday'] = (self.df_core['timestamp'].dt.date.astype('datetime64')
                                              .isin(us_holidays)
                                              .astype(bool))
    
    self.cats_order.update({
        c: sorted(self.df_core[c].unique()) for c in ['timestampMonth', 'timestampDay', 
                                                      'timestampWeek', 'timestampDayofweek', 
                                                      'timestampDayofyear', 'timestampHour']
    })
    return self.df_core

In [190]:
%%time
processor = Processor()
tfms_config = {'add_time_features':{}}
tmp = train.loc[(train['building_id']==60)&(train['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

Unnamed: 0,building_id,meter,timestamp,timestampWeek,meter_reading_log1p,timestampMonth,timestampDay,timestampDayofweek,timestampDayofyear,timestampIs_month_end,timestampIs_month_start,timestampIs_quarter_end,timestampIs_quarter_start,timestampIs_year_end,timestampIs_year_start,timestampElapsed,timestampHour,timestampIs_us_holiday
3172349,60,1,2016-02-29 09:00:00,9,0.0,2,29,0,60,True,False,False,False,False,False,1456736400,9,False
3174448,60,1,2016-02-29 10:00:00,9,0.0,2,29,0,60,True,False,False,False,False,False,1456740000,10,False
3176551,60,1,2016-02-29 11:00:00,9,0.0,2,29,0,60,True,False,False,False,False,False,1456743600,11,False
3178653,60,1,2016-02-29 12:00:00,9,0.0,2,29,0,60,True,False,False,False,False,False,1456747200,12,False
3180744,60,1,2016-02-29 13:00:00,9,0.0,2,29,0,60,True,False,False,False,False,False,1456750800,13,False


{'conts': [],
 'cats': ['building_id',
  'meter',
  'timestampMonth',
  'timestampDay',
  'timestampWeek',
  'timestampDayofweek',
  'timestampDayofyear',
  'timestampIs_month_end',
  'timestampIs_month_start',
  'timestampIs_quarter_start',
  'timestampIs_quarter_end',
  'timestampIs_year_start',
  'timestampIs_year_end',
  'timestampHour',
  'timestampIs_us_holiday'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 78.1 ms, sys: 0 ns, total: 78.1 ms
Wall time: 85.3 ms


In [191]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'timestampWeek', 'meter_reading_log1p', 
                 'timestampMonth', 'timestampDay', 'timestampDayofweek',
                 'timestampDayofyear', 'timestampIs_month_end', 'timestampIs_month_start',
                 'timestampIs_quarter_end', 'timestampIs_quarter_start',
                 'timestampIs_year_end', 'timestampIs_year_start', 'timestampElapsed',
                 'timestampHour', 'timestampIs_us_holiday']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == [])
assert (_vars['cats'] == ['building_id', 'meter', 'timestampMonth',
                          'timestampDay', 'timestampWeek', 'timestampDayofweek',
                          'timestampDayofyear', 'timestampIs_month_end', 
                          'timestampIs_month_start', 'timestampIs_quarter_start', 
                          'timestampIs_quarter_end', 'timestampIs_year_start',
                          'timestampIs_year_end', 'timestampHour',
                          'timestampIs_us_holiday'])
assert len(_df) == len(tmp)

Adding `dep_var_stats`

In [293]:
#export
DEFAULT_GRP_COLS = ['building_id', 'timestampHour', 'meter']

@patch
def add_dep_var_stats(self:Processor, grp_cols:typing.List[str]=None):
    if self.test_run: return
    
    grp_cols = DEFAULT_GRP_COLS if grp_cols is None else grp_cols
    
    assert self.is_train or self.dep_var_stats is not None
    if self.is_train: 
        self.dep_var_stats = dict()
    
    funs = {
        'median': lambda x: torch.median(tensor(x)).item(),
        'mean': lambda x: torch.mean(tensor(x)).item(),
        '5%': lambda x: np.percentile(x, 5),
        '95%': lambda x: np.percentile(x, 95),
    }

    # computing stats for self.dep_var on the coarsest possible level
    for name, fun in funs.items():
        name = f'{self.dep_var}_{name}'
        self.conts.append(name)

        if self.is_train:
            value = fun(self.df_core[self.dep_var].values)
            self.df_core[name] = value
            self.dep_var_stats[name] = value
        else:
            self.df_core[name] = self.dep_var_stats[name]

    # adding stats of self.dep_var on a more granular level            
    if grp_cols is not None:
        t_col = 'timestampHour'
        do_add_t = t_col in grp_cols and t_col not in self.df_core.columns.values
        if do_add_t:
            self.df_core[t_col] = self.df_core['timestamp'].dt.hour

        assert all([c in self.df_core.columns.values for c in grp_cols])

        for fun_name, fun in funs.items():
            name = f'{self.dep_var}_{"-".join(grp_cols)}_{fun_name}'
            self.conts.append(name)

            if self.is_train:

                self.dep_var_stats[name] = (self.df_core.groupby(grp_cols)[self.dep_var]
                                            .agg(fun)
                                            .rename(name))
            self.df_core = self.df_core.join(self.dep_var_stats[name], on=grp_cols)    

        if do_add_t:
            self.df_core.drop(columns=[t_col], inplace=True)
    return self.df_core

In [195]:
%%time
processor = Processor()
tfms_config = {'add_dep_var_stats':{}}
tmp = train.loc[(train['building_id']==60)&(train['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p,meter_reading_log1p_median,meter_reading_log1p_mean,meter_reading_log1p_5%,meter_reading_log1p_95%,meter_reading_log1p_building_id-timestampHour-meter_median,meter_reading_log1p_building_id-timestampHour-meter_mean,meter_reading_log1p_building_id-timestampHour-meter_5%,meter_reading_log1p_building_id-timestampHour-meter_95%
3172349,60,1,2016-02-29 09:00:00,0.0,7.354221,6.032163,0.0,8.530354,8.112414,6.384503,0.0,8.537822
3174448,60,1,2016-02-29 10:00:00,0.0,7.354221,6.032163,0.0,8.530354,8.149674,6.436301,0.0,8.647373
3176551,60,1,2016-02-29 11:00:00,0.0,7.354221,6.032163,0.0,8.530354,8.185596,6.483431,0.0,8.690882
3178653,60,1,2016-02-29 12:00:00,0.0,7.354221,6.032163,0.0,8.530354,8.197289,6.489438,0.0,8.716427
3180744,60,1,2016-02-29 13:00:00,0.0,7.354221,6.032163,0.0,8.530354,8.185596,6.496059,0.0,8.67731


{'conts': ['meter_reading_log1p_median',
  'meter_reading_log1p_mean',
  'meter_reading_log1p_5%',
  'meter_reading_log1p_95%',
  'meter_reading_log1p_building_id-timestampHour-meter_median',
  'meter_reading_log1p_building_id-timestampHour-meter_mean',
  'meter_reading_log1p_building_id-timestampHour-meter_5%',
  'meter_reading_log1p_building_id-timestampHour-meter_95%'],
 'cats': ['building_id', 'meter'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 62.5 ms, sys: 0 ns, total: 62.5 ms
Wall time: 85.7 ms


In [196]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'meter_reading_log1p_median', 'meter_reading_log1p_mean',
                 'meter_reading_log1p_5%', 'meter_reading_log1p_95%', 
                 'meter_reading_log1p_building_id-timestampHour-meter_median',
                 'meter_reading_log1p_building_id-timestampHour-meter_mean',
                 'meter_reading_log1p_building_id-timestampHour-meter_5%',
                 'meter_reading_log1p_building_id-timestampHour-meter_95%']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == ['meter_reading_log1p_median',
                           'meter_reading_log1p_mean',
                           'meter_reading_log1p_5%',
                           'meter_reading_log1p_95%', 
                           'meter_reading_log1p_building_id-timestampHour-meter_median',
                           'meter_reading_log1p_building_id-timestampHour-meter_mean',
                           'meter_reading_log1p_building_id-timestampHour-meter_5%',
                           'meter_reading_log1p_building_id-timestampHour-meter_95%'])
assert (_vars['cats'] == ['building_id', 'meter'])
assert len(_df) == len(tmp)

In [294]:
#export

DEFAULT_ONEHOT_COLS = ['meter']

@patch
def add_onehot_encoded(self:Processor, onehot_cols:typing.List[str]=None):
    if self.test_run: return
    onehot_cols = DEFAULT_ONEHOT_COLS if onehot_cols is None else onehot_cols

    t_col = 'timestampHour'
    do_add_t = t_col in onehot_cols and t_col not in self.df_core.columns.values
    if do_add_t:
        self.df_core[t_col] = self.df_core['timestamp'].dt.hour

    self.df_core['id'] = [str(v) for v in zip(*[self.df_core[v] for v in onehot_cols])]

    if self.is_train:
        self.onehot_cols = onehot_cols
        self.onehot_tfm = OneHotEncoder()
        self.onehot_tfm.fit(self.df_core.loc[:, ['id']])


    names = [f'{"-".join(self.onehot_cols)}_{v}' for v in self.onehot_tfm.categories_[0]]

    self.cats.extend(names)

    df_onehot = pd.DataFrame(self.onehot_tfm.transform(self.df_core.loc[:, ['id']]).toarray(),
                             columns=names, index=self.df_core.index, dtype=bool)

    to_drop = ['id']
    if do_add_t:
        to_drop.append(t_col)
    self.df_core.drop(columns=to_drop, inplace=True)
    self.df_core = pd.concat((self.df_core, df_onehot), axis=1)   
    return self.df_core

In [204]:
%%time
processor = Processor()
tfms_config = {'add_onehot_encoded':{}}
tmp = train.loc[(train['building_id']<=60)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)

Unnamed: 0,building_id,meter,timestamp,meter_reading_log1p,"meter_(0,)","meter_(1,)"
0,0,0,2016-01-01,0.0,True,False
1,1,0,2016-01-01,0.0,True,False
2,2,0,2016-01-01,0.0,True,False
3,3,0,2016-01-01,0.0,True,False
4,4,0,2016-01-01,0.0,True,False


{'conts': [],
 'cats': ['building_id', 'meter', 'meter_(0,)', 'meter_(1,)'],
 'dep_var': 'meter_reading_log1p'}

CPU times: user 469 ms, sys: 78.1 ms, total: 547 ms
Wall time: 530 ms


In [205]:
#hide
expected_cols = ['building_id', 'meter', 'timestamp', 'meter_reading_log1p', 
                 'meter_(0,)', 'meter_(1,)']
assert list(_df.columns.values) == expected_cols, f'columns {_df.columns.values} did not meet the expected columns: {expected_cols}'
assert (_vars['conts'] == [])
assert (_vars['cats'] == ['building_id', 'meter', 'meter_(0,)', 'meter_(1,)'])
assert len(_df) == len(tmp)

stuff to potentially add back in
```python
def remove_leading_zeros(self, df_core:pd.DataFrame, t_train:pd.DataFrame=None):
    'there are time series which start with many 0 values in the dep_var. this method removes those values'

    n = len(df_core)
    assert self.dep_var in df_core.columns
    assert 'timestamp' in df_core.columns
    assert df_core[self.dep_var].min() == 0

    # finding the first timestamps after 0s
    mins = (df_core[df_core[self.dep_var] > 0].groupby(["building_id","meter"])
            .timestamp.min().rename("first_timestamp"))
    df_core = df_core.join(mins,on=["building_id","meter"])

    mask = df_core['first_timestamp'] <= df_core['timestamp']

    if t_train is not None:
        t_mask = df_core['timestamp'].isin(t_train['timestamp'])
        mask = (mask & t_mask) | ~t_mask

    df_core = df_core.loc[mask,:]
    df_core.drop(columns=['first_timestamp'], inplace=True)
    assert len(df_core) < n
    print(f'Removed {(1-len(df_core)/n)*100:.4f} % of rows')

    return df_core


def remove_trailing_zeros(self, df_core:pd.DataFrame, t_train:pd.DataFrame=None):
    'there are time series which end with many 0 values in the dep_var. this method removes those values'

    n = len(df_core)
    assert self.dep_var in df_core.columns
    assert 'timestamp' in df_core.columns
    assert df_core[self.dep_var].min() == 0

    # finding the first timestamps after 0s
    maxs = (df_core[df_core[self.dep_var] > 0].groupby(["building_id","meter"])
            .timestamp.max().rename("last_timestamp"))
    df_core = df_core.join(maxs,on=["building_id","meter"])

    mask = df_core['last_timestamp'] >= df_core['timestamp']

    if t_train is not None:
        t_mask = df_core['timestamp'].isin(t_train['timestamp'])
        mask = (mask & t_mask) | ~t_mask

    df_core = df_core.loc[mask,:]
    df_core.drop(columns=['last_timestamp'], inplace=True)
    assert len(df_core) < n
    print(f'Removed {(1-len(df_core)/n)*100:.4f} % of rows')

    return df_core


def remove_empty_weeks_before_first_full_week(self, df_core:pd.DataFrame,
                                              t_train:pd.DataFrame):
    'there are some timeseries with weeks in the beginning which are basically empty'
    # TODO: something is likely buggy, losing combinations of building_id and meter

    n = len(df_core)
    n_comb = len(df_core.loc[:,['building_id', 'meter']].drop_duplicates())

    def get_combs(df):
        return set([tuple([row['building_id'], row['meter']])
                            for _, row in (df.loc[:,['building_id', 'meter']]
                                           .drop_duplicates()
                                           .iterrows())])

    combs = get_combs(df_core)

    add_t = 'timestampWeek' not in df_core.columns
    if add_t:
        df_core['timestampWeek'] = df_core[self.time_col].dt.isocalendar().week

    counts = (df_core[df_core[self.dep_var] > 0]
              .groupby(["building_id","meter","timestampWeek"])
              .timestamp.count()
              .rename("num_weekly_measurements").reset_index())

    expected_num = 24*7 # hours per week with a measurement
    expected_num *= 0.9

    first_full_week = (counts[counts['num_weekly_measurements'] > expected_num]
                       .groupby(["building_id","meter"])
                       .timestampWeek.min()
                       .rename("first_full_week")
                       .to_frame())

    df_core = df_core.join(first_full_week, on=["building_id","meter"])

    mask = df_core['timestampWeek'] >= df_core['first_full_week']

    na_series = df_core.loc[df_core['first_full_week'].isna(), ['building_id', 'meter']].drop_duplicates()

    print('number of na combinations', len(na_series))
    display('na bids & meters', len(na_series), na_series)

    mask = mask & df_core['first_full_week'].notna() # some time series have in each week less than the required number of observations

    if t_train is not None:
        t_mask = df_core['timestamp'].isin(t_train['timestamp'])
        mask = (mask & t_mask) | ~t_mask

    df_core = df_core.loc[mask,:]

    to_drop = ['first_full_week']
    if add_t:
        to_drop.append('timestampWeek')
    df_core.drop(columns=to_drop, inplace=True)

    na_combs = get_combs(df_core)
    miss_combs = [v for v in na_combs if v not in combs]
    print('combs diff', miss_combs, len(miss_combs))

    assert len(df_core) < n
    assert len(na_series) == 38
    print(f'Removed {(1-len(df_core)/n)*100:.4f} % of rows')
    print(f'{len(na_series)} of building_id/meter combinations count as empty = {100 * len(na_series) / n_comb:.4f} % of all combinations')
    return df_core
```

Running through part of the train / validation set

In [300]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {
    'fix_bid_363':{},
    'add_dep_var_stats':{},
    'add_time_features':{},
    'add_weather_features':{},
    'add_building_features':{},
    'remove_outliers':{'f':10,'dep_var':'meter_reading'},
    'remove_imputed_weeks':{'dep_var':'meter_reading'},
    'add_onehot_encoded':{},
}
tmp = train.loc[(train['building_id']<=60)]
_df_train, _vars = processor(tmp, tfms_configs=tfms_config,
                       df_weather=weather_train,
                       df_building=building)
display(_df_train.head(), _vars)

Fixing building_id 363: removing 0 data points = 0.00 %


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max,threshold
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,8784.0,146.454971,121.897171,0.0,0.000000,189.751999,246.404007,448.000000,756.272079
1,0,8784.0,74.865395,61.765388,0.0,0.000000,95.012398,133.781998,255.000000,482.708397
2,0,8784.0,14.551384,16.063791,0.0,0.000000,9.419300,24.981701,67.983002,165.043308
3,0,8784.0,235.549957,205.985855,0.0,0.000000,315.683990,395.629257,937.000000,1115.136658
4,0,8784.0,976.556824,779.694092,0.0,0.000000,1450.439941,1620.400024,3592.000000,3150.040771
...,...,...,...,...,...,...,...,...,...,...
57,0,8784.0,223.646942,187.575333,0.0,0.000000,281.214996,388.717987,698.000000,1356.244904
58,0,8784.0,235.929382,203.737656,0.0,0.000000,279.850006,414.399246,731.705017,1625.342407
59,0,8784.0,107.720985,176.513763,0.0,0.000000,6.962100,210.501999,875.724976,2042.361089
60,0,8784.0,511.896179,500.745300,0.0,0.000000,442.128510,993.807495,1736.430054,5958.918365


Outliers: removing 1637 data points = 0.27 %
Imputed weeks: removing 153078 data points = 25.04 %


Unnamed: 0,building_id,meter,timestamp,timestampWeek,meter_reading_log1p,meter_reading_log1p_median,meter_reading_log1p_mean,meter_reading_log1p_5%,meter_reading_log1p_95%,meter_reading_log1p_building_id-timestampHour-meter_median,...,sea_level_pressure,wind_direction,wind_speed,site_id,primary_use,square_feet,year_built,floor_count,"meter_(0,)","meter_(1,)"
20,20,0,2016-01-01 00:00:00,53,0.0,4.98624,3.824506,0.0,7.901918,6.005698,...,1019.700012,0.0,0.0,0,Education,110272,1977.0,,True,False
45,46,0,2016-01-01 00:00:00,53,3.993413,4.98624,3.824506,0.0,7.901918,2.271342,...,1019.700012,0.0,0.0,0,Retail,9045,2016.0,,True,False
52,54,0,2016-01-01 00:00:00,53,0.0,4.98624,3.824506,0.0,7.901918,6.13999,...,1019.700012,0.0,0.0,0,Education,7867,2011.0,,True,False
79,20,0,2016-01-01 01:00:00,53,4.532477,4.98624,3.824506,0.0,7.901918,5.998244,...,1020.200012,70.0,1.5,0,Education,110272,1977.0,,True,False
104,46,0,2016-01-01 01:00:00,53,4.000935,4.98624,3.824506,0.0,7.901918,2.303565,...,1020.200012,70.0,1.5,0,Retail,9045,2016.0,,True,False


{'conts': ['meter_reading_log1p_median',
  'meter_reading_log1p_mean',
  'meter_reading_log1p_5%',
  'meter_reading_log1p_95%',
  'meter_reading_log1p_building_id-timestampHour-meter_median',
  'meter_reading_log1p_building_id-timestampHour-meter_mean',
  'meter_reading_log1p_building_id-timestampHour-meter_5%',
  'meter_reading_log1p_building_id-timestampHour-meter_95%',
  'wind_direction',
  'air_temperature',
  'dew_temperature',
  'precip_depth_1_hr',
  'sea_level_pressure',
  'wind_speed',
  'square_feet',
  'year_built',
  'floor_count'],
 'cats': ['building_id',
  'meter',
  'timestampMonth',
  'timestampDay',
  'timestampWeek',
  'timestampDayofweek',
  'timestampDayofyear',
  'timestampIs_month_end',
  'timestampIs_month_start',
  'timestampIs_quarter_start',
  'timestampIs_quarter_end',
  'timestampIs_year_start',
  'timestampIs_year_end',
  'timestampHour',
  'timestampIs_us_holiday',
  'cloud_coverage',
  'site_id',
  'primary_use',
  'meter_(0,)',
  'meter_(1,)'],
 'dep_va

CPU times: user 28.9 s, sys: 1.66 s, total: 30.6 s
Wall time: 30.6 s


Running through part of the test set

In [301]:
%%time
tmp = test.loc[(test['building_id']<=60)]
_df_test, _ = processor(tmp, tfms_configs=tfms_config,
                       df_weather=weather_test,
                       df_building=building)

display(_df_test.head())

Unnamed: 0_level_0,building_id,meter,timestampWeek,timestamp,meter_reading_log1p_median,meter_reading_log1p_mean,meter_reading_log1p_5%,meter_reading_log1p_95%,meter_reading_log1p_building_id-timestampHour-meter_median,meter_reading_log1p_building_id-timestampHour-meter_mean,...,sea_level_pressure,wind_direction,wind_speed,site_id,primary_use,square_feet,year_built,floor_count,"meter_(0,)","meter_(1,)"
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,52,2017-01-01,4.98624,3.824506,0.0,7.901918,5.330063,3.35362,...,1021.400024,100.0,3.6,0,Education,7432,2008.0,,True,False
1,1,0,52,2017-01-01,4.98624,3.824506,0.0,7.901918,4.550158,2.99751,...,1021.400024,100.0,3.6,0,Education,2720,2004.0,,True,False
2,2,0,52,2017-01-01,4.98624,3.824506,0.0,7.901918,1.966133,1.537966,...,1021.400024,100.0,3.6,0,Education,5376,1991.0,,True,False
3,3,0,52,2017-01-01,4.98624,3.824506,0.0,7.901918,5.790767,3.605763,...,1021.400024,100.0,3.6,0,Education,23685,2002.0,,True,False
4,4,0,52,2017-01-01,4.98624,3.824506,0.0,7.901918,7.257961,4.506267,...,1021.400024,100.0,3.6,0,Education,116607,1975.0,,True,False


CPU times: user 3.69 s, sys: 2.34 s, total: 6.03 s
Wall time: 6.09 s


In [312]:
#export
def align_test(train:pd.DataFrame, var_names:dict, test:pd.DataFrame):
    return test.loc[:,[v for v in train.columns if v != var_names['dep_var']]]

In [322]:
%%time
_df_test = align_test(_df_train, _vars, _df_test)

CPU times: user 62.5 ms, sys: 46.9 ms, total: 109 ms
Wall time: 109 ms


In [323]:
#hide
a, b = list(_df_test.columns.values), [v for v in _df_train.columns.values if v != _vars['dep_var']]
assert a == b, f'Columns are mismatching! \n\tIn train but not test: {set(b)-set(a)}\n\tIn test but not train: {set(a)-set(b)}'

Done testing. Let's apply the transforms to the entire data set

In [279]:
%%time
processor = Processor(t_train=t_train)
tfms_config = {
    'fix_bid_363':{},
    'add_dep_var_stats':{},
    'add_time_features':{},
    'add_weather_features':{},
    'add_building_features':{},
    'remove_outliers':{'f':10,'dep_var':'meter_reading'},
    'remove_imputed_weeks':{'dep_var':'meter_reading'},
    'add_onehot_encoded':{},
}
df, var_names = processor(train, tfms_configs=tfms_config,
                       df_weather=weather_train,
                       df_building=building)
display(df.head(), var_names)

Fixing building_id 363: removing 4055 data points = 0.02 %


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max,threshold
building_id,meter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,8784.0,146.454971,121.897171,0.000000,0.000000,189.751999,246.404007,448.000000,756.272079
1,0,8784.0,74.865395,61.765388,0.000000,0.000000,95.012398,133.781998,255.000000,482.708397
2,0,8784.0,14.551384,16.063791,0.000000,0.000000,9.419300,24.981701,67.983002,165.043308
3,0,8784.0,235.549957,205.985855,0.000000,0.000000,315.683990,395.629257,937.000000,1115.136658
4,0,8784.0,976.556824,779.694092,0.000000,0.000000,1450.439941,1620.400024,3592.000000,3150.040771
...,...,...,...,...,...,...,...,...,...,...
1444,0,7445.0,7.482127,4.386412,0.000000,4.825000,6.675000,9.325000,35.825001,33.174996
1445,0,7449.0,4.732815,1.317669,2.400000,3.425000,4.825000,5.575000,10.850000,12.325000
1446,0,7472.0,3.592162,4.531993,0.000000,0.000000,0.000000,9.475000,15.300000,94.750004
1447,0,7471.0,187.117996,24.197422,132.675003,168.412498,186.649994,202.925003,259.024994,349.400085


Outliers: removing 356582 data points = 1.76 %


KeyboardInterrupt: 

In [None]:
var_names

In [None]:
df.head().T

In [None]:
df.info()

In [None]:
%%time
df_test, _ = processor(tmp, tfms_configs=tfms_config,
                       df_weather=weather_test,
                       df_building=building)
df_test = align_test(df, var_names, df_test)
assert len(df_test) == len(test)

In [None]:
#hide
assert len(df_test.columns) + 1 == len(df.columns)

In [None]:
#export
def test_var_names(var_names:dict):
    assert isinstance(var_names, dict)
    assert 'conts' in var_names and 'cats' in var_names and 'dep_var' in var_names
    assert isinstance(var_names['conts'], list) 
    assert isinstance(var_names['cats'], list) 
    assert isinstance(var_names['dep_var'], str)

In [None]:
test_var_names(var_names)

In [None]:
#export
def store_var_names(data_path:Path, var_names:dict):
    fname = data_path/'var_names.pckl'
    print(f'Storing var names at: {fname}')
    with open(fname, 'wb') as f:
        pickle.dump(var_names, f)

In [None]:
%%time
store_var_names(data_path, var_names)

In [None]:
#export
def load_var_names(fname:Path):
    print(f'Reading var names at: {fname}')
    with open(fname, 'rb') as f:
        var_names = pickle.load(f)
    return var_names

In [None]:
%%time
# var_names = load_var_names(data_path/'var_names.pckl')

In [None]:
#hide
test_var_names(var_names)

In [None]:
#export
def store_df(path:Path, df:pd.DataFrame): df.to_parquet(path)

In [None]:
%%time
store_df(data_path/'X.parquet', df)

In [None]:
%%time
store_df(data_path/'X_test.parquet', df_test)

In [None]:
#export
def load_df(path:Path): return pd.read_parquet(path)

In [None]:
%%time
# df = load_df(data_path/'X.parquet')

## Testing if certain features improve the score beyond the baseline

Training to get a basic idea if the added features do have any benefit

In [None]:
%%time
df = load_df(data_path/'X.parquet')

In [None]:
%%time
df_test = load_df(data_path/'X_test.parquet')

In [None]:
%%time
var_names = load_var_names(data_path/'var_names.pckl')

In [None]:
#export
def get_tabular_object(df:pd.DataFrame, var_names:dict,
                       splits=None, procs:list=[Categorify, FillMissing, Normalize]):
    return TabularPandas(df.copy(), procs, 
                         var_names['cats'], var_names['conts'], 
                         y_names=var_names['dep_var'],
                         splits=splits)

SPLIT_PARAMS = dict(
    train_frac = .8,
    split_kind = 'time_split_day',
)


def train_predict(df:pd.DataFrame, var_names:dict, 
                  model, params:dict=None, n_rep:int=3,
                  n_samples_train:int=10000, 
                  n_samples_valid:int=10000,
                  procs:list=[Categorify, FillMissing, Normalize],
                  split_params:dict=None):
    
    split_params = SPLIT_PARAMS if split_params is None else split_params
    y_col = var_names['dep_var']
    score_vals = []
    params = {} if params is None else params
    
    to = get_tabular_object(df, var_names, procs=procs)
    
    for i in tqdm.tqdm(range(n_rep), total=n_rep, desc='Repetition'):
        
        m = model(**params)
        splits = split_dataset(df, **split_params)
        
        mask = to.xs.index.isin(splits[0])
        
        _X = to.xs.loc[~mask, :].iloc[:n_samples_train]
        _y = to.ys.loc[~mask, y_col].iloc[:n_samples_train]
        m.fit(_X.values, _y.values)
        
        _X = to.xs.loc[mask, :].iloc[:n_samples_valid]
        _y = to.ys.loc[mask, y_col].iloc[:n_samples_valid]
        pred = m.predict(_X.values)
        s = torch.sqrt(F.mse_loss(tensor(pred), tensor(_y.values))).item()
        score_vals.append({'iter': i, 'rmse loss': s})
    
    return pd.DataFrame(score_vals)

In [None]:
split_params = dict(
    #split_kind = 'random',
    #split_kind = 'time',
    #split_kind = 'fix_time',
    split_kind = 'time_split_day',
    t_train = None,
    train_frac = .8,
)

In [None]:
# params = {'n_estimators': 20, 'max_features': 'sqrt',
#           'n_jobs': -1}
# model = ensemble.RandomForestRegressor
params = {}
model = linear_model.LinearRegression
n_rep = 21
n_samples_train = 10000
n_samples_valid = 1000

In [None]:
var_names

The following is not always necessary. Sensible in the case of a linear model to remove categorical values which are not onehot encoded

In [None]:
to_remove = {'cats':['building_id', 'meter'], 'conts': []}

for k in ['cats', 'conts']:
    var_names[k] = [_v for _v in var_names[k] if _v not in to_remove[k]]
var_names

In [None]:
%%time
procs = [] #[Categorify, FillMissing, Normalize]
df_rep = train_predict(df.copy(), var_names, model, params=params, 
                       n_rep=n_rep, n_samples_train=n_samples_train,
                       n_samples_valid=n_samples_valid, procs=procs,
                       split_params=split_params)

In [None]:
df_rep['rmse loss'].describe()

In [None]:
px.box(df_rep, y='rmse loss', range_y=(0, 2.5))

Baseline model = RandomForest with 20 estimators and sqrt features, training over 100k samples and predicting over 1k

<table>
    <tr>
        <th>input</th>
        <th>model</th>
        <th>rmse loss</th>
        <th>time [s/it]</th>
    </tr>
    <tr>
        <td>meter and building id only</td>
        <td>random forest</td>
        <td>1.2 - 1.21</td>
        <td>10.2</td>
    </tr>
    <tr>
        <td>using dep_var stats</td>
        <td>random forest</td>
        <td>1.16 - 1.18</td>
        <td>17.3</td>
    </tr>
    <tr>
        <td>using time stats</td>
        <td>random forest</td>
        <td>1.2 - 1.21</td>
        <td>13.2 - 13.7</td>
    </tr>
    <tr>
        <td>using building info</td>
        <td>random forest</td>
        <td>1.19</td>
        <td>17 - 18</td>
    </tr>
    <tr>
        <td>using weather (+ building) info</td>
        <td>random forest</td>
        <td>1.13 - 1.139</td>
        <td>14.6 - 15</td>
    </tr>
    <tr>
        <td>using all above</td>
        <td>random forest</td>
        <td>1.19 - 1.21</td>
        <td>20 - 26</td>
    </tr>
    <tr>
        <td>removing leading 0s in `dep_var`</td>
        <td>random forest</td>
        <td>.36 - .37</td>
        <td>4</td>
    </tr>
    <tr>
        <td>removing trailing 0s in `dep_var`</td>
        <td>random forest</td>
        <td>1.2</td>
        <td>4</td>
    </tr>
    <tr>
        <td>removing empty weeks before the first full week</td>
        <td>random forest</td>
        <td>1.16</td>
        <td>4</td>
    </tr>
    <tr>
        <td>meter only</td>
        <td>linear model</td>
        <td>2.2</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter + hour</td>
        <td>linear model</td>
        <td>2.1</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter reading stats only (meter, building_id, hour)</td>
        <td>linear model</td>
        <td>1.23 - 1.24 / 1.68 - 1.7</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter + meter reading stats (meter, building_id, hour)</td>
        <td>linear model</td>
        <td>1.51 - 1.52</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter reading stats (meter, building_id, hour)</td>
        <td>random forest</td>
        <td>0.58 - 0.6</td>
        <td>5</td>
    </tr>
    <tr>
        <td>meter + meter reading stats (meter, building_id, hour)</td>
        <td>random forest</td>
        <td>1.21 - 1.22</td>
        <td>5</td>
    </tr>
</table>

## Comparing `dep_var` distributions

In [None]:
%%time
splits = split_dataset(df, **split_params)

In [None]:
%%time
to = get_tabular_object(df, var_names, splits=splits)

In [None]:
n_samples_train = 10000
n_samples_valid = 10000

In [None]:
%%time
# params = {'n_estimators': 20, 'max_features': 'sqrt'}
# model = ensemble.RandomForestRegressor
m = model(**params)

In [None]:
%%time
_X = to.train.xs.sample(n_samples_train).values
_y = to.train.ys.sample(n_samples_train).values.ravel()
m.fit(_X, _y)

In [None]:
%%time
_X = to.valid.xs.sample(n_samples_valid).values
_y = to.valid.ys.sample(n_samples_valid).values.ravel()
pred = m.predict(_X)

In [None]:
#hide
assert np.isfinite(_y).all() and np.isfinite(pred).all()
assert _y.shape == pred.shape

In [None]:
#export
def hist_plot_preds(y0:np.ndarray, y1:np.ndarray, 
                    label0:str='y0', label1:str='y1'):
    res = pd.concat(
        (
            pd.DataFrame({
                'y': y0, 
                'set': [label0] * len(y0)
            }),
            pd.DataFrame({
                'y':y1, 
                'set': [label1] * len(y1)
            })
        ),
        ignore_index=True
    )

    return px.histogram(res, x='y', color='set', marginal='box',
                        barmode='overlay', histnorm='probability density')

In [None]:
hist_plot_preds(_y, pred, label0='truth (valid)', label1='prediction (valid)')

## Inspecting confidently wrong predictions

In [None]:
#export
class BoldlyWrongTimeseries:
    def __init__(self, xs, y_true, y_pred, info:pd.DataFrame=None):
        if info is None:
            self.df = xs.loc[:,['meter', 'building_id', 'timestamp']].copy()
        else:
            assert all([v in info.columns.values for v in ['meter', 'building_id', 'timestamp']])
            self.df = xs.join(info)
            
        for col in ['meter', 'building_id']:
            self.df[col] = self.df[col].astype('category')
            self.df[col].cat.set_categories(sorted(self.df[col].unique()), 
                                            ordered=True, inplace=True)
        
        self.df['y_true'] = y_true
        self.df['y_pred'] = y_pred
        self.compute_misses()

    def compute_misses(self):
        fun = lambda x: np.sqrt(np.mean(x**2))
        self.miss = (self.df.assign(difference=lambda x: x['y_pred']-x['y_true'])
                     .groupby(['building_id', 'meter'])
                     .agg(loss=pd.NamedAgg(column='difference', aggfunc=fun))
                     .dropna()
                     .sort_values('loss'))

In [None]:
%%time
_X = to.valid.xs 
_y = to.valid.ys.values.ravel() 
pred = m.predict(to.valid.xs.values)

In [None]:
assert _y.shape == pred.shape

In [None]:
%%time
bwt = BoldlyWrongTimeseries(to.valid.xs.join(df.loc[:,['building_id', 'meter','timestamp']]), 
                            _y, pred)

In [None]:
#hide
assert len(bwt.miss) == 2380

Adding plotting capability based on the loss or meter/building id

In [None]:
#export
@patch
def plot_boldly_wrong(self:BoldlyWrongTimeseries, 
                      nth_last:int=None,
                      meter:int=None, bid:int=None):
    
    assert (meter is not None and bid is not None) or (nth_last is not None)
    
    if nth_last is not None:
        ix = self.miss.iloc[[nth_last],:]
        meter = ix.index[0][1]
        bid = ix.index[0][0]
        loss = ix["loss"].values[0]
    else:
        ix = self.miss.xs((bid,meter))
        loss = ix.values[0]
        
    
    df_plot = self.df.loc[(self.df['meter']==int(meter)) & (self.df['building_id']==int(bid))]
    df_plot = pd.concat((
        df_plot[['timestamp', 'y_true']].rename(columns={'y_true':'y'}).assign(label='true'),
        df_plot[['timestamp', 'y_pred']].rename(columns={'y_pred':'y'}).assign(label='pred')),
        ignore_index=True
    )
    return df_plot.plot(kind='scatter', x='timestamp', 
                        y='y', color='label', opacity=.4,
                        title=f'pos {nth_last}: meter = {meter}, building_id = {bid}<br>loss = {loss:.3f}')
    


In [None]:
bwt.plot_boldly_wrong(nth_last=-1)

In [None]:
bwt.plot_boldly_wrong(meter=2, bid=1099)

Adding widgets for interactive exploration

In [None]:
#export
@patch
def init_widgets(self:BoldlyWrongTimeseries):
    self.int_txt_loss = widgets.IntText(min=-len(self.miss), max=len(self.miss),
                                        description='Position', value=-1)
    self.int_txt_meter = widgets.IntText(min=self.df['meter'].min(), max=self.df['meter'].max(),
                                         description='Meter')
    self.int_txt_bid = widgets.IntText(min=self.df['building_id'].min(), max=self.df['building_id'].max(),
                                       description='building id')
    self.run_btn = widgets.Button(description='plot')
    self.switch_btn = widgets.Checkbox(description='Loss-based', value=True)
    self.run_btn.on_click(self.click_boldly_wrong)
    self.out_wdg = widgets.Output()
    
@patch
def run_boldly(self:BoldlyWrongTimeseries):
    if not hasattr(self, 'switch_btn'):
        self.init_widgets()
    return widgets.VBox([self.switch_btn, self.int_txt_loss, 
                         self.int_txt_meter, self.int_txt_bid, 
                         self.run_btn, self.out_wdg])

@patch
def click_boldly_wrong(self:BoldlyWrongTimeseries, change):
    self.out_wdg.clear_output()
    nth_last = None if self.switch_btn.value == False else self.int_txt_loss.value
    meter = None if self.switch_btn.value == True else self.int_txt_meter.value
    bid = None if self.switch_btn.value == True else self.int_txt_bid.value
    with self.out_wdg:
        print(f'nth_last {nth_last} meter {meter} bid {bid}')
        try:
            self.plot_boldly_wrong(nth_last=nth_last, meter=meter, bid=bid).show()
        except:
            raise ValueError(f'nth_last {nth_last} meter {meter} bid {bid} not a valid combination! Likely due to missing meter/bid combination')

In [None]:
bwt.run_boldly()