In [None]:
# default_exp inspection

# Inspection of the data

> Basic inspection of the dependent variable, `dep_var`, in train and test set and preparation of the preprocessing step in `02_preprocessing.ipynb`.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
from pathlib import Path
from fastcore.utils import *
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import typing

from fastai.tabular.all import * 

import matplotlib.pyplot as plt

import ipywidgets as widgets

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
data_path = Path("../data")

## Collecting the `.csv` files

In [None]:
#export
CSV_NAMES = ['building', 'sample_submission', 'test', 'train', 'weather_test', 'weather_train']

def get_csvs(data_path:Path, csv_names:typing.List[str]=None) -> typing.Dict[str, Path]:
    csvs = sorted([v for v in data_path.ls() if v.name.endswith('.csv')])
    csv_names = CSV_NAMES if csv_names is None else csv_names
    return {_name: [_csv for _csv in csvs if _csv.name.startswith(_name)][0] 
            for _name in csv_names}

In [None]:
csvs = get_csvs(data_path)
csvs

In [None]:
#hide
assert len(csvs) == len(CSV_NAMES)

## Inspecting basic $X$ and $y$ in train and test set

Things to inspect:
- NaNs
- $y$ distribution
- compressibility

### Get  compressed $X$ and $y$

In [None]:
#export
def get_core_Xy(path:Path, nrows:int=None) -> pd.DataFrame:
    df = pd.read_csv(path, parse_dates=['timestamp'], nrows=nrows)
    return df_shrink(df, int2uint=True)

Get the core of the train dataset

In [None]:
%%time
train = get_core_Xy(csvs['train'])
display(train.head(), train.info())

In [None]:
%%time
test = get_core_Xy(csvs['test'])
display(test.head(), test.info())

In [None]:
#hide
assert set(train['meter'].unique()) == set(test['meter'].unique())
assert set(train['building_id'].unique()) == set(test['building_id'].unique())

### NaNs

In [None]:
#export
def show_nans(df:pd.DataFrame):
    nans = []
    for col in df.columns:
        nans.append({
            'nans count': df[col].isna().sum(), 
            'col':col,
            'nans %': df[col].isna().sum() / len(df) * 100,
        })
    return pd.DataFrame(nans).sort_values('nans count', ascending=False)

In [None]:
%%time
train_nans = show_nans(train)
train_nans

In [None]:
#hide
assert np.allclose(train_nans['nans count'].values, 0)

In [None]:
%%time
test_nans = show_nans(test)
test_nans

In [None]:
#hide
assert np.allclose(test_nans['nans count'].values, 0)

### Distribution of $y$

In [None]:
train[dep_var].describe(percentiles=[.05,.10,.25,.50,.75,.95])

Finding: ~10% of all `dep_var` values are in the vicinity of 0 and there is an outlier at 2.19e7

In [None]:
(train[dep_var] > 1e6).sum() / len(train) * 100

In [None]:
train['logp1'] = np.log10(train[dep_var] + 1)

In [None]:
%%time
px.histogram(train.sample(10000), x='logp1')

In [None]:
%%time
px.histogram(train.sample(10000), x='logp1', facet_row='meter')

Finding:
- abnormally many 0s

In [None]:
train.groupby('meter')[dep_var].describe(percentiles=[.05,.10,.25,.50,.75,.95]).T

Finding:
- 10% of meter 1 & 2 are near 0
- 25% of meter 3 are near 0

seasonality of 0s

In [None]:
mask = np.isclose(train[dep_var], 0)

In [None]:
train.loc[mask, 'timestamp'].sample(10000).dt.hour.plot(kind='hist')

In [None]:
px.histogram(train.loc[mask, :]
             .sample(10000)
             .assign(hour=lambda x: x['timestamp'].dt.month),
             x='hour',
             facet_row='meter')

Finding:
- hour, day of week, day of month: not really a trend on global or meter level for the frequency of zeros
- month: more zeros during summer for meter 2 & 3, less zeros for meter 0 from month 6 onwards, less zeros for meter 1 between month 6 and 9

Site, building and meter frequency

In [None]:
train['meter'].value_counts(normalize=True)

In [None]:
test['meter'].value_counts(normalize=True)

In [None]:
train['building_id'].value_counts(normalize=True)

In [None]:
test['building_id'].value_counts(normalize=True)

## Inspecting the building info

In [None]:
#export
def get_building_X(path:Path):
    # TODO: year_built and floor_count actually are discrete values but contain nans
    # test if 'Int' dtype would work or if it breaks the things downstream
    df_building = pd.read_csv(path)
    return df_shrink(df_building, int2uint=True)

In [None]:
%%time
building = get_building_X(csvs['building'])

In [None]:
#hide
assert building['building_id'].nunique() == len(building)
assert set(train['building_id'].unique()) == set(building['building_id'].unique())

In [None]:
building.info()

In [None]:
building_nans = show_nans(building)
building_nans

In [None]:
#hide
assert np.allclose(building_nans['nans count'].values, [1094, 774, 0, 0, 0, 0])

## Inspecting weather info

In [None]:
#export
def get_weather_X(path:Path):
    # TODO: cloud_coverage, wind_direction could be Int
    df = pd.read_csv(path, parse_dates=['timestamp'])
    return df

In [None]:
%%time
weather_train = get_weather_X(csvs['weather_train'])

In [None]:
#hide
assert set(weather_train['site_id'].unique()) == set(building['site_id'].unique())

In [None]:
weather_train.info()

In [None]:
weather_train_nans = show_nans(weather_train)
weather_train_nans

In [None]:
#hide
assert np.allclose(weather_train_nans.iloc[-2:]['nans count'], 0)

In [None]:
weather_train.describe()

In [None]:
%%time
weather_test = get_weather_X(csvs['weather_test'])

In [None]:
#hide
assert set(weather_test['site_id'].unique()) == set(building['site_id'].unique())

In [None]:
weather_test.info()

In [None]:
weather_test_nans = show_nans(weather_test)
weather_test_nans

In [None]:
#hide
assert np.allclose(weather_test_nans.iloc[-2:]['nans count'], 0)

In [None]:
weather_test.describe()

In [None]:
%%time
len(train.join(building.set_index('building_id'), on='building_id',
           how='left').join(weather_train.set_index(['site_id', 'timestamp']), on=['site_id', 'timestamp'], how='left'))

In [None]:
%%time
len(pd.merge(pd.merge(train, building, on='building_id', how='left'),
    weather_train, on=['site_id', 'timestamp'], how='left'))

## Loading

In [None]:
%%time
train = pd.read_csv(train_csv, parse_dates=['timestamp'])
train.head()

In [None]:
%%time
test = pd.read_csv(test_csv, parse_dates=['timestamp'])
test.head()

In [None]:
len(train), len(test), len(test)/len(train)

Finding:
- test samples ~2x train samples
- train samples ~2mio

In [None]:
%%time
weather_train = pd.read_csv(train_weather_csv)
weather_train.head()

In [None]:
%%time
weather_test = pd.read_csv(test_weather_csv)
weather_test.head()

In [None]:
%%time
building = pd.read_csv(meta_csv)
building.head()

In [None]:
len(train), len(weather_train)

## How many data points each building, meter, site, building type

In [None]:
train['building_id'].nunique()

In [None]:
%%time
px.box(train.groupby('building_id').size())

In [None]:
%%time
train.groupby('meter').size()

In [None]:
%%time
px.box(weather.groupby('site_id').size())

In [None]:
%%time
train.join(building.loc[:,['building_id', 'primary_use']], on='building_id', rsuffix='_building').groupby('primary_use').size().sort_values()

Finding: 
- buildings vary with data points significantly
- sites vary barely with data points
- meter vary with data points by 10x between least and most data points
- `primary_use`: Religious worship 32k, Education 8.1mio

## Anomalies: meter reading, weather, building properties, gaps in the time series

### meter readings (output)

#### counting number of meter readings

In [None]:
train.groupby('meter').()

In [None]:
train['timestamp'].unique()[:10]

In [None]:
ideal_ts = pd.date_range(train['timestamp'].min(), train['timestamp'].max(), freq='60T')
ideal_ts[:10]

In [None]:
len(ideal_ts), train['timestamp'].nunique()

Finding: 
- the number of timestamps in the training set matches the expected number of timestamps of 1hour intervals

In [None]:
train_counts = train.groupby('timestamp').size()
test_counts = test.groupby('timestamp').size()

fig = go.Figure(data=[
    go.Scatter(x=train_counts.index, y=train_counts.values, name='train'),
    go.Scatter(x=test_counts.index, y=test_counts.values, name='test'),
], layout=go.Layout(title='Data point count v time: train v test'))

fig.show()

Finding: 
- meter counts inconsistent vs time for training

#### meter value trends

In [None]:
train.head()

In [None]:
train['meter'].isna().sum()

No NaNs in the output

In [None]:
%%time
train_counts = (train.dropna(subset=['meter']).groupby(['timestamp', 'meter'])
                .agg(**{
                    'mean': pd.NamedAgg('meter_reading', np.mean),
                    'median': pd.NamedAgg('meter_reading', np.median),
                    '5%': pd.NamedAgg('meter_reading', lambda x: np.percentile(x, 5)),
                    '95%': pd.NamedAgg('meter_reading', lambda x: np.percentile(x, 95)),
                }).unstack(level=-1))
train_counts.head()

In [None]:
%%time
for meter in [0,1,2,3]:
    tmp = train_counts.loc[:,pd.IndexSlice[:,meter]]
    tmp.columns = tmp.columns.droplevel(level=1)
    tmp = tmp.reset_index()
    
    fig = go.Figure(data=[
        go.Scatter(x=tmp['timestamp'], y=tmp['5%'], mode='lines', name='5%'),
        go.Scatter(x=tmp['timestamp'], y=tmp['95%'], mode='lines', fill='tonexty', name='5% - 95%'),
        go.Scatter(x=tmp['timestamp'], y=tmp['mean'], mode='lines', name='mean'),
        go.Scatter(x=tmp['timestamp'], y=tmp['median'], mode='lines', name='median'),
    ], layout=go.Layout(title=f'meter: {meter}'))
    fig.show()
    

meter map: `{0: electricity, 1: chilledwater, 2: steam, 3: hotwater}`

Finding:
- the 4 meter types have quite different time behaviors
- meters 0 & 1 have a seasonabl behavior based on the weekday (but much stronger for meter 0 than 1)
- meter 2 has measurement anomalies has significant anomalous time periods  (median exceeds the 95% values)
- meter 2 & 3 have seasonal effect based on the time of year it seems 

In [None]:
%%time
for meter in [0,1,2,3]:
    tmp = train.loc[train['meter']==meter, :].sort_values('meter_reading', ascending=False)
    print('meter', meter)
    display(tmp.head())
    

### building properties

In [None]:
building.head()

In [None]:
building['in_train'] = building['building_id'].isin(train['building_id'])
building['in_test'] = building['building_id'].isin(test['building_id'])
building.head()

In [None]:
building.groupby(['in_train', 'in_test']).size()

In [None]:
px.box(building['square_feet'])

In [None]:
px.box(building['year_built'])

In [None]:
px.box(building['floor_count'])

Finding:
- `floor_count`, `year_built` and `square_feet` seem reasonable overall
- all buildings in train and test

### weather properties

In [None]:
weather_train.head()

In [None]:
cols = weather_train.columns.values[2:]
weather_train.loc[:,cols].isna().sum() / len(weather_train)

Finding:
- `cloud_coverage` 50% NaNs
- `precip_depth_1_hr` 35.9% NaNs 

In [None]:
%%time
for col in cols:
    tmp = (weather_train.dropna(subset=[col]).groupby(['timestamp'])
              .agg(**{
                  'mean': pd.NamedAgg(col, np.mean,),
                  'median': pd.NamedAgg(col, np.median),
                  '5%': pd.NamedAgg(col, lambda x: np.percentile(x, 5)),
                  '95%': pd.NamedAgg(col, lambda x: np.percentile(x, 95)),
              }).reset_index())
    display(tmp.head())
    
    fig = go.Figure(data=[
        go.Scatter(x=tmp['timestamp'], y=tmp['5%'], mode='lines', name='5%'),
        go.Scatter(x=tmp['timestamp'], y=tmp['95%'], mode='lines', fill='tonexty', name='5% - 95%'),
        go.Scatter(x=tmp['timestamp'], y=tmp['mean'], mode='lines', name='mean'),
        go.Scatter(x=tmp['timestamp'], y=tmp['median'], mode='lines', name='median'),
    ], layout=go.Layout(title=f'column: {col}'))
    fig.show()


Finding:
- temperatures $\Rightarrow$ buildings predomonantly in the northern hemisphere

## Looking at timeseries

In [None]:
#export
class InspectTimeseries:
    def __init__(self, train:pd.DataFrame, building:pd.DataFrame=None,
                 weather:pd.DataFrame=None):
        self.dep_var = 'meter_reading'
        self.df = train
        self.building = building
        self.weather = weather
        self.combos = list(zip(*train.loc[:,['building_id', 'meter']]
                       .drop_duplicates()
                       .values.T))
        

In [None]:
%%time
it = InspectTimeseries(train, building=building,
                       weather=weather_train)

In [None]:
#export
@patch
def init_widgets(self:InspectTimeseries):
    
    self.int_txt_meter = widgets.IntText(min=self.df['meter'].min(), 
                                         max=self.df['meter'].max(),
                                         description='Meter')
    self.int_txt_bid = widgets.IntText(min=self.df['building_id'].min(), 
                                       max=self.df['building_id'].max(),
                                       description='building id')
    
    self.run_btn = widgets.Button(description='plot')
    self.run_btn.on_click(self.click_boldly)
    self.selection_mode = widgets.Dropdown(description='selection', 
                                           value='all', 
                                           options=['all', 'random', 'filled_weeks', 'outlying'])
    self.out_wdg = widgets.Output()
 

@patch
def inspect_boldly(self:InspectTimeseries):
    if not hasattr(self, 'switch_btn'):
        self.init_widgets()
    return widgets.VBox([self.int_txt_meter, 
                         self.int_txt_bid, 
                         self.selection_mode,
                         self.run_btn, self.out_wdg])

@patch
def click_boldly(self:InspectTimeseries, change):
    self.out_wdg.clear_output()
    meter = self.int_txt_meter.value
    bid = self.int_txt_bid.value
    
    with self.out_wdg:
        print(f'Selected: meter {meter} bid {bid}')
        if (bid, meter) not in self.combos:
            print('Combination not in the training set 🥴')
        else:
            self.plot_boldly(meter=meter, bid=bid).show()

In [None]:
#export
@patch
def select_boldly_all(self:InspectTimeseries, df_plot:pd.DataFrame):
    return df_plot.assign(label='all')

@patch
def select_boldly_random(self:InspectTimeseries, df_plot:pd.DataFrame):
    mask_random = np.random.choice([True,False], size=len(df_plot))
    return pd.concat((
        (df_plot.loc[~mask_random, ['timestamp', self.dep_var]]
         .assign(label='one')),
        (df_plot.loc[mask_random, ['timestamp', self.dep_var]]
         .assign(label='two')),
    ),ignore_index=True)
        

@patch
def select_boldly_filled_weeks(self:InspectTimeseries, df_plot:pd.DataFrame):
    wks = (df_plot.groupby(pd.Grouper(key='timestamp', freq='W-MON'))[self.dep_var]
           .describe(percentiles=[.05, .95]))

    w_range = pd.date_range(df_plot['timestamp'].dt.date.min()-pd.Timedelta(7,unit='w'), df_plot['timestamp'].dt.date.max()+pd.Timedelta(7,unit='d'), freq='W-MON')
    
    df_plot['week'] = [v.right for v in pd.cut(df_plot['timestamp'], w_range)]
    
    df_plot = df_plot.join(wks.loc[:,['5%', '95%']], on='week')
    mask_drop = np.isclose(df_plot['5%'], df_plot['95%'])
    return pd.concat((
        (df_plot.loc[mask_drop, ['timestamp', self.dep_var]]
         .assign(label='constant')),
        (df_plot.loc[~mask_drop, ['timestamp', self.dep_var]]
         .assign(label='not constant')),
    ),ignore_index=True)

@patch
def select_boldly_outlying(self:InspectTimeseries, df_plot:pd.DataFrame):
    
    s = df_plot[self.dep_var].describe()
    threshold = s['50%'] + (s['75%'] - s['50%']) * 10
        
    mask = df_plot[self.dep_var] > threshold
    return pd.concat((
        (df_plot.loc[~mask, ['timestamp', self.dep_var]]
         .assign(label='normal')),
        (df_plot.loc[mask, ['timestamp', self.dep_var]]
         .assign(label=f'outlier {mask.sum()}')),
    ),ignore_index=True)
    
@patch
def plot_boldly(self:InspectTimeseries,
                meter:int=None, bid:int=None):
    
    
    assert (meter is not None and bid is not None)
        
    mask = (self.df['meter']==int(meter)) & (self.df['building_id']==int(bid))
    
    
    df_plot = self.df.loc[mask, ['timestamp', self.dep_var]]
    
    df_plot = getattr(self, f'select_boldly_{self.selection_mode.value}')(df_plot)
    
    
    fig = px.scatter(df_plot, x='timestamp',
                     y=self.dep_var, color='label',
                     title=f'meter = {meter}, building_id = {bid}')
#     fig.update_traces(line=dict(color="Black", width=.4))
    fig.update_traces(marker=dict(size=1.5)) # ,color='Black'
    return fig


In [None]:
it.inspect_boldly()