In [None]:
# default_exp inspection

# Inspection of the data

> API details.

Qs:
- how many data points for each building / meter?
- all train/test timestamps covered with weather data?
- weather trends?
- trends by meter?
- what does a simple model need?
- buiding type trends?
- **anomalies?**

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
base_path = Path("../data")

In [None]:
csvs = sorted([base_path/v for v in os.listdir(base_path) if v.endswith('.csv')])
csvs

In [None]:
train_csv = csvs[3]
train_weather_csv = csvs[-1]
test_csv = csvs[2]
test_weather_csv = csvs[-2]
meta_csv = csvs[0]

train_csv, train_weather_csv, test_csv, test_weather_csv, meta_csv

## Loading

In [None]:
%%time
train = pd.read_csv(train_csv, parse_dates=['timestamp'])
train.head()

In [None]:
%%time
test = pd.read_csv(test_csv, parse_dates=['timestamp'])
test.head()

In [None]:
len(train), len(test), len(test)/len(train)

Finding:
- test samples ~2x train samples
- train samples ~2mio

In [None]:
%%time
weather_train = pd.read_csv(train_weather_csv)
weather_train.head()

In [None]:
%%time
weather_test = pd.read_csv(test_weather_csv)
weather_test.head()

In [None]:
%%time
building = pd.read_csv(meta_csv)
building.head()

In [None]:
len(train), len(weather_train)

## How many data points each building, meter, site, building type

In [None]:
train['building_id'].nunique()

In [None]:
%%time
px.box(train.groupby('building_id').size())

In [None]:
%%time
train.groupby('meter').size()

In [None]:
%%time
px.box(weather.groupby('site_id').size())

In [None]:
%%time
train.join(building.loc[:,['building_id', 'primary_use']], on='building_id', rsuffix='_building').groupby('primary_use').size().sort_values()

Finding: 
- buildings vary with data points significantly
- sites vary barely with data points
- meter vary with data points by 10x between least and most data points
- `primary_use`: Religious worship 32k, Education 8.1mio

## Anomalies: meter reading, weather, building properties, gaps in the time series

### meter readings (output)

#### counting number of meter readings

In [None]:
train.groupby('meter').()

In [None]:
train['timestamp'].unique()[:10]

In [None]:
ideal_ts = pd.date_range(train['timestamp'].min(), train['timestamp'].max(), freq='60T')
ideal_ts[:10]

In [None]:
len(ideal_ts), train['timestamp'].nunique()

Finding: 
- the number of timestamps in the training set matches the expected number of timestamps of 1hour intervals

In [None]:
train_counts = train.groupby('timestamp').size()
test_counts = test.groupby('timestamp').size()

fig = go.Figure(data=[
    go.Scatter(x=train_counts.index, y=train_counts.values, name='train'),
    go.Scatter(x=test_counts.index, y=test_counts.values, name='test'),
], layout=go.Layout(title='Data point count v time: train v test'))

fig.show()

Finding: 
- meter counts inconsistent vs time for training

#### meter value trends

In [None]:
train.head()

In [None]:
train['meter'].isna().sum()

No NaNs in the output

In [None]:
%%time
train_counts = (train.dropna(subset=['meter']).groupby(['timestamp', 'meter'])
                .agg(**{
                    'mean': pd.NamedAgg('meter_reading', np.mean),
                    'median': pd.NamedAgg('meter_reading', np.median),
                    '5%': pd.NamedAgg('meter_reading', lambda x: np.percentile(x, 5)),
                    '95%': pd.NamedAgg('meter_reading', lambda x: np.percentile(x, 95)),
                }).unstack(level=-1))
train_counts.head()

In [None]:
%%time
for meter in [0,1,2,3]:
    tmp = train_counts.loc[:,pd.IndexSlice[:,meter]]
    tmp.columns = tmp.columns.droplevel(level=1)
    tmp = tmp.reset_index()
    
    fig = go.Figure(data=[
        go.Scatter(x=tmp['timestamp'], y=tmp['5%'], mode='lines', name='5%'),
        go.Scatter(x=tmp['timestamp'], y=tmp['95%'], mode='lines', fill='tonexty', name='5% - 95%'),
        go.Scatter(x=tmp['timestamp'], y=tmp['mean'], mode='lines', name='mean'),
        go.Scatter(x=tmp['timestamp'], y=tmp['median'], mode='lines', name='median'),
    ], layout=go.Layout(title=f'meter: {meter}'))
    fig.show()
    

meter map: `{0: electricity, 1: chilledwater, 2: steam, 3: hotwater}`

Finding:
- the 4 meter types have quite different time behaviors
- meters 0 & 1 have a seasonabl behavior based on the weekday (but much stronger for meter 0 than 1)
- meter 2 has measurement anomalies has significant anomalous time periods  (median exceeds the 95% values)
- meter 2 & 3 have seasonal effect based on the time of year it seems 

In [None]:
%%time
for meter in [0,1,2,3]:
    tmp = train.loc[train['meter']==meter, :].sort_values('meter_reading', ascending=False)
    print('meter', meter)
    display(tmp.head())
    

### building properties

In [None]:
building.head()

In [None]:
building['in_train'] = building['building_id'].isin(train['building_id'])
building['in_test'] = building['building_id'].isin(test['building_id'])
building.head()

In [None]:
building.groupby(['in_train', 'in_test']).size()

In [None]:
px.box(building['square_feet'])

In [None]:
px.box(building['year_built'])

In [None]:
px.box(building['floor_count'])

Finding:
- `floor_count`, `year_built` and `square_feet` seem reasonable overall
- all buildings in train and test

### weather properties

In [None]:
weather_train.head()

In [None]:
cols = weather_train.columns.values[2:]
weather_train.loc[:,cols].isna().sum() / len(weather_train)

Finding:
- `cloud_coverage` 50% NaNs
- `precip_depth_1_hr` 35.9% NaNs 

In [None]:
%%time
for col in cols:
    tmp = (weather_train.dropna(subset=[col]).groupby(['timestamp'])
              .agg(**{
                  'mean': pd.NamedAgg(col, np.mean,),
                  'median': pd.NamedAgg(col, np.median),
                  '5%': pd.NamedAgg(col, lambda x: np.percentile(x, 5)),
                  '95%': pd.NamedAgg(col, lambda x: np.percentile(x, 95)),
              }).reset_index())
    display(tmp.head())
    
    fig = go.Figure(data=[
        go.Scatter(x=tmp['timestamp'], y=tmp['5%'], mode='lines', name='5%'),
        go.Scatter(x=tmp['timestamp'], y=tmp['95%'], mode='lines', fill='tonexty', name='5% - 95%'),
        go.Scatter(x=tmp['timestamp'], y=tmp['mean'], mode='lines', name='mean'),
        go.Scatter(x=tmp['timestamp'], y=tmp['median'], mode='lines', name='median'),
    ], layout=go.Layout(title=f'column: {col}'))
    fig.show()


Finding:
- temperatures $\Rightarrow$ buildings predomonantly in the northern hemisphere