In [None]:
# default_exp modelling

# Modelling meter readings

> API details.

In [None]:
#export
import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import typing

from sklearn import linear_model, tree, model_selection, ensemble

from fastai.tabular.all import *

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
base_path = Path("../data")

In [None]:
csvs = sorted([base_path/v for v in os.listdir(base_path) if v.endswith('.csv')])
csvs

In [None]:
train_csv = csvs[3]
train_weather_csv = csvs[-1]
test_csv = csvs[2]
test_weather_csv = csvs[-2]
meta_csv = csvs[0]

train_csv, train_weather_csv, test_csv, test_weather_csv, meta_csv

In [None]:
#export
def numpy_evaluate(y_true:np.ndarray, y_pred:np.ndarray): return np.sqrt(np.mean((y_pred  - y_true)**2))

def evaluate_torch(y_true:torch.Tensor, y_pred:torch.Tensor): return torch.sqrt(torch.mean((y_pred - y_true)**2))

## Loading

In [None]:
%%time
train = pd.read_csv(train_csv, parse_dates=['timestamp'])
train.head()

In [None]:
%%time
test = pd.read_csv(test_csv, parse_dates=['timestamp'])
test.head()

In [None]:
%%time
weather_train = pd.read_csv(train_weather_csv, parse_dates=['timestamp'])
weather_train.head()

In [None]:
%%time
weather_test = pd.read_csv(test_weather_csv, parse_dates=['timestamp'])
weather_test.head()

In [None]:
%%time
building = pd.read_csv(meta_csv)
building.head()

## sklearn pipeline

### Preparing the model input and output

In [None]:
metering_input_cols = ['meter'] # 'building_id', 'timestamp']
output_col = 'meter_reading'

In [None]:
#export
def get_Xy(metering:pd.DataFrame,
           metering_input_cols:typing.List[str],
           output_col:str='meter_reading',
           is_train:bool=True):
    
    X = metering.loc[:,metering_input_cols]
    if is_train:
        y = np.log(1+metering[output_col].values.ravel())
        
    if is_train:
        return X, y
    return X, None

In [None]:
%%time
X, y = get_Xy(train, metering_input_cols=metering_input_cols,
              is_train=True)

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=.2)

In [None]:
X_train.values.shape, y_train.shape

In [None]:
X_train[:5], y_train[:5], X_test[:5], y_test[:5]

### Training

In [None]:
m = linear_model.LinearRegression()

In [None]:
m.fit(X_train, y_train)

In [None]:
y_pred = m.predict(X_train)

### Evaluation

In [None]:
%%time
numpy_evaluate(y_train, y_pred)

In [None]:
%%time
numpy_evaluate(y_test, m.predict(X_test))

Finding:
- Linear model + meter as input only + random 80-20 split  $\Rightarrow$ 2.14

## fastai overkill

### Radically merging all the data

In [None]:
#export
def radical_merging(df:pd.DataFrame, building:pd.DataFrame, 
                    weather:pd.DataFrame, n_sample:int=None,
                    training:bool=True):
    
    tmp = df.copy(deep=True)

    bid_col = 'building_id'
    sid_col = 'site_id'
    time_col = 'timestamp'
    target_col = 'meter_reading'
    
    categorical = ['meter', 'primary_use', 'cloud_coverage'] # bid_col, sid_col
    continuous = ['square_feet', 'year_built', 'floor_count', 
                  'air_temperature', 'dew_temperature',
                  'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
                  'wind_speed']

    x_cols = [bid_col, 'meter', target_col, time_col] if training \
            else [bid_col, 'meter', time_col]
    X = tmp.loc[:,x_cols].copy()

    X = pd.merge(X, building, on=bid_col, how='left')
    X = pd.merge(X, weather, on=[sid_col, time_col], how='left')

    #return_cols =  categorical + continuous + [target_col,]  # time_col

    #X = X.loc[:,return_cols]
    if n_sample is not None:
        X = X.sample(n_sample)
        
    if training:
        X[target_col] = np.log(X[target_col] + 1)
        
    X = add_datepart(X, time_col)
    """
    """
    categorical.extend(['timestampMonth', 'timestampWeek', 'timestampDay',
                        'timestampDayofweek', 'timestampDayofyear', 'timestampIs_month_end',
                        'timestampIs_month_start', 'timestampIs_quarter_end',
                        'timestampIs_quarter_start', 'timestampIs_year_end',
                        'timestampIs_year_start'])
    
    continuous.extend(['timestampYear', 'timestampElapsed'])
        
    X = X.loc[:, [col for col in X.columns.values if col not in [time_col]]]
    
    missing_cont = [col for col in continuous if col not in X.columns]
    missing_cat = [col for col in categorical if col not in X.columns]
    assert len(missing_cat) == 0, f'{missing_cat} not in X!'
    assert len(missing_cont) == 0, f'{missing_cont} not in X!'
    
    X.loc[:,continuous] = X.loc[:,continuous].astype(float)
    X.loc[:,categorical] = X.loc[:,categorical].astype('category')
    
    return X, continuous, categorical

In [None]:
%%time
n_sample = 10000
X, continuous, categorical = radical_merging(train.copy(), building, weather_train,
                    n_sample=n_sample)

In [None]:
test.head()

In [None]:
%%time
X_test, _, _ = radical_merging(test.copy(), building, weather_test,
                    n_sample=None, training=False)

TODO: fix pd.merge MemoryError: in line 24 
MemoryError: Unable to allocate 2.17 GiB for an array with shape (7, 41697600) and data type float64


In [None]:
X.head().T

In [None]:
X.info()

In [None]:
continuous, categorical

In [None]:
#export
def split_dataset(X:pd.DataFrame, split_kind:str='random',
                  train_frac:float=8):
    
    def random_split():
        n_train = int(len(X)*train_frac)
        train_bool = X.index.isin(np.random.choice(X.index.values, size=n_train, replace=False))
        return train_bool
    
    def time_split():
#        print(X.columns)
        time_col = 'timestampElapsed'
        ts = X[time_col].sort_values(ascending=True)
#        print(ts)
        ix = int(len(X)*train_frac)
#        print('ix', ix)
        threshold_t = ts.iloc[ix:].values[0]
#        print('threshold_t', threshold_t)
        return X[time_col] < threshold_t
    
    split_funs = {
        'random': random_split,
        'time': time_split,
    }
    
    assert split_kind in split_funs
    train_bool = split_funs[split_kind]()
    
    train_idx = np.where(train_bool)[0]
    valid_idx = np.where(~train_bool)[0]

    return (list(train_idx), list(valid_idx))

In [None]:
%%time
split_kind = 'random'
#split_kind = 'time'
splits = split_dataset(X, split_kind=split_kind, train_frac=.8)
#splits=None

In [None]:
sorted(X.iloc[splits[0]].loc[:, 'timestampMonth'].unique())

In [None]:
X.info()

Super simplistic input data

In [None]:
%%time
procs = [Categorify, FillMissing, Normalize]
to = TabularPandas(X, procs, ['meter'],
                   [], y_names='meter_reading', splits=splits)

All input data

In [None]:
%%time
procs = [Categorify, FillMissing, Normalize]
to = TabularPandas(X.copy(), procs, [], #categorical,
                   continuous, 
                   y_names='meter_reading', splits=splits)

In [None]:
to.train.xs

In [None]:
to.train.ys

### Modelling with sklearn

In [None]:
m = linear_model.LinearRegression()

In [None]:
m = ensemble.RandomForestRegressor(n_estimators=100, max_features=.75, criterion='mse')

In [None]:
%%time
m.fit(to.train.xs, to.train.ys.values.ravel())

In [None]:
evaluate_torch(torch.from_numpy(to.valid.ys.values), 
               torch.from_numpy(m.predict(to.valid.xs.values).ravel()))

### Modelling with fastai

In [None]:
dls = to.dataloaders()

In [None]:
y_range = [np.min([to.train.ys.values.min(), to.valid.ys.values.min()]),
           np.max([to.train.ys.values.max(), to.valid.ys.values.max()]),]
y_range

In [None]:
y_range = [to.train.ys.values.min(),
           to.train.ys.values.max()]
y_range

In [None]:
learn = tabular_learner(dls, y_range=y_range, layers=[500,250],
                        n_out=1, loss_func=evaluate_torch)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, 1e-2)

In [None]:
preds, targs = learn.get_preds()

In [None]:
preds, targs

In [None]:
evaluate_torch(targs, preds)

In [None]:
test = X.iloc[:50].copy()
test = test.drop('meter_reading', axis=1)
test

In [None]:
test = X_test.head(100).copy()
test.head()

In [None]:
test_dl = learn.dls.test_dl(test)

In [None]:
test_dl.xs

In [None]:
learn.get_preds(dl=test_dl)

**randomly splitting**
    
Finding (modified target values, all info = info except time):
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100k: 2.3
    - all info incl time @100k: 2.32
    - all info incl time + ids @100k: 2.32
- RandomForest:
    - meter only @100k: 2.2
    - all info minus time @100k: 2.7
    - all info incl time @100k: 2.74
    - all info incl time + ids @100k: 2.82
- tabular_learner:
    - meter only @100k: 2.1
    - all info minus time @100k: 1.56
    - all info incl time @100k: 1.52
    - all info incl time + ids @100k: 0.96
    
**splitting along time**
Finding:
- Linear:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.2
    - all info incl time @100k: 2.3
    - all info incl time + ids @100k: 2.29
- RandomForest:
    - meter only @100k: 2.1
    - all info minus time @100K: 2.7
    - all info incl time @100k: 2.52
    - all info incl time + ids @100k: 2.62
- tabular_learner:
    - meter only @100k: 2.06
    - all info minus time @100K: 1.62
    - all info incl time @100k: 1.62
    - all info incl time + ids @100k: 1.31