In [27]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import re
import nbimporter
import sys
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, confusion_matrix 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from operator import itemgetter

mods = ['exploratory_analysis', 'prep_and_split_data']
[sys.modules.pop(mod) for mod in mods if mod in sys.modules]

from exploratory_analysis import save_obj, load_obj, returnDataOnDate
from prep_and_split_data import prepareForModel, returnXandY

In [3]:
train_and_cv = pd.read_csv('data/training_and_cv_data.csv')

In [4]:
train_and_cv = prepareForModel(train_and_cv)

In [6]:
train_XY, train_X, train_Y = returnXandY(train_and_cv, '2021-06-02', '2021-06-26')
cv_XY, cv_X, cv_Y = returnXandY(train_and_cv, '2021-06-27', '2021-06-30')

In [7]:
print(f'train shape: {train_X.shape}')
print(f'cv shape: {cv_X.shape}')

train shape: (4709, 16)
cv shape: (524, 16)


## Neural Network

In [46]:
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
cv_X_scaled = scaler.fit_transform(cv_X)

Here is a StackExchange answer that provides a starting point for deciding on the number of hidden units to include: https://stats.stackexchange.com/a/136542

Below is a computation to obtain the absolute maximum number of hidden units:

In [10]:
Ns = train_X_scaled.shape[0]  # training examples
No = 1                           # output neurons
Ni = train_X_scaled.shape[1]  # input neurons
alpha = 2                        # scale factor

Nh = Ns / (alpha*(Ni + No))      # maximum hidden neurons

print(f'An upper bound for number of hidden units: {int(Nh)}')

An upper bound for number of hidden units: 138


This seems like *quite* a lot; we certainly don't need this many hidden units! 

However, we can remove some of the guesswork by using Keras Tuner. With this tool, we can search the parameter space and also determine an optimal number of hidden units!

We have 16 inputs, so let's opt for two hidden layers, one Dense and one Dropout.

In [11]:
from tensorflow.python.ops import math_ops, numpy_ops
numpy_ops.np_config.enable_numpy_behavior()

def asymmetric_loss(wgt):
    '''This is our custom objective loss function that favors either underestimates (wgt > 1)
    or overestimates (0 < wgt < 1).'''
    def custom_loss(y_true, y_pred):
        diff = wgt/2*math_ops.squared_difference(y_pred, y_true)*(y_true < y_pred).astype(float) + \
                1/2*math_ops.squared_difference(y_pred, y_true)*(y_true >= y_pred).astype(float)
        
        loss = tf.reduce_mean(diff, axis=-1)

        return loss
    return custom_loss

In [84]:
import keras_tuner as kt

#UNDERESTIMATE_BIAS = 2.
tf.random.set_seed(40)

def model_builder(numFeatures):
    def builder(tuner):
        numUnits = tuner.Int('units', min_value=4, max_value=32, step=4)
        learningRate = tuner.Choice('learningRate', values=[1e-2, 1e-3, 1e-4])
        UNDERESTIMATE_BIAS = tuner.Choice('UNDERESTIMATE_BIAS', values=[1, 2, 5, 10, 50])
        
        model = Sequential(
            [               
                Input(shape=(numFeatures,)),#train_X_scaled.shape[1]
                Dense(units=numUnits, activation='relu', name='dense_1'),
                Dropout(0.2),
                Dense(units=1, activation='linear', name='dense_2')
            ], name = 'nn_model' 
        )

        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=learningRate),
            loss=asymmetric_loss(UNDERESTIMATE_BIAS),
            metrics=[tf.keras.metrics.MeanSquaredError()]
        )

        return model
    return builder

In [85]:
tuner = kt.Hyperband(
    model_builder(train_X_scaled.shape[1]),
    objective='mean_squared_error',
    max_epochs=30,
    overwrite=True,
    directory='tuner logs',
    project_name='asymmetric_MSE'
)

stopEarly = tf.keras.callbacks.EarlyStopping(patience=3)

tuner.search(
    train_X_scaled, train_Y, 
    epochs=30,
    validation_data=(cv_X_scaled, cv_Y),
    callbacks=[stopEarly]
)

best_hparams = tuner.get_best_hyperparameters()[0]

Trial 90 Complete [00h 00m 08s]
mean_squared_error: 664.5034790039062

Best mean_squared_error So Far: 553.7682495117188
Total elapsed time: 00h 03m 30s
INFO:tensorflow:Oracle triggered exit


In [103]:
print(f'These are the best hyperparameter values: \n {best_hparams.values}')
nn_model = tuner.hypermodel.build(best_hparams)

These are the best hyperparameter values: 
 {'units': 32, 'learningRate': 0.01, 'UNDERESTIMATE_BIAS': 1, 'tuner/epochs': 10, 'tuner/initial_epoch': 0, 'tuner/bracket': 1, 'tuner/round': 0}


In [102]:
train_Y_preds = nn_model.predict(train_X_scaled)
cv_Y_preds = nn_model.predict(cv_X_scaled)

print(f'\ncv MSE: {mean_squared_error(cv_Y, cv_Y_preds)}\n')

dfcv = pd.DataFrame(data={'cvPreds': np.reshape(cv_Y_preds, (cv_Y_preds.size,)), 'cvVals': cv_Y})


cv MSE: 784.2179197510907



In [105]:
n = 5
nlargest = dfcv.nlargest(n, ['cvPreds'])
nsmallest = dfcv.nsmallest(n, ['cvPreds'])
print(f'The {n} largest predictions and their values: \n{nlargest}\n')
print(f'The {n} smallest predictions and their values: \n{nsmallest}')

The 5 largest predictions and their values: 
        cvPreds      cvVals
5153  10.747974   10.305895
5156  10.728843   10.305895
5157  10.154403   10.305895
5245   1.661836  225.657188
5246   1.661831  225.657188

The 5 smallest predictions and their values: 
       cvPreds     cvVals
5320 -2.920911   6.650825
5101 -2.514010  63.048783
5283 -2.264927  -2.651003
5177 -2.098195  15.254239
5435 -2.057703  28.778254
