# LSTM Model

Use prepared data (Data_scaled) to train LSTM model.

## Imports and Setup

In [143]:
import pandas as pd
import numpy as np
import hvplot.pandas

from sklearn.metrics import mean_absolute_error, mean_squared_error # Packages for measuring model performance / errors
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential # Deep learning library, used for neural networks
from keras.layers import LSTM, Dense, Dropout # Deep learning classes for recurrent and regular densely-connected layers
from keras.callbacks import EarlyStopping # EarlyStopping during model training
from keras.models import load_model

import joblib

from pathlib import Path

import matplotlib.pyplot as plt

In [2]:
np.set_printoptions(edgeitems=20, linewidth=100000, formatter=dict(float=lambda x: "%.3f" %x))

#root_dir = Path('c:/projects/nrel-presentation/')
root_dir = Path('c:/Users/david/OneDrive/projects/portfolio/traffic/')

train_test_split = 0.8
n_obs = 6      # look back 6*10=60 minutes
n_forecast = 2 # predict 2*10=20 minutes

## Functions

In [3]:
def prep_ts_data(Data, back, fore, train_split_fraction=0.8, verbose=False):
    """
    Prepare data for LSTM
    Data: dataframe with the data we want to predict in column 0
    back: how many timesteps backward to look
    fore: how many timesteps to predict
    train_split_fraction: what fraction (bet 0.01 and 0.99) to take for training data
    
    Returns:
        x_train: training X
        y_train: training y
        x_test: testing X
        y_test: testing y
    """
    nrows = Data.shape[0]
    dimensions = Data.shape[1]
    batch_size = nrows - back - fore + 1

    # instantiate big matrix M
    M = np.empty((batch_size, back+fore, dimensions))
    print('starting')
    for i in range(0, batch_size):
        if verbose:
            print('batch: {} of {}'.format(i+1, batch_size))
        M[i, :, :] = Data.iloc[i: (i+back+fore)]
        
    # slice M into X and Y
    X = M[:, 0:back, :]
    Y = M[:, back:(back+fore), 0]
    
    # slice X and Y into training and testing
    num_train = int(np.ceil(batch_size * train_split_fraction))
    x_train = X[:num_train, :, :]
    y_train = Y[:num_train, :]
    x_test = X[num_train:, :, :]
    y_test = Y[num_train:, :]
    print('x_train : {}'.format(x_train.shape))
    print('y_train : {}'.format(y_train.shape))
    print('x_test : {}'.format(x_test.shape))
    print('y_test : {}'.format(y_test.shape))
    print('done')

    return(x_train, y_train, x_test, y_test)

## Load Data

In [3]:
Data = pd.read_parquet(root_dir / 'data' / 'processed' / 'Data_unscaled_with_features.pq')

In [4]:
Data.head()

Unnamed: 0_level_0,lane_vehicle_speed,lane_vehicle_count,occupancy,ds_lane_vehicle_speed,ds_lane_vehicle_count,ds_occupancy,us_lane_vehicle_speed,us_lane_vehicle_count,us_occupancy,hod,sin_hod,cos_hod,is_dow_1,is_dow_2,is_dow_3,is_dow_4,is_dow_5,is_dow_6,sin_doy,cos_doy
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-10-26 20:10:00,85.125,2.888889,0.333333,83.375,2.555556,0.333333,81.444444,3.0,0.555556,20,-0.707107,0.707107,1,0,0,0,0,0,-0.899631,0.436651
2021-10-26 20:20:00,81.4,2.5,0.2,80.555556,2.7,0.5,78.444444,2.6,0.5,20,-0.707107,0.707107,1,0,0,0,0,0,-0.899631,0.436651
2021-10-26 20:30:00,79.375,2.9,0.4,82.25,2.8,0.5,79.333333,2.7,0.6,20,-0.707107,0.707107,1,0,0,0,0,0,-0.899631,0.436651
2021-10-26 20:40:00,84.444444,2.9,0.5,84.2,2.5,0.3,83.888889,1.9,0.3,20,-0.707107,0.707107,1,0,0,0,0,0,-0.899631,0.436651
2021-10-26 20:50:00,81.555556,2.666667,0.222222,83.625,2.222222,0.333333,81.25,3.111111,0.777778,20,-0.707107,0.707107,1,0,0,0,0,0,-0.899631,0.436651


In [5]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2886 entries, 2021-10-26 20:10:00 to 2021-11-15 21:00:00
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   lane_vehicle_speed     2886 non-null   float64
 1   lane_vehicle_count     2886 non-null   float64
 2   occupancy              2886 non-null   float64
 3   ds_lane_vehicle_speed  2886 non-null   float64
 4   ds_lane_vehicle_count  2886 non-null   float64
 5   ds_occupancy           2886 non-null   float64
 6   us_lane_vehicle_speed  2886 non-null   float64
 7   us_lane_vehicle_count  2886 non-null   float64
 8   us_occupancy           2886 non-null   float64
 9   hod                    2886 non-null   int64  
 10  sin_hod                2886 non-null   float64
 11  cos_hod                2886 non-null   float64
 12  is_dow_1               2886 non-null   uint8  
 13  is_dow_2               2886 non-null   uint8  
 14  is_dow_3            

In [6]:
# save column names
column_names = Data.columns.values
column_names

array(['lane_vehicle_speed', 'lane_vehicle_count', 'occupancy', 'ds_lane_vehicle_speed', 'ds_lane_vehicle_count', 'ds_occupancy', 'us_lane_vehicle_speed', 'us_lane_vehicle_count', 'us_occupancy', 'hod', 'sin_hod', 'cos_hod', 'is_dow_1', 'is_dow_2', 'is_dow_3', 'is_dow_4', 'is_dow_5', 'is_dow_6', 'sin_doy', 'cos_doy'], dtype=object)

## Train/Test

In [7]:
# take data out of dataframe and into array

# save the timestamps
all_timestamps = Data.index.values

Data_array = Data.reset_index(drop=True).values
Data_array

array([[85.125, 2.889, 0.333, 83.375, 2.556, 0.333, 81.444, 3.000, 0.556, 20.000, -0.707, 0.707, 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, -0.900, 0.437],
       [81.400, 2.500, 0.200, 80.556, 2.700, 0.500, 78.444, 2.600, 0.500, 20.000, -0.707, 0.707, 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, -0.900, 0.437],
       [79.375, 2.900, 0.400, 82.250, 2.800, 0.500, 79.333, 2.700, 0.600, 20.000, -0.707, 0.707, 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, -0.900, 0.437],
       [84.444, 2.900, 0.500, 84.200, 2.500, 0.300, 83.889, 1.900, 0.300, 20.000, -0.707, 0.707, 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, -0.900, 0.437],
       [81.556, 2.667, 0.222, 83.625, 2.222, 0.333, 81.250, 3.111, 0.778, 20.000, -0.707, 0.707, 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, -0.900, 0.437],
       [79.889, 2.900, 0.400, 78.900, 3.200, 0.600, 80.556, 2.700, 0.600, 21.000, -0.500, 0.866, 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, -0.900, 0.437],
       [80.500, 3.600, 0.600, 78.222, 3.100, 0.500, 78.600, 3.600, 0.9

In [8]:
print(Data_array.shape)

(2886, 20)


In [9]:
# split data to train and test
# y data is in column 0

num_train = int(Data_array.shape[0] * train_test_split)
X_train = Data_array[:num_train, :]
X_test  = Data_array[num_train:, :]
y_train = Data_array[:num_train, 0].reshape((-1, 1))
y_test  = Data_array[num_train:, 0].reshape((-1, 1))
print(Data_array.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2886, 20) (2308, 20) (578, 20) (2308, 1) (578, 1)


## Min/Max Scaling

In [10]:
# fit a min/max scaler to the training data X
# fit a separate min/max scaler to y
# y is in the first column
X_scaler = MinMaxScaler(feature_range=(-1, 1))
y_scaler = MinMaxScaler(feature_range=(-1, 1))

# fit the scalers to training data
X_scaler.fit(X_train)
y_scaler.fit(y_train)

# transform (scale) both the training and the test data
X_train_scaled = X_scaler.transform(X_train)
y_train_scaled = y_scaler.transform(y_train)

X_test_scaled = X_scaler.transform(X_test)
y_test_scaled = y_scaler.transform(y_test)

In [11]:
# check min/max of each scaled dataset
print('X_train min', X_train_scaled.min(axis=0))
print('X_train max', X_train_scaled.max(axis=0))
print('X_test min', X_test_scaled.min(axis=0))
print('X_test max', X_test_scaled.max(axis=0))
print('y_train_min', y_train_scaled.min(axis=0))
print('y_train max', y_train_scaled.max(axis=0))
print('y_test min', y_test_scaled.min(axis=0))
print('y_test max', y_test_scaled.max(axis=0))


X_train min [-1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000]
X_train max [1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000]
X_test min [-0.874 -0.986 -1.000 -0.373 -0.981 -1.000 -0.838 -0.995 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 1.000 1.000]
X_test max [0.671 0.954 0.886 0.459 0.986 0.989 0.315 0.896 0.978 1.000 1.000 1.000 -1.000 -1.000 1.000 1.000 1.000 1.000 1.622 1.437]
y_train_min [-1.000]
y_train max [1.000]
y_test min [-0.874]
y_test max [0.671]


In [13]:
# save the scalers to use later
X_scaler_filename = root_dir / 'models/X_scaler.save'
joblib.dump(X_scaler, X_scaler_filename)
y_scaler_filename = root_dir / 'models/y_scaler.save'
joblib.dump(y_scaler, y_scaler_filename)

['c:\\Users\\david\\OneDrive\\projects\\portfolio\\traffic\\models\\y_scaler.save']

In [12]:
# plot train
num_train = int(Data_array.shape[0] * train_test_split)

X_train_df = pd.DataFrame(X_train_scaled, columns=column_names[:], index=all_timestamps[:num_train])
y_train_df = pd.DataFrame(y_train_scaled, columns=[column_names[0]], index=all_timestamps[:num_train])
X_train_plots = X_train_df.hvplot(subplots=True, width=400, height=150, shared_axes=True).cols(3)
y_train_plot = y_train_df.hvplot(subplots=True, width=400, height=150, shared_axes=True, title=y_train_df.columns.values[0], ylabel='')
layout = (y_train_plot + X_train_plots).opts(title='<h2>TRAIN PLOTS</h2>').cols(1)
layout

In [13]:
# plot test
num_train = int(Data_array.shape[0] * train_test_split)

X_test_df = pd.DataFrame(X_test_scaled, columns=column_names[:], index=all_timestamps[num_train:])
y_test_df = pd.DataFrame(y_test_scaled, columns=[column_names[0]], index=all_timestamps[num_train:])
X_test_plots = X_test_df.hvplot(subplots=True, width=400, height=150, shared_axes=True).cols(3)
y_test_plot = y_test_df.hvplot(subplots=True, width=400, height=150, shared_axes=True, title=y_test_df.columns.values[0], ylabel='')
layout = (y_test_plot + X_test_plots).opts(title='<h2>TEST PLOTS</h2>').cols(1)
layout

## Combined Scaled Data

In [14]:
Data_array.shape

(2886, 20)

In [15]:
Data_array_scaled = np.concatenate([X_train_scaled, X_test_scaled], axis=0)

In [16]:
Data_array_scaled.shape

(2886, 20)

## Reshape Data

In [17]:
# reshape data into format for tensorflow
#  shape for X: (n_batches, n_obs, n_dimensions)
#  shape for y: (n_batches, n_forecast)

# from above: n_obs = 6 , n_forecast = 2

In [18]:
# X_train and y_train

#   note, this will re-define X_train/test y_train/test; these variable names were used above for scaling.
#   use Data_array_scaled to reshape the data

train_data_length = X_train_scaled.shape[0]

# instantiate 3D arrays
n_batches = train_data_length - n_obs + 1
n_dimensions = X_train_scaled.shape[1]
X_train = np.empty((n_batches, n_obs, n_dimensions))
y_train = np.empty((n_batches, n_forecast))

# populate array
for i in range(n_obs, train_data_length+1):
    X_train[i-n_obs, :, :] = Data_array_scaled[(i - n_obs):i, :]
    y_train[i-n_obs, :] = Data_array_scaled[i:(i+n_forecast), 0]
    
print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))

X_train shape: (2303, 6, 20)
y_train shape: (2303, 2)


In [19]:
# X_test and y_test

#   note, this will re-define X_train/test y_train/test; these variable names were used above for scaling.
#   use Data_array_scaled to reshape the data

test_data_length = Data_array_scaled.shape[0] - train_data_length

# instantiate 3D arrays
n_batches = test_data_length - n_obs -n_forecast + 1
n_dimensions = X_test_scaled.shape[1]
X_test = np.empty((n_batches, n_obs, n_dimensions))
y_test = np.empty((n_batches, n_forecast))

# populate array
st = train_data_length + n_obs
en = train_data_length + test_data_length - n_forecast + 1
for row, i in enumerate(range(st, en)):
    X_test[row, :, :] = Data_array_scaled[(i - n_obs):i, :]
    y_test[row, :] = Data_array_scaled[i:(i+n_forecast), 0]
    
print('X_test shape: {}'.format(X_test.shape))
print('y_test shape: {}'.format(y_test.shape))

X_test shape: (571, 6, 20)
y_test shape: (571, 2)


## Scratch

In [41]:
dimensions = X_train_scaled.shape[1]

In [80]:
# reshape X_train_scaled
n_batches = X_train_scaled.shape[0] - n_obs - n_forecast + 1

X_train_scaled_3D = np.empty((n_batches, n_obs, dimensions))

for batch in range(n_batches):
    X_train_scaled_3D[batch, :, :] = X_train_scaled[batch:(batch+n_obs), :]

print('X_train shape: {}'.format(X_train_scaled_3D.shape))

X_train shape: (2301, 6, 19)


In [81]:
# reshape X_test_scaled
n_batches = X_test_scaled.shape[0] - n_obs - n_forecast + 1

X_test_scaled_3D = np.empty((n_batches, n_obs, dimensions))
for batch in range(n_batches):
    X_test_scaled_3D[batch, :, :] = X_test_scaled[batch:(batch+n_obs), :]

print('X_test shape: {}'.format(X_test_scaled_3D.shape))

X_test shape: (571, 6, 19)


In [84]:
# reshape y_train_scaled
n_batches = y_train_scaled.shape[0] - n_obs - n_forecast + 1

y_train_scaled_2D = np.empty((n_batches, n_forecast))

for batch in range(n_batches):
    y_train_scaled_2D[batch, :] = y_train_scaled[batch:(batch+n_forecast), 0]

print('y_train shape: {}'.format(y_train_scaled_2D.shape))

y_train shape: (2301, 2)


In [85]:
# reshape y_test_scaled
n_batches = y_test_scaled.shape[0] - n_obs - n_forecast + 1

y_test_scaled_2D = np.empty((n_batches, n_forecast))

for batch in range(n_batches):
    y_test_scaled_2D[batch, :] = y_test_scaled[batch:(batch+n_forecast), 0]

print('y_test shape: {}'.format(y_test_scaled_2D.shape))

y_test shape: (571, 2)


In [86]:
# Rename datasets
X_train = X_train_scaled_3D
X_test = X_test_scaled_3D
y_train = y_train_scaled_2D
y_test = y_test_scaled_2D

In [105]:
X_train_scaled.shape

(2308, 19)

## Build Model

In [109]:
# Configure the neural network model
LSTMmodel = Sequential()

# Model with n_neurons = inputshape Timestamps, each with x_train.shape[2] variables
n_neurons = X_train.shape[1] * X_train.shape[2]
dense_neurons = y_train.shape[1]

print(n_neurons, X_train.shape[1], X_train.shape[2])
LSTMmodel.add(LSTM(n_neurons, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
LSTMmodel.add(Dropout(0.4))
LSTMmodel.add(LSTM(n_neurons, return_sequences=True))
LSTMmodel.add(Dropout(0.4))
#LSTMmodel.add(LSTM(n_neurons, return_sequences=True))
#LSTMmodel.add(Dropout(0.4))
LSTMmodel.add(LSTM(n_neurons, return_sequences=False))
LSTMmodel.add(Dropout(0.4))
LSTMmodel.add(Dense(dense_neurons))

# Compile the model
LSTMmodel.compile(optimizer='adam', loss='mse')

120 6 20


In [110]:
LSTMmodel.layers

[<keras.layers.recurrent_v2.LSTM at 0x20cce21fb80>,
 <keras.layers.core.Dropout at 0x20cc4342220>,
 <keras.layers.recurrent_v2.LSTM at 0x20cc4342640>,
 <keras.layers.core.Dropout at 0x20cc430e520>,
 <keras.layers.recurrent_v2.LSTM at 0x20cad3f1bb0>,
 <keras.layers.core.Dropout at 0x20cad3f1460>,
 <keras.layers.core.Dense at 0x20cb96ec640>]

# Train Model

In [111]:
# Training the model
epochs = 100
batch_size = 16
early_stop = EarlyStopping(monitor='loss', patience=5, verbose=1)
history = LSTMmodel.fit(X_train, y_train, 
                    batch_size=batch_size, 
                    epochs=epochs,
                    validation_data=(X_test, y_test),
                    verbose=2
                   )

Epoch 1/100
144/144 - 8s - loss: 0.0263 - val_loss: 0.0202
Epoch 2/100
144/144 - 2s - loss: 0.0202 - val_loss: 0.0200
Epoch 3/100
144/144 - 2s - loss: 0.0189 - val_loss: 0.0197
Epoch 4/100
144/144 - 2s - loss: 0.0184 - val_loss: 0.0215
Epoch 5/100
144/144 - 2s - loss: 0.0180 - val_loss: 0.0255
Epoch 6/100
144/144 - 2s - loss: 0.0176 - val_loss: 0.0191
Epoch 7/100
144/144 - 2s - loss: 0.0173 - val_loss: 0.0218
Epoch 8/100
144/144 - 2s - loss: 0.0170 - val_loss: 0.0212
Epoch 9/100
144/144 - 2s - loss: 0.0167 - val_loss: 0.0229
Epoch 10/100
144/144 - 2s - loss: 0.0170 - val_loss: 0.0213
Epoch 11/100
144/144 - 2s - loss: 0.0168 - val_loss: 0.0194
Epoch 12/100
144/144 - 2s - loss: 0.0163 - val_loss: 0.0260
Epoch 13/100
144/144 - 2s - loss: 0.0163 - val_loss: 0.0201
Epoch 14/100
144/144 - 2s - loss: 0.0161 - val_loss: 0.0284
Epoch 15/100
144/144 - 2s - loss: 0.0161 - val_loss: 0.0305
Epoch 16/100
144/144 - 2s - loss: 0.0159 - val_loss: 0.0222
Epoch 17/100
144/144 - 2s - loss: 0.0157 - val_lo

## Save model

In [62]:
#joblib.dump(LSTMmodel, root_dir / 'models' / 'LSTMmodel')

In [142]:
# save in hdf5 format because joblib complained about a weak reference and wouldn't save or load properly
LSTMmodel.save(root_dir / 'models' / 'LSTMmodel.h5')

## Load model
if necessary

In [64]:
#LSTMmodel = joblib.load(root_dir / 'models' / 'LSTMmodel')

In [66]:
LSTMmodel = load_model(root_dir / 'models' / 'LSTMmodel.h5')

# Predictions

In [113]:
y_test_predicted = LSTMmodel.predict(X_test)

In [114]:
# add training predictions to see how much overfitting we might have
y_train_predicted = LSTMmodel.predict(X_train)

In [115]:
print(X_test.shape, y_test.shape, y_test_predicted.shape, y_train_predicted.shape)

(571, 6, 20) (571, 2) (571, 2) (2303, 2)


## Compile Results

In [116]:
# assemble all results to one dataframe
Results = pd.DataFrame({'y_train_predicted_0': np.nan,
                        'y_train_predicted_1': np.nan,
                        'y_test_predicted_0': np.nan,
                        'y_test_predicted_1': np.nan}, index=Data.index)

In [117]:
Results

Unnamed: 0_level_0,y_train_predicted_0,y_train_predicted_1,y_test_predicted_0,y_test_predicted_1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-10-26 20:10:00,,,,
2021-10-26 20:20:00,,,,
2021-10-26 20:30:00,,,,
2021-10-26 20:40:00,,,,
2021-10-26 20:50:00,,,,
...,...,...,...,...
2021-11-15 20:20:00,,,,
2021-11-15 20:30:00,,,,
2021-11-15 20:40:00,,,,
2021-11-15 20:50:00,,,,


In [118]:
tmp = Results.iloc[(num_train + n_obs):-1][['y_test_predicted_0']].copy()
tmp['y_test_predicted_0'] = y_test_predicted[:, 0]
Results.update(tmp)

tmp = Results.iloc[(num_train + n_obs + 1):][['y_test_predicted_1']].copy()
tmp['y_test_predicted_1'] = y_test_predicted[:, 1]
Results.update(tmp)

tmp = Results.iloc[n_obs:(num_train + 1)][['y_train_predicted_0']].copy()
tmp['y_train_predicted_0'] = y_train_predicted[:, 0]
Results.update(tmp)

tmp = Results.iloc[(n_obs + 1):(num_train + 2)][['y_train_predicted_1']].copy()
tmp['y_train_predicted_1'] = y_train_predicted[:, 1]
Results.update(tmp)


In [119]:
Results.hvplot.scatter().opts(width=900, height=500)

In [120]:
Results.head(n=15)

Unnamed: 0_level_0,y_train_predicted_0,y_train_predicted_1,y_test_predicted_0,y_test_predicted_1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-10-26 20:10:00,,,,
2021-10-26 20:20:00,,,,
2021-10-26 20:30:00,,,,
2021-10-26 20:40:00,,,,
2021-10-26 20:50:00,,,,
2021-10-26 21:00:00,,,,
2021-10-26 21:10:00,-0.442194,,,
2021-10-26 21:20:00,-0.441421,-0.453068,,
2021-10-26 21:30:00,-0.43531,-0.4458,,
2021-10-26 21:40:00,-0.429284,-0.444062,,


## Unscale

In [121]:
y_scaler = joblib.load(root_dir / 'models' / 'y_scaler.save')

In [122]:
Results['y_train_predicted_0_unscaled'] = y_scaler.inverse_transform(Results['y_train_predicted_0'].values.reshape(-1, 1))
Results['y_train_predicted_1_unscaled'] = y_scaler.inverse_transform(Results['y_train_predicted_1'].values.reshape(-1, 1))
Results['y_test_predicted_0_unscaled'] = y_scaler.inverse_transform(Results['y_test_predicted_0'].values.reshape(-1, 1))
Results['y_test_predicted_1_unscaled'] = y_scaler.inverse_transform(Results['y_test_predicted_1'].values.reshape(-1, 1))

In [123]:
# add observed
Results = Results.join(Data[['lane_vehicle_speed']], how='left')

In [124]:
Results

Unnamed: 0_level_0,y_train_predicted_0,y_train_predicted_1,y_test_predicted_0,y_test_predicted_1,y_train_predicted_0_unscaled,y_train_predicted_1_unscaled,y_test_predicted_0_unscaled,y_test_predicted_1_unscaled,lane_vehicle_speed
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-10-26 20:10:00,,,,,,,,,85.125000
2021-10-26 20:20:00,,,,,,,,,81.400000
2021-10-26 20:30:00,,,,,,,,,79.375000
2021-10-26 20:40:00,,,,,,,,,84.444444
2021-10-26 20:50:00,,,,,,,,,81.555556
...,...,...,...,...,...,...,...,...,...
2021-11-15 20:20:00,,,-0.459429,-0.459319,,,80.365419,80.367435,85.875000
2021-11-15 20:30:00,,,-0.452225,-0.459061,,,80.496896,80.372144,80.444444
2021-11-15 20:40:00,,,-0.447207,-0.449485,,,80.588471,80.546906,79.750000
2021-11-15 20:50:00,,,-0.451743,-0.445812,,,80.505694,80.613928,82.375000


## Plot

In [125]:
Results.columns

Index(['y_train_predicted_0', 'y_train_predicted_1', 'y_test_predicted_0',
       'y_test_predicted_1', 'y_train_predicted_0_unscaled',
       'y_train_predicted_1_unscaled', 'y_test_predicted_0_unscaled',
       'y_test_predicted_1_unscaled', 'lane_vehicle_speed'],
      dtype='object')

In [141]:
line_plot = Results.hvplot.line(y='lane_vehicle_speed', label='observed')
scatter_plot0 = Results.hvplot.scatter(y=['y_train_predicted_0_unscaled', 'y_test_predicted_0_unscaled'], label='predicted t+10min', color='black', marker='o')
scatter_plot1 = Results.hvplot.scatter(y=['y_train_predicted_1_unscaled', 'y_test_predicted_1_unscaled'], label='predicted t+20min', color='black', marker='o', alpha=0.5)
(line_plot * scatter_plot0 * scatter_plot1).opts(title='Train and Test', width=1200, height=500, ylabel='Lane Vehicle Speed (mph)')

In [140]:
test_Results = Results.iloc[(num_train + n_obs):][['lane_vehicle_speed', 'y_test_predicted_0_unscaled', 'y_test_predicted_1_unscaled']].copy()

line_plot = test_Results.hvplot.line(y='lane_vehicle_speed', label='observed')
scatter_plot0 = test_Results.hvplot.scatter(y='y_test_predicted_0_unscaled', label='predicted t+10min', color='black', marker='o')
scatter_plot1 = test_Results.hvplot.scatter(y='y_test_predicted_1_unscaled', label='predicted t+20min', color='black', marker='o', alpha=0.5)
(line_plot * scatter_plot0 * scatter_plot1).opts(title='Test', width=1200, height=500, ylabel='Lane Vehicle Speed (mph)')

# Error

In [127]:
errors_0 = np.abs(Results['lane_vehicle_speed'] - Results['y_test_predicted_0_unscaled']).values
errors_1 = np.abs(Results['lane_vehicle_speed'] - Results['y_test_predicted_1_unscaled']).values

In [128]:
mae = np.nanmean(np.concatenate((errors_0, errors_1)))

In [129]:
print('MAE: {:.2f} mph'.format(mae))

MAE: 2.17 mph


In [130]:
LSTMmodel.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 6, 120)            67680     
_________________________________________________________________
dropout_5 (Dropout)          (None, 6, 120)            0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 6, 120)            115680    
_________________________________________________________________
dropout_6 (Dropout)          (None, 6, 120)            0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 120)               115680    
_________________________________________________________________
dropout_7 (Dropout)          (None, 120)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                

# More Scratch

In [164]:
from keras.utils.all_utils import plot_model

In [165]:
plot_model(LSTMmodel)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [169]:
115680/120

964.0

In [26]:
train_data_length = int(train_test_split * Data_array_scaled.shape[0])
test_data_length = Data_array_scaled.shape[0] - train_data_length
print(train_data_length, test_data_length, Data_array_scaled.shape[0], train_data_length + test_data_length)

2308 578 2886 2886


In [38]:
# X_train and y_train

#   note, this will re-define X_train/test y_train/test; these variable names were used above for scaling.
#   use Data_array_scaled to reshape the data

train_data_length = X_train_scaled.shape[0]

# instantiate 3D arrays
n_batches = train_data_length - n_obs + 1
n_dimensions = X_train_scaled.shape[1]
X_train = np.empty((n_batches, n_obs, n_dimensions))
y_train = np.empty((n_batches, n_forecast))

# populate array
for i in range(n_obs, train_data_length+1):
    print(i, end=', ')
    X_train[i-n_obs, :, :] = Data_array_scaled[(i - n_obs):i, :]
    y_train[i-n_obs, :] = Data_array_scaled[i:(i+n_forecast), 0]

6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225

In [56]:
# X_test and y_test

#   note, this will re-define X_train/test y_train/test; these variable names were used above for scaling.
#   use Data_array_scaled to reshape the data

test_data_length = Data_array_scaled.shape[0] - train_data_length

# instantiate 3D arrays
n_batches = test_data_length - n_obs -n_forecast + 1
n_dimensions = X_test_scaled.shape[1]
X_test = np.empty((n_batches, n_obs, n_dimensions))
y_test = np.empty((n_batches, n_forecast))

# populate array
st = train_data_length + n_obs
en = train_data_length + test_data_length - n_forecast + 1
for row, i in enumerate(range(st, en)):
    print(row, i, i-n_obs, end=', ')
    X_test[row, :, :] = Data_array_scaled[(i - n_obs):i, :]
    y_test[row, :] = Data_array_scaled[i:(i+n_forecast), 0]

0 2314 2308, 1 2315 2309, 2 2316 2310, 3 2317 2311, 4 2318 2312, 5 2319 2313, 6 2320 2314, 7 2321 2315, 8 2322 2316, 9 2323 2317, 10 2324 2318, 11 2325 2319, 12 2326 2320, 13 2327 2321, 14 2328 2322, 15 2329 2323, 16 2330 2324, 17 2331 2325, 18 2332 2326, 19 2333 2327, 20 2334 2328, 21 2335 2329, 22 2336 2330, 23 2337 2331, 24 2338 2332, 25 2339 2333, 26 2340 2334, 27 2341 2335, 28 2342 2336, 29 2343 2337, 30 2344 2338, 31 2345 2339, 32 2346 2340, 33 2347 2341, 34 2348 2342, 35 2349 2343, 36 2350 2344, 37 2351 2345, 38 2352 2346, 39 2353 2347, 40 2354 2348, 41 2355 2349, 42 2356 2350, 43 2357 2351, 44 2358 2352, 45 2359 2353, 46 2360 2354, 47 2361 2355, 48 2362 2356, 49 2363 2357, 50 2364 2358, 51 2365 2359, 52 2366 2360, 53 2367 2361, 54 2368 2362, 55 2369 2363, 56 2370 2364, 57 2371 2365, 58 2372 2366, 59 2373 2367, 60 2374 2368, 61 2375 2369, 62 2376 2370, 63 2377 2371, 64 2378 2372, 65 2379 2373, 66 2380 2374, 67 2381 2375, 68 2382 2376, 69 2383 2377, 70 2384 2378, 71 2385 2379, 72

In [39]:
print(train_data_length)
print(n_batches)
print(len(list(range(n_obs, train_data_length))))

2308
2303
2302


In [40]:
X_train[0, :, :]

array([[-0.199, -0.757, -0.905, 0.029, -0.791, -0.910, -0.273, -0.767, -0.877, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.403, -0.789, -0.943, -0.064, -0.779, -0.864, -0.435, -0.799, -0.889, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.514, -0.756, -0.886, -0.008, -0.770, -0.864, -0.387, -0.791, -0.867, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.236, -0.756, -0.857, 0.056, -0.796, -0.919, -0.141, -0.855, -0.933, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.394, -0.775, -0.937, 0.037, -0.819, -0.910, -0.284, -0.758, -0.827, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.486, -0.756, -0.886, -0.118, -0.736, -0.837, -0.321, -0.791, -0.867, 0.826, -0.500, 0.866, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000]])

In [41]:
X_train[-1, :, :]

array([[-0.858, 0.794, 0.886, -0.285, 0.795, 0.925, -0.795, 0.647, 0.889, 0.652, -0.866, 0.500, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.781, 0.204, 0.114, -0.177, 0.324, 0.275, -0.719, 0.245, 0.289, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.715, 0.381, 0.343, -0.151, 0.427, 0.356, -0.670, 0.325, 0.444, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.819, 0.642, 0.629, -0.213, 0.778, 0.844, -0.676, 0.462, 0.489, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.764, 0.465, 0.371, -0.167, 0.504, 0.492, -0.649, 0.454, 0.533, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.814, 0.651, 0.657, -0.180, 0.581, 0.546, -0.724, 0.510, 0.622, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000]])

In [50]:
for i, n in enumerate(range(10, 16)):
    print(i, n)

0 10
1 11
2 12
3 13
4 14
5 15


In [34]:
tmp = pd.DataFrame(Data_array_scaled, columns=column_names, index=pd.DatetimeIndex(all_timestamps))
tmp['row'] = list(range(tmp.shape[0]))
tmp.to_excel(root_dir / 'working' / 'Data_array_scaled.xlsx')

In [75]:
Data_array_scaled

array([[-0.199, -0.757, -0.905, 0.029, -0.791, -0.910, -0.273, -0.767, -0.877, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.403, -0.789, -0.943, -0.064, -0.779, -0.864, -0.435, -0.799, -0.889, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.514, -0.756, -0.886, -0.008, -0.770, -0.864, -0.387, -0.791, -0.867, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.236, -0.756, -0.857, 0.056, -0.796, -0.919, -0.141, -0.855, -0.933, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.394, -0.775, -0.937, 0.037, -0.819, -0.910, -0.284, -0.758, -0.827, 0.739, -0.707, 0.707, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       [-0.486, -0.756, -0.886, -0.118, -0.736, -0.837, -0.321, -0.791, -0.867, 0.826, -0.500, 0.866, 1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000],
       

In [60]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2303, 6, 20) (2303, 2) (571, 6, 20) (571, 2)


In [32]:
all_columns

NameError: name 'all_columns' is not defined

In [33]:
column_names

array(['lane_vehicle_speed', 'lane_vehicle_count', 'occupancy', 'ds_lane_vehicle_speed', 'ds_lane_vehicle_count', 'ds_occupancy', 'us_lane_vehicle_speed', 'us_lane_vehicle_count', 'us_occupancy', 'hod', 'sin_hod', 'cos_hod', 'is_dow_1', 'is_dow_2', 'is_dow_3', 'is_dow_4', 'is_dow_5', 'is_dow_6', 'sin_doy', 'cos_doy'], dtype=object)

In [38]:
X_train[-1, :, :]

array([[-0.858, 0.794, 0.886, -0.285, 0.795, 0.925, -0.795, 0.647, 0.889, 0.652, -0.866, 0.500, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.781, 0.204, 0.114, -0.177, 0.324, 0.275, -0.719, 0.245, 0.289, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.715, 0.381, 0.343, -0.151, 0.427, 0.356, -0.670, 0.325, 0.444, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.819, 0.642, 0.629, -0.213, 0.778, 0.844, -0.676, 0.462, 0.489, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.764, 0.465, 0.371, -0.167, 0.504, 0.492, -0.649, 0.454, 0.533, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000],
       [-0.814, 0.651, 0.657, -0.180, 0.581, 0.546, -0.724, 0.510, 0.622, 0.739, -0.707, 0.707, -1.000, -1.000, 1.000, -1.000, -1.000, -1.000, 1.000, 1.000]])

In [39]:
y_train[-1, :]

array([-0.792, -0.545])

In [43]:
X_test[-1, :, :]

array([[-0.490, -0.714, -0.829, 0.085, -0.727, -0.837, -0.438, -0.799, -0.889, 0.652, -0.866, 0.500, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, 1.622, 1.437],
       [-0.512, -0.663, -0.829, -0.020, -0.702, -0.783, -0.335, -0.679, -0.733, 0.739, -0.707, 0.707, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, 1.622, 1.437],
       [-0.459, -0.785, -0.905, 0.038, -0.762, -0.849, -0.318, -0.794, -0.877, 0.739, -0.707, 0.707, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, 1.622, 1.437],
       [-0.158, -0.663, -0.821, 0.077, -0.753, -0.864, -0.182, -0.797, -0.889, 0.739, -0.707, 0.707, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, 1.622, 1.437],
       [-0.455, -0.644, -0.810, -0.056, -0.667, -0.789, -0.508, -0.714, -0.802, 0.739, -0.707, 0.707, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, 1.622, 1.437],
       [-0.493, -0.688, -0.829, -0.059, -0.667, -0.783, -0.417, -0.687, -0.756, 0.739, -0.707, 0.707, -1.000, -1.000, -1.000, -1.000, -1.000, -1.000, 1.622, 1.437]])

In [45]:
y_test[0, :]

array([-0.408, -0.408])

In [49]:
all_timestamps[num_train + n_obs + 571]

numpy.datetime64('2021-11-15T21:00:00.000000000')

In [50]:
num_train + n_obs + 571

2885