In [87]:
# Authors: Jin Hong Kuan and Gabe Mersy
import os
import warnings  
with warnings.catch_warnings():  
    warnings.filterwarnings("ignore",category=FutureWarning)
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, LSTM, Dropout
    from tensorflow.keras.optimizers import Adam, RMSprop
from pandas import read_csv, DataFrame, concat
from datetime import datetime, timedelta
import matplotlib.pyplot as pyplot
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import numpy as np 
import pandas as pd 
from keras.utils import plot_model
import pickle
import math

# Numerically encoding categorical features

months = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 
          'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10,
         'November': 11, 'December': 12}

def encode(l):
    encoded = []
    for m in l:
        for key, value in months.items():
            if key == m:
                encoded.append(value)
    return encoded

# Parsing date

def parse(x):
    return datetime.strptime(x, '%m/%d/%Y')

# Setting intial values revelant to forecast
forecast_length = 200

# Setting seed for stochastic reproducibility
seed = 0
tf.set_random_seed(seed)
name = 'prices_may2500_' + str(seed)
shift = 8
# Extending calendar to fit forecast_length
add_weeks = math.ceil((forecast_length+shift)/5)

# Loading data and conducting preprocessing
data_set = pd.read_csv('may2020clean.csv', sep =',', date_parser = parse)
data_set = data_set.iloc[:data_set['totalSoybeanMealSupply'].last_valid_index(),:] # Truncate dataset  
target_cols = ['closePrice','canolaOilPrice', 'soybeanOilPrice', 'soybeanMealPrice', 'cottonseedmealPrice', 'sunflowerseedMealPrice', 'Month']
data_set['Month'] = encode(data_set['Month'])
dates = data_set['date'].tolist()
data_set = data_set.loc[:,target_cols]

for i in range(len(dates)):
    dates[i] = parse(dates[i])
    
last_week = dates[-5:]

for i in range(add_weeks):
    dates += [x + timedelta(days = 7) for x in last_week]
    last_week = dates[-5:]

In [88]:
# Feature engineering: mix/max scaling
values = data_set.values
values = values.astype('float32')
scaler = MinMaxScaler()
scaled = scaler.fit_transform(values)

scaler2 = MinMaxScaler()
transformed_dates = scaler2.fit_transform(np.asarray([x.month for x in dates]).reshape(len(dates),-1))



In [89]:
# Converting time series to supervised learning format
def series_to_supervised(data, seq_length, y_col):
    X = [] 
    Y = []
    for time in range(len(data)-seq_length):
        X += [data[time:time+seq_length]]
        Y += [data[time+seq_length][y_col]]
    
    return X,Y

data_x, data_y = scaled[:-shift], scaled[shift:]
separation = int(len(data_x)*0.8)
train_x, train_y = data_x[:separation], data_y[:separation]
test_x, test_y = data_x[separation:], data_y[separation:]
train_x = train_x.reshape(1, train_x.shape[0], train_x.shape[1])
train_y = train_y.reshape(1, train_y.shape[0], train_y.shape[1])
test_x = test_x.reshape(1, test_x.shape[0], test_x.shape[1])
test_y = test_y.reshape(1, test_y.shape[0], test_y.shape[1])
data_x = data_x.reshape(1, data_x.shape[0], data_x.shape[1])
data_y = data_y.reshape(1, data_y.shape[0], data_y.shape[1])
print(train_x.shape)
print(train_y.shape)

(1, 319, 7)
(1, 319, 7)


In [None]:
# Mean Squared Error function
def loss_mse_warmup(y_true, y_pred):
    warmup_steps = 50
    y_true_slice = y_true[:, warmup_steps:, :]
    y_pred_slice = y_pred[:, warmup_steps:, :]
    loss = tf.losses.mean_squared_error(labels=y_true_slice,
                                        predictions=y_pred_slice)
    loss_mean = tf.reduce_mean(loss)
    return loss_mean

# Building LTSM network with stateful = False (non-prediction model)
regressor = Sequential()
regressor.add(LSTM(units = 50, return_sequences = True, stateful = False, input_shape = (None, train_x.shape[2])))
regressor.add(LSTM(units = train_x.shape[2], return_sequences = True, stateful = False))
regressor.compile(optimizer = RMSprop(lr=1e-3), loss = loss_mse_warmup)

print('Fitting model...')
history = regressor.fit(train_x, train_y, epochs = 2500, verbose=0)

# Training loss plot
pyplot.plot(history.history['loss'], label='train')
pyplot.legend()
pyplot.show()

Fitting model...


In [None]:
values = data_set.values
values = values.astype('float32')
scaler = MinMaxScaler()
print(values[:5,:])
scaled = scaler.fit_transform(values)
print(scaled[:5,:])

In [None]:
# Building LTSM network with stateful = True (prediction model)
newModel = Sequential()
newModel.add(LSTM(units = 50, return_sequences = True, stateful = True, batch_input_shape = (1, None, train_x.shape[2])))
newModel.add(LSTM(units = train_x.shape[2], return_sequences = False, stateful = True))

newModel.set_weights(regressor.get_weights())

forecastFromSelf = np.empty((1, train_x.shape[1] + forecast_length, train_x.shape[2]))
forecastData = np.empty((1, train_x.shape[1] + forecast_length, train_x.shape[2]))
forecastData[:,:train_x.shape[1], :] = train_x[:,:,:]


for i in range(train_x.shape[1]+forecast_length):
    input_data = forecastData[:,i:i+1,:]
    forecastFromSelf[:,i:i+1,:] = newModel.predict(forecastData[:,i:i+1,:])
    forecastFromSelf[:,i,-1] = transformed_dates[i+shift,0] # Replace prediction with what's already known
    output_data = forecastFromSelf[:,i:i+1,:]
    if i + shift >= train_x.shape[1] and i + shift < forecastData.shape[1]:
        forecastData[:,i+shift:i+shift+1,:] = forecastFromSelf[:,i:i+1,:]

y_hat = scaler.inverse_transform(forecastFromSelf.reshape((forecastFromSelf.shape[1],forecastFromSelf.shape[2])))


In [None]:
# Prediction plot
a = pyplot.figure()
a.set_size_inches(10,30)
for i in range(y_hat.shape[1]):
    ax = a.add_subplot(y_hat.shape[1],1,i+1)
    b = data_y
    b = b.reshape(b.shape[1], b.shape[2])
    b = scaler.inverse_transform(b)
    ax.plot(b[:,i])
    ax.plot(y_hat[:,i])
    ax.title.set_text(target_cols[i])
a.savefig(fname='./data/{}.png'.format(name))
a.show()

In [None]:
# Exporting data 
regressor.save(open('pickles/{}.p'.format(name), 'wb'))
    
dates = dates[shift:y_hat.shape[0]+shift]
output_dict = {'Date':dates}
for i, col in enumerate(target_cols):
    output_dict[col] = y_hat[:,i]

output_df = DataFrame(output_dict, columns=['Date']+target_cols)
output_df.to_csv('./data/{}.csv'.format(name))