In [101]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import math
import itertools
import regex as re

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, normalize
from sklearn.model_selection import train_test_split, GridSearchCV

from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Activation
from keras.metrics import RootMeanSquaredError
%matplotlib inline

### Loading Preprocessed Dataset

In [3]:
df = pd.read_csv('tmp/mse_raw.csv', parse_dates=['date', 'start_date'], index_col='Unnamed: 0')

In [4]:
df.head(5)

Unnamed: 0,stock_id,date,open,high,low,average,close,volume,quantity,change %,volume total,start_date
0,ALK,1997-01-09,1070.0,1070.0,1070.0,1070.0,1070.0,279270.0,261.0,0.0,279270.0,1997-01-09
22,ALK,1997-01-10,1070.0,1070.0,1070.0,1070.0,1070.0,0.0,0.0,0.0,0.0,1997-01-09
44,ALK,1997-01-11,1070.0,1070.0,1070.0,1070.0,1070.0,0.0,0.0,0.0,0.0,1997-01-09
66,ALK,1997-01-12,1070.0,1070.0,1070.0,1070.0,1070.0,0.0,0.0,0.0,0.0,1997-01-09
88,ALK,1997-01-13,1070.0,1070.0,1070.0,1070.0,1070.0,0.0,0.0,0.0,0.0,1997-01-09


### setting RNN ALK dataset for training, validation and test

In [6]:
alk_df = df[df.stock_id == 'ALK']

In [8]:
alk_df.columns

Index(['stock_id', 'date', 'open', 'high', 'low', 'average', 'close', 'volume',
       'quantity', 'change %', 'volume total', 'start_date'],
      dtype='object')

### split dataset

In [13]:
#Create a new dataframe with only closing price column
close_price = alk_df['close']

#Convert the dataframe into numpy array
close_price = close_price.values

In [18]:
close_price.shape

(8630,)

In [16]:
scaler = MinMaxScaler(feature_range=(0, 1))

In [23]:
scaled_df = scaler.fit_transform(close_price.reshape(len(close_price), 1))

In [24]:
scaled_df

array([[0.04175824],
       [0.04175824],
       [0.04175824],
       ...,
       [0.86373626],
       [0.86373626],
       [0.85714286]])

### univariate sequence

In [28]:
# split a univariate sequence into samples
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [30]:
# choose a number of time steps
n_steps = 3
# split into samples
X, y = split_sequence(scaled_df, n_steps)
# summarize the data
for i in range(3):
    print(X[i], y[i])

[[0.04175824]
 [0.04175824]
 [0.04175824]] [0.04175824]
[[0.04175824]
 [0.04175824]
 [0.04175824]] [0.04175824]
[[0.04175824]
 [0.04175824]
 [0.04175824]] [0.04029304]


### Vanilla LSTM, one day prediciton

In [83]:
X_train, X_valid, X_test = X[:int(len(X)-60),], X[int(len(X)-60):int(len(X)-1),], X[int(len(X)-1):,]
y_train, y_valid, y_test = y[:int(len(X)-60),], y[int(len(X)-60):int(len(X)-1),], y[int(len(X)-1):,]

In [84]:
X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((8567, 3, 1), (59, 3, 1), (1, 3, 1), (8567, 1), (59, 1), (1, 1))

In [85]:
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_features))
X_valid = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], n_features))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], n_features))

In [103]:
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))
    
rmse = RootMeanSquaredError()

In [111]:
# define model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [112]:
my_callback1 = ModelCheckpoint('vanila_best.pt', verbose=1, save_best_only=True, mode='min', monitor='val_loss')
my_callback2 = EarlyStopping(patience=10)

my_callbacks = [my_callback1, my_callback2]

In [113]:
# fit model
model.fit(X_train, y_train, epochs=50, callbacks=my_callbacks, validation_data=[X_valid, y_valid])

Train on 8567 samples, validate on 59 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.00016, saving model to vanila_best.pt
Epoch 2/50

Epoch 00002: val_loss improved from 0.00016 to 0.00006, saving model to vanila_best.pt
Epoch 3/50

Epoch 00003: val_loss improved from 0.00006 to 0.00006, saving model to vanila_best.pt
Epoch 4/50

Epoch 00004: val_loss did not improve from 0.00006
Epoch 5/50

Epoch 00005: val_loss did not improve from 0.00006
Epoch 6/50

Epoch 00006: val_loss did not improve from 0.00006
Epoch 7/50

Epoch 00007: val_loss did not improve from 0.00006
Epoch 8/50

Epoch 00008: val_loss did not improve from 0.00006
Epoch 9/50

Epoch 00009: val_loss did not improve from 0.00006
Epoch 10/50

Epoch 00010: val_loss did not improve from 0.00006
Epoch 11/50

Epoch 00011: val_loss improved from 0.00006 to 0.00006, saving model to vanila_best.pt
Epoch 12/50

Epoch 00012: val_loss did not improve from 0.00006
Epoch 13/50

Epoch 00013: val_loss did not improve from

<keras.callbacks.callbacks.History at 0x20131698780>

In [114]:
yhat = model.predict(X_test, verbose=0)

In [115]:
y_actual = scaler.inverse_transform(y_test)
y_pred = scaler.inverse_transform(yhat)

In [118]:
np.sqrt(mean_squared_error(y_actual, y_pred))

36.423828125

In [146]:
valid_size = 30
train_size = 30

# split into train and test sets
train, valid, test = scaled[0:(len(scaled)-valid_size-train_size),:], scaled[(len(scaled)-valid_size-train_size):(len(scaled)-valid_size),:], scaled[(len(scaled)-valid_size):,:]    
print(len(train), len(valid), len(test))
print(train.shape, valid.shape, test.shape)

8540 30 30
(8540, 37) (30, 37) (30, 37)


In [154]:
train[0][1:,]

array([0.        , 0.09090909, 0.09615385, 0.23333333, 0.83333333,
       0.10410959, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.0433068 , 0.04011987, 0.00755519, 0.00729735, 0.03733373,
       0.03419948, 0.        , 0.01430729, 0.03663004, 0.02022472,
       0.03846154, 0.04032258, 0.03663004, 0.28362573, 0.5       ,
       0.8111995 , 0.72334858, 0.03663004, 0.28362573, 0.03663004,
       0.28362573])

In [168]:
train[0,0]

0.03663003663003663

In [169]:
valid[1,0]

0.8315018315018315

In [178]:
for i in range(len(valid)):
    print(valid[i,0])

0.8315018315018315
0.8315018315018315
0.8283516483516483
0.8424175824175825
0.8424175824175825
0.8388278388278388
0.8424908424908425
0.8424908424908425
0.8424908424908425
0.8351648351648352
0.8388278388278388
0.8351648351648352
0.8351648351648352
0.8351648351648352
0.8351648351648352
0.8351648351648352
0.8355311355311356
0.8355311355311356
0.8355311355311356
0.8419047619047619
0.8416849816849817
0.8416849816849817
0.8416849816849817
0.8416117216117216
0.8461538461538461
0.8572161172161172
0.8644688644688645
0.8791208791208791
0.8791208791208791
0.8791208791208791


In [198]:
# write or reuse a function to segment the dataset in the appropriate format 
# Tip: function_name (dataset, n_steps_in)
# convert an array of values into a dataset matrix
def create_dataset(df_arrays):
    data_x, data_y = [], []
    for i in range(len(df_arrays)):  
        a = df_arrays[i, 1:len(df_7_days)]
        data_x.append(a)
        data_y.append(df_arrays[i, 0])
    return np.array(data_x), np.array(data_y)

In [246]:
### Use the function on the dataset and print the newly segmented data
# use n_steps_in = 5
train_X, train_y = create_dataset(train)
valid_X, valid_y = create_dataset(valid)
test_X, test_y = create_dataset(test)
for i in range(2):
    print(train_X[i], train_y[i])

[0.         0.09090909 0.09615385 0.23333333 0.83333333 0.10410959
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.0433068  0.04011987 0.00755519
 0.00729735 0.03733373 0.03419948 0.         0.01430729 0.03663004
 0.02022472 0.03846154 0.04032258 0.03663004 0.28362573 0.5
 0.8111995  0.72334858 0.03663004 0.28362573 0.03663004 0.28362573] 0.03663003663003663
[0.00000000e+00 9.09090909e-02 9.61538462e-02 2.66666667e-01
 1.00000000e+00 1.06849315e-01 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.16292592e-04 4.33067964e-02
 4.01198670e-02 7.55519425e-03 7.29735302e-03 3.73337316e-02
 3.40192115e-02 0.00000000e+00 1.35872117e-02 3.66300366e-02
 2.02247191e-02 3.84615385e-02 4.03225806e-02 3.66300366e-02
 2.83625731e-01 5.00000000e-01 8.11199502e-01 7.23348584e-01
 3.66300366e-02 2.83625731e-01 3.66300366e-02 2.83625731e-01] 0.03663003663003663


In [247]:
train_X.shape, valid_X.shape, test_X.shape

((8540, 36), (30, 36), (30, 36))

In [248]:
# reshape the data to be appropriate for trianing the RNN model
train_X = train_X.reshape(train_X.shape[0], 1, train_X.shape[1])
valid_X = valid_X.reshape(valid_X.shape[0], 1, valid_X.shape[1])
test_X = test_X.reshape (test_X.shape[0], 1, test_X.shape[1])

In [249]:
train_X.shape, valid_X.shape, test_X.shape

((8540, 1, 36), (30, 1, 36), (30, 1, 36))

### Building RNN model

In [250]:
# create and fit the RNN network. 
# Use LSTM with 60 neurons, RELU activation function, MAE as a loss, and SGD as optimizer.
model = Sequential()
model.add(LSTM(60, activation='relu', input_shape=(1, 36)))
model.add(Dense(1))
model.compile(loss='mae', optimizer='SGD')

In [251]:
callback1 = ModelCheckpoint('best_RNN.pt', monitor='loss', save_best_only=True, mode='min', verbose=1)
callback2 = EarlyStopping(patience=15, monitor='loss', mode='min')
callback_list = [callback1, callback2]

In [252]:
model.fit(train_X, train_y, epochs=50,callbacks=callback_list, verbose=0, validation_data=(valid_X, valid_y))


Epoch 00001: loss improved from inf to 0.11514, saving model to best_RNN.pt

Epoch 00002: loss improved from 0.11514 to 0.03469, saving model to best_RNN.pt

Epoch 00003: loss improved from 0.03469 to 0.01198, saving model to best_RNN.pt

Epoch 00004: loss improved from 0.01198 to 0.01019, saving model to best_RNN.pt

Epoch 00005: loss improved from 0.01019 to 0.00947, saving model to best_RNN.pt

Epoch 00006: loss improved from 0.00947 to 0.00906, saving model to best_RNN.pt

Epoch 00007: loss improved from 0.00906 to 0.00878, saving model to best_RNN.pt

Epoch 00008: loss did not improve from 0.00878

Epoch 00009: loss did not improve from 0.00878

Epoch 00010: loss did not improve from 0.00878

Epoch 00011: loss did not improve from 0.00878

Epoch 00012: loss did not improve from 0.00878

Epoch 00013: loss did not improve from 0.00878

Epoch 00014: loss did not improve from 0.00878

Epoch 00015: loss did not improve from 0.00878

Epoch 00016: loss did not improve from 0.00878

Epoc

<keras.callbacks.callbacks.History at 0x2298846dcf8>

In [263]:
test_X.shape

(30, 1, 36)

In [253]:
# make predictions
test_predict = model.predict(test_X)
test_predict[:5]

array([[0.8867388 ],
       [0.896636  ],
       [0.88847417],
       [0.9030396 ],
       [0.90821034]], dtype=float32)

In [262]:
test_predict.shape, test_y.shape

((30, 1), (30,))

In [259]:
# invert predictions
test_predict_ = scaler.inverse_transform(test_predict)

ValueError: non-broadcastable output operand with shape (30,1) doesn't match the broadcast shape (30,37)