# Lab 6 Report:
## Stock Prediction AI with Encoder-Decoder RNN

### Name:

In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error

ModuleNotFoundError: No module named 'sklearn'

In [None]:
from IPython.display import Image # For displaying images in colab jupyter cell

In [None]:
Image('lab6_exercise.png', width = 1000)

In [None]:
# Seaborn plot styling
sns.set(style = 'white', font_scale = 2)

## Prepare Data

In [None]:
# Load stock datasets
# Pick one of three to train your model
# Use 'closing price' column for training and testing

# x = features/inputs, y = targets/outputs

tesla = pd.read_csv('TSLA.csv')
tesla_np = tesla.to_numpy()

google = pd.read_csv('GOOGL.csv')
google_np = google.to_numpy()

dji = pd.read_csv('DJI.csv')
dji_np = dji.to_numpy()

#### Building sequence function for next steps

In [None]:
def generate_input_output_seqs(y, encoder_inputseq_len, decoder_outputseq_len, stride = 1, num_features = 1):

    L = y.shape[0] # Length of y

    # Calculate how many input/target sequences there will be based on the parameters and stride
    num_samples = (L - encoder_inputseq_len - decoder_outputseq_len) // stride + 1

    # Numpy zeros arrray to contain the input/target sequences
    # Note that they should be in (num_samples, seq_len, num_features/time step) format
    train_input_seqs = np.zeros([num_samples, encoder_inputseq_len, num_features])
    train_output_seqs = np.zeros([num_samples, decoder_outputseq_len, num_features])

    # Iteratively fill in train_input_seqs and train_output_seqs
    # See slide 17 of lab 7 to get an idea of how input_seqs and output_seqs look like
    for ff in np.arange(num_features):

        for ii in np.arange(num_samples):

            start_x = stride * ii
            end_x = start_x + encoder_inputseq_len
            train_input_seqs[ii, :, ff] = y[start_x:end_x, ff]

            start_y = stride * ii + encoder_inputseq_len
            end_y = start_y + decoder_outputseq_len
            train_output_seqs[ii, :, ff] = y[start_y:end_y, ff]

    return train_input_seqs, train_output_seqs

#### Applying functions & preparing data

In [None]:
# Normalize your data and select training dataset (all the days except for last 100 days)

# ------ Selecting Google Stocks Dataset ------
# based on the model's training with ggl_train, it will then
#   use that information to predict the last 100 days, which
#   can then be compared with the actual data from ggl_test

# Using the closing price (column 4) as training sequence except for testing sequence (last 100 datapoints)
ggl_train = torch.as_tensor(np.array(  google_np[:-100, 4]  ).astype('float32')).cuda()
ggl_train = torch.unsqueeze(ggl_train, dim=1)
ggl_test = torch.as_tensor(np.array(  google_np[-100:, 4]  ).astype('float32')).cuda()
ggl_test = torch.unsqueeze(ggl_test, dim=1)


# Define encoder/decoder sequence lengths and testing sequence length
encoder_inputseq_len = 15                 # set to 5 in example
decoder_outputseq_len = 10                # set to 2 in example
testing_sequence_len = len(ggl_test)     # all except for last 100 days

google_np.shape, ggl_train.shape, ggl_test.shape

In [None]:
# Define your encoder input sequence length, decoder output sequence length and testing sequence length
# Construct train_input_seqs and train_output_seqs according to
# encoder input sequence length and decoder output sequence length similar to example task

# Generate encoder input seqs and decoder output seqs
train_input_seqs, train_output_seqs = generate_input_output_seqs(y = ggl_train.cpu(),
                                                                 encoder_inputseq_len = encoder_inputseq_len,
                                                                 decoder_outputseq_len = decoder_outputseq_len,
                                                                 stride = 1,
                                                                 num_features = 1)

In [None]:
# Make sure train_input_seqs and train_output_seqs have correct dimensions as expected
# (sample size, sequence length, # of features / timestep)

# shape will change based on stride parameter set in generate_input_output_seqs
print("Encoder Training Inputs Shape: ", train_input_seqs.shape)
print("Decoder Training Outputs Shape: ", train_output_seqs.shape)

## Define Model Architecture

In [None]:
# building the enconder class
class Encoder(torch.nn.Module):

    def __init__(self, input_size, hidden_size, num_layers):

        super(Encoder, self).__init__()

        # Using LSTM (long short term memory) with batch_first = True
        self.lstm = torch.nn.LSTM(input_size = input_size, hidden_size = hidden_size,
                                  num_layers = num_layers,
                                  batch_first = True)

        # No need for FC layer since encoder only passes hidden states to Decoder

    def forward(self, input_seq, hidden_state):
        # forward propagate to LSTM
        out, hidden = self.lstm(input_seq, hidden_state)
        return out, hidden

# building the decoder class
class Decoder(torch.nn.Module):

    def __init__(self, input_size, hidden_size, output_size, num_layers):

        super(Decoder, self).__init__()

        # Using LSTM for Decoder with batch_first = True
        self.lstm = torch.nn.LSTM(input_size = input_size, hidden_size = hidden_size,
                                  num_layers = num_layers,
                                  batch_first = True)

        # FC layer to convert hidden states to a single number
        self.fc_decoder = torch.nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, encoder_hidden_states):

        # forward propagate to the LSTM
        output, hidden = self.lstm(input_seq, encoder_hidden_states)

        # pass output of lstm through FC layer and get a prediction
        out = self.fc_decoder(output)
        return out, hidden

# Combine Encoder and Decoder classes into one class (model)
class Encoder_Decoder(torch.nn.Module):

    def __init__(self, input_size, hidden_size, decoder_output_size, num_layers):

        super(Encoder_Decoder, self).__init__()

        # encoder-decoder layers
        self.Encoder = Encoder(input_size = input_size, hidden_size = hidden_size,
                               num_layers = num_layers)
        self.Decoder = Decoder(input_size = input_size, hidden_size = hidden_size,
                               output_size = decoder_output_size, num_layers = num_layers)
        # RNN cell layer
        self.rnn = torch.nn.RNN(input_size=input_size, hidden_size=hidden_size,
                                num_layers = num_layers,
                                nonlinearity = 'relu')

    def forward(self, x):

        # encode input sequence
        hidden = self.Encoder(x)

        # passing encoder outputs through RNN
        x = self.rnn(x, hidden)

        # adding 1st dropout layer
        x = torch.nn.functional.dropout(x, p=0.5)

        # decode hidden state
        x = self.Decoder(x, hidden)

        # adding 2nd dropout layer
        out = torch.nn.functional.dropout(x, p=0.5)

        return out

## Define Hyperparameters

In [None]:
torch.manual_seed(1720)

# Using input_size = 1 (# of features to be fed to RNN per timestep)
# Using decoder_output_size = 1 (# of features to be output by Decoder RNN per timestep)
Encoder_Decoder_RNN = Encoder_Decoder(input_size = 1, hidden_size = 15,
                                      decoder_output_size = 1, num_layers = 1)

# Define learning rate + epochs
learning_rate = 10
epochs = 90

# Define batch size and num_features/timestep (this is simply the last dimension of train_output_seqs)
batchsize = 800
num_features = train_output_seqs.shape[2]

# Define loss function/optimizer
loss_func = torch.nn.MSELoss()
optimizer = torch.optim.Adam(Encoder_Decoder_RNN.parameters(), lr=learning_rate)

# Model should take the hidden state and output from the encoder, then
#   forward pass it into the decoder, which takes those values and uses
#   them to output a final prediction
Encoder_Decoder_RNN.cuda()

## Identify Tracked Values & Train Model

In [None]:
# Empty Python list to keep track of training loss
train_loss_list = []


# Convert training dataset into torch tensors
train_input_seqs = torch.from_numpy(train_input_seqs).float().cuda()
train_output_seqs = torch.from_numpy(train_output_seqs).float().cuda()

# Split the training data into mini-batches
# Skipping the last mini-batch since its size can be smaller than the set batchsize
train_batches_features = torch.split(train_input_seqs, batchsize)[:-1]
train_batches_targets = torch.split(train_output_seqs, batchsize)[:-1]

# Compute total number of mini-batches in training data
batch_split_num = len(train_batches_features)

In [None]:
import tqdm # Use "for epoch in tqdm.trange(epochs):" to see the progress bar

for epoch in tqdm.trange(epochs): # For each epoch

    for k in range(batch_split_num): # For each mini_batch

        # initialize hidden states to Encoder
        hidden_state = None

        # initialize empty torch tensor array to store decoder output sequence
        decoder_output_seq = torch.zeros(batchsize, decoder_outputseq_len, num_features).cuda()

        # empty gradient buffer
        optimizer.zero_grad()

        # Feed k-th mini-batch for encoder input sequences to encoder with hidden state
        encoder_output, encoder_hidden = Encoder_Decoder_RNN.Encoder(train_batches_features[k], hidden_state)

        # Re-define the resulting encoder hidden states as input hidden states to decoder
        decoder_hidden = encoder_hidden

        # Initial input to decoder is last timestep feature from the encoder input sequence
        decoder_input = train_batches_features[k][:, -1, :]
        # The extracted feature is 2D so need to add additional 3rd dimension
        # to conform to (sample size, seq_len, # of features)
        decoder_input = torch.unsqueeze(decoder_input, 2)

        # Populating the decoder output sequence
        for t in range(decoder_outputseq_len): # for each timestep in output sequence

            # Feed in the decoder_input and decoder_hidden to Decoder, get new output and hidden states
            decoder_output, decoder_hidden = Encoder_Decoder_RNN.Decoder(decoder_input, decoder_hidden)

            # Populate the corresponding timestep in decoder output sequence
            decoder_output_seq[:, t, :] = torch.squeeze(decoder_output, 2)

            # Using teacher forcing so using the groundtruth training target as the next input
            decoder_input = train_batches_targets[k][:, t, :]

            # The extracted feature is 2D so need to add additional 3rd dimension
            # to conform to (sample size, seq_len, # of features)
            decoder_input = torch.unsqueeze(decoder_input, 2)

        # Compare the predicted decoder output sequence aginast the target sequence to compute the MSE loss
        loss = loss_func(torch.squeeze(decoder_output_seq), torch.squeeze(train_batches_targets[k]))

        # Save the loss
        train_loss_list.append(loss.item())

        # Backprop
        loss.backward()

        # Update the RNN
        optimizer.step()

    #print("Averaged Training Loss for Epoch ", epoch,": ", np.mean(train_loss_list[-batch_split_num:]))

## Visualize & Evaluate Model

In [None]:
plt.figure(figsize = (9, 5))

plt.plot(np.convolve(train_loss_list, np.ones(100), 'valid') / 100,
         linewidth = 3, label = 'Rolling Averaged Training Loss')
plt.ylabel("training loss")
plt.xlabel("Iterations")
plt.legend()
sns.despine()

### Generate signal predictions for testing sequence with trained Encoder-Decoder

In [None]:
# USE TEACHER FORCING METHOD WHEN GENERATING OUTPUTS FROM DECODER
# See slide 42 of Lab 5 or Lab 5 part 2 video to recap the concept of teacher forcing method
# When generating decoder outputs, make sure each input to decoder at timestep t has the shape (1,1,1)
# i.e., num_samples = 1, sequence_len = 1, num_features = 1

# ---> IGNORE BELOW, test sequence is already a tensor:
# Convert test sequence to tensor (test_input_seq in example)
# ggl_test = torch.from_numpy(ggl_test).float()  # no .cuda() bc it was applied earlier(?)

# initialize empty torch tensor array to store decoder output sequence
# This should be the same size as the test sequence
decoder_output_seq = torch.zeros(testing_sequence_len, num_features)

# First n-datapoints in decoder output sequence = First n-datapoints in ground truth test sequence
# n = encoder_input_seq_len
decoder_output_seq[:encoder_inputseq_len] = ggl_test[:encoder_inputseq_len]

# Initialize index for prediction
pred_start_ind = 0

# Activate no_grad() since we aren't performing backprop
with torch.no_grad():

    # Loop continues until the RNN prediction reaches the end of the testing sequence length
    while pred_start_ind + encoder_inputseq_len + decoder_outputseq_len < testing_sequence_len:

        # initialize hidden state for encoder
        hidden_state = None

        # Define the input to encoder
        input_test_seq = decoder_output_seq[pred_start_ind:pred_start_ind + encoder_inputseq_len]
        # Add dimension to first dimension to keep the input (sample_size, seq_len, # of features/timestep)
        input_test_seq = torch.unsqueeze(input_test_seq, 0)

        # Feed the input to encoder and set resulting hidden states as input hidden states to decoder
        encoder_output, encoder_hidden = Encoder_Decoder_RNN.Encoder(input_test_seq.cuda(), hidden_state)
        decoder_hidden = encoder_hidden

        # Initial input to decoder is last timestep feature from the encoder input sequence
        decoder_input = input_test_seq[:, -1, :]
        # Add dimension to keep the input (sample_size, seq_len, # of features/timestep)
        decoder_input = torch.unsqueeze(decoder_input, 2)

        # Populate decoder output sequence
        for t in range(decoder_outputseq_len):

            # Generate new output for timestep t
            decoder_output, decoder_hidden = Encoder_Decoder_RNN.Decoder(decoder_input.cuda(), decoder_hidden)
            # Populate the corresponding timestep in decoder output sequence
            decoder_output_seq[pred_start_ind + encoder_inputseq_len + t] = torch.squeeze(decoder_output)
            # Use the output of the decoder as new input for the next timestep
            decoder_input = decoder_output

        # Update pred_start_ind
        pred_start_ind += decoder_outputseq_len

In [None]:
# Visualize predicted stock sequence vs the ground truth (aka the test sequence of last 100 datapoints)

plt.figure(figsize = (10, 5))

plt.plot(ggl_test.cpu(), linewidth = 3, label = 'GroundTruth')
plt.plot(decoder_output_seq, linewidth = 3, label = 'RNN Predicted')
plt.title('RNN Predicted vs GroundTruth')
plt.legend()
sns.despine()


# Compute the MSE error between test_input_seq and decoder_output_seq and print the value as Test MSE Error

mse_error = mean_squared_error(ggl_test.cpu(), decoder_output_seq)
print(f'Test MSE Error: {mse_error}')