## Assignment Description
Here we will be training a LSTM model using `Adam` and `LDFGS` optimizers

It is a common practice to assign seeds to all random number generators in the imported modules this ensures reproducibility across the environments

In [None]:
import numpy as np
import torch
import random

torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

In [None]:
def gen_sine_data(n_samples=100, n_len=1000, width=20):
    ''' 
    n_samples : number of samples
    n_len : length of each sample (number of values for each sine wave)
    width : width of the wave
    '''
    N, L, T = n_samples, n_len, width
    x = np.empty((N,L), np.float32) # instantiate empty array
    x[:] = np.arange(L) + np.random.randint(-4*T, 4*T, N).reshape(N,1)
    y = np.sin(x/1.0/T).astype(np.float32)

    return x, y

def gen_mixture_sine_data(n_samples=100, n_len=1000, width=20):
    ''' 
    n_samples : number of samples
    n_len : length of each sample (number of values for each sine wave)
    width : width of the wave
    '''
    N, L, T = n_samples, n_len, width
    x = np.empty((N,L), np.float32) # instantiate empty array
    x[:] = np.arange(L) + np.random.randint(-4*T, 4*T, N).reshape(N,1)
    y = 0.3*np.sin(x/1.0/T).astype(np.float32) + 0.5* np.sin(x/2.0/T).astype(np.float32) + 0.7* np.cos(x/2.0/T).astype(np.float32) + 1.7* np.cos(x/7.0/T).astype(np.float32)

    return x, y

We generate some sequential data here to train the LSTM

In [None]:
import matplotlib.pyplot as plt

def plot_data(x, y):
    fig, ax = plt.subplots()
    ax.plot(x, y)

    ax.set(xlabel='time (s)', ylabel='voltage (mV)',
        title='About as simple as it gets, folks')
    ax.grid()

    #fig.savefig("test.png")
    plt.show()

X, Y = gen_sine_data() #100 sample sequences each of length 1000
X1, Y1 = gen_mixture_sine_data() #100 sample sequences each of length 1000
print(X.shape, Y.shape)
print(X1.shape, Y1.shape)


## Visualize the datasets

In [None]:
plot_data(X[0], Y[0])
plot_data(X[10], Y[10])

plot_data(X1[10], Y1[10])
plot_data(X1[30], Y1[30])

## Model definition

In [None]:
import torch
from torch import nn

class LSTM(nn.Module):
    def __init__(self, hidden_layers=64):
        super(LSTM, self).__init__()
        self.hidden_layers = hidden_layers
        # lstm1, lstm2, linear are all layers in the network
        self.lstm1 = nn.LSTMCell(1, self.hidden_layers)
        self.lstm2 = nn.LSTMCell(self.hidden_layers, self.hidden_layers)
        self.linear = nn.Linear(self.hidden_layers, 1)
        
    def forward(self, y, future_preds=0):
        outputs, n_samples = [], y.size(0)
        h_t = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        c_t = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        h_t2 = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        c_t2 = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        
        for input_t in y.split(1, dim=1):
            # N, 1
            h_t, c_t = self.lstm1(input_t, (h_t, c_t)) # initial hidden and cell states
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2)) # new hidden and cell states
            output = self.linear(h_t2) # output from the last FC layer
            outputs.append(output)
            
        for i in range(future_preds):
            # this only generates future predictions if we pass in future_preds>0
            # mirrors the code above, using last output/prediction as input
            h_t, c_t = self.lstm1(output, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
            outputs.append(output)
        # transform list to tensor    
        outputs = torch.cat(outputs, dim=1)
        return outputs



## Train, validation and test datasets

We split the dataset in train,validation and test sets. For simplicity we do not use a dataloader in this assignment

In [None]:
TRAIN_TEST_SPLIT = 0.3
TRAIN_VALID_SPLIT = 0.1

DATA_SZ = Y.shape[0]
TEST_SZ = int(DATA_SZ*TRAIN_TEST_SPLIT + 0.5)
VALID_SZ = int((Y.shape[0] - TEST_SZ)*TRAIN_VALID_SPLIT + 0.5)

test_input = torch.from_numpy(Y[:TEST_SZ, :-1]) # (TEST_SZ, 999)
test_target = torch.from_numpy(Y[:TEST_SZ, 1:]) # (TEST_SZ, 999)

valid_input = torch.from_numpy(Y[TEST_SZ:TEST_SZ+VALID_SZ, :-1]) # (VALID_SZ, 999)
valid_target = torch.from_numpy(Y[TEST_SZ:TEST_SZ+VALID_SZ, 1:]) # (VALID_SZ, 999)

train_input = torch.from_numpy(Y[TEST_SZ+VALID_SZ:, :-1]) # (rest of data, 999)
train_target = torch.from_numpy(Y[TEST_SZ+VALID_SZ:, 1:]) # (rest of data, 999)

In [None]:

def run_epoch(model, optimiser, loss_fn, input, target, future=100, inference=False):
    
    epoch_loss = 0.0
    losses = []
    if inference:
        optimiser.zero_grad()
        with torch.no_grad():
            pred = model(input, future_preds=future)
            # use all pred samples, but only until 999 to compare with target
            loss = loss_fn(pred[:, :-future], target)
            y = pred.detach().numpy()
            epoch_loss += loss.item()
    else:
        y = None
        def closure():
            optimiser.zero_grad()
            out = model(train_input)
            loss = loss_fn(out, train_target)
            losses.append(loss.item())
            loss.backward()
            return loss
        optimiser.step(closure)
        epoch_loss += losses[0]
    return epoch_loss, y
    

def plot_validation(log_path, idx, train_input, y, future):
    # draw figures
    plt.figure(figsize=(12,6))
    plt.title(f"Step {idx+1}")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    n = train_input.shape[1] # 999
    def draw(yi, colour):
        plt.plot(np.arange(n), yi[:n], colour, linewidth=2.0)
        plt.plot(np.arange(n, n+future), yi[n:], colour+":", linewidth=2.0)
    draw(y[0], 'r')
    draw(y[1], 'b')
    draw(y[2], 'g')
    plt.savefig(os.path.join(log_path,"predict%d.png"%idx), dpi=200)
    plt.close()


In [None]:
from copy import deepcopy
import os 

def training_loop(n_train_epoch, validation_interval, model, optimiser, criterion, 
                  train_input, train_target, valid_input, valid_target):
    
    FUTURE = 1000
    training_losses, valid_losses = [], []

    best_valid_loss = np.inf
    BEST_MODEL_NAME = f'best_lstm_model.pth'
    LOG_MODEL = os.path.join(LOG_PATH,'model')
    LOG_PLOTS = os.path.join(LOG_PATH,'plots')

    if not os.path.exists(LOG_MODEL):
        os.makedirs(LOG_MODEL)
    if not os.path.exists(LOG_PLOTS):
        os.makedirs(LOG_PLOTS)

    
    for i in range(n_train_epoch):
        loss, _ = run_epoch(model, optimiser, criterion, train_input, train_target)
        training_losses.append(loss)
        if i % validation_interval == 0:
            loss, y = run_epoch(model, optimiser, criterion, valid_input, valid_target, 
                                future=FUTURE, inference=True)
            valid_losses.append(loss)
            
            plot_validation(LOG_PLOTS, i, valid_input, y, FUTURE)

            if valid_losses[-1] < best_valid_loss:
                print('----- Saving model after validation on epoch {:d} loss {:.6f} < {:.6f} -----'.format(i+1, valid_losses[-1], best_valid_loss))
                torch.save(deepcopy(model.state_dict()), os.path.join(LOG_MODEL, BEST_MODEL_NAME))
                best_valid_loss = valid_losses[-1]
    
    return training_losses, valid_losses


def plot_losses(train_losses, valid_losses, validation_interval, optim='adam'):
    # draw figures
    plt.figure(figsize=(12,6))
    plt.title(f"Train and Validation losses")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    n = len(train_losses)
    m = [ i*validation_interval for i in range(len(valid_losses))]
    
    plt.plot(np.arange(n), np.log10(train_losses), 'b', linewidth=2.0)
    plt.plot(m, np.log10(valid_losses), 'g', linewidth=2.0)
    plt.savefig(f"train_valid_losses_{optim}.png", dpi=200)
    plt.close()

## Instantiate the LSTM model with Adam optimizer

In [None]:
from torch import optim

model1 = LSTM()
criterion = nn.MSELoss()
optimiser1 = optim.Adam(model1.parameters(), lr=0.0001)
current_folder = globals()['_dh'][0]
LOG_PATH = os.path.join(current_folder,'logging_adam')

## Train the model with Adam

In [None]:
N_EPOCHS = 100
VALID_INTERVAL = 50
training_losses, valid_losses = training_loop(N_EPOCHS, VALID_INTERVAL, model1, optimiser1, criterion, 
                train_input, train_target, valid_input, valid_target)
plot_losses(training_losses, valid_losses, VALID_INTERVAL, optim='adam')

## Instantiate and train the model with LBFGS optimizer

In [None]:
model2 = LSTM()
optimiser2 = optim.LBFGS(model2.parameters(), lr=0.08)
LOG_PATH = os.path.join(current_folder,'logging_lbfgs')

N_EPOCHS = 15
VALID_INTERVAL = 5
training_losses, valid_losses = training_loop(N_EPOCHS, VALID_INTERVAL, model2, optimiser2, criterion, 
                  train_input, train_target, valid_input, valid_target)
plot_losses(training_losses, valid_losses, VALID_INTERVAL, optim='lbfgs')

`Task 01`: 
1. What did you observe about the computation time for single epoch?
2. What about the convergence rate of the two optimizers ?
3. LGFS is a hessian based gradient descent optimizer. A full analysis of Hessian based approach is beyond the scope of this course. But it is an interesting topic, so read a bit on Hessian based optimization. Now reason about its computational complexity and convergence rates.

In [None]:
#-------- Run inference ------------
inference_model = LSTM()
#Load adam optimized model
inference_model.load_state_dict(torch.load(os.path.join(os.path.join(current_folder,'logging_adam'),'model', f'best_lstm_model.pth'), weights_only=True))
inference_model.eval()
FUTURE = 1000
loss, y = run_epoch(inference_model, optimiser2, criterion, test_input, test_target, 
                                future=FUTURE, inference=True)
plot_validation('./', 1, test_input, y, FUTURE)

In [None]:
inference_model = LSTM()
#Load lbfgs optimized model
inference_model.load_state_dict(torch.load(os.path.join(os.path.join(current_folder,'logging_lbfgs'),'model', f'best_lstm_model.pth'), weights_only=True))
inference_model.eval()
FUTURE = 1000
loss, y = run_epoch(inference_model, optimiser2, criterion, test_input, test_target, 
                                future=FUTURE, inference=True)
plot_validation('./', 2, test_input, y, FUTURE)

`Task 02`: Comment on the performance of the to models during the training and inference

Note: You will find training plots per epoch in the `logging_adam/plots` and `logging_lbfgs/plots` directories