This is, in theory, a super simple example of how Long Short-Term Memory Neural Networks work. We'll start by implementing a single "memory cell" that we'll duplicate (reusing all the weights and biases) for each element in the input.

First, import the modules...

In [3]:
import torch # torch will allow us to create tensors.
import torch.nn as nn # torch.nn allows us to create a neural network.
import torch.nn.functional as F # nn.functional give us access to the activation and loss functions.
# from torch.optim import SGD # optim contains many optimizers. Here, we're using SGD, stochastic gradient descent.
from torch.optim import Adam # optim contains many optimizers. Here, we're using Adam

import lightning as L # lightning has tons of cool tools that make neural networks easier
from torch.utils.data import TensorDataset, DataLoader # these are needed for the training data

from torchmetrics import Accuracy

import matplotlib.pyplot as plt ## matplotlib allows us to draw graphs.
import seaborn as sns ## seaborn makes it easier to draw nice-looking graphs.

## Set the seed so that, hopefully, everyone will get the same results as me.
from pytorch_lightning.utilities.seed import seed_everything


In [None]:
torch.normal(mean=torch.tensor(0.0), std=torch.tensor(1.0))

In [None]:
class BasicLightningTrain(L.LightningModule):

    def __init__(self): # __init__() is the class constructor function, and we use it to initialize the weights and biases.
        
        super().__init__() # initialize an instance of the parent class, LightningModule.

        seed_everything(seed=42)
        # self.train_acc = Accuracy()
        
#         self.wf1 = nn.Parameter(torch.tensor(-3.2), requires_grad=True)
#         self.wf2 = nn.Parameter(torch.tensor(1.7), requires_grad=True)
#         self.bf1 = nn.Parameter(torch.tensor(-0.85), requires_grad=True)

#         self.wr1 = nn.Parameter(torch.tensor(1.3), requires_grad=True)
#         self.wr2 = nn.Parameter(torch.tensor(2.1), requires_grad=True)
#         self.br1 = nn.Parameter(torch.tensor(1.5), requires_grad=True)

#         self.wp1 = nn.Parameter(torch.tensor(-0.2), requires_grad=True)
#         self.wp2 = nn.Parameter(torch.tensor(-0.3), requires_grad=True)
#         self.bp1 = nn.Parameter(torch.tensor(-0.05), requires_grad=True)
        
#         self.wo1 = nn.Parameter(torch.tensor(0.4), requires_grad=True)
#         self.wo2 = nn.Parameter(torch.tensor(1.24), requires_grad=True)
#         self.bo1 = nn.Parameter(torch.tensor(0.31), requires_grad=True)
        mean = torch.tensor(0.0)
        std = torch.tensor(1.0)        
        
        self.wf1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wf2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bf1 = nn.Parameter(torch.tensor(0.), requires_grad=True)

        self.wr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.br1 = nn.Parameter(torch.tensor(0.), requires_grad=True)

        self.wp1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wp2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bp1 = nn.Parameter(torch.tensor(0.), requires_grad=True)
        
        self.wo1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wo2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bo1 = nn.Parameter(torch.tensor(0.), requires_grad=True)
        # self.learning_rate = 0.01 
        self.state = 0
        
    def forward(self, input): 
        
        cell_state = 0
        hidden_state = 0
        day1 = input[0]
        day2 = input[1]
        day3 = input[2]
        day4 = input[3]
        # print("day1 " + str(day1))
        # print("day2 " + str(day2))
        # print("day3 " + str(day3))
        # print("day4 " + str(day4))
        
        ##############
        ##
        ## Day 1
        ##
        ##############
        forget_percent = F.sigmoid((hidden_state * self.wf1) + (day1 * self.wf2) + self.bf1)
        remember_percent = F.sigmoid((hidden_state * self.wr1) + (day1 * self.wr2) + self.br1)
        potential_memory = F.tanh((hidden_state * self.wp1) + (day1 * self.wp2) + self.bp1)
        output_percent = F.sigmoid((hidden_state * self.wo1) + (day1 * self.wo2) + self.bo1)
        
        cell_state = (cell_state * forget_percent) + (remember_percent * potential_memory)
        hidden_state = F.tanh(cell_state) * output_percent
        
        
        ##############
        ##
        ## Day 2
        ##
        ##############
        forget_percent = F.sigmoid((hidden_state * self.wf1) + (day2 * self.wf2) + self.bf1)
        remember_percent = F.sigmoid((hidden_state * self.wr1) + (day2 * self.wr2) + self.br1)
        potential_memory = F.tanh((hidden_state * self.wp1) + (day2 * self.wp2) + self.bp1)
        output_percent = F.sigmoid((hidden_state * self.wo1) + (day2 * self.wo2) + self.bo1)
        
        cell_state = (cell_state * forget_percent) + (remember_percent * potential_memory)
        hidden_state = F.tanh(cell_state) * output_percent
        
        ##############
        ##
        ## Day 3
        ##
        ##############
        forget_percent = F.sigmoid((hidden_state * self.wf1) + (day3 * self.wf2) + self.bf1)
        remember_percent = F.sigmoid((hidden_state * self.wr1) + (day3 * self.wr2) + self.br1)
        potential_memory = F.tanh((hidden_state * self.wp1) + (day3 * self.wp2) + self.bp1)
        output_percent = F.sigmoid((hidden_state * self.wo1) + (day3 * self.wo2) + self.bo1)
        
        cell_state = (cell_state * forget_percent) + (remember_percent * potential_memory)
        hidden_state = F.tanh(cell_state) * output_percent
        
        ##############
        ##
        ## Day 4
        ##
        ##############
        forget_percent = F.sigmoid((hidden_state * self.wf1) + (day4 * self.wf2) + self.bf1)
        remember_percent = F.sigmoid((hidden_state * self.wr1) + (day4 * self.wr2) + self.br1)
        potential_memory = F.tanh((hidden_state * self.wp1) + (day4 * self.wp2) + self.bp1)
        output_percent = F.sigmoid((hidden_state * self.wo1) + (day4 * self.wo2) + self.bo1)
        
        cell_state = (cell_state * forget_percent) + (remember_percent * potential_memory)
        hidden_state = F.tanh(cell_state) * output_percent
    
        ##### Now return the hidden_state as the output
        output = hidden_state
        return output
        
    def configure_optimizers(self): # this configures the optimizer we want to use for backpropagation.
        # return SGD(self.parameters(), lr=self.learning_rate)
        return Adam(self.parameters())

    def training_step(self, batch, batch_idx): # take a step during gradient descent.
        
        ## NOTE: When training_step() is called it calculates the loss with the code below...
        input_i, label_i = batch # collect input
        output_i = self.forward(input_i[0]) # run input through the neural network
        loss = (output_i - label_i)**2 ## loss = squared residual
        
        # self.train_acc.update(output_i, label_i)
        self.log("train_loss", loss)
        if (self.state == 0):
            self.state = 1
            self.log("out_0", output_i)
        else:
            self.state = 0
            self.log("out_1", output_i)
            
        ##...before calling (internally and behind the scenes)...
        ## optimizer.zero_grad() # to clear gradients
        ## loss.backward() # to do the backpropagation
        ## optimizer.step() # to update the parameters
        return loss

In [None]:
## create the training data for the neural network.
inputs = torch.tensor([[0., 0.5, 0.25, 1.], [1., 0.5, 0.25, 1.]])
labels = torch.tensor([0., 1.])
# labels = torch.tensor([0, 1])

dataset = TensorDataset(inputs, labels) 
dataloader = DataLoader(dataset)

In [None]:
model = BasicLightningTrain() # First, make model from the class
print("Before...")
## print out the name and value for each parameter
print("Parameters...")
for name, param in model.named_parameters():
    print(name, param.data)

print("\nOutput Values...")
print(model(torch.tensor([0., 0.5, 0.25, 0.75])).detach())
print(model(torch.tensor([1., 0.5, 0.25, 0.75])).detach())


In [None]:
trainer = L.Trainer(max_epochs=4000)
trainer.fit(model, train_dataloaders=dataloader)
print("\nAfter...")
## print out the name and value for each parameter
print("Parameters...")
for name, param in model.named_parameters():
    print(name, param.data)

print("\nOutput Values...")
print(model(torch.tensor([0., 0.5, 0.25, 0.75])).detach())
print(model(torch.tensor([1., 0.5, 0.25, 0.75])).detach())

In [None]:
## So we are close. How do I add an additional 1000 epochs without starting over?
## 3 Features:
## 1) Picking up where we left off
## 2) Logging and visualizing the logs (perhaps with comet or tensorboard).
## 3) LSTM module?
## - ask the team?
path_to_best_checkpoint = trainer.checkpoint_callback.best_model_path ## By default, "best" = "most recent"
print("The new trainer will start where the last left off, and the check point data is here: " + 
      path_to_best_checkpoint + "\n")

trainer = L.Trainer(max_epochs=5000)
trainer.fit(model, train_dataloaders=dataloader, ckpt_path=path_to_best_checkpoint)
print("\nAfter...")
## print out the name and value for each parameter
print("Parameters...")
for name, param in model.named_parameters():
    print(name, param.data)

print("\nOutput Values...")
print(model(torch.tensor([0., 0.5, 0.25, 0.75])).detach())
print(model(torch.tensor([1., 0.5, 0.25, 0.75])).detach())

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/

In [22]:
## Just learning how the dataloader works...

## create the training data for the neural network.
inputs = torch.tensor([[0., 0.5, 0.25, 1.], [1., 0.5, 0.25, 1.]])
labels = torch.tensor([0., 1.])
# labels = torch.tensor([0, 1])

dataset = TensorDataset(inputs, labels) 
dataloader = DataLoader(dataset, batch_size=1)
# dataloader = DataLoader(dataset, batch_size=2)

In [21]:
print(torch.transpose(inputs))

TypeError: transpose() received an invalid combination of arguments - got (Tensor), but expected one of:
 * (Tensor input, int dim0, int dim1)
 * (Tensor input, name dim0, name dim1)


In [23]:
for i, batch in enumerate(dataloader):
    print("index: " + str(i))
    print("batch: " + str(batch))
    print("\tbatch shape: " + str(batch.shape))
    print("input: " + str(batch[0]))
    print()
    for j, value in enumerate(batch[0][0]):
        print("\tday: " + str(j) + " " + str(value))
    print("\nlabel: " + str(batch[1]) + "\n")

index: 0
batch: [tensor([[0.0000, 0.5000, 0.2500, 1.0000],
        [1.0000, 0.5000, 0.2500, 1.0000]]), tensor([0., 1.])]


AttributeError: 'list' object has no attribute 'shape'

In [7]:
batch, y = next(iter(dataloader))
print(batch)
print(batch.shape)
print(y.shape)

tensor([[0.0000, 0.5000, 0.2500, 1.0000]])
torch.Size([1, 4])
torch.Size([1])


In [16]:
batch, y = next(iter(dataloader))
print("batch:", str(batch))
print("batch.size():", str(batch.size()), "\n")
print("y:", str(y))
print("y.size():", str(y.size()))

batch: tensor([[0.0000, 0.5000, 0.2500, 1.0000]])
batch.size(): torch.Size([1, 4]) 

y: tensor([0.])
y.size(): torch.Size([1])


In [None]:
model = BasicLightningTrain() # First, make model from the class

## Now create a Trainer - we can use the trainer to...
##  1) Find the optimal learning rate
##  2) Train (optimize) the weights and biases in the model
## By default, the trainer will run on your system's CPU
trainer = L.Trainer(max_epochs=34) 
## However, if we wanted to automatically take advantage of any available GPUs,
## we would set accelerator="auto" to automatically use available GPUs
## and we would set devices="auto" to automatically select as many GPUs as we have.
#
# trainer = L.Trainer(max_epochs=34, accelerator="auto", devices="auto")

## Now let's find the optimal learning rate
lr_find_results = trainer.tuner.lr_find(model,
                                        train_dataloaders=dataloader, # the training data
                                        min_lr=0.001, # minimum learning rate
                                        max_lr=1.0,   # maximum learning rate
                                        early_stop_threshold=None) # setting this to "None" tests all 100 candidate rates
new_lr = lr_find_results.suggestion() ## suggestion() returns the best guess for the optimal learning rate

## now print out the learning rate
print(f"lr_find() suggests {new_lr:.5f} for the learning rate.")

# now set the model's learning rate to the new value
model.learning_rate = new_lr

## NOTE: we can also plot the loss for each learning rate tested.
## When you have a lot of data, this graph can be useful
## (see https://pytorch-lightning.readthedocs.io/en/1.4.5/advanced/lr_finder.html to learn how to interpret)
## but when you only have 3 data points, like our example, this plot is pretty hard to interpret so I did
## not cover it in the video.
# fig = lr_finder.plot(suggest=True)
# fig.show()

In [None]:
## Now that we have an improved learning rate, we can train the model (optimize final_bias)
trainer.fit(model, train_dataloaders=dataloader)

print(model.final_bias.data)

In [None]:
## run the different doses through the neural network
output_values = model(input_doses)

## set the style for seaborn so that the graph looks cool.
sns.set(style="whitegrid")

## create the graph (you might not see it at this point, but you will after we save it as a PDF).
sns.lineplot(x=input_doses, 
             y=output_values.detach(), ## NOTE: we call detach() because final_bias has a gradient
             color='green', 
             linewidth=2.5)

## now label the y- and x-axes.
plt.ylabel('Effectiveness')
plt.xlabel('Dose')

## lastly, save the graph as a PDF.
# plt.savefig('BasicLightningTrain_optimized.pdf')