# Lab 3b: LSTM
In this second part of the lab, you will use a similar language class and LSTM as in part (a) to make a text generator.

In [None]:

# import pytorch
import torch
import torch.nn as nn
from torch import Tensor
from torch import optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import lightning as L

# import basic functionality
import random
import numpy as np
import pandas as pd
import itertools
import re
import string
import time


In [None]:

# download a collection of short stories
!mkdir -p ~/short_stories
%cd ~/

!wget -P ~/short_stories/ https://github.com/evocellnet/bc_deep_learning_in_biology/blob/main/lab3/Lab3b_LSTM_short_stories.txt


In [None]:

# put your favorite text here
with open('short_stories/Lab3b_LSTM_short_stories.txt') as f:
    lines = f.readlines()

lines = lines[0].split('rawLines":[')[1].split('TEXT TAKEN FROM')[0]
cur_text = lines.lower()

letters = string.ascii_lowercase+'. '
cur_text_trimmed = re.sub(' +', ' ', "".join([ch if ch in letters else ' ' for ch in cur_text])).strip()

use_nr_sentences = 50
cur_text = ". ".join(cur_text_trimmed.split('. ')[0:use_nr_sentences])+'.'
cur_text

In [None]:
# as before we define a language
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.index2word = {}
        self.n_words = 0  # Count SOS and EOS

    def addSentence(self, sentence, codon_length):
        for word in sentence:#[sentence[i:i+codon_length] for i in range(0, len(sentence), codon_length)]:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1
            
    def to_encoding(self):
        for key in self.word2index:
            new_val = np.zeros(len(self.word2index),dtype=np.int32)
            new_val[self.word2index[key]] = 1
            self.word2index[key] = new_val


# function for encoding an input sentence
def encode_input(language, sentence):
    sentence_encoded = [np.array(language.word2index[sentence])]
    return (sentence_encoded)
    
# function for encoding a target sentence
def encode_target(language, sentence):
    targets = []
    for x in sentence:
        targets.append(np.argmax(language.word2index[x]))
    return np.array([tar for tar in targets])


# Step 1: Create a language for your text
We need to encode the text that we use for training as before. We'll use the same language for encoding/decoding input and outputs of the model.

In [None]:

# create a language for encoding the words in your text
gen_lang = Lang("text_gen")

########################
### create the language
########################

# number of words in language
gen_lang.n_words


In [None]:

# create input-target word pairs
cur_sent = cur_text#sentences[i]
use_sentences_inp = cur_sent.split()[:-1]
use_sentences_tar = cur_sent.split()[1:]

# look at input-target pairs
use_sentences_inp[0:5], use_sentences_tar[0:5]


In [None]:

# encode sentences
input_encoded = [encode_input(gen_lang, word) for word in use_sentences_inp]
target_encoded = encode_target(gen_lang, use_sentences_tar)

# look at input-target encodings
input_encoded[0:3], target_encoded[0:3]


In [None]:

# create input/target tensors for model
inp = torch.from_numpy(np.array(input_encoded)).double()
tar = torch.from_numpy(np.array(target_encoded))

# look at input-target tensors
inp, tar


# Step 2: Define model
We define and train our model class as before. Here, we use a simple, manual, training loop without cross-validation to speed up the training process and give some insight in the details.

In [None]:

# Define the device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:

########################
### create the model architecture
########################
class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyLSTM,self).__init__()
        
        # save input parameters
        # define model layers (LSTM) (pseudocode:)
        self.LSTM = nn.LSTM(input_1, ..., input_k, batch_first = True)
        
    def forward(self,inp):
        inp = inp.to(device)
        
        # define initial hidden and cell states of LSTM
        # run LSTM (pseudocode:)
        output_1, ..., output_n = self.LSTM(input_1, ..., input_m) 
        
        return output


In [None]:

# initialize model
model = MyLSTM(gen_lang.n_words, gen_lang.n_words, gen_lang.n_words).double()

########################
### define the loss function and optimizer
########################
optimizer = ... # optimizer
criterion = ... # loss function

#Number of iterations
epochs = 150
start = time.time()
for itr in range(epochs):
    
    # zero the previous gradients
    model.zero_grad()
    optimizer.zero_grad()
    
    #Find the output
    output = model(inp)
    
    #Reshape the output to 2 dimensions. This is done, so that we can compare with target and get loss
    output = output.view(len(inp), gen_lang.n_words)
    
    # compute loss of model output for target text
    loss = criterion(output, tar)
    
    # print loss for every x-th iteration
    if itr%20==0:
        print('Iteration : '+str(itr)+' Loss : '+str(loss) )
        
    #Back propagate the loss
    loss.backward()
    
    #Perform weight updation
    optimizer.step()
    
print('Time taken to train : '+str(time.time()-start)+" seconds")


# Step 3: Test the model
Finally, we test the model on a random sequence

In [None]:

# function predicts the next letter given the sequence   
def predict_word(s):
    # get the input tensor
    input_encoded = [encode_input(gen_lang, word) for word in s.split()]
    input_tensor = torch.from_numpy(np.array(input_encoded)).double() 
    
    # get the output from your model
    out = model(input_tensor)
    
    # map output to most likely word in your language
    return gen_lang.index2word[out[-1][0].topk(1)[1].numpy()[0]]
         

#THis method recursively generates the sequence using the trained model
def generate_sentence(cur_sent):    
    # generate untill we find a dot
    if cur_sent[-1]=='.' or len(cur_sent)>100:
        return
    
    # predict with sequence s
    next_word = predict_word(cur_sent)
    
    # continue prediction with sequence s + predicted value
    print(cur_sent + ' ' + next_word)
    
    # continue generation of text
    generate_sentence(cur_sent + ' ' + next_word)
    

In [None]:

# each line adds a newly generated word
generate_sentence('boat')


In [None]:

# view words in your dictionary for input
set([key for key in gen_lang.word2index])


# Step 4: Questions
1) Define a LSTM, optimizer and loss function
2) How does the training depend on the size of the hidden layer?
3) Train the model on different text / longer text
3) Can you modify the lab to encode letters instead of words? (i.e., a literal text generator instead of word generator). Is this beneficial?
