# Recurrent Neural Networks and Language Models

You guys probably very excited about ChatGPT.  In today class, we will be implementing a very simple language model, which is basically what ChatGPT is, but with a simple LSTM.  You will be surprised that it is not so difficult at all.

Paper that we base on is *Regularizing and Optimizing LSTM Language Models*, https://arxiv.org/abs/1708.02182

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm

In [2]:
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda:2


## 1. Load data - Wiki Text

We will be using wikitext which contains a large corpus of text, perfect for language modeling task.  This time, we will use the `datasets` library from HuggingFace to load.

In [3]:
import os
os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'
# https://huggingface.co/datasets/codeparrot/github-jupyter-code-to-text/tree/main dataset
#there are raw and preprocessed version; we used the raw one and preprocessed ourselves for fun
import datasets 
train_jupyter = datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="train")
test_jupyter = datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="test")
print(train_jupyter, test_jupyter)

Using custom data configuration codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1
Found cached dataset parquet (/root/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Using custom data configuration codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1
Found cached dataset parquet (/root/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 47452
}) Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 11864
})


In [4]:
#Split New line Sentence
train_split = [split for text in train_jupyter['content'] for split in text.split('\n') if split != ""]
test_split = [split for text in test_jupyter['content'] for split in text.split('\n') if split != ""]

In [5]:
len(train_split),len(test_split)

(11367363, 2875424)

## 2. Preprocessing

### Tokenizing

Simply tokenize the given text to tokens.

In [6]:
from torchtext.data.utils import get_tokenizer
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

tokenized_dataset_train = yield_tokens(train_split[:int(len(train_split)/100)])
tokenized_dataset_test = yield_tokens(test_split[:int(len(test_split)/100)])

In [7]:
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
import re

nlp = spacy.load('en_core_web_md')

def preprocessing(sentence):
    
    # Clear the html tag by using regular expression.
    sentence = re.sub("<[^>]*>", "", sentence)
    sentence = re.sub("[^\x00-\x7F]+", "", sentence) #extract non-english out
    #It matches any character which is not contained in the ASCII character set (0-127, i.e. 0x0 to 0x7F)
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    cleaned_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and \
            token.pos_ != 'SYM' and token.pos_!= 'X':
                cleaned_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(cleaned_tokens)

### Numericalizing

We will tell torchtext to add any word that has occurred at least three times in the dataset to the vocabulary because otherwise it would be too big.  Also we shall make sure to add `unk` and `eos`.

In [8]:
from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(preprocessing(text))

vocab = build_vocab_from_iterator(yield_tokens(train_split[:int(len(test_split)/100)]), min_freq=5) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print('Vocab Size',len(vocab))                         
print(vocab.get_itos()[:10])       

Vocab Size 2769
['<unk>', '<eos>', '=', 'explanation', '#', 'end', "'", 'import', 'datum', 'property']


In [9]:
with open('vocab.txt', 'w') as file:
    for item in vocab.get_itos():
        # write each item on a new line
        file.write("%s\n" % item)
    print('Done')

v = [line.rstrip() for line in open('vocab.txt', mode = 'r')]
print('Vocab Size check', len(v)) #not work

Done
Vocab Size check 2769


In [45]:
import pickle
# Store data (serialize)
with open('vocab.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load data (deserialize)
with open('vocab.pickle', 'rb') as handle:
    check_vocab = pickle.load(handle)
check_vocab #Good

Vocab()

## 3. Prepare the batch loader

### Prepare data

Given "Chaky loves eating at AIT", and "I really love deep learning", and given batch size = 3, we will get three batches of data "Chaky loves eating at", "AIT `<eos>` I really", "love deep learning `<eos>`".  

In [10]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:       
        #appends eos so we know it ends....so model learn how to end...                             
        tokens = example.append('<eos>') #end of sentence
        #numericalize          
        tokens = [vocab[token] for token in example] 
        data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)        
    return data

In [11]:
batch_size = 128
train_data = get_data(tokenized_dataset_train, vocab, batch_size)
valid_data = get_data(tokenized_dataset_test, vocab, batch_size)
# test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)

In [12]:
train_data.shape #[batch_size, all the next length]

torch.Size([128, 8296])

## 4. Modeling 

In [13]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.hid_dim = hid_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size,emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                                        dropout = dropout_rate, batch_first = True)
        self.dropout = nn.Dropout(dropout_rate)
        #when you do LM, you look forward, so it does not make sense to do bidirectional
        self.fc = nn.Linear(hid_dim,vocab_size)

    def init_hidden(self, batch_size, device):
        #this function gonna be run in the beginning of the epoch
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)

        return hidden, cell #return as tuple

    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #removing this hidden from gradients graph
        cell =  cell.detach() #removing this hidden from gradients graph
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch_size, seq_len]

        #embed 
        embedded = self.embedding(src)
        #embed : [batch_size, seq_len, emb_dim]

        #send this to the lstm
        #we want to put hidden here... because we want to reset hidden .....
        output, hidden = self.lstm(embedded, hidden)
        #output : [batch_size, seq_len, hid_dim] ==> all hidden states
        #hidden : [batch_size, seq_len, hid_dim] ==> last hidden states from each layer

        output = self.dropout(output)
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab_size]
        return prediction, hidden

## 5. Training 

Follows very basic procedure.  One note is that some of the sequences that will be fed to the model may involve parts from different sequences in the original dataset or be a subset of one (depending on the decoding length). For this reason we will reset the hidden state every epoch, this is like assuming that the next batch of sequences is probably always a follow up on the previous in the original dataset.

In [14]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [15]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 22,467,281 trainable parameters


In [16]:
def get_batch(data, seq_len, idx):
    #this data is from get_data()
    #train_data.shape # [batch_size, number of batches....]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [17]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) #prevent gradient explosion - clip is basically 
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [18]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [19]:
n_epochs = 50
seq_len  = 50
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './models/best-val-auto.pt')
    print(f'\tepoch: {epoch+1}')
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

	Train Perplexity: 17.034
	Valid Perplexity: 11.250


                                                           

	Train Perplexity: 9.233
	Valid Perplexity: 9.612


                                                           

	Train Perplexity: 8.099
	Valid Perplexity: 9.060


                                                           

	Train Perplexity: 7.492
	Valid Perplexity: 8.703


                                                           

	Train Perplexity: 7.063
	Valid Perplexity: 8.460


                                                           

	Train Perplexity: 6.718
	Valid Perplexity: 8.290


                                                           

	Train Perplexity: 6.426
	Valid Perplexity: 8.205


                                                           

	Train Perplexity: 6.180
	Valid Perplexity: 8.134


                                                           

	Train Perplexity: 5.958
	Valid Perplexity: 8.025


                                                           

	Train Perplexity: 5.762
	Valid Perplexity: 7.985


                                                           

	Train Perplexity: 5.589
	Valid Perplexity: 8.000


                                                           

	Train Perplexity: 5.305
	Valid Perplexity: 7.905


                                                           

	Train Perplexity: 5.129
	Valid Perplexity: 7.929


                                                           

	Train Perplexity: 4.954
	Valid Perplexity: 7.942


                                                           

	Train Perplexity: 4.841
	Valid Perplexity: 7.947


                                                           

	Train Perplexity: 4.768
	Valid Perplexity: 7.957


                                                           

	Train Perplexity: 4.726
	Valid Perplexity: 7.968


                                                           

	Train Perplexity: 4.713
	Valid Perplexity: 7.955


                                                           

	Train Perplexity: 4.701
	Valid Perplexity: 7.947


                                                           

	Train Perplexity: 4.703
	Valid Perplexity: 7.943


                                                           

	Train Perplexity: 4.692
	Valid Perplexity: 7.940


                                                           

	Train Perplexity: 4.694
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.698
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.697
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.694
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.693
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.693
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.694
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.695
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.695
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.691
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.695
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.694
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.690
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.696
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.691
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.695
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.694
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.691
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.691
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.694
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.697
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.686
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.690
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.693
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.700
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.692
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.693
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.694
	Valid Perplexity: 7.939


                                                           

	Train Perplexity: 4.693
	Valid Perplexity: 7.939


## 6. Testing

In [20]:
# model.load_state_dict(torch.load('best-val-auto.pt',  map_location=device))
# test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
# print(f'Test Perplexity: {math.exp(test_loss):.3f}')

## 7. Real-world inference

Here we take the prompt, tokenize, encode and feed it into the model to get the predictions.  We then apply softmax while specifying that we want the output due to the last word in the sequence which represents the prediction for the next word.  We divide the logits by a temperature value to alter the model’s confidence by adjusting the softmax probability distribution.

Once we have the Softmax distribution, we randomly sample it to make our prediction on the next word. If we get <unk> then we give that another try.  Once we get <eos> we stop predicting.
    
We decode the prediction back to strings last lines.

In [21]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [37]:
prompt = 'import matplotlib.pyplot'
max_seq_len = 30
seed = 0
            #superdiverse   more diverse
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0] 
#sample from this distribution higher probability will get more change
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
import matplotlib.pyplot as plt

0.7
import matplotlib.pyplot as plt

0.75
import matplotlib.pyplot as plt

0.8
import matplotlib.pyplot as plt

1.0
import matplotlib.pyplot as plt

