# Assignment 3 Top-Level Code/Notebook
### Training a language model base on Karpathy's minGPT codebase


In [1]:
# The code below is needed for using Google Colab, so un comment this if that is what you're using
""" 
import nltk
nltk.download('punkt')
"""

" \nimport nltk\nnltk.download('punkt')\n"

In [2]:
# The code below is also needed for using Google Colab
# BEFORE executing this, you must place the mingpt folder supplied in the assignment
# your google drive, within the folder "Colab Notebooks"
#
# It mounts and changes into the folder that contains mingpt, which you must upload to google drive
# So un-comment it if you've uploaded mingpt to your google drive, into the  "Colab Notebooks" folder
"""
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks/
""" 

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n%cd /content/drive/MyDrive/Colab\\ Notebooks/\n"

In [3]:
import torch 
import numpy as np

from nltk.tokenize import sent_tokenize 

from pathlib import Path 
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.bpe import BPETokenizer 
from mingpt.utils import set_seed 
set_seed(1234)

In [4]:
"""
Prepare the dataset to train the Language Model (LM)
This implementation splits the sentences and so doesn't create training 
examples that cross sentences.

This code is set so that it uses one of two possible datasets, which were also used in Assignment 1: 
SmallSimpleCorpus.txt or LargerCorpus.txt

Arguments:
            ds_choice: str. "small" or "large". (i.e. selects which of the two datasets)
            split: str. "train" or "test".
            truncation: int. If -1: no truncation on sentences. Otherwise: truncate to this specific length.
""" 

class LanguageModelingDataset(Dataset):
    
    def __init__(self, ds_choice="small", split="train", truncation=-1):
        
        base_path = "./"
        fn = {"small": "SmallSimpleCorpus.txt", "large": "LargerCorpus.txt"}
        self.ds_choice = ds_choice
        self.truncation = truncation  # int. If -1, then
        text = Path(base_path, fn[ds_choice]).read_text()
        if ds_choice == "large":
            # Remove the newline char in the middle of sentences
            # The "paragraph splitting" newlines appear to be \n\n -- remove the duplications there
            text = text.replace("\n\n", "$$^^$$").replace("\n", " ").replace("$$^^$$", "\n")
        sentences = sent_tokenize(text)

        # Train / test split
        train, val = train_test_split(sentences, test_size=0.2, shuffle=False)
        if split == "train":
            raw_data = train 
        else:
            raw_data = val 

        # Tokenize
        self.tokenizer = BPETokenizer()
        self.data = []  # List of 1-d pytorch tensor
        for sent in raw_data:
            tokenized = self.tokenizer(sent).view(-1)  # pytorch tensor
            if truncation >= 0:
                self.data.append(tokenized[:truncation])
            else:
                self.data.append(tokenized)

        # Count some items
        self.max_sentence_length = np.max([len(d) for d in self.data])

    def __len__(self):
        return len(self.data)

    def get_vocab_size(self):
        """
        We have to set this to the max vocab size (i.e., that decided by the BPE tokenizer), 
        but actually, only a small number of vocab is used, especially for the small text. 
        """
        return 50257

    def __getitem__(self, idx):
        """
        The output should be a tuple x and y, both as pytorch tensors.
        Please refer to the `run()` method in the mingpt/trainer.py script for 
        how the x and y are going to be used.
        """
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return (x, y)

    def get_block_size(self):
        """
        block_size is the size at which lines are truncated to ensure they are equal-length.
        """
        return self.max_sentence_length
    
# Instantiate the Training Dataset
train_dataset = LanguageModelingDataset(ds_choice="small", split="train")  # use this for the short corpus
#train_dataset = LanguageModelingDataset(ds_choice="large", split="train", truncation=512) #use this for long

# Instantiate a Validation Dataset (this is only really needed for the fine-tune task, not the LM task)
val_dataset = LanguageModelingDataset(ds_choice="small", split="validation")
#val_dataset = LanguageModelingDataset(ds_choice="large", split="validation", truncation=512)

In [5]:
def lm_collate_fn(batch, device):
    x = [item[0] for item in batch]  # List (len B) of varying lengths
    y = [item[1] for item in batch]  # List (len B) of the same lengths as x
    maxlen = max([len(s) for s in x])

    padded_x, padded_y = [], []
    for sx, sy in zip(x, y):
        padded_x.append(torch.cat([sx, torch.ones(maxlen - len(sx))]))
        padded_y.append(torch.cat([sy, torch.ones(maxlen - len(sy))]))
    return torch.stack(padded_x).long().to(device), torch.stack(padded_y).long().to(device)


In [6]:
# Print out an example of the data - this is processed more once it reaches lm_collate_fn (above)
x,y = train_dataset[5]
print(x, y)
print("X: ",train_dataset.tokenizer.decode(x))
print("Y: ",train_dataset.tokenizer.decode(y))

tensor([  40, 6437,  262, 3290]) tensor([6437,  262, 3290,   13])
X:  I rub the dog
Y:   rub the dog.


In [7]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model_config.n_classification_class = 2
model = GPT(model_config)

number of parameters: 2.50M


In [8]:
# Create a Trainer object and set the core hyper-parameters
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 3000  # For small corpus: 3000 iterations is plenty. For large corpus: 100000 iterations is needed
train_config.num_workers = 0
train_config.batch_size = 4    # For small corpus, batch size of 4 is fine.  For large corpus use 16
trainer = Trainer(train_config, model, train_dataset, val_dataset, collate_fn=lm_collate_fn)

running on device cpu


In [9]:
# This function is called at the end of every batch in training
# and is used to report the amount of time per 100 batches, and the loss at that point

def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

# Train!
trainer.run()

iter_dt 0.00ms; iter 0: train loss 10.82099
iter_dt 38.56ms; iter 100: train loss 5.97739
iter_dt 37.61ms; iter 200: train loss 2.52467
iter_dt 36.24ms; iter 300: train loss 1.45734
iter_dt 37.96ms; iter 400: train loss 0.82555
iter_dt 37.67ms; iter 500: train loss 0.81646
iter_dt 38.19ms; iter 600: train loss 0.79090
iter_dt 37.52ms; iter 700: train loss 0.67038
iter_dt 37.53ms; iter 800: train loss 0.66822
iter_dt 37.75ms; iter 900: train loss 0.56715
iter_dt 38.29ms; iter 1000: train loss 0.59438
iter_dt 37.78ms; iter 1100: train loss 0.76046
iter_dt 34.85ms; iter 1200: train loss 0.58739
iter_dt 37.26ms; iter 1300: train loss 0.59170
iter_dt 37.45ms; iter 1400: train loss 0.62839
iter_dt 37.51ms; iter 1500: train loss 0.66044
iter_dt 37.31ms; iter 1600: train loss 0.70982
iter_dt 37.72ms; iter 1700: train loss 0.75451
iter_dt 37.38ms; iter 1800: train loss 0.59662
iter_dt 38.26ms; iter 1900: train loss 0.59755
iter_dt 36.50ms; iter 2000: train loss 0.58447
iter_dt 37.34ms; iter 210

In [10]:
model.to(trainer.device)
# store the saved model in a file, so can re-use later
modelsavename= "model_filename.pt"  # change the name here to save in a specific file (and restore below)
with open(modelsavename, "wb") as f:
    torch.save(trainer.model.state_dict(), f)

In [12]:
# Use the trained language model to predict a sequence of words following a few words
encoded_prompt = train_dataset.tokenizer("He and I").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.8, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'He and I can hold a dog. cat. cat and dog'

In [13]:
# Another example
encoded_prompt = train_dataset.tokenizer("She rubs").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'She rubs a dog and cat. cat. cat. cat'

In [14]:
# The code below shows how to reload the model from the saved file; is useful things that take long to train
model.load_state_dict(torch.load(modelsavename))

<All keys matched successfully>

In [15]:
# Example showing how the reloaded model still works
encoded_prompt = train_dataset.tokenizer("She rubs").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'She rubs a cat and dog. dog. cat. cat'