# Assignment 3 Top-Level Code/Notebook
### Training a language model base on Karpathy's minGPT codebase


In [123]:
# The code below is needed for using Google Colab, so un comment this if that is what you're using
""" 
import nltk
nltk.download('punkt')
"""

" \nimport nltk\nnltk.download('punkt')\n"

In [124]:
# The code below is also needed for using Google Colab
# BEFORE executing this, you must place the mingpt folder supplied in the assignment
# your google drive, within the folder "Colab Notebooks"
#
# It mounts and changes into the folder that contains mingpt, which you must upload to google drive
# So un-comment it if you've uploaded mingpt to your google drive, into the  "Colab Notebooks" folder
"""
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks/
""" 

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n%cd /content/drive/MyDrive/Colab\\ Notebooks/\n"

In [125]:
import torch 
import numpy as np

from nltk.tokenize import sent_tokenize 

from pathlib import Path 
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.bpe import BPETokenizer 
from mingpt.utils import set_seed 
set_seed(1234)

In [126]:
base_path = "./"
fn = {"small": "SmallSimpleCorpus.txt", "large": "LargerCorpus.txt", "got": "./data/Jon_1.txt"}
ds_choice = "got"
truncation = -1  # int. If -1, then
text = Path(base_path, fn[ds_choice]).read_text()
if ds_choice == "got":
    # Remove the newline char in the middle of sentences
    # The "paragraph splitting" newlines appear to be \n\n -- remove the duplications there
    text = text.replace("\n\n", "$$^^$$").replace("\n", " ").replace("$$^^$$", "\n")
sentences = sent_tokenize(text)

# Train / test split
train, val = train_test_split(sentences, test_size=0.2, shuffle=False)
raw_data = train 


# Tokenize
tokenizer = BPETokenizer()
data = []  # List of 1-d pytorch tensor
for sent in raw_data[:10]:
    tokenized = tokenizer(sent).view(-1)  # pytorch tensor
    print(tokenized)
    print(sent)

tensor([ 1858,   547,  1661,  ..., 42939,   757,    30])
There were times—not many <SEP> but a few—when Jon Snow was glad he was a bastard <SEP> As he filled his wine cup once more from a passing flagon <SEP> it struck him that this might be one of them <SEP> He settled back in his place on the bench among the younger squires and drank <SEP> The sweet <SEP> fruity taste of summerwine filled his mouth and brought a smile to his lips <SEP> The Great Hall of Winterfell was hazy with smoke and heavy with the smell of roasted meat and fresh-baked bread <SEP> Its grey stone walls were draped with banners <SEP> White <SEP> gold <SEP> crimson: the direwolf of Stark <SEP> Baratheon ’ s crowned stag <SEP> the lion of Lannister <SEP> A singer was playing the high harp and reciting a ballad <SEP> but down at this end of the hall his voice could scarcely be heard above the roar of the fire <SEP> the clangor of pewter plates and cups <SEP> and the low mutter of a hundred drunken conversations <SEP> 

In [127]:
"""
Prepare the dataset to train the Language Model (LM)
This implementation splits the sentences and so doesn't create training 
examples that cross sentences.

This code is set so that it uses one of two possible datasets, which were also used in Assignment 1: 
SmallSimpleCorpus.txt or LargerCorpus.txt

Arguments:
            ds_choice: str. "small" or "large". (i.e. selects which of the two datasets)
            split: str. "train" or "test".
            truncation: int. If -1: no truncation on sentences. Otherwise: truncate to this specific length.
""" 

class LanguageModelingDataset(Dataset):
    
    def __init__(self, ds_choice="large", split="train", truncation=-1):
        
        base_path = "./"
        fn = fn = {"small": "SmallSimpleCorpus.txt", "large": "LargerCorpus.txt", "got": "./data/Jon_1.txt"}
        self.ds_choice = ds_choice
        self.truncation = truncation  # int. If -1, then
        text = Path(base_path, fn[ds_choice]).read_text()
        if ds_choice == "got":
            # Remove the newline char in the middle of sentences
            # The "paragraph splitting" newlines appear to be \n\n -- remove the duplications there
            text = text.replace("\n\n", "$$^^$$").replace("\n", " ").replace("$$^^$$", "\n")
        sentences = sent_tokenize(text)

        # Train / test split
        train, val = train_test_split(sentences, test_size=0.2, shuffle=False)
        if split == "train":
            raw_data = train 
        else:
            raw_data = val 

        # Tokenize
        self.tokenizer = BPETokenizer()
        self.data = []  # List of 1-d pytorch tensor
        for sent in raw_data:
            tokenized = self.tokenizer(sent).view(-1)  # pytorch tensor
            if truncation >= 0:
                self.data.append(tokenized[:truncation])
            else:
                self.data.append(tokenized)

        # Count some items
        self.max_sentence_length = np.max([len(d) for d in self.data])

    def __len__(self):
        return len(self.data)

    def get_vocab_size(self):
        """
        We have to set this to the max vocab size (i.e., that decided by the BPE tokenizer), 
        but actually, only a small number of vocab is used, especially for the small text. 
        """
        return 50257

    def __getitem__(self, idx):
        """
        The output should be a tuple x and y, both as pytorch tensors.
        Please refer to the `run()` method in the mingpt/trainer.py script for 
        how the x and y are going to be used.
        """
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return (x, y)

    def get_block_size(self):
        """
        block_size is the size at which lines are truncated to ensure they are equal-length.
        """
        return self.max_sentence_length
    
# Instantiate the Training Dataset
#train_dataset = LanguageModelingDataset(ds_choice="small", split="train")  # use this for the short corpus
train_dataset = LanguageModelingDataset(ds_choice="got", split="train", truncation=512) #use this for long

# Instantiate a Validation Dataset (this is only really needed for the fine-tune task, not the LM task)
#val_dataset = LanguageModelingDataset(ds_choice="small", split="validation")
val_dataset = LanguageModelingDataset(ds_choice="got", split="validation", truncation=512)

In [128]:
print(train_dataset[1])

(tensor([  258,  1965,  1279,  5188,    47,    29,  1318,   373,   991,  2063,
          257, 12498,   276,  9015,   287,   262,  3641,   286,   262,  3084,
         1279,  5188,    47,    29,  5966,  4251,   503,   284, 11626,   572,
          257,  1232,  1279,  5188,    47,    29,   788,   550,   257,  1365,
         2126,  1279,  5188,    47,    29,   679,   638,   361,   276,   262,
         6512,  2187,   290,  1309,   262, 36756,   562, 10649,   284,   262,
         4314,  1022,   465,  7405,  1279,  5188,    47,    29,  9897, 19551,
          656,   340,   287, 27303,  9550,  1279,  5188,    47,    29,  2399,
         9397,   290, 15153,   550,   407,   587, 10431,   284,  2222,   511,
        23214,   284,   262, 47600,  1279,  5188,    47,    29,   475,   612,
          547,   517, 13882,   621,  5966,   714,   954,   379,   428,   886,
          286,   262,  6899,  1279,  5188,    47,    29,   290,   645,   530,
          550,   531,   257,  1573,   546,   465, 15552,  1279,

In [129]:
print(train_dataset.get_vocab_size())
print(train_dataset.get_block_size())

50257
512


In [130]:
def lm_collate_fn(batch, device):
    x = [item[0] for item in batch]  # List (len B) of varying lengths
    y = [item[1] for item in batch]  # List (len B) of the same lengths as x
    maxlen = max([len(s) for s in x])

    padded_x, padded_y = [], []
    for sx, sy in zip(x, y):
        padded_x.append(torch.cat([sx, torch.ones(maxlen - len(sx))]))
        padded_y.append(torch.cat([sy, torch.ones(maxlen - len(sy))]))
    return torch.stack(padded_x).long().to(device), torch.stack(padded_y).long().to(device)


In [131]:
torch.stack([torch.tensor([1,2,3,4]),torch.tensor([1,2,3,4])])

tensor([[1, 2, 3, 4],
        [1, 2, 3, 4]])

In [132]:
# Print out an example of the data - this is processed more once it reaches lm_collate_fn (above)
x,y = train_dataset[6]
print(x, y)
print("X: ",train_dataset.tokenizer.decode(x))
print("Y: ",train_dataset.tokenizer.decode(y))

tensor([18219, 32758,  1279,  5188,    47,    29,   383,  8237,   373,  1642,
          683, 10758,  1279,  5188,    47,    29,   679,  3088,   284,  1650,
          845,  3892,  1279,  5188,    47,    29,   284,   787,  2241,  1283,
        25242,  1279,  5188,    47,    29,  1279,    33,  2640,    29,   314,
         6044,  2147,  1279,  5188,    47,    29,   314,   765,   284,  4691,
          287,   262,  5265,   564,   247,   264,  6305,  1279,  5188,    47,
           29, 23169,  1279,  5188,    47,    29,  1279,    36,  2640,    29,
          679,   550,  1807,   319,   340,   890,   290,  1327,  1279,  5188,
           47,    29,  9105,   450,   276,   379,  1755,   981,   465,  9397,
        21256,  1088,   683,  1279,  5188,    47,    29, 31384,   561, 25580,
        16955, 10633, 23299,  1279,  5188,    47,    29,   561,  3141,  1049,
        18837,   355,   262, 38498,   286,   262,  2258,  1279,  5188,    47,
           29, 33628,   290,  8759,   261,   561,   307, 31384, 

In [133]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-micro'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model_config.n_classification_class = 2
model = GPT(model_config)

number of parameters: 7.29M


In [134]:
# Create a Trainer object and set the core hyper-parameters
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 3e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 1000  # For small corpus: 3000 iterations is plenty. For large corpus: 100000 iterations is needed
train_config.num_workers = 0
train_config.batch_size = 8    # For small corpus, batch size of 4 is fine.  For large corpus use 16
trainer = Trainer(train_config, model, train_dataset, val_dataset, collate_fn=lm_collate_fn)

running on device cpu


In [135]:
# This function is called at the end of every batch in training
# and is used to report the amount of time per 100 batches, and the loss at that point

def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

# Train!
trainer.run()

iter_dt 0.00ms; iter 0: train loss 11.07419
iter_dt 3362.16ms; iter 100: train loss 3.29252


KeyboardInterrupt: 

In [107]:
model.to(trainer.device)
# store the saved model in a file, so can re-use later
modelsavename= "model_small.pt"  # change the name here to save in a specific file (and restore below)
with open(modelsavename, "wb") as f:
    torch.save(trainer.model.state_dict(), f)

In [109]:
# Use the trained language model to predict a sequence of words following a few words
encoded_prompt = train_dataset.tokenizer("Jon said").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=1.5, max_new_tokens=20)
train_dataset.tokenizer.decode(generated_sequence[0])
# print([round(prob.item(),4) for prob in probs_seq])

'Jon said.” he said.” “I said.” “I”'

In [41]:
from tabulate import tabulate

rows = []
for i in range(6):
    row = []
    for probs in probs_seq:
        row.append((train_dataset.tokenizer.decode(torch.tensor([probs[1][i]])),round(probs[0][i].item(),4)))
    rows.append(row)
print(tabulate(rows))

-----------------  ------------------  -----------------  ----------------  ----------------  ----------------  ----------------  -----------------  ----------------  ------------------
(' can', 0.5617)   (' hold', 0.6249)   (' a', 0.5445)     (' dog', 0.5756)  ('.', 0.9982)     (' cat', 0.607)   ('.', 0.9972)     (' cat', 0.7194)   (' and', 0.5736)  (' dog', 0.9899)
(' hold', 0.2944)  (' rub', 0.3688)    (' the', 0.452)    (' cat', 0.424)   (' and', 0.0011)  (' dog', 0.3889)  (' .', 0.0017)    (' dog', 0.2796)   ('.', 0.4235)     (' cat', 0.0077)
(' rub', 0.1411)   (' can', 0.0059)    (' and', 0.0032)   (' the', 0.0001)  (' .', 0.0006)    (' a', 0.0013)    (' and', 0.001)   (' the', 0.0003)   (' a', 0.0012)    (' I', 0.0012)
(' holds', 0.002)  (' cat', 0.0001)    (' dog', 0.0001)   (' a', 0.0001)    (' rub', 0.0)     (' the', 0.0012)  (' rub', 0.0001)  (' a', 0.0003)     (' the', 0.0011)  (' rub', 0.0007)
(' and', 0.0004)   (' holds', 0.0001)  (' cat', 0.0001)   (' holds', 0.0)   (' c

In [15]:
# Another example
encoded_prompt = train_dataset.tokenizer("She rubs").to(trainer.device)
generated_sequence, probs_seq = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
print(train_dataset.tokenizer.decode(generated_sequence[0]))
print([round(prob.item(),4) for prob in probs_seq])

She rubs a dog and cat. cat. cat. dog
[0.4242, 0.5053, 0.7471, 0.9993, 0.9997, 0.6742, 0.8804, 0.7203, 0.9986, 0.6166]


In [18]:
# my example
encoded_prompt = train_dataset.tokenizer("She holds").to(trainer.device)
generated_sequence, probs_seq = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
print(train_dataset.tokenizer.decode(generated_sequence[0]))
print([round(prob.item(),4) for prob in probs_seq])

She holds a cat and dog.. dog. cat.
[0.4604, 0.5518, 0.6688, 0.9997, 0.9997, 0.934, 0.541, 0.9994, 0.7498, 0.983]


In [None]:
print(train_dataset[9])

(tensor([  40, 1745,  290, 6437,  262, 3290]), tensor([1745,  290, 6437,  262, 3290,   13]))


In [18]:
# The code below shows how to reload the model from the saved file; is useful things that take long to train
model_large = "model_large100K.pt"
model.load_state_dict(torch.load(model_large))

<All keys matched successfully>

In [56]:
# Example showing how the reloaded model still works
set_seed(99)
encoded_prompt = train_dataset.tokenizer("He is")
generated_sequence, probs_seq = model.generate(encoded_prompt, device="cpu", temperature=0.8, max_new_tokens=10)
print(train_dataset.tokenizer.decode(generated_sequence[0]))

He is a native
of the New Jersey and the son


In [59]:
set_seed(99)
encoded_prompt = train_dataset.tokenizer("Ontario lake")
generated_sequence, probs_seq = model.generate(encoded_prompt, device="cpu", temperature=0.8, max_new_tokens=10)
print(train_dataset.tokenizer.decode(generated_sequence[0]))

Ontario lake are in 1873. and shield as an were


In [23]:
import datasets 
datasets.load_dataset("glue", "sst2")

tensor([[9.5537e-10, 1.2851e-02, 1.2568e-09,  ..., 1.8367e-09, 1.3405e-09,
         9.3107e-10]])
tensor([[8.9296e-10, 5.7133e-03, 7.9823e-10,  ..., 1.0679e-09, 1.1928e-09,
         8.4614e-10]])
tensor([[9.4527e-11, 7.1178e-03, 1.0264e-10,  ..., 1.4708e-10, 1.1174e-10,
         9.1374e-11]])
tensor([[3.2113e-10, 3.2172e-02, 2.7883e-10,  ..., 4.4908e-10, 3.9936e-10,
         3.0540e-10]])
tensor([[2.9912e-10, 1.3020e-02, 2.6233e-10,  ..., 4.1762e-10, 2.9399e-10,
         2.9330e-10]])
tensor([[8.2578e-10, 1.2202e-02, 6.8875e-10,  ..., 9.8177e-10, 1.1165e-09,
         7.2898e-10]])
tensor([[3.3163e-10, 7.3499e-03, 2.6987e-10,  ..., 4.4147e-10, 3.5781e-10,
         3.0072e-10]])
tensor([[1.0116e-09, 4.2700e-02, 8.3285e-10,  ..., 1.2209e-09, 1.3048e-09,
         1.0034e-09]])
tensor([[9.4714e-10, 5.7228e-03, 6.5951e-10,  ..., 1.1590e-09, 9.6619e-10,
         8.0748e-10]])
tensor([[1.0553e-09, 8.7323e-03, 8.3025e-10,  ..., 1.1847e-09, 1.3750e-09,
         9.1425e-10]])


AssertionError: 