In [1]:
!pip install einops
!pip install torchtyping
!pip install transformers
!pip install datasets
!pip install GPUtil
!pip install jsonlines

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtyping
  Downloading torchtyping-0.1.4-py3-none-any.whl (17 kB)
Collecting typeguard>=2.11.1
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, torchtyping
  Attempting uninstall: typeguard
    Found existing installation: typeguard 2.7.1
    Uninstalling typeguard-2.7.1:
      Successfully uninstalled typeguard-2.7.1
Successfully installed torchtyping-0.1.4 typeguard-2.13.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |█████████████████████████

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
from gpt2 import GPT2
import torch
from torch import nn
from torch import optim
import transformers
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from tqdm import tqdm
import time
import random
from torch.nn import functional as F
import math

In [4]:
seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [5]:
class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [6]:
block_size = 128

In [7]:
text = open('input.txt', 'r').read() # don't worry we won't run out of file handles
train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters

data has 1115394 characters, 65 unique.


In [8]:
batch_size = 128
train_loader = DataLoader(
    train_dataset, shuffle=True, pin_memory=True, batch_size=batch_size
)
print("train loader:", train_loader)

train loader: <torch.utils.data.dataloader.DataLoader object at 0x7fad248ffd10>


In [9]:
# training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

model = GPT2(
    num_layers=8,
    num_heads=8,
    vocab_size=train_dataset.vocab_size,
    hidden_size=512, #corresponds to n_embd
    max_position_embeddings=train_dataset.block_size, # corresponds to block_size
    dropout=0.1,
    layer_norm_epsilon=1e-5,
).to(device).train()


loss_fn = nn.CrossEntropyLoss()

learning_rate = 6e-4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

device: cuda
number of parameters: 25318912


In [10]:
max_epochs = 2

#counter used for lr decay
tokens = 0
warmup_tokens = 512 * 20
final_tokens = 2*len(train_dataset)*block_size

start_time = time.time()
for epoch in range(max_epochs):
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for it, (x, y) in pbar:
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        gpt_output = model(x)
        loss = loss_fn(gpt_output.logits.view(-1, gpt_output.logits.size(-1)), y.view(-1))
        loss.backward()
        optimizer.step()

        tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
        if tokens < warmup_tokens:
            # linear warmup
            lr_mult = float(tokens) / float(max(1, warmup_tokens))
        else:
            # cosine learning rate decay
            progress = float(tokens - warmup_tokens) / float(max(1, final_tokens - warmup_tokens))
            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
        lr = learning_rate * lr_mult
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
            
        pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}")

end_time = time.time()
print("\n training time:", end_time - start_time)

epoch 1 iter 8713: train loss 0.71790: 100%|██████████| 8714/8714 [2:06:38<00:00,  1.15it/s]
epoch 2 iter 8713: train loss 0.31412: 100%|██████████| 8714/8714 [2:07:11<00:00,  1.14it/s]


 training time: 15230.299560308456





In [11]:
def top_k_logits(logits, k):
    v, ix = torch.topk(logits, k)
    out = logits.clone()
    out[out < v[:, [-1]]] = -float('Inf')
    return out

@torch.no_grad()
def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
    """
    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
    the sequence, feeding the predictions back into the model each time. Clearly the sampling
    has quadratic complexity unlike an RNN that is only linear, and has a finite context window
    of block_size, unlike an RNN that has an infinite context window.
    """
    block_size = model.get_block_size()
    model.eval()
    for k in range(steps):
        x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
        logits = model(x_cond).logits
        # pluck the logits at the final step and scale by temperature
        logits = logits[:, -1, :] / temperature
        # optionally crop probabilities to only the top k options
        if top_k is not None:
            logits = top_k_logits(logits, top_k)
        # apply softmax to convert to probabilities
        probs = F.softmax(logits, dim=-1)
        # sample from the distribution or take the most likely
        if sample:
            ix = torch.multinomial(probs, num_samples=1)
        else:
            _, ix = torch.topk(probs, k=1, dim=-1)
        # append to the sequence and continue
        x = torch.cat((x, ix), dim=1)

    return x

In [12]:
context = "O God, O God!"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(device)
y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

O God, O God! that every fair!

HORTENSIO:
Farewell, and shall go wash the canopy.

KATHARINA:
Alas, I warrant you: I will not do't, and work.

PETRUCHIO:
You were more at least twinning some supping things
My weak needly finger stars in a dreams,
On all best all the flatterments of me,
That I, being govern'd by the watery moon,
May send forth plenteous tears to drown the world!
Oh for my husband, for my dear lord Edward!

Children:
Oh for our father, for our dear lord Clarence!

DUCHESS OF YORK:
Alas for both, both mine, Edward and Clarence!

QUEEN ELIZABETH:
What stay had I but Edward? and he's gone.

Children:
What stay had we but Clarence? and he's gone.

DUCHESS OF YORK:
What stays had I but they? and they are gone.

QUEEN ELIZABETH:
Was never will before with thee?

BAPTISTA:
No, my good lord, Lord Northumberland.

KING RICHARD III:
Then call them to our praise them.

BRUTUS:
I never saw you hear have.

SICINIUS:
We know you well?

MENENIUS:
Yet you are general, by this good will