## This notebook is used to Debug functionalities

In [8]:
import torch
import torch.nn as nn
import tiktoken
from model import *
from dataset import *
from torch.utils.data import DataLoader

In [9]:
# Get gpt-2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
vocab_size = tokenizer.n_vocab
print("Vocab size:", vocab_size)

Vocab size: 50257


## Read data from a test file

In [10]:
with open("../data/the-verdict.txt", "r") as file:
    raw_text = file.read()

print(len(raw_text), "characters")

20479 characters


## Create Dataset object

In [11]:
dataset = GPTDataset(raw_text, tokenizer, 126, 1) # input phrases of 126 tokens

inputs, labels = dataset[0]
print("Inputs shape:", inputs.shape)
print("Labels shape:", labels.shape)

Inputs shape: torch.Size([126])
Labels shape: torch.Size([126])


## Create DataLoader

In [12]:
dataloader = DataLoader(
    dataset = dataset,
    batch_size = 32,
    shuffle = True
)

dataiter = iter(dataloader)
inputs_batch, labels_batch = next(dataiter)

print("Input batch shape:", inputs_batch.shape)
print("Labels batch shape:", labels_batch.shape)

Input batch shape: torch.Size([32, 126])
Labels batch shape: torch.Size([32, 126])


In [13]:
args = ModelArgs(
    emb_dim = 256,
    num_heads = 2,
    context_length=256,
    vocab_size=vocab_size,
    num_blocks = 2
)

## Test attention model

In [7]:
# apply embedding
emb_dim = 768
embedding = nn.Embedding(vocab_size, emb_dim)
vectors = embedding(inputs_batch)

attention = MultiHeadAttention(args)
z = attention(vectors)
print("Context vector shape:", z.shape)

Context vector shape: torch.Size([32, 126, 768])


## Test FF block

In [8]:
ff = FeedForward(args)
out = ff(z)
print(out.shape)

torch.Size([32, 126, 768])


## Test LayerNorm

In [9]:
norm = LayerNorm(args)
out = norm(out)
print(out.shape)

torch.Size([32, 126, 768])


## Test Transformer block

In [10]:
transformer = TransformerBlock(args)
out = transformer(vectors)
print(out.shape)

torch.Size([32, 126, 768])


## Number of parameters of a Transformer Block

In [11]:
print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]))

7085568


## Test GPT Block

In [12]:
model = GPTModel(args)
out = model(inputs_batch)
print(out.shape)

torch.Size([32, 126, 50257])


## Number of parameters in GPT Model

In [13]:
# We subtract the number of parameters in the final (output) layer
print(sum([p.numel() for p in model.parameters() if p.requires_grad]) - sum([p.numel() for p in model.output.parameters()]))

53556480


## Generate text

In [14]:
input = "Of course"
ids = torch.tensor(tokenizer.encode(input)).unsqueeze(0)
num_tokens = 10

for i in range(num_tokens):
    with torch.no_grad():
        logits = model(ids)
    
    # Get last vector
    logits = logits[:,-1,:]
    # APply softmax
    probs = torch.softmax(logits, dim=-1)
    # Get token with higher prob
    idx_next = torch.argmax(probs, dim=-1, keepdim=True)
    ids = torch.cat((ids, idx_next), dim=1)

# Finally, decode and join into a new string
tokens = tokenizer.decode(ids.squeeze(dim=0).tolist())
print(tokens)

Of course FANTASY IncreasingAppData taxutonium NAACP momentum NAACP Increasing Fey


## Test train loop

In [14]:
torch.manual_seed(123)
model = GPTModel(args)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4)
criterion = torch.nn.CrossEntropyLoss()
dataiter = iter(dataloader) # Reset iterator

EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0.0
    epoch_accuracy = 0.0

    for batch_idx, (inputs, labels) in enumerate(dataiter):
        # Reset gradients
        optimizer.zero_grad()

        # Predict logits
        logits = model(inputs)

        # Compute loss
        loss = criterion(logits.flatten(0,1), labels.flatten())

        # Backward pass
        loss.backward()

        # Update params
        optimizer.step()

        epoch_loss += loss.item()
    
    epoch_loss /= len(dataiter)
    print(f"Epoch: {epoch+1}, loss: {epoch_loss}")

Epoch: 1, loss: 7.221414383809278
Epoch: 2, loss: 0.0
Epoch: 3, loss: 0.0


: 