## This notebook is used to Debug functionalities

In [2]:
import torch
import torch.nn as nn
import tiktoken
from model import *
from dataset import *
from torch.utils.data import DataLoader

In [3]:
# Get gpt-2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
vocab_size = tokenizer.n_vocab
print("Vocab size:", vocab_size)

Vocab size: 50257


## Read data from a test file

In [4]:
with open("../data/the-verdict.txt", "r") as file:
    raw_text = file.read()

print(len(raw_text), "characters")

20479 characters


## Create Dataset object

In [5]:
dataset = GPTDataset(raw_text, tokenizer, 126, 1) # input phrases of 126 tokens

inputs, labels = dataset[0]
print("Inputs shape:", inputs.shape)
print("Labels shape:", labels.shape)

Inputs shape: torch.Size([126])
Labels shape: torch.Size([126])


## Create DataLoader

In [6]:
dataloader = DataLoader(
    dataset = dataset,
    batch_size = 32,
    shuffle = True
)

dataiter = iter(dataloader)
inputs_batch, labels_batch = next(dataiter)

print("Input batch shape:", inputs_batch.shape)
print("Labels batch shape:", labels_batch.shape)

Input batch shape: torch.Size([32, 126])
Labels batch shape: torch.Size([32, 126])


In [7]:
args = ModelArgs(
    emb_dim = 768,
    num_heads = 2,
    context_length=1024,
    vocab_size=vocab_size,
    num_blocks = 2
)

## Test attention model

In [8]:
# apply embedding
emb_dim = 768
embedding = nn.Embedding(vocab_size, emb_dim)
vectors = embedding(inputs_batch)

attention = MultiHeadAttention(args)
z = attention(vectors)
print("Context vector shape:", z.shape)

torch.Size([32, 2, 126, 126])
Context vector shape: torch.Size([32, 126, 768])


## Test FF block

In [9]:
ff = FeedForward(args)
out = ff(z)
print(out.shape)

torch.Size([32, 126, 768])


## Test LayerNorm

In [10]:
norm = LayerNorm(args)
out = norm(out)
print(out.shape)

torch.Size([32, 126, 768])


## Test Transformer block

In [11]:
transformer = TransformerBlock(args)
out = transformer(vectors)
print(out.shape)

torch.Size([32, 2, 126, 126])
torch.Size([32, 126, 768])


## Number of parameters of a Transformer Block

In [12]:
print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]))

7085568


## Test GPT Block

In [13]:
model = GPTModel(args)
out = model(inputs_batch)
print(out.shape)

torch.Size([32, 2, 126, 126])
torch.Size([32, 2, 126, 126])
torch.Size([32, 126, 50257])


## Number of parameters in GPT Model

In [14]:
print(sum([p.numel() for p in model.parameters() if p.requires_grad]))

92204113


In [15]:
model(inputs_batch)

torch.Size([32, 2, 126, 126])
torch.Size([32, 2, 126, 126])


tensor([[[-0.9073,  0.0335,  1.4934,  ..., -0.5020, -0.5059, -1.0931],
         [-1.3550,  0.8765,  0.8421,  ..., -0.9532,  0.0283, -0.2321],
         [-0.3883,  0.3822,  1.3812,  ..., -0.7576,  1.0357, -0.6536],
         ...,
         [-1.4300,  0.2335,  0.4352,  ..., -0.5809,  0.3527, -2.0563],
         [-0.9426,  0.5583,  0.3104,  ..., -0.9327,  0.2543, -0.7374],
         [-1.3073,  0.0846,  0.5767,  ..., -0.9223,  0.5519, -0.5645]],

        [[-1.1226,  0.1392,  1.9035,  ..., -0.4972, -0.3713, -0.5790],
         [-1.1835,  0.5731,  0.9717,  ..., -0.6964,  0.7273, -0.6818],
         [-0.7272,  0.7066,  1.3161,  ..., -1.0345,  0.4620, -0.4453],
         ...,
         [-0.3837,  1.7918, -0.0408,  ..., -0.8322, -0.7472, -1.0639],
         [-0.4096,  0.1195,  0.5933,  ..., -1.5029, -0.8600, -1.1141],
         [-0.1876, -0.5669,  0.9252,  ..., -0.4962,  0.9002, -0.2192]],

        [[-0.3534,  0.9287,  1.0123,  ..., -0.7236, -0.2595, -0.3089],
         [-1.2247,  1.2695,  0.4970,  ..., -0