## This notebook is used to Debug functionalities

In [1]:
import torch
import torch.nn as nn
import tiktoken
from model import *
from dataset import *
from torch.utils.data import DataLoader

In [2]:
# Get gpt-2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
vocab_size = tokenizer.n_vocab
print("Vocab size:", vocab_size)

Vocab size: 50257


## Read data from a test file

In [3]:
with open("../data/the-verdict.txt", "r") as file:
    raw_text = file.read()

print(len(raw_text), "characters")

20479 characters


## Create Dataset object

In [4]:
dataset = GPTDataset(raw_text, tokenizer, 126, 1) # input phrases of 126 tokens

inputs, labels = dataset[0]
print("Inputs shape:", inputs.shape)
print("Labels shape:", labels.shape)

Inputs shape: torch.Size([126])
Labels shape: torch.Size([126])


## Create DataLoader

In [5]:
dataloader = DataLoader(
    dataset = dataset,
    batch_size = 32,
    shuffle = True
)

dataiter = iter(dataloader)
inputs_batch, labels_batch = next(dataiter)

print("Input batch shape:", inputs_batch.shape)
print("Labels batch shape:", labels_batch.shape)

Input batch shape: torch.Size([32, 126])
Labels batch shape: torch.Size([32, 126])


In [6]:
args = ModelArgs(
    emb_dim = 768,
    num_heads = 12,
    context_length=1024,
    vocab_size=vocab_size,
    num_blocks = 12
)

## Test attention model

In [7]:
# apply embedding
emb_dim = 768
embedding = nn.Embedding(vocab_size, emb_dim)
vectors = embedding(inputs_batch)

attention = MultiHeadAttention(args)
z = attention(vectors)
print("Context vector shape:", z.shape)

Context vector shape: torch.Size([32, 126, 768])


## Test FF block

In [8]:
ff = FeedForward(args)
out = ff(z)
print(out.shape)

torch.Size([32, 126, 768])


## Test LayerNorm

In [9]:
norm = LayerNorm(args)
out = norm(out)
print(out.shape)

torch.Size([32, 126, 768])


## Test Transformer block

In [10]:
transformer = TransformerBlock(args)
out = transformer(vectors)
print(out.shape)

torch.Size([32, 126, 768])


## Number of parameters of a Transformer Block

In [11]:
print(sum([p.numel() for p in transformer.parameters() if p.requires_grad]))

7085568


## Test GPT Block

In [12]:
model = GPTModel(args)
out = model(inputs_batch)
print(out.shape)

torch.Size([32, 126, 50257])


## Number of parameters in GPT Model

In [13]:
# We subtract the number of parameters in the final (output) layer
print(sum([p.numel() for p in model.parameters() if p.requires_grad]) - sum([p.numel() for p in model.output.parameters()]))

124412160


In [14]:
model(inputs_batch)

tensor([[[-1.1667e-01, -8.2985e-01, -5.8249e-01,  ...,  6.0607e-01,
           6.8824e-01,  1.6421e+00],
         [-5.0194e-01, -5.9211e-01, -1.8949e-01,  ..., -4.1374e-01,
          -3.8185e-01,  1.2588e+00],
         [ 6.7507e-01, -1.0917e+00, -8.8019e-02,  ...,  6.8964e-02,
           3.9118e-01,  9.0287e-01],
         ...,
         [ 6.1311e-01, -8.3764e-01,  1.0579e+00,  ...,  3.6020e-01,
          -4.7086e-01,  1.6606e+00],
         [ 1.2487e+00,  2.0049e-01,  4.1201e-01,  ..., -2.7011e-01,
          -2.1878e-01,  7.4676e-01],
         [ 2.7146e-01, -4.1402e-01,  4.7407e-01,  ..., -3.9146e-02,
          -1.1699e-01,  6.5736e-01]],

        [[-3.5757e-02,  4.6312e-01,  2.6027e-01,  ...,  8.6704e-01,
           3.5686e-01,  8.9648e-01],
         [ 1.7573e-01, -1.4281e+00,  4.5357e-01,  ...,  3.3828e-01,
           3.8132e-02,  8.9718e-01],
         [ 5.5927e-01, -8.9882e-01,  2.7039e-02,  ..., -2.2215e-01,
          -3.1467e-01,  3.0857e-01],
         ...,
         [-4.2699e-01, -4