In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import torch
import tiktoken

from torch.nn import functional as F
from torchviz import make_dot

from gpt.model import Gpt, detect_device
from gpt.loader import DataLoaderLite
from gpt.config import GPTConfig

In [19]:
device = detect_device()
enc = tiktoken.get_encoding('gpt2')

In [20]:
torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

In [21]:
train_loader = DataLoaderLite(B=4, T=32)

loaded 338024 tokens
1 epoch = 2640 batches


In [22]:
config = GPTConfig()

In [23]:
config.n_layer = 1

In [24]:
model = Gpt(config)
model.to(device)

Gpt(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0): Block(
        (att_norm): LayerNorm()
        (attention): Attention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (layer_norm): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

### Debugging

In [26]:
x, y = train_loader.next_batch()
x, y = x.to(device), y.to(device)

In [27]:
logits, loss = model(x, y)

In [28]:
dot = make_dot(loss, params=dict(model.named_parameters()))
dot.render('computational_graph', format='png')

'computational_graph.png'

In [29]:
loss.backward(retain_graph=True)  # retain_graph=True to allow multiple backwards
dot = make_dot(loss, 
               params=dict(list(model.named_parameters()) + [('input', x)]),
               show_attrs=True,  # Show tensor sizes and other attributes
               show_saved=True)  # Show saved tensors for backward
dot.render('computational_graph_with_grads', format='png')

'computational_graph_with_grads.png'

## Train loop

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for i in range(50):
    x, y = train_loader.next_batch()
    x, y = x.to(device), y.to(device)
    
    optimizer.zero_grad()
    
    logits, loss = model(x, y)
    
    loss.backward()
    optimizer.step()
    
    print(f"step {i}, loss: {loss.item()}")
    break

step 0, loss: 10.880291938781738


## Model eval

In [None]:
model.eval()
num_return_sequences = 5
max_length = 30
tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens, dtype=torch.long) # (8,)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
x = tokens.to(device)

# generate! right now x is (B, T) where B = 5, T = 8
# set the seed to 42
torch.manual_seed(42)
torch.cuda.manual_seed(42)
while x.size(1) < max_length:
    # forward the model to get the logits
    with torch.no_grad():
        logits, loss = model(x) # (B, T, vocab_size)
        # take the logits at the last position
        logits = logits[:, -1, :] # (B, vocab_size)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)
        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1) # (B, 1)
        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # append to the sequence
        x = torch.cat((x, xcol), dim=1)

# print the generated text
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)