## 5.1 Evaluating generative models
### 5.1.1 Using GPT to generate text

In [2]:
import torch

from build_a_large_language_model_from_scratch.lib.GPTModel import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256, # intentionally shortening it to reduce computational demands of training
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12, 
    "drop_rate": 0.1, # it's possible and common to set dropout to 0
    "qkv_bias": False
}

cfg = GPT_CONFIG_124M

model = GPTModel(
    context_length=cfg["context_length"],
    drop_rate=cfg["drop_rate"],
    emb_dim=cfg["emb_dim"],
    n_heads=cfg["n_heads"],
    n_layers=cfg["n_layers"],
    vocab_size=cfg["vocab_size"],
    qkv_bias=cfg["qkv_bias"]
)

torch.manual_seed(123)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (layers): ModuleList(
        (0): Sequential(
          (0): LayerNorm()
          (1): MultiHeadAttention(
            (W_query): Linear(in_features=768, out_features=768, bias=False)
            (W_key): Linear(in_features=768, out_features=768, bias=False)
            (W_value): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (2): Dropout(p=0.1, inplace=False)
        )
        (1): Sequential(
          (0): LayerNorm()
          (1): FeedForward(
            (layers): Sequential(
              (0): Linear(in_features=768, out_features=3072, bias=True)
              (1): GELU()
              (2): Linear(in_features=3072, out_features=768, bias=Tr

In [3]:
import tiktoken

from build_a_large_language_model_from_scratch.lib.generate import generate_text_simple

def text_to_token_ids(text: str, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # `unsqueeze(0)` adds batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=cfg["context_length"]
)
print(f"Output text:\n{token_ids_to_text(token_ids, tokenizer)}")

Output text:
Every effort moves youElsObviously engulfed Fried guestwavelier rein obtain 345


### 5.1.2 Calculating the text generation loss

In [12]:
inputs = torch.tensor([
    [16833, 3626, 6100], # "every effort moves"
      [40, 1107, 58]])   # "I really like"

targets = torch.tensor([
    [3626, 6100, 345], # " effort moves you"
    [1107, 588, 11311] # " really like chocolate"
])

with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1)
print(f"{probas.shape=}")

token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print(f"{token_ids=}")
print(f"{token_ids.shape=}")

print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print(f"Text 1: {target_probas_1}")

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print(f"Text 2: {target_probas_2}")

probas.shape=torch.Size([2, 3, 50257])
token_ids=tensor([[16332, 30944, 37325],
        [25353, 28987, 37325]])
token_ids.shape=torch.Size([2, 3])
Targets batch 1:  effort moves you
Outputs batch 1:  Langivistahar
Text 1: tensor([1.1212e-05, 2.3273e-05, 2.4384e-05])
Text 2: tensor([1.0821e-05, 1.1233e-05, 1.5693e-05])


In [13]:
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

tensor([-11.3985, -10.6682, -10.6216, -11.4340, -11.3967, -11.0623])


In [16]:
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-11.0969)


In [18]:
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor(11.0969)


In [19]:
print(f"{logits.shape=}")
print(f"{targets.shape=}")

logits.shape=torch.Size([2, 3, 50257])
targets.shape=torch.Size([2, 3])


In [20]:
# Need to flatten for `cross_entropy` to work, so we combien them over the batch dimension:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()
print(f"{logits_flat.shape}")
print(f"{targets_flat.shape}")

torch.Size([6, 50257])
torch.Size([6])


In [21]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(11.0969)


In [22]:
# Perplexity
perplexity = torch.exp(loss)
print(perplexity)

tensor(65964.3359)


### 5.1.3 Calculating the training and validation set losses

In [24]:
file_path = "./data/the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

In [25]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print(f"{total_characters=}")
print(f"{total_tokens=}")

total_characters=20479
total_tokens=5145


In [26]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [27]:
from build_a_large_language_model_from_scratch.lib.dataloader import create_dataloader_v1
torch.manual_seed(123)

# We use a small batch size to reduce computational resource demand because we're working with a small dataset.
# Using batch sizes of 1024 or larger is not uncommon.

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=cfg["context_length"],
    stride=cfg["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=cfg["context_length"],
    stride=cfg["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

In [30]:
# Check data loaders:
print("Train loader")
for x, y in train_loader:
    print(x.shape, y.shape)


print("\nValidation loader")
for x, y in val_loader:
    print(x.shape, y.shape)

Train loader
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader
torch.Size([2, 256]) torch.Size([2, 256])


In [31]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

In [32]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    
    return total_loss / num_batches

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.983904944525825
Validation loss: 11.021097183227539
