In [2]:
import os
import sys
sys.path.append(os.path.abspath("../ch04"))
from ipynb.fs.defs.gpt import GPTModel
import torch
import torch.nn as nn
folder_a_path = os.path.abspath('../ch04')  # Adjust the relative path
sys.path.append(folder_a_path)

GPT_CONFIG_124M = {
"vocab_size": 50257,
"context_length": 256, #A
"emb_dim": 768,
"n_heads": 12,
"n_layers": 12,
"drop_rate": 0.1, #B
"qkv_bias": False
}
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (layernorm1): LayerNorm()
      (layernorm2): LayerNorm()
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (drop): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (layernorm1): LayerNorm()
      (layernorm2): LayerNorm()
    

Reimplementing tokenization and inverse operations

In [3]:
import tiktoken
from ipynb.fs.defs.gpt import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    input_tokens=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves youInstant surrounds receptor Importminus ay symbols audiences cheaply preventing


In [4]:
input1 = text_to_token_ids("every effort moves", tokenizer)
input2 = text_to_token_ids("I really like", tokenizer)

inputs = torch.stack([input1, input2], dim=1).squeeze(0)
print(inputs)

target1 = text_to_token_ids(" effort moves you", tokenizer)
target2 = text_to_token_ids(" really like chocolate", tokenizer)
print(target1.shape)
print(target2.shape)
targets = torch.stack([target1, target2], dim=1).squeeze(0)
print(targets)

tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])
torch.Size([1, 3])
torch.Size([1, 3])
tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])


Compute loss    

In [5]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

print(inputs.shape)
print(targets.shape)


torch.Size([2, 3])
torch.Size([2, 3])


In [6]:
logits = model(inputs)
print(logits.shape)

torch.Size([2, 3, 50257])


In [9]:
logits_flattened = logits.flatten(0, 1)
targets_flattened = targets.flatten()
print(logits_flattened.shape)
print(targets_flattened.shape)


loss = nn.functional.cross_entropy(logits_flattened, targets_flattened)
perplexity = torch.exp(loss)
print("cross-entropy loss: ", loss.item())
print("perplexity: ", perplexity.item())

torch.Size([6, 50257])
torch.Size([6])
cross-entropy loss:  10.873671531677246
perplexity:  52768.59765625


Construct Dataset

In [13]:
file_path = "../ch02/the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

print("total characters: ", len(text_data))
print("total tokens: ", len(tokenizer.encode(text_data)))

total characters:  8927
total tokens:  2199


In [31]:
split_idx = int(0.8 * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [33]:
folder_a_path = os.path.abspath('../ch02')  # Adjust the relative path
sys.path.append(folder_a_path)
from ipynb.fs.defs.data_preparation import GPTDatasetV1
from torch.utils.data import Dataset, DataLoader


def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader



In [34]:
train_loader = create_dataloader_v1(train_data,
batch_size=2,
max_length=GPT_CONFIG_124M["context_length"],
stride=GPT_CONFIG_124M["context_length"],
drop_last=True,
shuffle=True)

val_loader = create_dataloader_v1(val_data,
batch_size=2,
max_length=GPT_CONFIG_124M["context_length"],
stride=GPT_CONFIG_124M["context_length"],
drop_last=False,
shuffle=False)

Calculate loss of a batch

In [35]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits=model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

In [36]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item() 
        else:
            break
        return total_loss / num_batches 

In [45]:
torch.manual_seed(12345)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
model.to(device)
train_loss = calc_loss_loader(train_loader, model, device) 
val_loss = calc_loss_loader(val_loader, model, device)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 3.663067181905111
Validation loss: 10.997014045715332


Training function

In [52]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()
