# GPT-2 Model and Training Review

## Preliminaries

Recall our GPT-2 model:

In [68]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.d_in = config["n_embd"]
        self.d_out = config["n_embd"]
        self.num_heads = config["n_heads"]
        self.d_head = self.d_out // self.num_heads # Dimension of each head
        self.context_length = config["context_length"]
        self.W_query = nn.Linear(self.d_in, self.d_out, bias=config["qkv_bias"])
        self.W_key = nn.Linear(self.d_in, self.d_out, bias=config["qkv_bias"])
        self.W_value = nn.Linear(self.d_in, self.d_out, bias=config["qkv_bias"])
        self.dropout = nn.Dropout(config["dropout_rate"])
        causal_mask = torch.tril(torch.ones(self.context_length, self.context_length))
        self.projection = nn.Linear(self.d_out, self.d_out)

        self.register_buffer("mask", causal_mask)

    def forward(self, x):
        B, N, D = x.shape
        Q = self.W_query(x)
        K = self.W_key(x)
        V = self.W_value(x)

        Q = Q.view(B, N, self.num_heads, self.d_head).transpose(1, 2)
        K = K.view(B, N, self.num_heads, self.d_head).transpose(1, 2)
        V = V.view(B, N, self.num_heads, self.d_head).transpose(1, 2)

        QKT = Q @ K.transpose(2, 3)
        masked_QKT = QKT.masked_fill(self.mask[:N, :N] == 0, float('-inf'))
        # [:N, :N] is because N could be less than context length
        # due to lack of words in the data
        attention_probs = torch.softmax(masked_QKT / (self.d_head ** 0.5), dim=-1)
        attention_probs = self.dropout(attention_probs)

        context_vector = attention_probs @ V
        context_vector = context_vector.transpose(1, 2).contiguous().view(B, N, self.d_out)
        return self.projection(context_vector)

class FeedForward(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(config["n_embd"], 4 * config["n_embd"]),
                                    nn.GELU(),
                                    nn.Linear(4 * config["n_embd"], config["n_embd"]))

    def forward(self, x):
        return self.layers(x)

class LayerNorm(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(config["n_embd"]))
        self.beta = nn.Parameter(torch.zeros(config["n_embd"]))
        self.eps = 1e-5

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        x = (x - mean) / (std + self.eps) # Normalize
        x = self.gamma * x + self.beta # Apply linear function
        return x

class TransformerBlock(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = LayerNorm(config)
        self.attn = MultiHeadAttention(config)
        self.dropout = nn.Dropout(config["dropout_rate"])
        self.ff = FeedForward(config)
        self.ln2 = LayerNorm(config)

    def forward(self, x):
        # x -> Layer norm 1 -> attention -> dropout -> residual connection
        saved_x = x
        x = self.ln1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = saved_x + x # residual connection

        # x -> Layer norm 2 -> feed forward -> dropout -> residual connection
        saved_x = x
        x = self.ln2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = saved_x + x # residual connection

        # You can do the above with two lines:
        # x = x + self.dropout(self.attn(self.ln1(x)))
        # x = x + self.dropout(self.ff(self.ln2(x)))
        return x

class Simple_GPT(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.token_embedding = nn.Embedding(config["vocab_size"], config["n_embd"])
        self.position_embedding = nn.Embedding(config["context_length"], config["n_embd"])
        self.dropout = nn.Dropout(config["dropout_rate"])
        self.blocks = nn.Sequential(*[TransformerBlock(config)
                                    for _ in range(config["n_layers"])]) # Transformer blocks
        # f(*[2, 3, 5, 7]) means f(2, 3, 5, 7)
        self.ln_f = LayerNorm(config) # Final layer norm
        self.prediction_layer = nn.Linear(config["n_embd"], config["vocab_size"])
        # Linear mapping to vocab size

        # Register buffer torch.arange(N) to prevent issues with device
        self.register_buffer("pos_range", torch.arange(config["context_length"]))

    def forward(self, x):
        B, N = x.shape      # B is batch size, N is context length
        token_embeddings = self.token_embedding(x)  # [B, N, n_embd]
        position_embeddings = self.position_embedding(self.pos_range[:N])  # [N, n_embd]
        x = token_embeddings + position_embeddings  # Full embeddings; [B, N, n_embd]
        x = self.dropout(x)  # Apply dropout
        x = self.blocks(x)  # Apply transformer blocks; [B, N, n_embd]
        x = self.ln_f(x) # Final layer norm
        logits = self.prediction_layer(x)   # [B, N, vocab_size]
        return logits

In [69]:
config = {
    "vocab_size": 50257,
    "context_length": 1024,
    "n_embd": 768,
    "n_heads": 12,
    "n_layers": 12,
    "dropout_rate": 0.0,
    "qkv_bias": False
}

We will be using the `cuda` device.

In [70]:
device = "cuda"

Recall the text generation process:

In [4]:
def generate_text_sample(model, idx, max_new_tokens, context_length):
    # max_new_tokens is the number of tokens we want to generate
    # idx is the array of indices in the current context
    # idx has size [batch_size, n_tokens]
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_length:]     # Takes the latest context window
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]       #   last token in new context window
        # we want to keep batch and vocab dimension same
        probs = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probs, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)     # dim=1 for the context window
    return idx

In [5]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In order to train our model, we need data. A dataset class and a dataloader function will be helpful.

In [6]:
from torch.utils.data import Dataset, DataLoader
# Dataset class
class MyData(Dataset):
    # Init function, called when the dataset is created
    # dataset = MyData(text, tokenizer, context_length=4, stride=1)
    def __init__(self, text, tokenizer, context_length, stride=1):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - context_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i : i + context_length]))
            self.target_ids.append(torch.tensor(token_ids[i + 1 : i + context_length + 1]))

    # Length function
    # len(dataset)
    def __len__(self):
        return len(self.input_ids)

    # Get item function
    # dataset[idx]
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def my_batch(text, batch_size, context_length, stride, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create the dataset object
    dataset = MyData(text, tokenizer, context_length, stride)

    # Use the DataLoader library to create a dataloader that batches the data
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers)

    return dataloader

Cross entropy loss function used to train the model.

In [7]:
def calculate_loss(dataloader, model, device="cpu", num_batches=None): # 1 epoch average loss
    # number of batches in dataset is not included as a dimension in any tensor
    if num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches, len(dataloader))
    model.eval()
    total_loss = 0.0
    for i, (input, target) in enumerate(dataloader): # i is batch index
        if i >= num_batches:
            break

        input = input.to(device) # Move input to appropriate device
        logits = model(input) # Obtain output logits of the model
        target = target.to(device) # Move target to appropriate device

        loss = nn.functional.cross_entropy(logits.flatten(0, 1), target.flatten()) # Use cross entropy loss
        # cross_entropy takes in 2D tensor for logits
        # and 1D tensor for targets

        total_loss += loss.item()
        # .item() extracts a numerical value from a 0D scalar tensor
    return total_loss / num_batches # len(dataloader) is number of batches

## Training

In [8]:
GPT_CONFIG_124M = {
    'vocab_size': 50257,
    'context_length': 256,
    'n_embd': 768,
    'n_heads': 12,
    'n_layers': 12,
    'dropout_rate': 0.1,
    'qkv_bias': False,
    'device': 'cuda'
}

Useful helper function that can convert text into token IDs:

In [9]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    return torch.tensor(encoded).unsqueeze(0) #unsqueeze adds batch dimension 1

Helper function used to generate and print the output:

In [10]:
def generate_and_print_sample(model, tokenizer, device, start_context, max_new_tokens=50):
    model.eval()
    context_size = model.position_embedding.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_sample(model, encoded, max_new_tokens, context_size)
    decoded = tokenizer.decode(token_ids[0].squeeze(0).tolist())
    print(decoded.replace("\n", " "))
    model.train()

Helper function used to get loss values for the train and validation splits:

In [11]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calculate_loss(train_loader, model, device, eval_iter)
        val_loss = calculate_loss(val_loader, model, device, eval_iter)
    model.train()
    return train_loss, val_loss

Training loop:

In [12]:
def training_loop(model, train_dataloader, val_dataloader,
                  optimizer, device, num_epochs,
                  eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
    for epoch in range(num_epochs):
        model.train() # Puts the model in training mode
        for input_batch, target_batch in train_dataloader:
            optimizer.zero_grad() # Zeros gradient calculations

            input_batch = input_batch.to(device) # Move to proper device
            target_batch = target_batch.to(device) # Move to proper device
            logits = model(input_batch)
            loss = nn.functional.cross_entropy(logits.flatten(0, 1),
                                               target_batch.flatten())

            # we are updating based on single batch here
            loss.backward() # computes the gradients
            optimizer.step() # updates the model parameters (optimizer is linked to model)
            # forward means passing through the model
            # backward means I compute the gradient of the loss wrt the parameters
            # Update by -lr * gradient

            tokens_seen += input_batch.numel() # number of elements
            # train_losses.append(loss.item())
            global_step += 1 # number of batches trained
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_dataloader, val_dataloader, device, eval_iter)
                val_losses.append(val_loss)
                train_losses.append(train_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f},"
                      f"Val loss {val_loss:.3f}")

        # Generate and print a sample for each epoch:
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

Training a GPT-2 model on a dataset:

In [13]:
my_config = {
    'vocab_size': 50257,
    'context_length': 256,
    'n_embd': 768,
    'n_heads': 12,
    'n_layers': 12,
    'dropout_rate': 0.1,
    'qkv_bias': False,
    'device': 'cuda'
}

In [15]:
with open('training_data_3.txt', 'r', encoding="utf-8") as file:
    text_data_2 = file.read()

train_ratio = 0.8
split_idx = int(train_ratio * len(text_data_2))
train_data_2 = text_data_2[:split_idx]
val_data_2 = text_data_2[split_idx:]

train_dataloader_2 = my_batch(train_data_2, batch_size=20,
                            context_length=my_config['context_length'] // 2,
                            stride=my_config['context_length'] // 2,
                            shuffle=True, drop_last=True, num_workers=0)

val_dataloader_2 = my_batch(val_data_2, batch_size=20,
                          context_length=my_config['context_length'] // 2,
                          stride=my_config['context_length'] // 2,
                          shuffle=False, drop_last=False, num_workers=0)

model_2 = Simple_GPT(my_config)
model_2.to(my_config["device"])

optimizer_2 = torch.optim.AdamW(model_2.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 2 # When actually training, change to be more
start_context = "Once upon a time," # Replace

train_losses, val_losses, tokens_seen = training_loop(
    model_2, train_dataloader_2, val_dataloader_2, optimizer_2,
    my_config["device"], num_epochs,
    eval_freq=1, eval_iter=5, start_context=start_context, tokenizer=tokenizer
) # Run the training loop


Epoch 1 (Step 000000): Train loss 9.847,Val loss 9.846
Epoch 1 (Step 000001): Train loss 9.376,Val loss 9.403
Epoch 1 (Step 000002): Train loss 9.140,Val loss 9.249
Epoch 1 (Step 000003): Train loss 8.819,Val loss 8.904
Epoch 1 (Step 000004): Train loss 8.452,Val loss 8.546
Epoch 1 (Step 000005): Train loss 8.174,Val loss 8.261
Epoch 1 (Step 000006): Train loss 7.916,Val loss 8.046
Epoch 1 (Step 000007): Train loss 7.627,Val loss 7.781
Epoch 1 (Step 000008): Train loss 7.352,Val loss 7.567
Epoch 1 (Step 000009): Train loss 7.166,Val loss 7.420
Epoch 1 (Step 000010): Train loss 7.081,Val loss 7.295
Epoch 1 (Step 000011): Train loss 6.893,Val loss 7.181
Epoch 1 (Step 000012): Train loss 6.772,Val loss 7.080
Once upon a time, and the.                                               
Epoch 2 (Step 000013): Train loss 6.699,Val loss 7.009
Epoch 2 (Step 000014): Train loss 6.623,Val loss 6.968
Epoch 2 (Step 000015): Train loss 6.640,Val loss 6.948
Epoch 2 (Step 000016): Train loss 6.548,Val lo

# Loading weights from OpenAI

For this notebook, we will be using code from the book "Build a Large Language Model (From Scratch)" (https://github.com/rasbt/LLMs-from-scratch).

Downloading the textbook code used to download and GPT:

In [16]:
import urllib.request
url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)
filename = url.split("/")[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x7fde6a841550>)

Downloading GPT:

In [29]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 168kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:01<00:00, 622kiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 211kiB/s]
model.ckpt.data-00000-of-00001:  92%|█████████▏| 456M/498M [04:27<00:24, 1.70MiB/s]


Primary URL (https://openaipublic.blob.core.windows.net/gpt-2/models/124M/model.ckpt.data-00000-of-00001) failed. Attempting backup URL: https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2/124M/model.ckpt.data-00000-of-00001


model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [00:32<00:00, 15.4MiB/s]
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 12.6MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:01<00:00, 395kiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:01<00:00, 382kiB/s]


In [19]:
print("Settings:", settings)
print("Parameter dictionary keys:", params.keys())

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 1024, 'n_head': 16, 'n_layer': 24}
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


Configs for various model sizes:

In [30]:
model_configs = {
    "gpt2-small (124M)": {"n_embd": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"n_embd": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"n_embd": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"n_embd": 1600, "n_layers": 48, "n_heads": 25},
}

In [31]:
model_name = "gpt2-small (124M)"
# model_name = "gpt2-medium (355M)"
# model_name = "gpt2-large (774M)"
# model_name = "gpt2-xl (1558M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024})
NEW_CONFIG.update({"qkv_bias": True})
good_gpt = Simple_GPT(NEW_CONFIG)
good_gpt.eval()
0

0

Helper function that returns the right tensor given that the left and right tensors have the same size:

In [22]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, "
                         f"Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

The following is a function used to load weights into a GPT-2 model. Note that the downloaded GPT must have the **exact** same structure as the passed in GPT model. The function loads each parameter group separately.

In [23]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.position_embedding.weight = assign(gpt.position_embedding.weight, params['wpe'])
    gpt.token_embedding.weight = assign(gpt.token_embedding.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.blocks[b].attn.W_query.weight = assign(gpt.blocks[b].attn.W_query.weight, q_w.T)
        gpt.blocks[b].attn.W_key.weight = assign(gpt.blocks[b].attn.W_key.weight, k_w.T)
        gpt.blocks[b].attn.W_value.weight = assign(gpt.blocks[b].attn.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.blocks[b].attn.W_query.bias = assign(gpt.blocks[b].attn.W_query.bias, q_b)
        gpt.blocks[b].attn.W_key.bias = assign(gpt.blocks[b].attn.W_key.bias, k_b)
        gpt.blocks[b].attn.W_value.bias = assign(gpt.blocks[b].attn.W_value.bias, v_b)

        gpt.blocks[b].attn.projection.weight = assign(gpt.blocks[b].attn.projection.weight,
                                                      params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.blocks[b].attn.projection.bias = assign(gpt.blocks[b].attn.projection.bias,
                                                    params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.blocks[b].ff.layers[0].weight = assign(gpt.blocks[b].ff.layers[0].weight,
                                                   params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.blocks[b].ff.layers[0].bias = assign(gpt.blocks[b].ff.layers[0].bias,
                                                   params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.blocks[b].ff.layers[2].weight = assign(gpt.blocks[b].ff.layers[2].weight,
                                                   params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.blocks[b].ff.layers[2].bias = assign(gpt.blocks[b].ff.layers[2].bias,
                                                   params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.blocks[b].ln1.gamma = assign(gpt.blocks[b].ln1.gamma, params["blocks"][b]["ln_1"]["g"])
        gpt.blocks[b].ln1.beta = assign(gpt.blocks[b].ln1.beta, params["blocks"][b]["ln_1"]["b"])
        gpt.blocks[b].ln2.gamma = assign(gpt.blocks[b].ln2.gamma, params["blocks"][b]["ln_2"]["g"])
        gpt.blocks[b].ln2.beta = assign(gpt.blocks[b].ln2.beta, params["blocks"][b]["ln_2"]["b"])

    gpt.ln_f.gamma = assign(gpt.ln_f.gamma, params["g"])
    gpt.ln_f.beta = assign(gpt.ln_f.beta, params["b"])
    gpt.prediction_layer.weight = assign(gpt.prediction_layer.weight, params["wte"])


In [32]:
load_weights_into_gpt(good_gpt, params)
good_gpt.to(device)
0

0

Once the weights are loaded into the model, the model performs very well.

In [34]:
generate_and_print_sample(good_gpt, tokenizer, device, start_context="Once upon a time,", max_new_tokens=100)

Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world


# Fine-tuning for classification

Here, we will fine-tune our GPT model to classify whether a text message is spam or not.

## Spam detection dataset

Downloading the spam dataset:

In [36]:
# We always need a specific dataset in order to fine-tune
# The dataset must be relevant to our task
# For example:
# Dataset where each datapoint is:
# (input: text, ground truth: yes/no)
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "spam_collection.zip"
extracted_path = "spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists.")
        return
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)

    print(f"Data downloaded and extracted to {extracted_path}.")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

spam_collection/SMSSpamCollection.tsv already exists.


Data is formatted as a pandas dataframe initially. `ham` means not spam.

In [37]:
import pandas as pd
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
# df.columns = ["label", "text"]
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


There are a lot more non-spam messages than spam messages in the dataset.

In [38]:
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


Function used to balance the dataset (equal numbers of spam and non-spam messages):

In [39]:
def create_balanced_spam_dataset(df):
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_subset = df[df["Label"] == "ham"].sample(
        num_spam, random_state=123
    )
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])
    return balanced_df

balanced_df = create_balanced_spam_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [40]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

Function used to create train, val, and test splits from the dataset:

In [42]:
from operator import index
# Split dataset into train, validation, test

def random_split(df, train_ratio=0.8, val_ratio=0.1):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(train_ratio * len(df))
    val_end = train_end + int(val_ratio * len(df))

    train_df = df[:train_end]
    val_df = df[train_end:val_end] # Includes train_end but not val_end
    test_df = df[val_end:]

    return train_df, val_df, test_df

In [43]:
train_df, val_df, test_df = random_split(balanced_df)
train_df.to_csv("train.csv", index=None)
val_df.to_csv("val.csv", index=None)
test_df.to_csv("test.csv", index=None)

Often, we want to **pad** our dataset.

Padding example:
If the padding token is \`, then our message will look something like
"This is a text message \` \` \` \` \` \` \` \` ..."

Padding ensures that text messages are the same length.

Padding can allow everything to be grouped nicely into batches.

In [48]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

# text = "Free money now! Enter the lottery system by giving your social security number"
text = "Free money now!"
tokens = tokenizer.encode(text)
print(tokens)

# Example: Truncating a sequence
max_length = 10
tokens = tokens[:max_length]
print(tokens)

# Example: Padding a sequence
pad_token_id = 50256
padded = tokens + [pad_token_id] * (max_length - len(tokens))

print(padded)

[11146, 1637, 783, 0]
[11146, 1637, 783, 0]
[11146, 1637, 783, 0, 50256, 50256, 50256, 50256, 50256, 50256]


**Exercise 1:** Fill in missing parts of the SpamDataset class.

In [49]:
class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
         # TODO: Tokenize the text
        self.encoded_texts = [tokenizer.encode(text) for text in self.data["Text"]]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # TODO: Truncate the text
            self.encoded_texts = [encoded_text[:self.max_length]
                                  for encoded_text in self.encoded_texts]

        # TODO: Pad the text
        self.encoded_texts = [encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
                              for encoded_text in self.encoded_texts]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        encoded_text = torch.tensor(self.encoded_texts[idx], dtype=torch.long)
        label = torch.tensor(self.data.iloc[idx]["Label"], dtype=torch.long)
        return encoded_text, label

    def _longest_encoded_length(self):
        return max(len(encoded_text) for encoded_text in self.encoded_texts)

Creating train, val, and test dataloaders of the Spam dataset:

In [50]:
tokenizer = tiktoken.get_encoding("gpt2")
max_length = None
train_dataset = SpamDataset(csv_file="train.csv", tokenizer=tokenizer, max_length=max_length)
val_dataset = SpamDataset(csv_file="val.csv", tokenizer=tokenizer, max_length=max_length)
test_dataset = SpamDataset(csv_file="test.csv", tokenizer=tokenizer, max_length=max_length)

In [51]:
num_workers = 0
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [56]:
for input_batch, target_batch in train_dataloader:
    # print(input_batch.shape, target_batch.shape)
    # print(input_batch)
    # print(target_batch)
    break
    pass


Input datapoint: text

Target datapoint: 0 or 1

## Fine-tuning

In [84]:
settings, params = download_and_load_gpt2(
    model_size="355M", models_dir="gpt2"
)

File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe


Initializing our model and optimizer:

In [96]:


# spam_model_name = "gpt2-small (124M)"
spam_model_name = "gpt2-medium (355M)"
# spam_model_name = "gpt2-large (774M)"
# spam_model_name = "gpt2-xl (1558M)"
SPAM_CONFIG = GPT_CONFIG_124M.copy()
SPAM_CONFIG.update(model_configs[spam_model_name])
SPAM_CONFIG.update({"context_length": 1024})
SPAM_CONFIG.update({"qkv_bias": True})


spam_model = Simple_GPT(SPAM_CONFIG)
load_weights_into_gpt(spam_model, params)
spam_model.to(device)

optimizer = torch.optim.AdamW(spam_model.parameters(), lr=0.0004, weight_decay=0.1)

SPAM_CONFIG

{'vocab_size': 50257,
 'context_length': 1024,
 'n_embd': 1024,
 'n_heads': 16,
 'n_layers': 24,
 'dropout_rate': 0.1,
 'qkv_bias': True,
 'device': 'cuda'}

Simple_GPT output size is 50257. We want the output size to be 2 for classification. For any model instance, we can replace portions of the model. We will replace the prediction layer to make it perform 2-way classification instead of next token prediction (50257-way classification).

In [97]:
# Change prediction layer into classification layer
num_classes = 2
spam_model.prediction_layer = nn.Linear(SPAM_CONFIG["n_embd"], num_classes)
spam_model.to(device)
0

0

In [98]:
for p in spam_model.prediction_layer.parameters():
    print(p.shape)

torch.Size([2, 1024])
torch.Size([2])


`requires_grad` determines whether a parameter group is trainable (`True` means trainable; `False` means frozen).

We only want to train part of the model when fine-tuning since we start from a pre-trained model. We can freeze certain parameters of the model.

In [99]:
# Freeze all model parameters
for param in spam_model.parameters():
    param.requires_grad = False

**Exercise 2:** Make the last transformer block and classification layer trainable without touching other parameters.

In [100]:
# Make last transformer block and classification layer trainable
for param in spam_model.blocks[-1].parameters():
    param.requires_grad = True

for param in spam_model.prediction_layer.parameters():
    param.requires_grad = True

**Exercise 3:** Fill missing parts of the accuracy calculation function:

In [101]:
def calculate_spam_accuracy(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for input_batch, target_batch in dataloader:
            # TODO: Move input and target batch to proper device
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            # TODO: Obtain logits
            logits = model(input_batch) # B x N x 2, where 2 is # classes

            # TODO: Get last token from each context window
            last_logits = logits[:, -1, :] # B x 2

            # TODO: Use argmax to get predicted labels
            predicted_labels = torch.argmax(last_logits, dim=-1)

            total += predicted_labels.shape[0] # total += Batch size
            correct += (predicted_labels == target_batch).sum().item()
    accuracy = correct / total
    return accuracy

In [102]:
train_accuracy = calculate_spam_accuracy(spam_model, train_dataloader, device)
val_accuracy = calculate_spam_accuracy(spam_model, val_dataloader, device)
test_accuracy = calculate_spam_accuracy(spam_model, test_dataloader, device)
print(f"Train accuracy: {train_accuracy}")
print(f"Val accuracy: {val_accuracy}")
print(f"Test accuracy: {test_accuracy}")

Train accuracy: 0.499581589958159
Val accuracy: 0.47651006711409394
Test accuracy: 0.5266666666666666


**Exercise 4:** Fill in missing parts about calculating spam loss:

In [103]:
# Calculate the loss for a single batch
def calculate_spam_loss_batch(input_batch, target_batch, model, device):
    # TODO: Move batches to the proper device
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    # TODO: Obtain logits
    logits = model(input_batch)[:, -1, :]
    # Unlike our original training, we only want the last token in each context window
    # Originally: Predict next token -> correctly predicting earlier tokens is some measure
    # of model's performance
    # Now: Binary classification -> only care about the final classification
    # We don't care about classifications with incomplete information

    # TODO: Calculate loss
    loss = nn.functional.cross_entropy(logits, target_batch)
    return loss

In [92]:
# Calculate overall spam loss
def calculate_spam_loss(model, dataloader, device):
    total_loss = 0.0
    total_tokens = 0
    if len(dataloader) == 0:
        return float("nan")

    for input_batch, target_batch in dataloader:
        # TODO: Calculate batch loss
        loss = calculate_spam_loss_batch(input_batch, target_batch, model, device)

        # TODO: Update total_loss
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [104]:
with torch.no_grad():
    train_loss = calculate_spam_loss(spam_model, train_dataloader, device)
    val_loss = calculate_spam_loss(spam_model, val_dataloader, device)
    test_loss = calculate_spam_loss(spam_model, test_dataloader, device)
print(f"Train loss: {train_loss}")
print(f"Val loss: {val_loss}")
print(f"Test loss: {test_loss}")

Train loss: 2.520157130236427
Val loss: 2.6221918369594372
Test loss: 2.495323482312654


**Exercise 5:** Fill in missing parts of the training loop.

In [105]:
def train_spam_classifier(model, train_dataloader, val_dataloader,
                          optimizer, device, num_epochs, eval_freq=50, eval_iter=5):
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []

    examples_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_dataloader:
            # TODO: Perform one parameter update
            # (you can use last lecture's training loop as reference)
            optimizer.zero_grad()
            loss = calculate_spam_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()

            global_step += 1
            examples_seen += input_batch.shape[0] # batch size

            if global_step % eval_freq == 0:
                with torch.no_grad():
                    train_loss = calculate_spam_loss(model, train_dataloader, device)
                    val_loss = calculate_spam_loss(model, val_dataloader, device)
                train_losses.append(train_loss)
                val_losses.append(val_loss)

                print(f"Epoch {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f},"
                      f"Val loss {val_loss:.3f}")

        train_acc = calculate_spam_accuracy(model, train_dataloader, device)
        val_acc = calculate_spam_accuracy(model, val_dataloader, device)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        print(f"Train accuracy: {train_acc}")
        print(f"Val accuracy: {val_acc}")

    return train_losses, val_losses, train_accs, val_accs

In [106]:
spam_optimizer = torch.optim.AdamW(spam_model.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 5

train_losses, val_losses, train_accs, val_accs = train_spam_classifier(
    spam_model, train_dataloader, val_dataloader,
    spam_optimizer, device, num_epochs)

Epoch 1 (Step 000000): Train loss 2.073,Val loss 2.237
Epoch 1 (Step 000050): Train loss 0.718,Val loss 0.814
Epoch 1 (Step 000100): Train loss 0.591,Val loss 0.664
Train accuracy: 0.797489539748954
Val accuracy: 0.5167785234899329
Epoch 2 (Step 000150): Train loss 0.536,Val loss 0.781
Epoch 2 (Step 000200): Train loss 0.510,Val loss 0.934
Epoch 2 (Step 000250): Train loss 0.499,Val loss 0.990
Train accuracy: 0.8359832635983263
Val accuracy: 0.5167785234899329
Epoch 3 (Step 000300): Train loss 0.473,Val loss 0.674
Epoch 3 (Step 000350): Train loss 0.480,Val loss 0.858
Epoch 3 (Step 000400): Train loss 0.475,Val loss 0.646
Train accuracy: 0.8267782426778243
Val accuracy: 0.5167785234899329
Epoch 4 (Step 000450): Train loss 0.442,Val loss 0.800
Epoch 4 (Step 000500): Train loss 0.426,Val loss 0.524
Epoch 4 (Step 000550): Train loss 0.343,Val loss 0.772
Train accuracy: 0.9154811715481171
Val accuracy: 0.5167785234899329
Epoch 5 (Step 000600): Train loss 0.264,Val loss 0.604
Epoch 5 (Step 

In [107]:
calculate_spam_accuracy(spam_model, test_dataloader, device)

0.5133333333333333

Using our spam classification model for inference:

In [81]:
def classify_spam_text(text, model, tokenizer, device, max_length=None,
                       pad_token_id=50256):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer.encode(text)
        supported_context_length = model.config["context_length"]
        if max_length is None:
            max_length = supported_context_length
        input_ids = input_ids[:min(max_length, supported_context_length)]
        input_ids += [pad_token_id] * (max_length - len(input_ids))

        input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)
        logits = model(input_tensor)[:, -1, :]
        predicted_label = torch.argmax(logits, dim=-1).item()
    return ("spam" if predicted_label == 1 else "not spam", torch.softmax(logits, dim=-1))

In [82]:
sample_text_1 = "You are a winner you have been specially selected to receive $1000"
sample_text_2 = "Are you coming home tonight"
sample_text_3 = """
Pennsylvania (DMV) Final Notice: Enforcement Begins August 6nd. \
Our records indicate that as of today, you still have an outstanding traffic ticket.
"""
sample_text_4 = "MIT Alert: Gas leak in Building 46. Responders on scene. Vassar Street closed."
for text in [sample_text_1, sample_text_2, sample_text_3, sample_text_4]:
    print(classify_spam_text(text, spam_model, tokenizer, device,
                         max_length=train_dataset.max_length))

('spam', tensor([[0.0038, 0.9962]], device='cuda:0'))
('not spam', tensor([[0.9976, 0.0024]], device='cuda:0'))
('spam', tensor([[0.0731, 0.9269]], device='cuda:0'))
('spam', tensor([[0.1085, 0.8915]], device='cuda:0'))


# Attribution

Portions of this notebook are adapted from LLMs from Scratch by Sebastian Raschka
https://github.com/rasbt/LLMs-from-scratch

Licensed under the Apache License 2.0