# GPT-2 Model and Training Review

## Preliminaries

Recall our GPT-2 model:

In [None]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.d_in = config["n_embd"]
        self.d_out = config["n_embd"]
        self.num_heads = config["n_heads"]
        self.d_head = self.d_out // self.num_heads # Dimension of each head
        self.context_length = config["context_length"]
        self.W_query = nn.Linear(self.d_in, self.d_out, bias=config["qkv_bias"])
        self.W_key = nn.Linear(self.d_in, self.d_out, bias=config["qkv_bias"])
        self.W_value = nn.Linear(self.d_in, self.d_out, bias=config["qkv_bias"])
        self.dropout = nn.Dropout(config["dropout_rate"])
        causal_mask = torch.tril(torch.ones(self.context_length, self.context_length))
        self.projection = nn.Linear(self.d_out, self.d_out)

        self.register_buffer("mask", causal_mask)

    def forward(self, x):
        B, N, D = x.shape
        Q = self.W_query(x)
        K = self.W_key(x)
        V = self.W_value(x)

        Q = Q.view(B, N, self.num_heads, self.d_head).transpose(1, 2)
        K = K.view(B, N, self.num_heads, self.d_head).transpose(1, 2)
        V = V.view(B, N, self.num_heads, self.d_head).transpose(1, 2)

        QKT = Q @ K.transpose(2, 3)
        masked_QKT = QKT.masked_fill(self.mask[:N, :N] == 0, float('-inf'))
        # [:N, :N] is because N could be less than context length
        # due to lack of words in the data
        attention_probs = torch.softmax(masked_QKT / (self.d_head ** 0.5), dim=-1)
        attention_probs = self.dropout(attention_probs)

        context_vector = attention_probs @ V
        context_vector = context_vector.transpose(1, 2).contiguous().view(B, N, self.d_out)
        return self.projection(context_vector)

class FeedForward(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(config["n_embd"], 4 * config["n_embd"]),
                                    nn.GELU(),
                                    nn.Linear(4 * config["n_embd"], config["n_embd"]))

    def forward(self, x):
        return self.layers(x)

class LayerNorm(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(config["n_embd"]))
        self.beta = nn.Parameter(torch.zeros(config["n_embd"]))
        self.eps = 1e-5

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        x = (x - mean) / (std + self.eps) # Normalize
        x = self.gamma * x + self.beta # Apply linear function
        return x

class TransformerBlock(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = LayerNorm(config)
        self.attn = MultiHeadAttention(config)
        self.dropout = nn.Dropout(config["dropout_rate"])
        self.ff = FeedForward(config)
        self.ln2 = LayerNorm(config)

    def forward(self, x):
        # x -> Layer norm 1 -> attention -> dropout -> residual connection
        saved_x = x
        x = self.ln1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = saved_x + x # residual connection

        # x -> Layer norm 2 -> feed forward -> dropout -> residual connection
        saved_x = x
        x = self.ln2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = saved_x + x # residual connection

        # You can do the above with two lines:
        # x = x + self.dropout(self.attn(self.ln1(x)))
        # x = x + self.dropout(self.ff(self.ln2(x)))
        return x

class Simple_GPT(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.token_embedding = nn.Embedding(config["vocab_size"], config["n_embd"])
        self.position_embedding = nn.Embedding(config["context_length"], config["n_embd"])
        self.dropout = nn.Dropout(config["dropout_rate"])
        self.blocks = nn.Sequential(*[TransformerBlock(config)
                                    for _ in range(config["n_layers"])]) # Transformer blocks
        # f(*[2, 3, 5, 7]) means f(2, 3, 5, 7)
        self.ln_f = LayerNorm(config) # Final layer norm
        self.prediction_layer = nn.Linear(config["n_embd"], config["vocab_size"])
        # Linear mapping to vocab size

        # Register buffer torch.arange(N) to prevent issues with device
        self.register_buffer("pos_range", torch.arange(config["context_length"]))

    def forward(self, x):
        B, N = x.shape      # B is batch size, N is context length
        token_embeddings = self.token_embedding(x)  # [B, N, n_embd]
        position_embeddings = self.position_embedding(self.pos_range[:N])  # [N, n_embd]
        x = token_embeddings + position_embeddings  # Full embeddings; [B, N, n_embd]
        x = self.dropout(x)  # Apply dropout
        x = self.blocks(x)  # Apply transformer blocks; [B, N, n_embd]
        x = self.ln_f(x) # Final layer norm
        logits = self.prediction_layer(x)   # [B, N, vocab_size]
        return logits

In [None]:
config = {
    "vocab_size": 50257,
    "context_length": 1024,
    "n_embd": 768,
    "n_heads": 12,
    "n_layers": 12,
    "dropout_rate": 0.0,
    "qkv_bias": False
}

We will be using the `cuda` device.

In [None]:
device = "cuda"

Recall the text generation process:

In [None]:
def generate_text_sample(model, idx, max_new_tokens, context_length):
    # max_new_tokens is the number of tokens we want to generate
    # idx is the array of indices in the current context
    # idx has size [batch_size, n_tokens]
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_length:]     # Takes the latest context window
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]       #   last token in new context window
        # we want to keep batch and vocab dimension same
        probs = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probs, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)     # dim=1 for the context window
    return idx

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In order to train our model, we need data. A dataset class and a dataloader function will be helpful.

In [None]:
from torch.utils.data import Dataset, DataLoader
# Dataset class
class MyData(Dataset):
    # Init function, called when the dataset is created
    # dataset = MyData(text, tokenizer, context_length=4, stride=1)
    def __init__(self, text, tokenizer, context_length, stride=1):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - context_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i : i + context_length]))
            self.target_ids.append(torch.tensor(token_ids[i + 1 : i + context_length + 1]))

    # Length function
    # len(dataset)
    def __len__(self):
        return len(self.input_ids)

    # Get item function
    # dataset[idx]
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def my_batch(text, batch_size, context_length, stride, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create the dataset object
    dataset = MyData(text, tokenizer, context_length, stride)

    # Use the DataLoader library to create a dataloader that batches the data
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers)

    return dataloader

Cross entropy loss function used to train the model.

In [None]:
def calculate_loss(dataloader, model, device="cpu", num_batches=None): # 1 epoch average loss
    # number of batches in dataset is not included as a dimension in any tensor
    if num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches, len(dataloader))
    model.eval()
    total_loss = 0.0
    for i, (input, target) in enumerate(dataloader): # i is batch index
        if i >= num_batches:
            break

        input = input.to(device) # Move input to appropriate device
        logits = model(input) # Obtain output logits of the model
        target = target.to(device) # Move target to appropriate device

        loss = nn.functional.cross_entropy(logits.flatten(0, 1), target.flatten()) # Use cross entropy loss
        # cross_entropy takes in 2D tensor for logits
        # and 1D tensor for targets

        total_loss += loss.item()
        # .item() extracts a numerical value from a 0D scalar tensor
    return total_loss / num_batches # len(dataloader) is number of batches

## Training

In [None]:
GPT_CONFIG_124M = {
    'vocab_size': 50257,
    'context_length': 256,
    'n_embd': 768,
    'n_heads': 12,
    'n_layers': 12,
    'dropout_rate': 0.1,
    'qkv_bias': False,
    'device': 'cuda'
}

Useful helper function that can convert text into token IDs:

In [None]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    return torch.tensor(encoded).unsqueeze(0) #unsqueeze adds batch dimension 1

Helper function used to generate and print the output:

In [None]:
def generate_and_print_sample(model, tokenizer, device, start_context, max_new_tokens=50):
    model.eval()
    context_size = model.position_embedding.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_sample(model, encoded, max_new_tokens, context_size)
    decoded = tokenizer.decode(token_ids[0].squeeze(0).tolist())
    print(decoded.replace("\n", " "))
    model.train()

Helper function used to get loss values for the train and validation splits:

In [None]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calculate_loss(train_loader, model, device, eval_iter)
        val_loss = calculate_loss(val_loader, model, device, eval_iter)
    model.train()
    return train_loss, val_loss

Training loop:

In [None]:
def training_loop(model, train_dataloader, val_dataloader,
                  optimizer, device, num_epochs,
                  eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
    for epoch in range(num_epochs):
        model.train() # Puts the model in training mode
        for input_batch, target_batch in train_dataloader:
            optimizer.zero_grad() # Zeros gradient calculations

            input_batch = input_batch.to(device) # Move to proper device
            target_batch = target_batch.to(device) # Move to proper device
            logits = model(input_batch)
            loss = nn.functional.cross_entropy(logits.flatten(0, 1),
                                               target_batch.flatten())

            # we are updating based on single batch here
            loss.backward() # computes the gradients
            optimizer.step() # updates the model parameters (optimizer is linked to model)
            # forward means passing through the model
            # backward means I compute the gradient of the loss wrt the parameters
            # Update by -lr * gradient

            tokens_seen += input_batch.numel() # number of elements
            # train_losses.append(loss.item())
            global_step += 1 # number of batches trained
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_dataloader, val_dataloader, device, eval_iter)
                val_losses.append(val_loss)
                train_losses.append(train_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f},"
                      f"Val loss {val_loss:.3f}")

        # Generate and print a sample for each epoch:
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

Training a GPT-2 model on a dataset:

In [None]:
my_config = {
    'vocab_size': 50257,
    'context_length': 256,
    'n_embd': 768,
    'n_heads': 12,
    'n_layers': 12,
    'dropout_rate': 0.1,
    'qkv_bias': False,
    'device': 'cuda'
}

In [None]:
with open('training_data_3.txt', 'r', encoding="utf-8") as file:
    text_data_2 = file.read()

train_ratio = 0.8
split_idx = int(train_ratio * len(text_data_2))
train_data_2 = text_data_2[:split_idx]
val_data_2 = text_data_2[split_idx:]

train_dataloader_2 = my_batch(train_data_2, batch_size=20,
                            context_length=my_config['context_length'] // 2,
                            stride=my_config['context_length'] // 2,
                            shuffle=True, drop_last=True, num_workers=0)

val_dataloader_2 = my_batch(val_data_2, batch_size=20,
                          context_length=my_config['context_length'] // 2,
                          stride=my_config['context_length'] // 2,
                          shuffle=False, drop_last=False, num_workers=0)

model_2 = Simple_GPT(my_config)
model_2.to(my_config["device"])

optimizer_2 = torch.optim.AdamW(model_2.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 2 # When actually training, change to be more
start_context = "Once upon a time," # Replace

train_losses, val_losses, tokens_seen = training_loop(
    model_2, train_dataloader_2, val_dataloader_2, optimizer_2,
    my_config["device"], num_epochs,
    eval_freq=1, eval_iter=5, start_context=start_context, tokenizer=tokenizer
) # Run the training loop


Epoch 1 (Step 000000): Train loss 9.800,Val loss 9.827
Epoch 1 (Step 000001): Train loss 9.267,Val loss 9.343
Epoch 1 (Step 000002): Train loss 9.019,Val loss 9.078
Epoch 1 (Step 000003): Train loss 8.678,Val loss 8.737
Epoch 1 (Step 000004): Train loss 8.342,Val loss 8.444
Epoch 1 (Step 000005): Train loss 8.089,Val loss 8.188
Epoch 1 (Step 000006): Train loss 7.813,Val loss 7.929
Epoch 1 (Step 000007): Train loss 7.559,Val loss 7.719
Epoch 1 (Step 000008): Train loss 7.329,Val loss 7.535
Epoch 1 (Step 000009): Train loss 7.189,Val loss 7.363
Epoch 1 (Step 000010): Train loss 7.017,Val loss 7.226
Epoch 1 (Step 000011): Train loss 6.835,Val loss 7.129
Epoch 1 (Step 000012): Train loss 6.767,Val loss 7.068
Once upon a time, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and,
Epoch 2 (Step 000013): Train loss 6.646,Val loss 7.018
Epoch 2 (Step 000014): Train loss 6.667,Val loss 6.989
Epoch 2 (Step 000015): Train los

# Loading weights from OpenAI

For this notebook, we will be using code from the book "Build a Large Language Model (From Scratch)" (https://github.com/rasbt/LLMs-from-scratch).

Downloading the textbook code used to download and GPT:

In [None]:
import urllib.request
url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)
filename = url.split("/")[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x79f2c7978080>)

Downloading GPT:

In [None]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size="355M", models_dir="gpt2"
)

File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe


In [None]:
print("Settings:", settings)
print("Parameter dictionary keys:", params.keys())

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 1024, 'n_head': 16, 'n_layer': 24}
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


Configs for various model sizes:

In [None]:
model_configs = {
    "gpt2-small (124M)": {"n_embd": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"n_embd": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"n_embd": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"n_embd": 1600, "n_layers": 48, "n_heads": 25},
}

In [None]:
# model_name = "gpt2-small (124M)"
model_name = "gpt2-medium (355M)"
# model_name = "gpt2-large (774M)"
# model_name = "gpt2-xl (1558M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024})
NEW_CONFIG.update({"qkv_bias": True})
good_gpt = Simple_GPT(NEW_CONFIG)
good_gpt.eval()
0

0

Helper function that returns the right tensor given that the left and right tensors have the same size:

In [None]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, "
                         f"Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

The following is a function used to load weights into a GPT-2 model. Note that the downloaded GPT must have the **exact** same structure as the passed in GPT model. The function loads each parameter group separately.

In [None]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.position_embedding.weight = assign(gpt.position_embedding.weight, params['wpe'])
    gpt.token_embedding.weight = assign(gpt.token_embedding.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.blocks[b].attn.W_query.weight = assign(gpt.blocks[b].attn.W_query.weight, q_w.T)
        gpt.blocks[b].attn.W_key.weight = assign(gpt.blocks[b].attn.W_key.weight, k_w.T)
        gpt.blocks[b].attn.W_value.weight = assign(gpt.blocks[b].attn.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.blocks[b].attn.W_query.bias = assign(gpt.blocks[b].attn.W_query.bias, q_b)
        gpt.blocks[b].attn.W_key.bias = assign(gpt.blocks[b].attn.W_key.bias, k_b)
        gpt.blocks[b].attn.W_value.bias = assign(gpt.blocks[b].attn.W_value.bias, v_b)

        gpt.blocks[b].attn.projection.weight = assign(gpt.blocks[b].attn.projection.weight,
                                                      params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.blocks[b].attn.projection.bias = assign(gpt.blocks[b].attn.projection.bias,
                                                    params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.blocks[b].ff.layers[0].weight = assign(gpt.blocks[b].ff.layers[0].weight,
                                                   params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.blocks[b].ff.layers[0].bias = assign(gpt.blocks[b].ff.layers[0].bias,
                                                   params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.blocks[b].ff.layers[2].weight = assign(gpt.blocks[b].ff.layers[2].weight,
                                                   params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.blocks[b].ff.layers[2].bias = assign(gpt.blocks[b].ff.layers[2].bias,
                                                   params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.blocks[b].ln1.gamma = assign(gpt.blocks[b].ln1.gamma, params["blocks"][b]["ln_1"]["g"])
        gpt.blocks[b].ln1.beta = assign(gpt.blocks[b].ln1.beta, params["blocks"][b]["ln_1"]["b"])
        gpt.blocks[b].ln2.gamma = assign(gpt.blocks[b].ln2.gamma, params["blocks"][b]["ln_2"]["g"])
        gpt.blocks[b].ln2.beta = assign(gpt.blocks[b].ln2.beta, params["blocks"][b]["ln_2"]["b"])

    gpt.ln_f.gamma = assign(gpt.ln_f.gamma, params["g"])
    gpt.ln_f.beta = assign(gpt.ln_f.beta, params["b"])
    gpt.prediction_layer.weight = assign(gpt.prediction_layer.weight, params["wte"])


In [None]:
load_weights_into_gpt(good_gpt, params)
good_gpt.to(device)
0

0

Once the weights are loaded into the model, the model performs very well.

In [None]:
generate_and_print_sample(good_gpt, tokenizer, device, start_context="Once upon a time,", max_new_tokens=100)

Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world


# Instruction tuning GPT-2

In [None]:
import json
import os
import urllib

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w") as out_file:
            out_file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

In [None]:
file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)
data = download_and_load_file(file_path, url)
print(len(data))

1100


In [None]:
print(data) # list of dictionaries

[{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.', 'input': 'freind --> friend', 'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'}, {'instruction': 'Edit the following sentence for grammar.', 'input': 'He go to the park every day.', 'output': 'He goes to the park every day.'}, {'instruction': 'Convert 45 kilometers to meters.', 'input': '', 'output': '45 kilometers is 45000 meters.'}, {'instruction': "Rewrite this sentence to start with 'Although': Despite the rain, they went for a walk.", 'input': '', 'output': 'Although it was raining, they went for a walk.'}, {'instruction': 'What are the first 10 square numbers?', 'input': '', 'output': '1, 4, 9, 16, 25, 36, 49, 64, 81, 100.'}, {'instruction': 'Suggest a more formal synonym for "happy."', 'input': '', 'output': 'A more formal synonym for "happy" is "content."'}, {'instruction': 'Translate the following sentence into French.', 'input': 'W

In [None]:
data[50]

{'instruction': 'Identify the correct spelling of the following word.',
 'input': 'Ocassion',
 'output': "The correct spelling is 'Occasion.'"}

In order to train GPT to respond to questions, we need to format our text with an instruction and an input.

In [None]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request"
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = (
        f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

In [None]:
print(format_input(data[1]))

Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Edit the following sentence for grammar.

### Input:
He go to the park every day.


**Exercise 1:** Split the data into training, validation, and test portion (first 85% is train, next 10% is test, last 5% is validation).

In [None]:
train_ratio = 0.85
test_ratio = 0.1
train_portion = int(len(data) * train_ratio)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion] # TODO
test_data = data[train_portion:train_portion+test_portion] # TODO
val_data = data[train_portion+test_portion:] # TODO
print(len(train_data))
print(len(val_data))
print(len(test_data))

935
55
110


Our instruction dataset class:

In [None]:
import torch
from torch.utils.data import Dataset
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(tokenizer.encode(full_text))

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

Collate function that takes a batch of token sequences of different lengths and turns them into padded input-target pairs:

In [None]:
def custom_collate(batch, pad_token_id=50256, ignore_index=-100,
                   allowed_max_length=None, device="cpu"):
    batch_max_length = max(len(item) + 1 for item in batch)
    inputs_lst, targets_lst = [], []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (new_item + [pad_token_id] * (batch_max_length - len(new_item)))
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

In [None]:
c = custom_collate(([0, 1, 2, 3, 4], [5, 6], [7, 8, 9]))
print(c[0])
print(c[1])

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


Creating train, val, and test dataloaders:

In [None]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset,
                          batch_size=batch_size,
                          collate_fn=custom_collate,
                          shuffle=True,
                          drop_last=True,
                          num_workers=num_workers)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(val_dataset,
                        batch_size=batch_size,
                        collate_fn=custom_collate,
                        shuffle=False,
                        drop_last=False,
                        num_workers=num_workers)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset,
                         batch_size=batch_size,
                         collate_fn=custom_collate,
                         shuffle=False,
                         drop_last=False,
                         num_workers=num_workers)

In [None]:
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

torch.Size([8, 60]) torch.Size([8, 60])
torch.Size([8, 56]) torch.Size([8, 56])
torch.Size([8, 60]) torch.Size([8, 60])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 79]) torch.Size([8, 79])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 74]) torch.Size([8, 74])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 58]) torch.Size([8, 58])
torch.Size([8, 87]) torch.Size([8, 87])
torch.Size([8, 56]) torch.Size([8, 56])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 60]) torch.Size([8, 60])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 57]) torch.Size([8, 57])
torch.Size([8, 60]) torch.Size([8, 60])


Initializing and loading weights into our GPT model:

In [None]:
from gpt_download import download_and_load_gpt2
device = "cuda"
BASE_CONFIG = {"vocab_size": 50257,
               "context_length": 1024,
               "dropout_rate": 0.0,
               "qkv_bias": True,
               "device": device}
model_configs = {
    "gpt2-small (124M)": {"n_embd": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"n_embd": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"n_embd": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"n_embd": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size, models_dir="gpt2"
)
model = Simple_GPT(BASE_CONFIG) #.to(device)
load_weights_into_gpt(model, params) # CAREFUL: This can load CPU weights, causing issues
model = model.to(device) # Making sure things are in GPU
model.eval()
0

File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe


0

In [None]:
def token_ids_to_text(token_ids, tokenizer):
    return tokenizer.decode(token_ids.squeeze(0).tolist())

**Exercise 2:** Fill in missing parts of the following `generate_text_sample_2` and `generate_and_print_sample_2`.

In [None]:
def generate_text_sample_2(model, idx, max_new_tokens, context_length, eos_id=None):
    # max_new_tokens is the number of tokens we want to generate
    # idx is the array of indices in the current context
    # idx has size [batch_size, n_tokens]
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_length:]     # Takes the latest context window
        with torch.no_grad():
            logits = model(idx_cond)       # TODO: get logits
        logits = logits[:, -1, :]           # TODO: last token in new context window

        probs = torch.softmax(logits, dim=-1) # TODO: Obtain probabilities using softmax
        idx_next = torch.argmax(probs, dim=-1, keepdim=True) # TODO: Obtain next token using argmax with keepdim
        if idx_next == eos_id: # TODO: check whether the next token is the eos token
            break
        idx = torch.cat((idx, idx_next), dim=1)     # dim=1 for the context window
    return idx

In [None]:
def generate_and_print_sample_2(model, tokenizer, device, start_context, max_new_tokens=50, eos_id=50256):
    model.eval()
    context_size = model.position_embedding.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_sample_2(model, encoded, max_new_tokens, context_size, eos_id) # TODO
    decoded = tokenizer.decode(token_ids[0].squeeze(0).tolist())
    print(decoded.replace("\n", " "))
    model.train()

In [None]:
generate_and_print_sample_2(model, tokenizer, device, "Once upon a time,", max_new_tokens=200)

Once upon a time, there was a man who lived in a village called Krakow. He was a very good man, and he was very kind to his children. One day, he was walking along the road, and he saw a young girl walking along the road. He asked her, "What is your name?" She replied, "My name is Mary." He asked her, "What is your father's name?" She replied, "My father's name is John." He asked her, "What is your mother's name?" She replied, "My mother's name is Mary." He asked her, "What is your mother's maiden name?" She replied, "My mother's name is Mary." He asked her, "What is your father's name?" She replied, "My father's name is John." He asked her, "What is your father's name?" She replied, "My father's name is John." He asked her, "What is your mother's name?" She replied


**Exercise 3:** Fill in missing parts of the training loop (try not to look at earlier training loop).

In [None]:
def training_loop_2(model, train_dataloader, val_dataloader,
                  optimizer, device, num_epochs,
                  eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
    for epoch in range(num_epochs):
        model.train() # Puts the model in training mode
        for input_batch, target_batch in train_dataloader:
            # TODO: Zeros gradient calculations
            optimizer.zero_grad()

            # TODO: Move batches to proper device
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            # TODO: Obtain logits
            logits = model(input_batch)

            # TODO: Calculate cross entropy loss
            loss = nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())

            # TODO: Compute gradients and update parameters
            loss.backward()
            optimizer.step()

            tokens_seen += input_batch.numel() # number of elements
            # train_losses.append(loss.item())
            global_step += 1 # number of batches trained
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_dataloader, val_dataloader, device, eval_iter)
                val_losses.append(val_loss)
                train_losses.append(train_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f},"
                      f"Val loss {val_loss:.3f}")

        # Generate and print a sample for each epoch:
        generate_and_print_sample_2(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

In [None]:
#"""
input_text = format_input(val_data[0])
print(input_text)
token_ids = generate_text_sample_2(model=model,
                                 idx=torch.tensor(tokenizer.encode(input_text)).unsqueeze(0).to(device),
                                 max_new_tokens=100,
                                 context_length=BASE_CONFIG["context_length"],
                                 eos_id=50256)
generated_text = token_ids_to_text(token_ids, tokenizer)
print("_" * 100)
print(generated_text)
#"""

Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'
____________________________________________________________________________________________________
Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:

The chef cooks the meal every day.

### Instruction:

Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:

The chef cooks the meal every day.

### Instruction:

Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:

The chef cooks the meal every day.

### Instruction:




In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 2
#print(device)
train_losses, val_losses, tokens_seen = training_loop_2(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[1]), tokenizer=tokenizer)

Epoch 1 (Step 000000): Train loss 1.728,Val loss 1.725
Epoch 1 (Step 000005): Train loss 0.929,Val loss 1.034
Epoch 1 (Step 000010): Train loss 0.882,Val loss 0.961
Epoch 1 (Step 000015): Train loss 0.875,Val loss 0.925
Epoch 1 (Step 000020): Train loss 0.824,Val loss 0.869
Epoch 1 (Step 000025): Train loss 0.727,Val loss 0.840
Epoch 1 (Step 000030): Train loss 0.805,Val loss 0.830
Epoch 1 (Step 000035): Train loss 0.728,Val loss 0.812
Epoch 1 (Step 000040): Train loss 0.627,Val loss 0.790
Epoch 1 (Step 000045): Train loss 0.678,Val loss 0.767
Epoch 1 (Step 000050): Train loss 0.688,Val loss 0.760
Epoch 1 (Step 000055): Train loss 0.650,Val loss 0.760
Epoch 1 (Step 000060): Train loss 0.622,Val loss 0.757
Epoch 1 (Step 000065): Train loss 0.635,Val loss 0.757
Epoch 1 (Step 000070): Train loss 0.618,Val loss 0.738
Epoch 1 (Step 000075): Train loss 0.564,Val loss 0.736
Epoch 1 (Step 000080): Train loss 0.563,Val loss 0.724
Epoch 1 (Step 000085): Train loss 0.450,Val loss 0.706
Epoch 1 (S

In [None]:
#"""
input_text = format_input(val_data[51])
print(input_text)
token_ids = generate_text_sample_2(model=model,
                                 idx=torch.tensor(tokenizer.encode(input_text)).unsqueeze(0).to(device),
                                 max_new_tokens=35,
                                 context_length=BASE_CONFIG["context_length"],
                                 eos_id=50256)
generated_text = token_ids_to_text(token_ids, tokenizer)
print("_" * 100)
print(generated_text)
#"""

Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Convert 1000 milliliters to liters.
____________________________________________________________________________________________________
Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Convert 1000 milliliters to liters.

### Response:
1000 milliliters is equal to 1000 liters.


**Exercise 4:** Choose three examples from the validation data, and use the model to perform inference on the three examples.

In [None]:
# Your code here
examples = [37, 45, 54]
for e in examples:
    input_text = format_input(val_data[e])
    token_ids = generate_text_sample_2(model=model,
                                 idx=torch.tensor(tokenizer.encode(input_text)).unsqueeze(0).to(device),
                                 max_new_tokens=35,
                                 context_length=BASE_CONFIG["context_length"],
                                 eos_id=50256)
    generated_text = token_ids_to_text(token_ids, tokenizer)
    print("_" * 100)
    print(generated_text)

____________________________________________________________________________________________________
Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Translate 'Good night' into German.

### Response:
The German translation of 'Good night' is 'Good night, sleep well.'
____________________________________________________________________________________________________
Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Provide a synonym for 'beautiful'.

### Response:
A synonym for 'beautiful' is 'curious'.
____________________________________________________________________________________________________
Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Change the sentence 'You should have called me.' into a question.

### Response:
You should have called me.


**Exercise 5:** Train another model with 5 epochs instead of 2. Use the model to perform inference on the three examples from exercise 4.

In [None]:
# Your code here
device = "cuda"
BASE_CONFIG = {"vocab_size": 50257,
               "context_length": 1024,
               "dropout_rate": 0.0,
               "qkv_bias": True,
               "device": device}
model_configs = {
    "gpt2-small (124M)": {"n_embd": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"n_embd": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"n_embd": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"n_embd": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size, models_dir="gpt2"
)
model_2 = Simple_GPT(BASE_CONFIG) #.to(device)
load_weights_into_gpt(model_2, params) # CAREFUL: This can load CPU weights, causing issues
model_2 = model_2.to(device) # Making sure things are in GPU
model_2.eval()

optimizer_2 = torch.optim.AdamW(model_2.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 5
#print(device)
train_losses, val_losses, tokens_seen = training_loop_2(
    model_2, train_loader, val_loader, optimizer_2, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[1]), tokenizer=tokenizer)


File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe
Epoch 1 (Step 000000): Train loss 2.741,Val loss 2.738
Epoch 1 (Step 000005): Train loss 1.077,Val loss 1.115
Epoch 1 (Step 000010): Train loss 0.900,Val loss 0.958
Epoch 1 (Step 000015): Train loss 0.893,Val loss 0.923
Epoch 1 (Step 000020): Train loss 0.775,Val loss 0.884
Epoch 1 (Step 000025): Train loss 0.784,Val loss 0.843
Epoch 1 (Step 000030): Train loss 0.695,Val loss 0.833
Epoch 1 (Step 000035): Train loss 0.746,Val loss 0.823
Epoch 1 (Step 000040): Train loss 0.656,Val loss 0.796
Epoch 1 (Step 000045): Train loss 0.743,Val loss 0.

In [None]:
print(train_data[32])

{'instruction': 'What is the atomic number of oxygen?', 'input': '', 'output': 'The atomic number of oxygen is 8.'}


In [None]:
# Your code here
examples = [37, 45, 54]
# examples = range(30, 55)
for e in examples:
    # input_text = format_input(train_data[e])
    input_text = format_input(val_data[e])
    token_ids = generate_text_sample_2(model=model_2,
                                 idx=torch.tensor(tokenizer.encode(input_text)).unsqueeze(0).to(device),
                                 max_new_tokens=35,
                                 context_length=BASE_CONFIG["context_length"],
                                 eos_id=50256)
    generated_text = token_ids_to_text(token_ids, tokenizer)
    print("_" * 100)
    print(generated_text)

____________________________________________________________________________________________________
Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Translate 'Good night' into German.

### Response:
The German translation of 'Good night' is 'Nachhaltige Bed.'
____________________________________________________________________________________________________
Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Provide a synonym for 'beautiful'.

### Response:
A synonym for 'beautiful' is 'attractive'.
____________________________________________________________________________________________________
Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Change the sentence 'You should have called me.' into a question.

### Input:
You should have called me.

### Response:
Sho

**Questions:**
1. Does the model perform better with more epochs?
2. Why can't we use a direct accuracy (correct answer proportion) score to automatically evaluate our models?

1. Yes (in this case). It originally got 0/3 for those examples chosen, but now it has 2/3 and even the one it got wrong at least looks better. In general, more epochs means more time for the model to learn, which can lead to better results. However, if we have too many epochs then it can start to overfit (we didn't see that here yet).

2. The biggest problem is that we have no way to automatically score our model using accuracy. We can't automatically say `A synonym for 'beautiful' is 'attractive'.` is correct because there is more than one correct response, and the response can be formatted or worded differently (e.g. `'attractive' is a synonym for 'beautiful'.` or `A synonym for 'beautiful is 'pretty'.`).

Performing inference using my own text:

In [None]:
my_input = {'instruction': "Provide a synonym for 'small'.", 'input': ""}
my_input_text = format_input(my_input)
token_ids = generate_text_sample_2(model=model_2,
                                idx=torch.tensor(tokenizer.encode(my_input_text)).unsqueeze(0).to(device),
                                max_new_tokens=35,
                                context_length=BASE_CONFIG["context_length"],
                                eos_id=50256)
generated_text = token_ids_to_text(token_ids, tokenizer)
print(generated_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request

### Instruction:
Provide a synonym for 'small'.

### Response:
A synonym for 'small' is 'little'.


# Fine-tuning BERT (No exercises)

## Disaster tweet classification

In [None]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
print(df_train["target"].value_counts())

target
0    4342
1    3271
Name: count, dtype: int64


In [None]:
from operator import index
# Split dataset into train, validation, test

def random_split(df, train_ratio=0.8):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(train_ratio * len(df))

    train_df = df[:train_end]
    val_df = df[train_end:]

    return train_df, val_df

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
tokenizer(["a b c", "b"], truncation=True, padding=True)

{'input_ids': [[101, 1037, 1038, 1039, 102], [101, 1038, 102, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]]}

In [None]:
from torch.utils.data import Dataset
import torch

class DisasterTweetDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
train_df, val_df = random_split(df_train)

In [None]:
train_texts = train_df['text'].tolist()
val_texts = val_df['text'].tolist()
test_texts = df_test['text'].tolist()
train_labels = train_df['target'].tolist()
val_labels = val_df['target'].tolist()
# test_labels = df_test['target'].tolist()

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
train_dataset = DisasterTweetDataset(train_encodings, train_labels)
val_dataset = DisasterTweetDataset(val_encodings, val_labels)
test_dataset = DisasterTweetDataset(test_encodings)

In [None]:
import numpy as np
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = np.mean(predictions == labels)
    return {'accuracy': accuracy}

training_args = TrainingArguments(
    output_dir='./results', # checkpoint (model weights)
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,
    report_to="none",
    eval_strategy="steps"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
20,0.7179,0.704227,0.415627
40,0.7052,0.676183,0.623112
60,0.6745,0.644114,0.695338
80,0.651,0.583861,0.725542
100,0.5935,0.521281,0.7761
120,0.5249,0.465911,0.80893
140,0.4519,0.443312,0.814183
160,0.4083,0.416137,0.825345
180,0.4067,0.427486,0.827315
200,0.4688,0.40981,0.831254


TrainOutput(global_step=762, training_loss=0.4419186032741401, metrics={'train_runtime': 97.3507, 'train_samples_per_second': 125.115, 'train_steps_per_second': 7.827, 'total_flos': 525769888593600.0, 'train_loss': 0.4419186032741401, 'epoch': 2.0})

In [None]:
predictions = trainer.predict(val_dataset)
predictions.predictions

array([[ 0.2812366 , -0.06426147],
       [ 0.99896634, -1.1926782 ],
       [ 0.78297484, -1.1009752 ],
       ...,
       [ 0.7081249 , -1.1849495 ],
       [-2.0012774 ,  1.5817366 ],
       [-1.6484641 ,  1.52317   ]], dtype=float32)

In [None]:
len(val_dataset)

1523

In [None]:
predictions.predictions[0:20]

array([[ 0.2812366 , -0.06426147],
       [ 0.99896634, -1.1926782 ],
       [ 0.78297484, -1.1009752 ],
       [-1.1031365 ,  1.1983205 ],
       [ 0.04724909, -0.3525718 ],
       [-1.6028138 ,  1.4485874 ],
       [ 0.89122885, -1.328399  ],
       [-0.481672  ,  0.7399304 ],
       [-1.1333342 ,  0.5411072 ],
       [ 0.40449452, -0.41707617],
       [-1.9200532 ,  1.7078657 ],
       [ 0.94446725, -1.327072  ],
       [-1.7446764 ,  1.6019489 ],
       [ 0.9425024 , -1.3425268 ],
       [ 0.9269346 , -1.3692045 ],
       [-1.631067  ,  1.5871632 ],
       [ 0.8994038 , -1.319348  ],
       [-1.8068988 ,  1.5639117 ],
       [-1.0290976 ,  1.0082314 ],
       [-1.42941   ,  1.43355   ]], dtype=float32)

In [None]:
predictions_tensor = torch.tensor(predictions.predictions)
predictions_tensor.shape

torch.Size([1523, 2])

In [None]:
predictions_tensor_probs = torch.softmax(predictions_tensor, dim=-1)
predictions_tensor_probs[0:10]

tensor([[0.5855, 0.4145],
        [0.8995, 0.1005],
        [0.8681, 0.1319],
        [0.0910, 0.9090],
        [0.5986, 0.4014],
        [0.0452, 0.9548],
        [0.9020, 0.0980],
        [0.2277, 0.7723],
        [0.1578, 0.8422],
        [0.6946, 0.3054]])

In [None]:
predictions_labels = torch.argmax(predictions_tensor_probs, dim=-1)
predictions_labels[0:30]

tensor([0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
        0, 1, 0, 1, 0, 1])

In [None]:
num_correct = sum(1 if predictions_labels[i] == val_labels[i] else 0
                  for i in range(len(val_labels)))
print(num_correct / len(val_labels))

0.8463558765594222


In [None]:
print(sum(predictions_labels))

tensor(602)


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model_not_fine_tuned = BertForSequenceClassification.from_pretrained('bert-base-uncased')

training_args = TrainingArguments(
    output_dir='./results', # checkpoint (model weights)
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"
)

trainer_not_fine_tuned = Trainer(
    model=model_not_fine_tuned,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

#trainer.train()
predictions_not_fine_tuned = trainer_not_fine_tuned.predict(test_dataset)
predictions_not_fine_tuned.predictions[0:10]

# Most of model -> classification head (if not fine-tuned, it is random)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


array([[0.49359864, 0.5200346 ],
       [0.5293699 , 0.6069484 ],
       [0.46960893, 0.75746083],
       [0.51122844, 0.71294373],
       [0.42932087, 0.50667584],
       [0.5276265 , 0.5391525 ],
       [0.4389047 , 0.5392798 ],
       [0.50068265, 0.48209736],
       [0.47785786, 0.35478613],
       [0.29998916, 0.23867726]], dtype=float32)

In [None]:
inference_encodings = tokenizer(["I am currently teaching a lecture"])
inference_dataset = DisasterTweetDataset(inference_encodings)
inference_predictions = trainer.predict(inference_dataset)
inference_predictions_tensor = torch.tensor(inference_predictions.predictions)
inference_probs = torch.softmax(inference_predictions_tensor, dim=-1)
inference_probs

tensor([[0.8818, 0.1182]])

## Spam classification

In [None]:
# We always need a specific dataset in order to fine-tune
# The dataset must be relevant to our task
# For example:
# Dataset where each datapoint is:
# (input: text, ground truth: yes/no)
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "spam_collection.zip"
extracted_path = "spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists.")
        return
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)

    print(f"Data downloaded and extracted to {extracted_path}.")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

Data downloaded and extracted to spam_collection.


In [None]:
def create_balanced_spam_dataset(df):
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_subset = df[df["Label"] == "ham"].sample(
        num_spam, random_state=123
    )
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])
    return balanced_df

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
balanced_df = create_balanced_spam_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [None]:
from operator import index
# Split dataset into train, validation, test

def random_split_spam(df, train_ratio=0.8, val_ratio=0.1):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(train_ratio * len(df))
    val_end = train_end + int(val_ratio * len(df))

    train_df = df[:train_end]
    val_df = df[train_end:val_end] # Includes train_end but not val_end
    test_df = df[val_end:]

    return train_df, val_df, test_df

In [None]:
spam_train_df, spam_val_df, spam_test_df = random_split_spam(balanced_df)


In [None]:
spam_train_df.head()

Unnamed: 0,Label,Text
0,ham,Dude how do you like the buff wind.
1,ham,Tessy..pls do me a favor. Pls convey my birthd...
2,spam,Reminder: You have not downloaded the content ...
3,spam,Got what it takes 2 take part in the WRC Rally...
4,spam,"Shop till u Drop, IS IT YOU, either 10K, 5K, £..."


In [None]:
from torch.utils.data import Dataset
import torch

class BinaryTextClassificationDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
spam_train_texts = spam_train_df['Text'].tolist()
spam_val_texts = spam_val_df['Text'].tolist()
spam_test_texts = spam_test_df['Text'].tolist()
spam_train_labels = spam_train_df['Label'].tolist()
spam_val_labels = spam_val_df['Label'].tolist()
spam_test_labels = spam_test_df['Label'].tolist()
spam_train_labels = [1 if label == "spam" else 0 for label in spam_train_labels]
spam_val_labels = [1 if label == "spam" else 0 for label in spam_val_labels]
spam_test_labels = [1 if label == "spam" else 0 for label in spam_test_labels]

In [None]:
spam_train_encodings = tokenizer(spam_train_texts, truncation=True, padding=True)
spam_val_encodings = tokenizer(spam_val_texts, truncation=True, padding=True)
spam_test_encodings = tokenizer(spam_test_texts, truncation=True, padding=True)
spam_train_dataset = BinaryTextClassificationDataset(spam_train_encodings, spam_train_labels)
spam_val_dataset = BinaryTextClassificationDataset(spam_val_encodings, spam_val_labels)
spam_test_dataset = BinaryTextClassificationDataset(spam_test_encodings, spam_test_labels)

In [None]:
import numpy as np
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

spam_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = np.mean(predictions == labels)
    return {'accuracy': accuracy}

spam_training_args = TrainingArguments(
    output_dir='./results_spam', # checkpoint (model weights)
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_spam',
    logging_steps=20,
    report_to="none",
    eval_strategy="steps"
)

spam_trainer = Trainer(
    model=spam_model,
    args=spam_training_args,
    train_dataset=spam_train_dataset,
    eval_dataset=spam_val_dataset,
    compute_metrics=compute_metrics
)

spam_trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
20,0.6724,0.646864,0.818792
40,0.6122,0.558737,0.892617
60,0.475,0.387622,0.932886
80,0.2717,0.232933,0.946309
100,0.1566,0.169421,0.932886
120,0.0732,0.168518,0.95302
140,0.0806,0.155833,0.95302
160,0.1025,0.137012,0.959732
180,0.0454,0.16364,0.959732
200,0.0558,0.184682,0.95302


TrainOutput(global_step=375, training_loss=0.14313771812741954, metrics={'train_runtime': 34.3624, 'train_samples_per_second': 173.882, 'train_steps_per_second': 10.913, 'total_flos': 383810682562500.0, 'train_loss': 0.14313771812741954, 'epoch': 5.0})

In [None]:
spam_predictions = spam_trainer.predict(spam_test_dataset)
spam_predictions_tensor = torch.tensor(spam_predictions.predictions)
spam_predictions_probs = torch.softmax(spam_predictions_tensor, dim=-1)
spam_predictions_labels = torch.argmax(spam_predictions_probs, dim=-1)
print(spam_predictions_labels[0:10])
print(spam_test_labels[0:10])
num_correct = sum(1 if spam_predictions_labels[i] == spam_test_labels[i] else 0
                  for i in range(len(spam_test_labels)))
print(num_correct / len(spam_test_labels))

tensor([1, 1, 0, 0, 0, 1, 0, 1, 1, 1])
[1, 1, 0, 0, 0, 1, 0, 1, 1, 1]
0.9933333333333333


In [None]:
sum(p.numel() for p in model.parameters())

109483778

In [None]:
sample_text_1 = "You are a winner you have been specially selected to receive $1000"
sample_text_2 = "Are you coming home tonight"
sample_text_3 = """
Pennsylvania (DMV) Final Notice: Enforcement Begins August 6nd. \
Our records indicate that as of today, you still have an outstanding traffic ticket.
"""
sample_text_4 = "MIT Alert: Gas leak in Building 46. Responders on scene. Vassar Street closed."
inference_encodings = tokenizer([sample_text_4])
inference_dataset = BinaryTextClassificationDataset(inference_encodings)
inference_predictions = spam_trainer.predict(inference_dataset)
inference_predictions_tensor = torch.tensor(inference_predictions.predictions)
inference_probs = torch.softmax(inference_predictions_tensor, dim=-1)
inference_probs

tensor([[7.5569e-04, 9.9924e-01]])

# Attribution

Portions of this notebook are adapted from LLMs from Scratch by Sebastian Raschka
https://github.com/rasbt/LLMs-from-scratch

Licensed under the Apache License 2.0