In [1]:
import json
import os
import urllib

## Data

In [2]:
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode('utf-8')
        
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(text_data)
    
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    return data

In [3]:
# instruction fine-tuning dataset based on the book

file_path = 'instruction-data.json'
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print(f'Total Entries: {len(data)}')

Total Entries: 1100


In [4]:
print(f'Example:\n{data[50]}')

Example:
{'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [None]:
print(f'Example:\n{data[999]}')  # "input" field may be empty in the JSON

Example:
{'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


### Prompt style

In [6]:
def format_input(entry):
    # alpaca style

    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    # input_text will be empty if entry['input'] is empty
    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ""

    return instruction_text + input_text

In [7]:
model_input = format_input(data[50])
desired_response = f'\n\n### Response:\n{data[50]['output']}'
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [8]:
# empty "input"
model_input = format_input(data[999])
desired_response = f'\n\n### Response:\n{data[50]['output']}'
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response:
The correct spelling is 'Occasion.'


### Train/Val/Test Splits

In [9]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print(f'Train length: {len(train_data)}')
print(f'Val length: {len(val_data)}')
print(f'Test length: {len(test_data)}')

Train length: 935
Val length: 55
Test length: 110


### Batching

PyTorch handles batching using a collate function, like in spam classification. <br>
Collate function takes a list of samples and merges them into a batch. <br>
IFT requires a more complex batching process. <br>

In [10]:
# step 1 and 2 - format into template and tokenize

import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded = []

        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f'\n\n### Response:\n{entry["output"]}'
            full_text = instruction_plus_input + response_text
            
            self.encoded.append(tokenizer.encode(full_text))
    
    def __getitem__(self, index):
        return self.encoded[index]
    
    def __len__(self):
        return len(self.data)

In [11]:
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')
print(tokenizer.encode('<|endoftext|>', allowed_special={'<|endoftext|>'}))

[50256]


In [21]:
# step 3 - padding

def custom_collate_draft_1(batch, pad_token_id=50256, device='cpu'):
    batch_max_len = max(len(item)+1 for item in batch)
    inputs_list = []

    for item in batch:
        new_item = item.copy()
        # extra pad
        new_item += [pad_token_id]

        padded = new_item + [pad_token_id]*(batch_max_len - len(new_item))

        # remove extra pad
        inputs = torch.tensor(padded[:-1])
        inputs_list.append(inputs)
    
    inputs_tensor = torch.stack(inputs_list).to(device)
    return inputs_tensor

In [22]:
inputs_1 = [0,1,2,3,4]
inputs_2 = [5,6]
inputs_3 = [7,8,9]

batch = (inputs_1, inputs_2, inputs_3)
print(custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


In [23]:
# step 4 - target token IDs

def custom_collate_draft_2(batch, pad_token_id=50256, device='cpu'):
    batch_max_len = max(len(item)+1 for item in batch)
    inputs_list, targets_list = [], []

    for item in batch:
        new_item = item.copy()
        # extra pad for target
        new_item += [pad_token_id]

        padded = new_item + [pad_token_id]*(batch_max_len - len(new_item))

        # remove extra pad for input
        inputs = torch.tensor(padded[:-1])
        # shift by 1, and keep extra pad for target
        targets = torch.tensor(padded[1:])
        inputs_list.append(inputs)
        targets_list.append(targets)
    
    inputs_tensor = torch.stack(inputs_list).to(device)
    targets_tensor = torch.stack(targets_list).to(device)
    return inputs_tensor, targets_tensor

In [24]:
inputs, targets = custom_collate_draft_2(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


In [26]:
# step 5 - mask pad tokens in targets (except the first one)
# the first pad token helps LLM know when to generate <|endoftext|>

def custom_collate_fn(
        batch, pad_token_id=50256, ignore_index=-100,
        allowed_max_len=None, device='cpu',
):
    batch_max_len = max(len(item)+1 for item in batch)
    inputs_list, targets_list = [], []

    for item in batch:
        new_item = item.copy()
        # extra pad for target
        new_item += [pad_token_id]

        padded = new_item + [pad_token_id]*(batch_max_len - len(new_item))

        # remove extra pad for input
        inputs = torch.tensor(padded[:-1])
        # shift by 1, and keep extra pad for target
        targets = torch.tensor(padded[1:])

        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index
        
        # for datasets where length exceeds context size of 1024
        if allowed_max_len is not None:
            inputs = inputs[:allowed_max_len]
            targets = targets[:allowed_max_len]


        inputs_list.append(inputs)
        targets_list.append(targets)
    
    inputs_tensor = torch.stack(inputs_list).to(device)
    targets_tensor = torch.stack(targets_list).to(device)
    return inputs_tensor, targets_tensor

In [27]:
inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [29]:
# reason for masking

logits_1 = torch.tensor(
    [[-1.0, 1.0],
     [-0.5, 1.5]]
)
targets_1 = torch.tensor([0,1])
loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)
print(loss_1)

tensor(1.1269)


In [30]:
# adding an additional token ID affects loss

logits_2 = torch.tensor(
    [[-1.0, 1.0],
     [-0.5, 1.5],
     [-0.5, 1.5]]
)
targets_2 = torch.tensor([0,1,1])
loss_2 = torch.nn.functional.cross_entropy(logits_2, targets_2)
print(loss_2)

tensor(0.7936)


In [32]:
# if target is -100, then PyTorch ignores that

targets_3 = torch.tensor([0,1,-100])
loss_3 = torch.nn.functional.cross_entropy(logits_2, targets_3)
print(loss_3)
print(f"Loss 1 == Loss 3: {loss_1 == loss_3}")

tensor(1.1269)
Loss 1 == Loss 3: True


### Data Loaders

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# for Apple Silicon
if torch.backends.mps.is_available():
    device = torch.device('mps')

print(f'Device: {device}')

Device: cpu


In [34]:
from functools import partial

# fix these args so we don't need to specify
customized_collate_fn = partial(
    custom_collate_fn,
    device=device,
    allowed_max_len=1024,
)

In [35]:
from torch.utils.data import DataLoader

n_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=n_workers,
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=n_workers,
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=n_workers,
)

In [None]:
print('Train Loader:')
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)  # variable lengths

Train Loader:
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 76]) torch.Size([8, 76])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 79]) torch.Size([8, 79])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 83]) torch.Size([8, 83])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 68]) torch.

## Pretrained LLM

In [38]:
from gpt_download import download_and_load_gpt2
from gpt_model import GPTModel
from load_weights import load_weights_into_gpt

BASE_CONFIG = {
    'vocab_size': 50257,
    'context_len': 1024,
    'drop_rate': 0.0,
    'qkv_bias': True,
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# book uses medium
MODEL = "gpt2-small (124M)"
BASE_CONFIG.update(model_configs[MODEL])

model_size = MODEL.split(' ')[-1].lstrip('(').rstrip(')')
settings, params = download_and_load_gpt2(model_size, models_dir='gpt2')

File already exists and is up-to-date: gpt2\124M\checkpoint
File already exists and is up-to-date: gpt2\124M\encoder.json
File already exists and is up-to-date: gpt2\124M\hparams.json
File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\124M\model.ckpt.index
File already exists and is up-to-date: gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: gpt2\124M\vocab.bpe


In [40]:
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=True)
        (W_k): Linear(in_features=768, out_features=768, bias=True)
        (W_v): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_feat

In [41]:
# check pretrained model's output

torch.manual_seed(123)
input_text = format_input(val_data[0])
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [44]:
from generate_text import generate
from text_token_id_conversion import text_to_token_ids, token_ids_to_text

token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer),
    max_new_tokens=35,
    context_size=BASE_CONFIG['context_len'],
    eos_id=50256,
)

generated_text = token_ids_to_text(token_ids, tokenizer)

In [48]:
# remove input text
response_text = generated_text[len(input_text):].strip()
print(response_text)

### Instruction:

Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Instruction:

Convert the active


## Fine-Tuning

In [49]:
from train_utils import calc_loss_loader, train_model_simple

model.to(device)
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print(f'Training Loss: {train_loss}')
print(f'Val Loss: {val_loss}')

Training Loss: 4.167123603820801
Val Loss: 4.050918817520142


In [None]:
import time

start = time.time()

torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
n_epochs = 2

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=n_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer,
)

end = time.time()
print(f'Training took {(end - start) / 60:.3f}m')

In [None]:
from plot import plot_losses

epochs_tensor = torch.linspace(0, n_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

## Extract Responses

In [51]:
torch.manual_seed(123)

for entry in test_data[:3]:
    input_text = format_input(entry)
    token_ids = generate(
        model,
        text_to_token_ids(input_text, tokenizer),
        max_new_tokens=256,
        context_size=BASE_CONFIG['context_len'],
        eos_id=50256,
    )

    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):] \
                    .replace("### Response:", "") \
                    .strip()
    
    print(input_text)
    print(f'\nCorrect:\n>>{entry['output']}')
    print(f'\nModel:\n>>{response_text}')
    print('-'*50)


Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Rewrite the sentence using a simile.

### Input:
The car is very fast.

Correct:
>>The car is as fast as lightning.

Model:
>>The car is very fast.
--------------------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What type of cloud is typically associated with thunderstorms?

Correct:
>>The type of cloud typically associated with thunderstorms is cumulonimbus.

Model:
>>A type of cloud is typically associated with thunderstorms.
--------------------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Name the author of 'Pride and Prejudice'.

Correct:
>>Jane Austen.

Model:
>>The author of 'Pride and Prejudice' is William Shakespeare.
---------------------