# Building a GPT model from scratch using pytorch


Sources:
* https://github.com/karpathy/nanoGPT
* [GPT Model Behind the Scene: Exploring it from scratch with Pytorch](https://ai.plainenglish.io/creating-and-exploring-gpt-from-scratch-ffe84ac415a9)

In [1]:
import math
# import inspect
# from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch import Tensor
# from typing import Tuple
from typing import Optional
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

import os
import time
# import pickle
# from contextlib import nullcontext
from pathlib import Path
import fitz
import zipfile
import re
from collections import defaultdict

import numpy as np

# import requests
import tiktoken
# from tqdm import tqdm

# import random
# from sklearn.model_selection import train_test_split

# 1) Pre-train custom GPT model

## 1.1) Prepare datasets

### 1.1.1) Define paths

In [2]:
root_path=Path(os.getcwd())
data_path=root_path.joinpath('data')
corpus_path=data_path.joinpath('corpus')
pdf_path=data_path.joinpath('apra_pdfs')
model_path=root_path.joinpath('model')

Path(data_path).mkdir(parents=True, exist_ok=True)
Path(corpus_path).mkdir(parents=True, exist_ok=True)
Path(pdf_path).mkdir(parents=True, exist_ok=True)
Path(model_path).mkdir(parents=True, exist_ok=True)

### 1.1.2) Split shakespeare into 20 parts

In [3]:
def split_file(input_filename: str,
               source_path: Path,
               target_path: Path,
               num_chunks: Optional[int]=20
               ):
    
    # Read all the lines from the input file
    with open(source_path.joinpath(input_filename), 'r') as file:
        lines = file.readlines()

    # Calculate the number of lines per chunk
    total_lines = len(lines)
    lines_per_chunk = total_lines // num_chunks
    remainder = total_lines % num_chunks  # For uneven splits

    for i in range(num_chunks):
        # Calculate start and end indices for each chunk
        start = i * lines_per_chunk + min(i, remainder)
        end = start + lines_per_chunk + (1 if i < remainder else 0)

        # Generate a chunk filename
        chunk_filename = target_path.joinpath(f"{input_filename[:-4]}_{i+1:02}.txt")

        # Write the current chunk to a new file
        with open(chunk_filename, 'w') as chunk_file:
            chunk_file.writelines(lines[start:end])

        print(f"Written: {chunk_filename}")
        
split_file('shakespear.txt',data_path,corpus_path)


Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_01.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_02.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_03.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_04.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_05.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_06.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_07.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_08.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_09.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_10.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_11.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_12.txt
Written: d:\repos\Transformers_from_scratch\data\corpus\shakespear_13.txt
Written: d:\repos\Transformers_from_sc

### 1.1.3) Unzip pdfs and convert them to text files

In [4]:
with zipfile.ZipFile(data_path.joinpath('APRA_Guidelines.zip'), 'r') as zip_ref:
    zip_ref.extractall(pdf_path)
    print(f"Extracted 'APRA_Guidelines.zip' to '{pdf_path}'.")

def pdf_to_text(pdf_path: Path, txt_path: Path):
    # Open the PDF
    pdf_document = fitz.open(pdf_path)

    # Create a text file to store the extracted text
    with open(txt_path, "w", encoding="utf-8") as text_file:
        for page_number in range(len(pdf_document)):
            page = pdf_document.load_page(page_number)
            text = page.get_text()
            text_file.write(text)

    # Close the PDF
    pdf_document.close()

def convert_pdfs_in_folder(source_path: Path, target_path: Path):
    for file in os.listdir(source_path):
        if file.endswith('.pdf'):
            pdf_to_text(source_path.joinpath(file), target_path.joinpath(re.sub('.pdf','.txt',file)))
            print(f'Created {target_path.joinpath(re.sub('.pdf','.txt',file))}')

convert_pdfs_in_folder(pdf_path,corpus_path)

Extracted 'APRA_Guidelines.zip' to 'd:\repos\Transformers_from_scratch\data\apra_pdfs'.
Created d:\repos\Transformers_from_scratch\data\corpus\3PS 222 Intra-group Transactions and Exposures.txt
Created d:\repos\Transformers_from_scratch\data\corpus\APG 223 Residential Mortgage Lending.txt
Created d:\repos\Transformers_from_scratch\data\corpus\APS 110 Capital Adequacy.txt
Created d:\repos\Transformers_from_scratch\data\corpus\APS 112 Capital Adequacy Standardised Approach.txt
Created d:\repos\Transformers_from_scratch\data\corpus\APS 113 Capital Adequacy Internal Ratings-based.txt
Created d:\repos\Transformers_from_scratch\data\corpus\APS 180 Capital Adequacy Counterparty Credit Risk.txt
Created d:\repos\Transformers_from_scratch\data\corpus\APS 210 Liquidity.txt
Created d:\repos\Transformers_from_scratch\data\corpus\APS 220 Credit Risk Management.txt
Created d:\repos\Transformers_from_scratch\data\corpus\APS 222 Associations with Related Entities.txt
Created d:\repos\Transformers_from_

### 1.1.4) Split corpus into train, test and validation sets
Splitting was later found to be unnecessary, as full dataset will be used to perform unsupervised training. Validation will come later in the fine-tuning stage.

In [5]:
file_list = [corpus_path.joinpath(file) for file in os.listdir(corpus_path) if file.endswith('.txt')]

# Set the random seed for reproducibility
# random_seed=1986
# random.seed(random_seed)
# train_files, val_files = train_test_split(file_list, test_size=0.2, random_state=random_seed)
train_files = file_list

In [6]:
print('===================================================')
print("Documents in training corpus")
print('===================================================')
train_files

Documents in training corpus


[WindowsPath('d:/repos/Transformers_from_scratch/data/corpus/3PS 222 Intra-group Transactions and Exposures.txt'),
 WindowsPath('d:/repos/Transformers_from_scratch/data/corpus/APG 223 Residential Mortgage Lending.txt'),
 WindowsPath('d:/repos/Transformers_from_scratch/data/corpus/APS 110 Capital Adequacy.txt'),
 WindowsPath('d:/repos/Transformers_from_scratch/data/corpus/APS 112 Capital Adequacy Standardised Approach.txt'),
 WindowsPath('d:/repos/Transformers_from_scratch/data/corpus/APS 113 Capital Adequacy Internal Ratings-based.txt'),
 WindowsPath('d:/repos/Transformers_from_scratch/data/corpus/APS 180 Capital Adequacy Counterparty Credit Risk.txt'),
 WindowsPath('d:/repos/Transformers_from_scratch/data/corpus/APS 210 Liquidity.txt'),
 WindowsPath('d:/repos/Transformers_from_scratch/data/corpus/APS 220 Credit Risk Management.txt'),
 WindowsPath('d:/repos/Transformers_from_scratch/data/corpus/APS 222 Associations with Related Entities.txt'),
 WindowsPath('d:/repos/Transformers_from_s

In [7]:
# print('===================================================')
# print("Documents in validation corpus")
# print('===================================================')
# val_files

### 1.1.5) Concatenate individual files together

In [8]:
def concatenate_files_to_output(file_list, output_file):
    """
    Concatenate text from multiple files and write to a single output file.
    
    Parameters:
    - file_list: List of paths to the input text files.
    - output_file: Path to the output text file where combined text will be saved.
    """
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file_path in file_list:
            # Ensure the file exists
            if os.path.exists(file_path):
                with open(file_path, 'r', encoding='utf-8') as infile:
                    # Read the contents of the file and write to output file
                    outfile.write(infile.read() + "\n")  # Add a newline to separate contents
            else:
                print(f"File not found: {file_path}")

train_text_path = data_path.joinpath('train.txt')
concatenate_files_to_output(train_files, train_text_path)
print(f"Combined text has been written to: {train_text_path}")

# No longer needed
# val_text_path = data_path.joinpath('val.txt')
# concatenate_files_to_output(val_files, val_text_path)
# print(f"Combined text has been written to: {val_text_path}")


Combined text has been written to: d:\repos\Transformers_from_scratch\data\train.txt


### 1.1.6) Apply GPT-2 byte pair encoding

In [9]:
enc = tiktoken.get_encoding("gpt2")

def apply_gpt2_byte_encoding(data_path, source_file, target_file):
    with open(data_path.joinpath(source_file), 'r', encoding='utf-8') as f:
        text = f.read()

        # Some low level text cleaning
        text = re.sub(r'\n\s*\n', '\n', text.strip())
        text = re.sub(r'[^\x00-\x7F]+', '', text)

        # encode with tiktoken gpt2 bpe
        encoded_text = enc.encode_ordinary(text)
        print(f"{source_file} has {len(encoded_text):,} tokens")
        encoded_text = np.array(encoded_text, dtype=np.uint16)
        encoded_text.tofile(data_path.joinpath(target_file))
        return len(encoded_text)

train_bte_path = data_path.joinpath('train.bin')
# val_bte_path   = data_path.joinpath('val.bin')

train_vocab_size=apply_gpt2_byte_encoding(data_path,'train.txt','train.bin')
# val_vocab_size=apply_gpt2_byte_encoding(data_path,'val.txt','val.bin')

train.txt has 618,255 tokens


## 1.2) Model Configuration

In [10]:
class GPTConfig:
    def __init__(self, vocab_size, **kwargs):
        self.vocab_size = vocab_size
        for key, value in kwargs.items():
            setattr(self, key, value)

class CustomConfig(GPTConfig):
    n_layer = 8
    n_head = 8
    n_embd = 256
    embd_pdrop = 0.1
    resid_pdrop = 0.1
    attn_pdrop = 0.1
    dropout = 0.1
    compile = False
    device = 'cuda'
    num_workers = 0
    max_iters = 2e4
    batch_size = 4
    block_size = 128
    learning_rate = 6e-4
    betas = (0.9, 0.95)
    weight_decay = 1e-1
    grad_norm_clip = 1.0
    bias = True

config = CustomConfig(vocab_size=train_vocab_size)

## 1.3) Dataset constructor and Dataloader objects

In [11]:
# read data from .bin
train_data = np.memmap(train_bte_path, dtype=np.uint16, mode='r')
# val_data   = np.memmap(val_bte_path, dtype=np.uint16, mode='r')

class TextDataset(Dataset):
    def __init__(self, split, block_size=128, device_type='cuda'):
        assert split in {'train', 'test'}
        self.split = split
        self.block_size = block_size
        self.device_type = device_type
        # self.data = train_data if split == 'train' else val_data
        # Removed option for val_data
        self.data = train_data

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = torch.from_numpy(self.data[idx : idx + self.block_size].astype(np.int64))
        y = torch.from_numpy(self.data[idx + 1 : idx + 1 + self.block_size].astype(np.int64)) 

        if self.device_type == 'cuda':
            # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
            x, y = x.pin_memory().to('cuda', non_blocking=True), y.pin_memory().to('cuda', non_blocking=True)
        else:
            x, y = x.to('cpu'), y.to('cpu')
        return x, y

# create dataset and dataloader
train_dataset = TextDataset('train', config.block_size, config.device)

# Trainer() constructor does not use this data loader
# train_loader = DataLoader(train_dataset, batch_size=config.batch_size, num_workers=config.num_workers, drop_last=False)

# val_dataset = TextDataset('val', config.block_size, config.device)
# val_loader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=config.num_workers, drop_last=False)

## 1.4) Model Implementation

### 1.4.1) Causal Multi-head Attention

**torch.nn.functional.scaled_dot_product_attention** is equivalent to the following:

In [None]:
def matts_scaled_dot_product_attention(query:     Tensor,
                                       key:       Tensor,
                                       value:     Tensor,
                                       mask:      Optional[Tensor]=None,
                                       dropout_p: Optional[float]=0.0,
                                       is_causal: Optional[bool]=False,
                                       scale:     Optional[float]=None
                                       ) -> Tensor:
    """
    Combine three tensors; query, key, and value; to generate an output tensor of scaled dot product attention.

    Parameters:
    - query (Tensor)              - shape (N x ... x L x E)
    - key (Tensor)                - shape (N x ... x S x E)
    - value (Tensor)              - shape (N x ... x S x Ev)
    - mask (optional Tensor)      - shape (N x ... x L x S)

            mask; shape must be broadcastable to the shape of attention weights.
                            Two types of masks are supported.
                                1) A boolean mask where a value of True indicates that the element should take part in attention.
                                2) A float mask of the same type as query, key, value that is added to the attention score.

    - dropout_p (float)           - Dropout probability; if greater than 0.0, dropout is applied

    Returns:
    - Attention output (Tensor)   - shape (N x ... x L x Ev)


    Shape legend:
    - N:    Batch size
    - ...:  Any number of other batch dimensions (optional)
    - S:    Source sequence length
    - L:    Target sequence length
    - E:    Embedding dimension of the query and key
    - Ev:   Embedding dimension of the value
    """
    L, S = query.size(-2), key.size(-2)

    # Calculate scaling factor ahead of time
    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale

    # Pre-define attn_bias as zero-weighted tensor
    # this allows it to be included in the attn_weight
    # calculation regardless of being defined
    attn_bias = torch.zeros(L, S, dtype=query.dtype)

    if is_causal:
        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
        attn_bias.to(query.dtype)

    if mask is not None:
        if mask.dtype == torch.bool:
            attn_bias.masked_fill_(mask.logical_not(), float("-inf"))
        else:
            attn_bias += mask

    # Compute attention weights
    attn_weight = query @ key.transpose(-2, -1) * scale_factor
    attn_weight += attn_bias

    # Apply softmax to the attention weights
    attn_weight = torch.softmax(attn_weight, dim=-1)

    # Apply dropout if specified
    if dropout_p > 0:
        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)

    # Compute the final output
    return attn_weight @ value

In [12]:
class CausalSelfAttention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection 
    at the end.
    It's important in decoder block to have diagonal mask
    It is also possible to use torch.nn.MultiheadAttention.
    """

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        self.dropout = config.dropout
        self.n_head = config.n_head
        self.n_embd = config.n_embd

        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(
                        torch.nn.functional, 
                        'scaled_dot_product_attention')
        if not self.flash:
            print(
              "WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer(
                "mask", 
                torch.tril(torch.ones(config.block_size, config.block_size)
            ).view(1, 1, config.block_size, config.block_size))
            
    def forward(self, x, attention_mask=None):
        # batch_size, seq_len, emb_dim
        B, T, C = x.size() 

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # (b, seq_len, emb_dim) --> (b, seq_len, emb_dim * 3) --> (b, seq_len, emb_dim)
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (b, h, seq_len, d_k)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (b, h, seq_len, d_k)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (b, h, seq_len, d_k)
        
        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(

                # Hack to excluding use of attention_mask as it only leads to problems when using transformers data loader and trainer
                # q, k, v, attn_mask=attention_mask, dropout_p=self.dropout if self.training else 0, is_causal=True
                q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True
            )
        else:
            # (b, h, seq_len, d_k) matmul (b, h, d_k, seq_len) --> (b, h, seq_len, seq_len)
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            # diagonal mask
            # fill 0 mask with super small number so it wont affect the softmax weight
            # (batch_size, h, seq_len, seq_len)
            att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)

            # (b, h, seq_len, seq_len) matmul (b, h, seq_len, d_k) --> (b, h, seq_len, d_k)
            y = att @ v 

        # (b, h, seq_len, d_k) --> (b, seq_len, h, d_k) --> (b, seq_len, d_model)
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

### 1.4.2) Decoder Block

Custom GELU class to match Google BERT repo

In [13]:
class NewGELU(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

Multi-layer perceptron model that processes information from the Multi-head Attention layer.

In [14]:
class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = NewGELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

Decoder Block

In [15]:
class Block(nn.Module):
    """ GPT decoder block"""

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x, attention_mask=None):
        x = x + self.attn(self.ln_1(x), attention_mask)
        x = x + self.mlp(self.ln_2(x))
        return x

### 1.4.3) GPT Model

In [16]:
class GPT(nn.Module):
    """ GPT Language Model """

    def __init__(self, config):
        super().__init__()

        self.config = config
        self.block_size = config.block_size

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.embd_pdrop),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    def configure_optimizers(self, train_config):

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        
        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, input_ids, attention_mask=None, targets=None):
        device = input_ids.device
        b, t = input_ids.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"

        # positional token, shape (1, t)
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) 

        # forward the GPT model itself
        tok_emb = self.transformer.wte(input_ids) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x, attention_mask)
        
        x = self.transformer.ln_f(x)
        # (b, t, n_embd) -- > # (b, t, vocab_size)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        # -1 at output will be ignored
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b, t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

## 1.5) Trainer Constructor

In [17]:
class Trainer:

    def __init__(self, config, model, train_dataset):
        self.config = config
        self.model = model
        self.optimizer = None
        self.train_dataset = train_dataset
        self.callbacks = defaultdict(list)
        self.device = config.device
        self.model = self.model.to(self.device)

        # variables that will be assigned to trainer class later for logging and etc
        self.iter_num = 0
        self.iter_time = 0.0
        self.iter_dt = 0.0

    def add_callback(self, onevent: str, callback):
        self.callbacks[onevent].append(callback)

    def set_callback(self, onevent: str, callback):
        self.callbacks[onevent] = [callback]

    def trigger_callbacks(self, onevent: str):
        for callback in self.callbacks.get(onevent, []):
            callback(self)

    def run(self):
        model, config = self.model, self.config

        # setup the optimizer
        self.optimizer = model.configure_optimizers(config)

        # setup the dataloader
        train_loader = DataLoader(
            self.train_dataset,
            sampler=torch.utils.data.RandomSampler(self.train_dataset, replacement=True, num_samples=int(1e10)),
            shuffle=False,
            # pin_memory=True,
            batch_size=config.batch_size,
            num_workers=config.num_workers,
        )

        model.train()
        self.iter_num = 0
        self.iter_time = time.time()
        data_iter = iter(train_loader)
        while True:

            # fetch the next batch (x, y) and re-init iterator if needed
            try:
                batch = next(data_iter)
            except StopIteration:
                data_iter = iter(train_loader)
                batch = next(data_iter)
            batch = [t.to(self.device) for t in batch]
            x, y = batch

            # forward the model
            logits, self.loss = model(input_ids=x, targets=y)

            # backprop and update the parameters
            model.zero_grad(set_to_none=True)
            self.loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
            self.optimizer.step()

            self.trigger_callbacks('on_batch_end')
            self.iter_num += 1
            tnow = time.time()
            self.iter_dt = tnow - self.iter_time
            self.iter_time = tnow

            # termination conditions
            if config.max_iters is not None and self.iter_num >= config.max_iters:
                torch.cuda.empty_cache()
                break

## 1.6) Execute model pre-training

In [18]:
model_mattGPT = GPT(config).to(config.device)
if config.compile:
     model_mattGPT = torch.compile(model_mattGPT)
trainer = Trainer(config, model_mattGPT, train_dataset)

def batch_end_callback(trainer):
    if trainer.iter_num % 500 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)
trainer.run()

number of parameters: 164.62M


  y = torch.nn.functional.scaled_dot_product_attention(


iter_dt 0.00ms; iter 0: train loss 13.41142
iter_dt 203.05ms; iter 500: train loss 5.78267
iter_dt 202.80ms; iter 1000: train loss 5.41902
iter_dt 214.38ms; iter 1500: train loss 4.79306
iter_dt 201.72ms; iter 2000: train loss 4.59108
iter_dt 204.96ms; iter 2500: train loss 4.67543
iter_dt 202.17ms; iter 3000: train loss 4.43193
iter_dt 213.41ms; iter 3500: train loss 4.40250
iter_dt 199.89ms; iter 4000: train loss 3.94002
iter_dt 215.62ms; iter 4500: train loss 3.00203
iter_dt 201.11ms; iter 5000: train loss 3.60044
iter_dt 204.08ms; iter 5500: train loss 4.29090
iter_dt 204.07ms; iter 6000: train loss 3.82599
iter_dt 203.79ms; iter 6500: train loss 3.80347
iter_dt 198.82ms; iter 7000: train loss 3.92197
iter_dt 214.21ms; iter 7500: train loss 3.60690
iter_dt 203.65ms; iter 8000: train loss 3.34533
iter_dt 202.43ms; iter 8500: train loss 4.37069
iter_dt 204.28ms; iter 9000: train loss 4.18926
iter_dt 202.53ms; iter 9500: train loss 4.16536
iter_dt 211.23ms; iter 10000: train loss 3.18

In [24]:
# save model
torch.save(model_mattGPT,model_path.joinpath('model_mattGPT_1.pkl'))
# torch.save(model_mattGPT,model_path.joinpath('model_mattGPT.pkl'))

In [25]:
torch.cuda.empty_cache()

## 1.7) Initial look at generated text samples

In [24]:
# Reload model if running from new session
if 'model_mattGPT' not in globals():
    model_path=Path(os.getcwd()).joinpath('model')
    model_mattGPT = torch.load(model_path.joinpath('model_mattGPT_1.pkl'), weights_only=False)

def print_mattGPT(prompt):
    sample_ids = torch.Tensor(enc.encode_ordinary(prompt)).long()
    sample_ids = torch.unsqueeze(sample_ids, 0).to(config.device)
    result = model_mattGPT.generate(sample_ids, max_new_tokens=50, temperature=1, do_sample=False, top_k=None)
    print(enc.decode(result.detach().cpu().tolist()[0]))

Shakespearian language prompt

In [21]:
print_mattGPT('Lord:\nRise! My people, conquer the north!')

Lord:
Rise! My people, conquer the north!
RIVERS:
I thank you not, sir; I have you a little word,
And I will be so still my wife.
KING RICHARD III:
I am a man, sir, that I may live to die.
PETRUCHIO:
I thank you, sir; I beseech you, sir.
LADY ANNE:
I am a kind of kings, that I must die.
KING RICHARD III:


APRA Banking regulation prompt

In [27]:
print_mattGPT('An ADI must have an independent credit risk control unit that is responsible for')

An ADI must have an independent credit risk control unit that is responsible for 
the ADI to use the internal assessment and approval process for determining the 
ADIs credit risk management strategy and risk management. 
The valuation process must be documented and documented and documented and appropriate 
to the ADIs credit risk


Mixed shakespearian language + banking regulation prompt - 1

In [34]:
print_mattGPT('KING RICHARD III:\n The key requirements of this Prudential Standard are that Montague hath breathed his last loan-to-value ratio')

KING RICHARD III:
 The key requirements of this Prudential Standard are that Montague hath breathed his last loan-to-value ratio.
RIVERS:
I think the king is notional amount of all the ADIs:
And I, that I mean the king.
KING RICHARD III:
I hope the king, that was the king, and the


Mixed shakespearian language + banking regulation prompt - 2

In [51]:
print_mattGPT('First Citizen:\nWe are accounted poor citizens and believe that special purpose vehicles (SPVs) holding securitised assets may be')

First Citizen:
We are accounted poor citizens and believe that special purpose vehicles (SPVs) holding securitised assets may be 
eligible for regulatory capital and APS 120. 
Authorised Version F2022L01578 registered 05/12/2022
January 2023 
CPS 230.0 - 3 
Prudential Standard APS


Dr Seuss

In [54]:
print_mattGPT('I do not eat green eggs and ham')

I do not eat green eggs and ham her,
And I will be married to her.
I am a kind of kings,
And, when my name was not a fool!
I, that I was, that I was,
I was the man that I had been


Star Wars

In [55]:
print_mattGPT('Luke, I am your father')

Luke, I am your father.
First Lord:
I am a man that I have to die, and
I am a man of you.
First Gentleman:
I am a kind of kings, which
say is a man that I was,
and a


Philosophy

In [56]:
print_mattGPT('Truth is in the eye')

Truth is in the eye
Of the world, and the king,
And all the king, the king, and the king,
And his son, his son, his son, his son,
And his son, his son, his son, his son,


Physics

In [63]:
print_mattGPT('Newtons third law states that')

Newtons third law states that he hath
His oath to be a king.
BUCKINGHAM:
My lord, I beseech you,
I may not be so, my lord.
BUCKINGHAM:
My lord, I will not be


# 2) Fine-tune model and compare

Sources:
* [Fine-Tuning GPT-2 for Sentiment Analysis](https://drlee.io/fine-tuning-gpt-2-for-sentiment-analysis-94ebdd7b5b24)
* [Train and Deploy Fine-Tuned GPT-2 Model Using PyTorch on Amazon SageMaker to Classify News Articles](https://towardsdatascience.com/train-and-deploy-fine-tuned-gpt-2-model-using-pytorch-on-amazon-sagemaker-to-classify-news-articles-612f9957c7b)

## 2.1) Attempt to test custom GPT

### 2.1.1) Download imdb and load dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 2.1.2) Tokenise IMBD dataset using GPT-2 byte-pair encoding
Customised the block size to the custom GPT size

In [33]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=config.block_size)
tokenized_datasets = dataset.map(tokenize_function, batched=True)




### 2.1.3) Add output linear layer to custom GPT

The below code was sourced from Chat GPT using several very detailed prompts, all documented in the history link below.
</br>https://chatgpt.com/share/670b1688-2560-8000-ae98-96986809c16c

In [34]:
class CustomGPTForClassification(nn.Module):
    def __init__(self, base_gpt, num_classes):
        super().__init__()
        self.base_gpt = base_gpt  # Pretrained GPT model
        self.classifier_head = nn.Linear(base_gpt.config.n_embd, num_classes)
        
        # Add a config attribute similar to Hugging Face models
        self.config = base_gpt.config
        # self.config.pad_token_id = self.config.eos_token_id  
        # Set pad_token to eos_token

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Pass inputs through the base GPT model
        outputs = self.base_gpt(input_ids, attention_mask=attention_mask)
        hidden_state = outputs[0]  # Get the last hidden states (batch_size, seq_len, hidden_dim)
        
        # Get the CLS token's hidden state (or you can pool the hidden states differently)
        pooled_output = hidden_state[:, -1, :]  # Take the last token's hidden state
        
        # Classification head
        logits = self.classifier_head(pooled_output)

        # If labels are provided, compute the loss (used for training)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"logits": logits, "loss": loss} if loss is not None else {"logits": logits}

if 'model_mattGPT' not in globals():
    model_path=Path(os.getcwd()).joinpath('model')
    model_mattGPT = torch.load(model_path.joinpath('model_mattGPT_1.pkl'), weights_only=False)


model_mattGPT_classifier = CustomGPTForClassification(model_mattGPT, 2)
model_mattGPT_classifier.to(device)
# model_mattGPT_classifier.config.pad_token_id = model_mattGPT_classifier.config.eos_token_id

CustomGPTForClassification(
  (base_gpt): GPT(
    (transformer): ModuleDict(
      (wte): Embedding(618255, 256)
      (wpe): Embedding(128, 256)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-7): 8 x Block(
          (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (attn): CausalSelfAttention(
            (c_attn): Linear(in_features=256, out_features=768, bias=True)
            (c_proj): Linear(in_features=256, out_features=256, bias=True)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (c_fc): Linear(in_features=256, out_features=1024, bias=True)
            (gelu): NewGELU()
            (c_proj): Linear(in_features=1024, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f):

### 2.1.4) Set training arguments

In [35]:
import transformers

training_args = transformers.TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    # num_train_epochs=3,
    # Doing a really really short training run due to limited time available
    max_steps=100,
    weight_decay=0.01,
)

### 2.1.5) Finetuning and evaluating MattGPT

In [36]:
trainer_mattGPT = transformers.Trainer(
    model=model_mattGPT_classifier,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)
trainer_mattGPT.train()

max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/100 [00:00<?, ?it/s]

RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x618255 and 256x2)

Clearly the above model is incompatible with the hugging face transformers dataset and data loader.

Unfortunately as of writing, I had run out of time to debug how to proceed and cannot perform any further fine-tuning on this model.

One possible way forward would be to convert the IMDB dataset into a format that is accepted by the custom dataset constructor above and modify the custom Trainer 

In [37]:
# Evaluation commented out as finetuning failed
# trainer.evaluate()

Performing garbage collection

In [38]:
import gc
del trainer_mattGPT
del model_mattGPT_classifier
del tokenized_datasets

gc.collect()
print("Garbage collection thresholds:",gc.get_threshold())

torch.cuda.empty_cache()

Garbage collection thresholds: (700, 10, 10)


## 2.2) Test GPT-2 model from Hugging Face

### 2.2.1) Download imdb and load dataset

In [1]:
from datasets import load_dataset
# import torch

dataset = load_dataset("imdb")

# device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 2.2.2) Tokenise IMBD dataset using GPT-2 byte-pair encoding

In [2]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)



### 2.2.3) Define GPT-2 model with classifier head
Added fix defining pad_token_id as per solution provided in https://stackoverflow.com/questions/68084302/assertionerror-cannot-handle-batch-sizes-1-if-no-padding-token-is-defined

In [3]:
from transformers import GPT2ForSequenceClassification

model_gpt2_classifier = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)

# set the pad token of the model's configuration
model_gpt2_classifier.config.pad_token_id = model_gpt2_classifier.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 2.2.4) Set training arguments

In [4]:
import transformers

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

training_args = transformers.TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=1,
    compute_metrics=compute_metrics,
    # Doing a really really short training run due to limited time available
    # max_steps=100,
    weight_decay=0.01,
)

### 2.2.5) Finetuning and evaluating GPT-2

In [7]:
trainer_gpt2 = transformers.Trainer(
    model=model_gpt2_classifier,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)
trainer_gpt2.train()

In [6]:
trainer_gpt2.evaluate()

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.18055514991283417,
 'eval_runtime': 17148.762,
 'eval_samples_per_second': 1.458,
 'eval_steps_per_second': 0.182,
 'epoch': 1.0}

### 2.2.6) Evaluate specific sentiment examples

In [None]:
def get_sentiment(sentence,model):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    prediction = outputs.logits.argmax(-1).item()
    return "positive" if prediction == 1 else "negative"

In [None]:
sentence = "I loved this movie!"
print("GPT-2:",get_sentiment(sentence,model_gpt2_classifier))

## 2.3) Test GPT-2 on list of generated text samples

In [14]:
from transformers import pipeline
import torch

generator = pipeline('text-generation', 
                     model='gpt2', 
                     device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), 
                     max_length=50,
                     truncation=True
                     )

def return_generated_text(prompt):
    print(generator(prompt, pad_token_id=generator.tokenizer.eos_token_id)[0]['generated_text'])

Shakespearian language prompt

In [15]:
return_generated_text('Lord:\nRise! My people, conquer the north!')

Lord:
Rise! My people, conquer the north!

Flee; flee, I will make war for you.

The great beast was taken, and was taken captive. [Gryffindor died in the Dragon


APRA Banking regulation prompt

In [16]:
return_generated_text('An ADI must have an independent credit risk control unit that is responsible for')

An ADI must have an independent credit risk control unit that is responsible for the assessment of the creditworthiness of an affected individual and must review, in writing, every application to increase the risk that the applicant will default. In 2013 and 2014 an AD


Mixed shakespearian language + banking regulation prompt - 1

In [17]:
return_generated_text('KING RICHARD III:\n The key requirements of this Prudential Standard are that Montague hath breathed his last loan-to-value ratio')

KING RICHARD III:
 The key requirements of this Prudential Standard are that Montague hath breathed his last loan-to-value ratio and that the Treasury do not own any of Rothschild's properties because the Federal Reserve has control of its


Mixed shakespearian language + banking regulation prompt - 2

In [22]:
return_generated_text('First Citizen:\nWe are accounted poor citizens and believe that special purpose vehicles (SPVs) holding securitised assets may be')

First Citizen:
We are accounted poor citizens and believe that special purpose vehicles (SPVs) holding securitised assets may be prohibited as the subject of investigation.

The SPV will be inspected for any potential illegal conduct.


Dr Seuss

In [24]:
return_generated_text('I do not eat green eggs and ham')

I do not eat green eggs and ham, instead I use spinach, eggs and tomatoes. I also only buy green potatoes and green peppers (also green and sunflower seeds). There is not much difference in these two. All of my products are made


Philosophy

In [26]:
return_generated_text('Truth is in the eye')

Truth is in the eye of the beholder."-Charles Darwin

"To think the greatest known example of a true theory of nature and phenomena is that which can only be understood by the greatest number of rational beings."-Charles Darwin


