In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from dataclasses import dataclass
import math

In [3]:
class SDPMHA(nn.Module):
    """
    Scaled Dot Product Multihead Attention
    Implementation from page 5 from: https://arxiv.org/pdf/1706.03762
    """
    def __init__(self, n_embed, seq_len, n_head, flash=True):
        super().__init__()
        assert n_embed%n_head == 0, "n_embed must be divisible by n_head"
        self.n_head = n_head
        self.q_projection = nn.Linear(n_embed, n_embed)
        self.k_projection = nn.Linear(n_embed, n_embed)
        self.v_projection = nn.Linear(n_embed, n_embed)
        self.out_projection  = nn.Linear(n_embed, n_embed)
        self.flash = flash

        # this will be used as mask in attention:
        # "We need to prevent leftward information flow in the decoder to preserve 
        # the auto-regressive property. We implement this inside of scaled 
        # dot-product attention by masking out (setting to −∞) all values 
        # in the input of the softmax which correspond to illegal connections." (from paper)
        # Mathematicaly this means: set upper triangle matrix to -inf. values.
        self.register_buffer("bias",torch.tril(torch.ones(seq_len,
                                                          seq_len)).view(1,1,seq_len, seq_len))

    def forward(self, x, mask=None):
        batch_size, seq_len, emb_dim = x.size()
        # below you will see that we use transpose so that the final shape will be:
        # (batch_size, sel.n_head, seq_len, emb_dim//self.n_head)
        # from now on: emb_dim//self.n_head = head_size
        # (batch_size, sel.n_head, seq_len, head_size)
        Q = self.q_projection(x)
        K = self.k_projection(x)
        V = self.v_projection(x)
        
        Q = Q.view(batch_size, seq_len, self.n_head, emb_dim//self.n_head).transpose(1,2) 
        K = K.view(batch_size, seq_len, self.n_head, emb_dim//self.n_head).transpose(1,2) 
        V = V.view(batch_size, seq_len, self.n_head, emb_dim//self.n_head).transpose(1,2) 


        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(Q, K, V, attn_mask=None, dropout_p=0, is_causal=True)
        else:
            # below transpose is needed to transpose along last two dimensions only
            attention = Q@K.transpose(-2, -1) * (1.0 / math.sqrt(K.size(-1)))
            if mask:
                attention = attention.masked_fill(self.bias[:,:,:seq_len,:seq_len]==0, float('-inf'))
            # softmax along last dimension (emb_dim)
            attention = F.softmax(attention, dim=-1)
    
            # Just as in Karpathy code
            # (batch_size, n_head, seq_len, seq_len) x (batch_size, n_head, seq_len, head_size)
            # results in (batch_size, n_head, seq_len, head_size)
            y = attention@V
        # concat of all heads to recover final dimensions
        # (batch_size, n_head, seq_len, head_size) -> (batch_size, seq_len, n_head*head_size)
        # first transpose to (batch_size, seq_len, n_head, head_size)
        # then reshape to x shape
        # contiguous forces to have correct format in computer memory
        y = y.transpose(1,2).contiguous().view(batch_size, seq_len, emb_dim)
        y = self.out_projection(y)
        # dont return attention here, as it is multiheaded and not easy to understand and use
        # later in code
        return y

class Position_Wise_Feed_Forward(nn.Module):
    """
    Implementation of eq 2 from: https://arxiv.org/pdf/1706.03762
    """
    def __init__(self, 
                 d_model: int = 512, 
                 activation: nn.Module = nn.GELU(approximate='tanh')):
        """
        d_model = model dimensionality
        d_ff = inner-layer dimensioality
        activation = activation function, OpenAI uses GeLU with tanh aproximation
        """
        super().__init__()
        d_ff = 4 * d_model
        self.inner = nn.Linear(d_model, d_ff)
        self.outer = nn.Linear(d_ff, d_model)
        self.activation = activation

    def forward(self, x):
        y = self.outer(self.activation(self.inner(x)))
        return y


class DecoderBlock(nn.Module):
    # implementation from Figure 1 
    # from Attention is all you need: https://arxiv.org/pdf/1706.03762
    # this is not the implementation of Karpathy
    # and not the way OpenAI did it.
    def __init__(self, seq_len, emb_dim, n_head):
        super().__init__()
        self.mha1 = SDPMHA(emb_dim, seq_len, n_head)
        self.ln1 = nn.LayerNorm(emb_dim)
        self.mha2 = SDPMHA(emb_dim, seq_len, n_head)
        self.ln2 = nn.LayerNorm(emb_dim)
        self.ff = Position_Wise_Feed_Forward(d_model=emb_dim)
        self.ln3 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        x = self.ln1(x+self.mha1(x, mask=False))
        x = self.ln2(x+self.mha2(x, mask=False))
        x = self.ln3(self.ff(x))
        return x

class DecoderBlockOpenAI(nn.Module):
    # Implementation smillar to Karpathy
    # and not the full way the OpenAI did it.
    def __init__(self, seq_len, emb_dim, n_head, activation=nn.GELU(approximate='tanh')):
        super().__init__()
        self.mha1 = SDPMHA(emb_dim, seq_len, n_head)
        self.ln1 = nn.LayerNorm(emb_dim)
        self.ff = Position_Wise_Feed_Forward(d_model=emb_dim, activation=activation)
        self.ln2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        x = self.ln1(x+self.mha1(x, mask=True))
        x = self.ln2(self.ff(x))
        return x

In [4]:
@dataclass
class GPTConfig:
    context_size: int = 1024
    vocab_size: int = 50304 # Just like Karpathy I pad it to nearest multiple of 64
    n_layers: int = 12
    n_head: int = 12
    n_embeding: int = 768
    

class GPT2(nn.Module):
    def __init__(self, config: GPTConfig) -> None:
        super().__init__()
        
        self.config = config
        
        # All we need to do  is to specify a single valabre with whoel transformer
        # with a flexbile size
        self.transformer = nn.ModuleDict(dict(
            word_embeding = nn.Embedding(config.vocab_size, config.n_embeding),
            positional_embeding = nn.Embedding(config.context_size, config.n_embeding),
            blocks = nn.ModuleList(
                [DecoderBlockOpenAI(seq_len=config.context_size, 
                                    emb_dim=config.n_embeding,
                                    n_head=config.n_head) for _ in range(config.n_layers)]
            ),
            layer_norm = nn.LayerNorm(config.n_embeding),
        ))
        # the full body of transformer is completed now we need a head
        self.lm_head = nn.Linear(config.n_embeding, config.vocab_size, bias=False)
        
        # Just like Karpathy said wrote in his repository the idea is to set
        # the weights of word_embeding and lm_head to be the same
        # source: https://paperswithcode.com/method/weight-tying
        self.transformer.word_embeding.weight = self.lm_head.weight
        
        # The most important step in all machine learning
        # initialize all weights correctly.
        self.apply(self._init_weights)
        #  Additionaly follow GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('out_projection.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layers))
                
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            
            
    def forward(self, tokens, targets=None):
        device = tokens.device
        batch_size, tokens_size = tokens.size()
        assert tokens_size <= self.config.context_size, f"Input tokens size: {tokens_size} is larger then context size {self.config.context_size}!"
        tokens_position = torch.arange(0, tokens_size, dtype=torch.long, device=device)
        
        # forward the input throght model
        token_embbeding = self.transformer.word_embeding(tokens)
        position_embeding = self.transformer.positional_embeding(tokens_position)
        x = token_embbeding + position_embeding
        for block in self.transformer.blocks:
            x = block(x)
        x = self.transformer.layer_norm(x)
        
        # tricks from Karpathy source code
        if targets is not None:
            # calcualte the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

In [5]:
# @dataclass
# class GPTConfig:
#     context_size: int = 1024
#     vocab_size: int = 50304 # Just like Karpathy I pad it to nearest multiple of 64
#     n_layers: int = 12
#     n_head: int = 12
#     n_embeding: int = 768
    

gpt2_standard_config = dict(n_layers = 12, n_head=12, n_embeding=768)
gpt2_standard_config['vocab_size'] = 50304 # always 50257 for GPT model checkpoints
gpt2_standard_config['context_size'] = 1024 # always 1024 for GPT model checkpoints
model  = GPT2(config=GPTConfig(**gpt2_standard_config))

In [6]:
test_input = torch.randint(0, 50304, (1, 1024))

In [7]:
model.forward(test_input)[0].size()

torch.Size([1, 1, 50304])

# train tokenizer

In [8]:
from datasets import load_dataset

In [9]:
dataset = load_dataset("Skylion007/openwebtext")

Using the latest cached version of the module from /Users/dwojcik/.cache/huggingface/modules/datasets_modules/datasets/Skylion007--openwebtext/6f68e85c16ccc770c0dd489f4008852ea9633604995addd0cd76e293aed9e521 (last modified on Sat Nov 16 10:37:31 2024) since it couldn't be found locally at Skylion007/openwebtext, or remotely on the Hugging Face Hub.


Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [10]:
# Pretrain tokenizer
from tokenizers import ByteLevelBPETokenizer
from transformers import AutoTokenizer

tokenizer = ByteLevelBPETokenizer()

In [11]:
# Extract text data from the dataset
sample_size = 100_000
text_column_name = 'text'  # Replace with the name of your text column
texts = dataset['train'][:sample_size][text_column_name]

In [None]:
vocab_size = 50304
tokenizer.train_from_iterator(texts, 
                              vocab_size=vocab_size, 
                              min_frequency=2)

In [None]:
tokenizer.save_model("tokenizer")

['tokenizer/vocab.json', 'tokenizer/merges.txt']

# train GPT2 model

In [None]:
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2Model, GPT2TokenizerFast, Trainer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import Trainer, TrainingArguments
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import get_cosine_schedule_with_warmup
import gc
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

In [1]:
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2Model, GPT2TokenizerFast, Trainer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import Trainer, TrainingArguments
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import get_cosine_schedule_with_warmup
import gc

vocab_size = 50304
gpt2_config = GPT2Config(vocab_size=vocab_size,n_layer=4, n_head=4, n_positions=256)
model = GPT2LMHeadModel(config=gpt2_config)
tokenizer = GPT2TokenizerFast(vocab_file='tokenizer/vocab.json', merges_file='tokenizer/merges.txt')
tokenizer.pad_token = tokenizer.eos_token

def tokenize(element):
    outputs = tokenizer(
        element[0]["text"],
        truncation=True,
        padding=True,
        max_length=gpt2_config.n_positions,
        return_tensors="pt",
    )
    # Pad the sequences to length n_positions
    input_ids_padded = torch.nn.functional.pad(outputs['input_ids'], (0, gpt2_config.n_positions - len(outputs['input_ids'])))
    attention_mask = torch.ones_like(input_ids_padded)
    return {"input_ids": input_ids_padded, "attention_mask": attention_mask}

dataset = load_dataset("Skylion007/openwebtext")

dataloader = DataLoader(dataset=dataset['train'], 
                        collate_fn=tokenize, 
                        batch_size=64, 
                        pin_memory=False, 
                        )

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"

# Initialize the model, tokenizer, data loader and the args here as shown in the code above.
model.train() # Set the model to training mode.
model = model.to(device)
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")
optimizer = torch.optim.AdamW(model.parameters(), lr=5.e-4) # Define an optimizer.
# Create a learning rate scheduler that starts with a low learning rate and increases it over time.
lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=100,
    num_training_steps=len(dataset['train'])
)


loss_values = []
steps = []



global_step = 0
loop = tqdm(dataloader, leave=True)
for batch in loop:
    input_ids = batch["input_ids"].to(device) # Move the data to the correct device (CPU/GPU/MPS).
    attention_mask = batch["attention_mask"].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids) # Forward pass.
    
    loss = outputs.loss
    
    loss.backward() # Backward pass.

    optimizer.step() # Update the parameters.
    lr_scheduler.step()
    
    # Manually delete input_ids and attention_mask to free up memory
    del input_ids
    del attention_mask  
    gc.collect()
    torch.mps.empty_cache()
    
    optimizer.zero_grad() # Reset gradients to zero for the next iteration.
    
    loop.set_description_str(f"Loss: {loss.item():.5f}, lr = {lr_scheduler.get_last_lr()[0]:.2e}")  

    loss_values.append(loss.item())
    steps.append(global_step)
    global_step += 1
    
    if global_step > 500:
        break

Using the latest cached version of the module from /Users/dwojcik/.cache/huggingface/modules/datasets_modules/datasets/Skylion007--openwebtext/6f68e85c16ccc770c0dd489f4008852ea9633604995addd0cd76e293aed9e521 (last modified on Sat Nov 16 10:37:31 2024) since it couldn't be found locally at Skylion007/openwebtext, or remotely on the Hugging Face Hub.


Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

Model size: 67.2M parameters


  0%|          | 0/125216 [00:00<?, ?it/s]

In [3]:
def generate_text(prompt, max_length=50):
    # Tokenize the input prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt").to("mps")  # Assuming MPS is available; change to "cuda" or "cpu" if needed
    
    # Generate text
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text

# Example usage
prompt = "Once upon a time, there was a"
completed_text = generate_text(prompt)
print(completed_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Once upon a time, there was a the the the the the the the the the the the the the the the the the the the the the the the the the the the U, the the the the the the the the the the the the the


In [1]:
import tiktoken
enc = tiktoken.get_encoding("o200k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4o")

In [5]:
enc = tiktoken.get_encoding("p50k_base")

In [6]:
enc.n_vocab

50281