In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import math
import torch
import torch.nn as nn
from torch.nn import functional as F

class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 12 # number of layers
    n_head: int = 12 # number of heads
    n_embd: int = 768 # embedding dimension


class GPT(nn.Module):
    def __init__(self, config):
        self.config = config

        self.transformer = nn.ModuleDict(dict(
                wte = nn.Embedding(config.vocab_size, config.n_embd),
                wpe = nn.Embedding(config.block_size, config.n_embd),
                h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
                ln_f = nn.LayerNorm(config.n_embd),
            ))
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.LayerNorm(config.n_embed)
        self.attn = CausalSelfAttention(config)
        self.linear_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.linear_1(x))
        x = x + self.mlp(self.linear_2(x))
        return x

In [None]:
# !pip install tiktoken
import tiktoken
tiktoken.__version__

In [None]:
# https://tiktokenizer.vercel.app/?model=codellama%2FCodeLlama-7b-hf

tokenizer = tiktoken.get_encoding("cl100k_base")

text = ("''' below is the python code for sum of two number''' def add_two(a, b):\n    return a+b")
tokenizer.n_vocab

In [None]:
tokenizer.n_vocab

In [None]:
tokenizer.decode([19317,
 3770,
 374,
 279,
 10344,
 2082,
 369,
 2694,
 315,
 1403,
 1396,
 19317,
 711,
 923,
 24120,
 2948,
 11,
 293,
 997,
 262,
 471,
 264,
 36193])

In [None]:
# embeddings
import torch
import torch.nn as nn

vocab_size = 10
output_dim = 3 # 256

torch.manual_seed(23)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
embedding_layer.weight

In [None]:
nn.Linear(10,3).weight

In [None]:
nn.Linear(10,3).weight

In [None]:
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim) # (512, )

In [None]:
import torch
inputs = torch.Tensor([
    [1,2,1],
    [2,1,2],
    [2,1,2]
 
])

x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 6

In [None]:
inputs.shape

In [None]:
torch.manual_seed(123)

Wq = torch.nn.Parameter(torch.rand(d_in, d_out))
Wk = torch.nn.Parameter(torch.rand(d_in, d_out))
Wv = torch.nn.Parameter(torch.rand(d_in, d_out))

In [None]:
keys = inputs @ Wk
query = inputs @ Wq

keys, query

In [None]:
d_k = keys.shape[1]
attention_scores = query @ keys.T
attention_scores_norm = torch.softmax(attention_scores / d_k**0.5, dim=-1)
attention_scores_norm

In [None]:
query

In [None]:

class SelfAttention(nn.Module):
    def __init__(self, d_in, d_out, d_k):
        super().__init__()
        # self.Wq = torch.nn.Parameter(torch.rand(d_in, d_out))
        # self.Wk = torch.nn.Paramter(torch.rand(d_in, d_out))
        # self.Wv = torch.nn.Paramter(torch.rand(d_in, d_out))

        self.Wq = torch.nn.Linear(torch.rand(d_in, d_out, bias = False))
        self.Wk = torch.nn.Linear(torch.rand(d_in, d_out, bias = False))
        self.Wv = torch.nn.Linear(torch.rand(d_in, d_out, bias = False))

    def forward(self, x):
        key = inputs @ Wk
        query = inputs @ Wq
        value = inputs @ Wv

        attention_scores = query @ key.T
        attention_scores_norm = torch.softmax(attention_scores / d_k**0.5, dim=-1)

        context_vector = attention_scores_norm @ value

        return context_vector

        

In [None]:

class CasualSelfAttention(nn.Module):
    def __init__(self, d_in, d_out, d_k, context_length):
        super().__init__()


        self.Wq = torch.nn.Linear(torch.rand(d_in, d_out, bias = False))
        self.Wk = torch.nn.Linear(torch.rand(d_in, d_out, bias = False))
        self.Wv = torch.nn.Linear(torch.rand(d_in, d_out, bias = False))

    def forward(self, x):
        
        key = inputs @ Wk
        query = inputs @ Wq
        value = inputs @ Wv

        attention_scores = query @ key.T

        mask = torch.tril(torch.ones(context_length, context_length))
        mask_attention = attention_scores.masked_fill(mask.bool(), -torch.inf)
        attention_scores_norm = torch.softmax(mask_attention / d_k**0.5, dim=-1)

        context_vector = attention_scores_norm @ value
        return context_vector

        

In [None]:
import torch
import torch.nn as nn

class CasualAttention(nn.Module):
    def __init__(self, d_in, d_out, dropout, context_length):
        super().__init__()


        self.Wq = torch.nn.Linear(d_in, d_out, bias = False)
        self.Wk = torch.nn.Linear(d_in, d_out, bias = False)
        self.Wv = torch.nn.Linear(d_in, d_out, bias = False)
        self.dropout = torch.nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
        
    def forward(self, x):
        B, num_tokens, d_int = x.shape
        key = self.Wk(x)
        query = self.Wq(x)
        value = self.Wv(x)

        attention_scores = query @ key.transpose(1, 2)

        # mask = torch.tril(torch.ones(context_length, context_length))
        mask_attention = attention_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attention_scores_norm = torch.softmax(mask_attention / (d_out)**0.5, dim=-1)

        attention_scores_dropout = self.dropout(attention_scores_norm)

        context_vector = attention_scores_dropout @ value
        return context_vector

        

In [None]:
import torch
torch.manual_seed(123)
inputs = torch.Tensor([
    [1,2,1],
    [2,1,2],
    [2,1,2],
    [1,2,1],
    [2,1,2],
    [2,1,2]
 
 
])
batch = torch.stack((inputs, inputs), dim=0)


In [None]:
# context_length = batch.shape[1]
# dropout = 0.3
# d_in = inputs.shape[1]
# d_out = 6
# d_k = d_in
# ca = CasualAttention(d_in, d_out, dropout, context_length)
# ca(batch)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, dropout, context_length, num_heads):
        super().__init__()
        self.heads = nn.ModuleList([CasualAttention(d_in, d_out, dropout, context_length) for _ in range(num_heads)])

    def forward(self, x):
        return torch.concat([head(x) for head in self.heads])

# **GPT2 Execution**

In [2]:
import torch
import torch.nn as nn

torch.manual_seed(123)
inputs = torch.Tensor([
    [1,2,1],
    [2,1,2],
    [2,1,2],
    [1,2,1],
    [2,1,2],
    [2,1,2]
])
batch = torch.stack((inputs, inputs), dim=0)
# batch = torch.Tensor(inputs.reshape(-1))
context_length = batch.shape[1]
dropout = 0.3
d_in = inputs.shape[1]
d_out = 6
d_k = d_in
num_heads = 4

In [3]:
# batch.shape

In [4]:
# torch.manual_seed(123)
# mha = MultiHeadAttention(d_in, d_out, dropout, context_length, num_heads)
# mha_out = mha(batch[:1])

In [None]:
# mha_out

In [5]:
class MultiHeadCasualAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec


# mha_ch03 = MultiHeadCasualAttention(
#     d_in=3,
#     d_out=24,
#     context_length=12,
#     dropout=0.0,
#     num_heads=12,
#     qkv_bias=False
# ).to("cpu")

# out = mha_ch03(batch[:1])
# print(out.shape)

In [6]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 512, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [7]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attention = MultiHeadCasualAttention(
            d_in= cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            dropout=cfg["drop_rate"],
            num_heads=cfg["n_heads"],
            qkv_bias=cfg["qkv_bias"]
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # This block does nothing and just returns its input.
        shortcut = x
        x = self.norm1(x)
        x = self.attention(x)
        x = self.drop_shortcut(x)

        x = x + shortcut

        shortcut = x

        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x


class LayerNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        

    def forward(self, x):
        # This layer does nothing and just returns its input.

        mean = x.mean(dim = -1, keepdim = True)
        var = x.var(dim=-1,keepdim = True, unbiased = False)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return self.scale * norm_x + self.shift

In [8]:
torch.manual_seed(123)

x = torch.rand(2, 4, 768)  # Shape: [batch_size, num_tokens, emb_dim]
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [9]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device = in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)

        logits = self.out_head(x)
        return logits

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

In [None]:
# torch.manual_seed(123)
# model = GPTModel(GPT_CONFIG_124M)

# out = model(batch)
# print("Input batch:\n", batch)
# print("\nOutput shape:", out.shape)
# print(out)

In [None]:
# total_params = sum(p.numel() for p in model.parameters())
# print(f"Total number of parameters: {total_params:,}")

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

In [None]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

In [None]:
logits.shape

In [None]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [None]:
start_context = "Hello, I am"

encoded = tokenizer.encode(start_context)
print("encoded:", encoded)

encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

In [None]:
# model.eval() # disable dropout

# out = generate_text_simple(
#     model=model,
#     idx=encoded_tensor, 
#     max_new_tokens=16, 
#     context_size=GPT_CONFIG_124M["context_length"]
# )

# print("Output:", out)
# decoded_text = tokenizer.decode(out.squeeze(0).tolist())
# print(decoded_text)
# print("Output length:", len(out[0]))

In [None]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

In [None]:
import torch
# If the `previous_chapters.py` file is not available locally,
# you can import it from the `llms-from-scratch` PyPI package.
# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg
# E.g.,
# from llms_from_scratch.ch04 import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 512, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference

In [None]:
import tiktoken

# Alternatively:
# from llms_from_scratch.ch04 import generate_text_simple

def text_to_token_ids(text, tokenizer):
    # encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

In [None]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) 

In [None]:
# with torch.no_grad():
#     logits = model(inputs)

# probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary
# print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)

In [None]:
# token_ids = torch.argmax(probas, dim=-1, keepdim=True)
# print("Token IDs:\n", token_ids)

In [None]:
# print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
# print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

In [None]:
# import os
# import urllib.request

# file_path = "the-verdict.txt"
# url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

# if not os.path.exists(file_path):
#     with urllib.request.urlopen(url) as response:
#         text_data = response.read().decode('utf-8')
#     with open(file_path, "w", encoding="utf-8") as file:
#         file.write(text_data)
# else:
#     with open(file_path, "r", encoding="utf-8") as file:
#         text_data = file.read()

# # flytech/python-codes-25k

In [None]:
from datasets import load_dataset

train_dataset = load_dataset('flytech/python-codes-25k', split='train')
# train_dataset = load_dataset('flytech/python-codes-25k', split='test')

# One can map the dataset in any way, for the sake of example:
dataset = train_dataset.map(lambda example: {'text': example['instruction'] + ' ' + example['input'] + ' ' + example['output']})['text']
# Remember that you don't need to map if the dataset has a "text" field already:)

combined_dataset = "<|endoftext|>".join(dataset[:])

In [None]:
print(dataset[:1][0])

In [None]:
len(dataset)
encoded = tokenizer.encode(combined_dataset, allowed_special={'<|endoftext|>'})

In [None]:
total_characters = len(combined_dataset)
total_tokens = len(tokenizer.encode(combined_dataset, allowed_special={'<|endoftext|>'}))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

In [None]:
text_data = combined_dataset

In [None]:
from datasets import load_dataset

train_dataset = load_dataset('flytech/python-codes-25k', split='train')
dataset = train_dataset.map(lambda example: {'text': example['instruction'] + ' ' + example['input'] + ' ' + example['output']})['text']

combined_dataset = "<|endoftext|>".join(dataset[:])

train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [None]:
split_idx

In [None]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [None]:
train_loader

In [None]:
print("Train loader:")

train_ds = 0
test_ds = 0
for x, y in train_loader:
    # print(x.shape, y.shape)
    train_ds += 1

print("\nValidation loader:")
for x, y in val_loader:
    # print(x.shape, y.shape)
    test_ds += 1

print(train_ds, test_ds)

In [None]:
14*32 + 2*32

In [3]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [4]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss




def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# torch.manual_seed(123)
# model = GPTModel(GPT_CONFIG_124M)
# model.to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

# num_epochs = 10
# train_losses, val_losses, tokens_seen = train_model_simple(
#     model, train_loader, val_loader, optimizer, device,
#     num_epochs=num_epochs, eval_freq=5, eval_iter=5,
#     start_context="Give me a python program to add two numbers", tokenizer=tokenizer
# )

In [None]:
combined_dataset[:100]

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator


def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))

    # Plot training and validation loss against epochs
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))  # only show integer labels on x-axis

    # Create a second x-axis for tokens seen
    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis
    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Tokens seen")

    fig.tight_layout()  # Adjust layout to make room
    plt.savefig("loss-plot.pdf")
    plt.show()

epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [None]:
model.to("cpu")
model.eval()

tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=25,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

In [6]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [7]:
torch.manual_seed(123)

token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=50,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

NameError: name 'model' is not defined

In [2]:
# %%writefile gpt2_multigpu.py
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.optim as optim
import tiktoken
from torch.utils.data import DataLoader, Dataset
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch.nn.functional as F

GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 512, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attention = MultiHeadCasualAttention(
            d_in= cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            dropout=cfg["drop_rate"],
            num_heads=cfg["n_heads"],
            qkv_bias=cfg["qkv_bias"]
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # This block does nothing and just returns its input.
        shortcut = x
        x = self.norm1(x)
        x = self.attention(x)
        x = self.drop_shortcut(x)

        x = x + shortcut

        shortcut = x

        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x

class LayerNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        # This layer does nothing and just returns its input.

        mean = x.mean(dim = -1, keepdim = True)
        var = x.var(dim=-1,keepdim = True, unbiased = False)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return self.scale * norm_x + self.shift

class MultiHeadCasualAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

        self.rope = RoPE(self.head_dim, context_length)

    
    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        queries, keys = self.rope(queries, keys)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        
        # Use scaled_dot_product_attention which can leverage Flash Attention
        attn_output = F.scaled_dot_product_attention(
            queries, keys, values,
            attn_mask=mask_bool,
            dropout_p=self.dropout.p if self.training else 0.0, # Apply dropout if in training mode
            is_causal=False # Mask is provided, so is_causal is False
        )

        # Shape: (b, num_heads, num_tokens, head_dim) -> (b, num_tokens, num_heads, head_dim)
        context_vec = attn_output.transpose(1, 2).contiguous().view(b, num_tokens, self.d_out)

        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        # attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # # Original mask truncated to the number of tokens and converted to boolean
        # mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # # Use the mask to fill attention scores
        # attn_scores.masked_fill_(mask_bool, -torch.inf)

        # attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        # attn_weights = self.dropout(attn_weights)

        # # Shape: (b, num_tokens, num_heads, head_dim)
        # context_vec = (attn_weights @ values).transpose(1, 2)

        # # Combine heads, where self.d_out = self.num_heads * self.head_dim
        # context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        # context_vec = self.out_proj(context_vec)  # optional projection

        # return context_vec

import torch
import torch.nn as nn
import math

def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., :x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2:]
    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
    # The first two dimensions of q and k should be (batch, num_heads).
    # The last dimension is the embedding dimension, which we'll rotate.
    q_rotated = (q * cos[:, offset:offset + q.shape[2], :]) + (rotate_half(q) * sin[:, offset:offset + q.shape[2], :])
    k_rotated = (k * cos[:, offset:offset + k.shape[2], :]) + (rotate_half(k) * sin[:, offset:offset + k.shape[2], :])
    return q_rotated, k_rotated

class RoPE(nn.Module):
    def __init__(self, dim, context_length, base=10000):
        super().__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        t = torch.arange(context_length, device=inv_freq.device)
        freqs = torch.outer(t, inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos()[None, None, :, :])
        self.register_buffer("sin_cached", emb.sin()[None, None, :, :])

    def forward(self, q, k):
        # q and k have shapes (batch, num_heads, seq_len, head_dim)
        seq_len = q.shape[2]
        cos = self.cos_cached[:, :, :seq_len, :]
        sin = self.sin_cached[:, :, :seq_len, :]
        q_rotated, k_rotated = apply_rotary_pos_emb(q, k, cos, sin)
        return q_rotated, k_rotated


# Assuming you have your GPT-2 model defined already
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        # print(in_idx.shape)
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device = in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)

        logits = self.out_head(x)
        return logits

# Custom Dataset (assuming you have this)
class CustomDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = CustomDataset(txt, tokenizer, max_length, stride)

    # Create dataloader
    # dataloader = DataLoader(
    #     dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataset


def load_train_val_dataset():
    from datasets import load_dataset

    train_dataset = load_dataset('flytech/python-codes-25k', split='train')
    # train_dataset = load_dataset('flytech/python-codes-25k', split='test')

    # One can map the dataset in any way, for the sake of example:
    dataset = train_dataset.map(lambda example: {'text': example['instruction'] + ' ' + example['input'] + ' ' + example['output']})['text']
    # Remember that you don't need to map if the dataset has a "text" field already:)
    train_ratio = 0.90
    # text_data = "<|endoftext|>".join(dataset[:50])
    dataset = dataset[:]
    split_idx = int(train_ratio * len(dataset))

    train_data = dataset[:split_idx]
    val_data = dataset[split_idx:]
    train_data = "<|endoftext|>".join(train_data[:])
    val_data = "<|endoftext|>".join(val_data[:])
    # print("split_idx: ", split_idx)

    torch.manual_seed(123)

    train_loader = create_dataloader_v1(
        train_data,
        batch_size=8,
        max_length=GPT_CONFIG_124M["context_length"],
        stride=GPT_CONFIG_124M["context_length"],
        drop_last=True,
        shuffle=True,
        num_workers=0
    )

    val_loader = create_dataloader_v1(
        val_data,
        batch_size=8,
        max_length=GPT_CONFIG_124M["context_length"],
        stride=GPT_CONFIG_124M["context_length"],
        drop_last=False,
        shuffle=False,
        num_workers=0
    )
    # print("length of dataset", len(train_data), len(val_data))
    # print("length of loader", len(train_loader), len(val_loader))
    return train_loader, val_loader

def text_to_token_ids(text, tokenizer):
    # encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

def calc_loss_batch(input_batch, target_batch, model, device):
    try:
        input_batch, target_batch = input_batch.to(device), target_batch.to(device)
        # with torch.no_grad():
        logits = model(input_batch)
        loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
        del input_batch, target_batch, logits
        torch.cuda.empty_cache()
        return loss
    except Exception as exp:
        print(exp)
        return torch.tensor(0)

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches


def setup_ddp(rank, world_size):
    """Initialize the distributed environment"""
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    """Clean up the distributed environment"""
    dist.destroy_process_group()

def plot_losses(train_losses, val_losses, epochs):
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Train Loss (per batch)')
    plt.plot([i * (len(train_losses) // epochs) for i in range(epochs + 1)], [val_losses[0]] + val_losses, label='Validation Loss (per epoch)', marker='o')
    plt.xlabel('Batch / Epoch')
    plt.ylabel('Loss')
    plt.title('Train and Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.draw()
    plt.pause(0.1)

def train(batch_size=8, epochs=15):
    from tqdm.notebook import tqdm
    """Training function with DDP"""
    # Setup DDP
    # setup_ddp(rank, world_size)
    
    # Set device
    # device = torch.device(f'cuda:{rank}')
    # torch.cuda.set_device(device)
    device = torch.device("cuda")
    
    # Create model and move to device
    model = GPTModel(GPT_CONFIG_124M).to(device)
    
    # Wrap model with DDP
    # model = DDP(model, device_ids=[rank])
    
    # Create optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
    
    # Prepare data
    train_data, val_data = load_train_val_dataset()
    # dataset = CustomDataset(train_data)
    # Use DistributedSampler for DDP
    # train_sampler = torch.utils.data.distributed.DistributedSampler(
    #     train_data,
    #     num_replicas=world_size,
    #     rank=rank
    # )
    train_dataloader = DataLoader(
        train_data,
        batch_size=batch_size,
        # sampler=train_sampler,
        num_workers=4
    )

    # val_sampler = torch.utils.data.distributed.DistributedSampler(
    #     val_data,
    #     num_replicas=world_size,
    #     rank=rank,
    #     shuffle=False
    # )
    val_loader = DataLoader(
        val_data,
        batch_size=batch_size,
        num_workers=4,
        drop_last=False
    )
    print("train_data: ",  len(train_dataloader), " val_data: ", len(val_loader))
    # val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, drop_last=False, num_workers=4)
    
    # if rank == 0:
    #     plt.ion()
        
    # Training loop
    model.train()
    training_loss = []
    validation_loss = []
    # with tqdm(total=epochs * len(train_dataloader)) as pbar:
    for epoch in tqdm(range(epochs), desc='Epochs'):
        # train_sampler.set_epoch(epoch)  # Ensure shuffling works properly across epochs
        
        total_loss = 0
        batch_idx = 0
        for input_batch, target_batch in tqdm(train_dataloader, desc='Batch', leave=False):
            optimizer.zero_grad()
            batch_idx += 1

            loss = calc_loss_batch(input_batch, target_batch, model, device)  # Assuming your model returns a loss
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            # pbar.update(1)
            
            if batch_idx % 100 == 0:
                print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}')
        
        # if rank == 0:
        avg_loss = total_loss / len(train_dataloader)
        # print(f'Epoch {epoch} completed. Average Train Loss: {avg_loss:.4f}')
        training_loss.append(avg_loss)

        total_val_loss = 0
        val_idx = 0
        with torch.no_grad():
            for input_batch, target_batch in (val_loader):
                model.eval()
                val_loss = calc_loss_batch(input_batch, target_batch, model, device)
                val_idx += 1
                total_val_loss += (val_loss.item())

        if val_idx:
            val_loss = total_val_loss/(val_idx)
            validation_loss.append(val_loss)
            print(f'Epoch {epoch} completed. Average Train Loss: {avg_loss:.4f}. Validation Loss: {val_loss:.4f}')
        print("---"*20)

    
    # Save model (only from rank 0)
    # if rank == 0:
    # torch.save(model.module.state_dict(), 'gpt2_ddp_model_3.pth')
        torch.save(model.to(torch.device("cpu")).state_dict(), f'gpt2_ddp_model_4_{epoch}.pth')
        model.to(torch.device("cuda"))
    torch.save({"train_loss": training_loss, "validation_loss": validation_loss}, f"loss_gpt2_ddp_model_4.pth")
        # plot_losses(training_loss, validation_loss, epoch + 1)
        # plt.ioff() # Turn off interactive mode
        # plt.show()
    
        # Cleanup
        # cleanup()
# 
def main():
    # Model configuration
    GPT_CONFIG_124M = {
                        "vocab_size": 50257,   # Vocabulary size
                        "context_length": 512, # Shortened context length (orig: 1024)
                        "emb_dim": 768,        # Embedding dimension
                        "n_heads": 12,         # Number of attention heads
                        "n_layers": 12,        # Number of layers
                        "drop_rate": 0.1,      # Dropout rate
                        "qkv_bias": False      # Query-key-value bias
                    }

    # world_size = torch.cuda.device_count()
    
    # # Launch DDP training
    # torch.multiprocessing.spawn(
    #     train,
    #     args=(world_size,),
    #     nprocs=world_size,
    #     join=True
    # )

# if __name__ == "__main__":
    # Set multiprocessing start methodpbar.update(1)
    # torch.multiprocessing.set_start_method('spawn')
    # main()

In [3]:
train(batch_size=10, epochs=1)

README.md:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

python-codes-25k.json:   0%|          | 0.00/26.4M [00:00<?, ?B/s]

python-codes-25k.jsonl:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49626 [00:00<?, ? examples/s]

Map:   0%|          | 0/49626 [00:00<?, ? examples/s]

train_data:  1338  val_data:  176


Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

Batch:   0%|          | 0/1338 [00:00<?, ?it/s]

Epoch: 0, Batch: 100, Loss: 3.1769
Epoch: 0, Batch: 200, Loss: 0.7039
Epoch: 0, Batch: 300, Loss: 0.3968
Epoch: 0, Batch: 400, Loss: 0.2091
Epoch: 0, Batch: 500, Loss: 0.2319
Epoch: 0, Batch: 600, Loss: 0.5465
Epoch: 0, Batch: 700, Loss: 0.1304


KeyboardInterrupt: 

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 512, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

snapshot_model = GPTModel(GPT_CONFIG_124M)

# /kaggle/input/py-gpt2/pytorch/default/1/gpt2_ddp_model_1.pth

snapshot_model.load_state_dict(torch.load("/kaggle/working/gpt2_ddp_model_3_14.pth", weights_only=True))
# snapshot_model.eval()
device = torch.device("cpu")
snapshot_model.to(device)
torch.save(snapshot_model.state_dict(), f'gpt2_ddp_model_3_14_cpu.pth')

In [14]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # idx.to(device)
    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx


In [15]:
torch.manual_seed(123)
tokenizer = tiktoken.get_encoding("gpt2")
import time
start = time.time()

token_ids = generate(
    model=snapshot_model.to(torch.device("cpu")),
    idx=text_to_token_ids("Python program to calculate the average", tokenizer),
    max_new_tokens=250,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=5,
    temperature=0
)
end = time.time()
print("Time taken: ", end-start)
print("Tokens/s: ", 250/(end-start))
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


Time taken:  63.73817420005798
Tokens/s:  3.922296224164083
Output text:
 Python program to calculate the average value of three numbers?  ```python
# Function to calculate the average of three numbers

def sum_of_num(num_list):
  
  
   
 # Calculate the average_average
   total = num_sum(1)
 
   
   a = num_sum(num_list_of_list)
      
   
  for num in num_list:
        print(num_list)
```<|endoftext|>Develop a machine learning model in Python to predict the values of a type given input data Input: Input: [type: "price' and the number of people
 {%s: 1},
 {2: '%s', 3: 'price'},
 {'name: '%' : 'John', 4: 'Developer'},
: {'number: 4: 'Joe', ' lambda 1: 'price'},
]  ```python
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Function to classify
def


In [14]:
%%time
%%script bash
python3 gpt2_multigpu.py

length of dataset 30180 3447
length of loader 19 2
val_data:  1


Using the latest cached version of the dataset since flytech/python-codes-25k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/0ed98ff2a76c5d133d8c157b814189a5a17ebd20 (last modified on Sun Mar 30 08:15:53 2025).
Traceback (most recent call last):
  File "/kaggle/working/gpt2_multigpu.py", line 456, in <module>
    main()
  File "/kaggle/working/gpt2_multigpu.py", line 451, in main
    train(batch_size=8, epochs=20)
  File "/kaggle/working/gpt2_multigpu.py", line 391, in train
    loss.backward()
  File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 581, in backward
    torch.autograd.backward(
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 347, in backward
    _engine_run_backward(
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 825, in _engine_run_backward
    return Variable.

CalledProcessError: Command 'b'python3 gpt2_multigpu.py\n'' returned non-zero exit status 1.

In [None]:
from tqdm.contrib import itertools
import time
for i1, i2 in itertools.product(range(5), range(300)):
    # do something, e.g. sleep
    time.sleep(0.01)

  0%|          | 0/1500 [00:00<?, ?it/s]

In [22]:
import tiktoken
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 512, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

# Alternatively:
# from llms_from_scratch.ch04 import generate_text_simple

def text_to_token_ids(text, tokenizer):
    # encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [23]:
device = torch.device("cpu")
device

device(type='cpu')

In [24]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        device = torch.device("cpu")
        idx.to(device)
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [25]:
snapshot_model = GPTModel(GPT_CONFIG_124M)

snapshot_model.load_state_dict(torch.load("/kaggle/working/gpt2_ddp_model_2_cpu.pth", weights_only=True))
# snapshot_model.eval()
device = torch.device("cpu")
snapshot_model.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(512, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attention): MultiHeadCasualAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attention): MultiHeadCasualAttention(
        (W_qu

In [26]:
# torch.save(snapshot_model.state_dict(), 'gpt2_ddp_model_2_cpu.pth')

In [27]:
# device = next(snapshot_model.parameters()).device
# print(f"Model is on: {device}")

In [28]:
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate(
    model=snapshot_model,
    idx=text_to_token_ids("logic to return if a number is prime or not: ", tokenizer),
    max_new_tokens=256,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 logic to return if a number is prime or not:  ```python
def is_prime(n):
 if n == 1:
 return False
 for i in range(2, int(n**0.5) + 1):
 if n % i == 0:
 return False
 return True

num = int(input("Enter a number: "))
print("The number is", is_prime(num))
```<|endoftext|>Implement a basic chatbot in Python using the NLTK library for a library  ```python
import json
import nltk
from nltk.tokenize import word_tokenize

# load the user response
def get_response(user_input):
    response = get_response(user_input=True)
    if response.status_code == 200:
        return 'Error: Failed message'
    else:
        return 'Not a response message'

# respond to get user input
def respond(user_input):
      if user_input in responses:
        return 'I'm sorry, I don't understand.'


In [None]:
#

In [None]:
def evaluate_model_whole(model, train_loader, val_loader, device, eval_iter):
    
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
            
    model.train()
    return train_loss, val_loss

def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.

    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))

    print(num_batches)
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch.reshape(1, 512), target_batch.reshape(1, 512), model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [None]:
class CustomDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = CustomDataset(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataset

In [None]:
from torch.utils.data import DataLoader, Dataset

In [None]:
from datasets import load_dataset

train_dataset = load_dataset('flytech/python-codes-25k', split='train')
dataset = train_dataset.map(lambda example: {'text': example['instruction'] + ' ' + example['input'] + ' ' + example['output']})['text']

combined_dataset = "<|endoftext|>".join(dataset[:])

train_ratio = 0.90
split_idx = int(train_ratio * len(combined_dataset))
train_data = combined_dataset[:split_idx]
val_data = combined_dataset[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [None]:
len(train_loader), len(val_loader)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
%%time
train_losses, val_losses = evaluate_model(
                    snapshot_model.to(device), 
                    val_loader, 
                    val_loader, 
                    device, 
                    eval_iter=None)

In [None]:
val_losses

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

num_epochs = 10
def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))

    # Plot training and validation loss against epochs
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))  # only show integer labels on x-axis

    # Create a second x-axis for tokens seen
    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis
    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Tokens seen")

    fig.tight_layout()  # Adjust layout to make room
    plt.savefig("loss-plot.pdf")
    plt.show()

epochs_tensor = torch.linspace(0, num_epochs, len(train_loss))
plot_losses(epochs_tensor, tokens_seen, train_loss, val_loss)

In [None]:
train_loss

In [None]:
import torch
import torch.nn.functional as F
from torch.cuda.amp import autocast

def evaluate(model, val_loader, device, context_size, loss_fn=F.cross_entropy):
    model.eval()  # Set model to evaluation mode
    total_loss = 0.0
    total_tokens = 0
    
    with torch.no_grad():  # Disable gradient computation
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Ensure inputs respect context size
            if inputs.size(1) > context_size:
                inputs = inputs[:, -context_size:]
                targets = targets[:, -context_size:]
            
            # Forward pass with mixed precision (optional)
            with autocast():
                logits = model(inputs)  # Shape: [batch_size, seq_len, vocab_size]
                loss = loss_fn(logits.view(-1, logits.size(-1)), targets.view(-1))
            
            total_loss += loss.item() * targets.numel()  # Weighted by number of tokens
            total_tokens += targets.numel()
    
    # Compute average loss and perplexity
    avg_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(avg_loss)).item()
    
    model.train()  # Switch back to training mode
    return avg_loss, perplexity

# Example usage
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = snapshot_model.to(device)

# Assuming you have a validation DataLoader (val_loader)
# val_loader could contain tokenized sequences (inputs, targets)
avg_loss, perplexity = evaluate(
    model=model,
    val_loader=val_loader,
    device=device,
    context_size=512  # Adjust to your model's context size
)
print(f"Validation Loss: {avg_loss:.4f}, Perplexity: {perplexity:.4f}")

## Generate

In [9]:
import tiktoken
def text_to_token_ids(text, tokenizer):
    # encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

tokenizer = tiktoken.get_encoding("gpt2")

def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx


In [None]:
import torch
# If the `previous_chapters.py` file is not available locally,
# you can import it from the `llms-from-scratch` PyPI package.
# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg
# E.g.,
# from llms_from_scratch.ch04 import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 512, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
# model = GPTModel(GPT_CONFIG_124M)
# model.eval();  # Disable dropout during inference

In [14]:
import torch
import torch.nn as nn
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attention = MultiHeadCasualAttention(
            d_in= cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            dropout=cfg["drop_rate"],
            num_heads=cfg["n_heads"],
            qkv_bias=cfg["qkv_bias"]
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # This block does nothing and just returns its input.
        shortcut = x
        x = self.norm1(x)
        x = self.attention(x)
        x = self.drop_shortcut(x)

        x = x + shortcut

        shortcut = x

        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x


class LayerNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        

    def forward(self, x):
        # This layer does nothing and just returns its input.

        mean = x.mean(dim = -1, keepdim = True)
        var = x.var(dim=-1,keepdim = True, unbiased = False)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return self.scale * norm_x + self.shift

class MultiHeadCasualAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec



class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device = in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)

        logits = self.out_head(x)
        return logits

snapshot_model = GPTModel(GPT_CONFIG_124M)
snapshot_model.load_state_dict(torch.load("/kaggle/input/gpt2-cpu/pytorch/default/1/gpt2_ddp_model_2_cpu.pth", weights_only=True))
# snapshot_model.eval()
device = torch.device("cpu")
snapshot_model.to(device)
print()




In [None]:
# snapshot_model = GPTModel(GPT_CONFIG_124M)

# snapshot_model.load_state_dict(torch.load("/kaggle/working/gpt2_ddp_model_2_cpu.pth", weights_only=True))
# # snapshot_model.eval()
# device = torch.device("cpu")
# snapshot_model.to(device)

In [None]:
# token_ids = generate(
#     model=snapshot_model,
#     idx=text_to_token_ids("logic to return if a number is prime or not: ", tokenizer),
#     max_new_tokens=256,
#     context_size=GPT_CONFIG_124M["context_length"]
# )

# print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


In [15]:
import time
start = time.time()

token_ids = generate(
    model=snapshot_model,
    idx=text_to_token_ids("logic to return if a number is prime or not: ", tokenizer),
    max_new_tokens=250,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=5,
    temperature=0
)
end = time.time()
print("Time taken: ", end-start)
print("Tokens/s: ", 250/(end-start))
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


Time taken:  67.82775211334229
Tokens/s:  3.685806947903009
Output text:
 logic to return if a number is prime or not:  ```python
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num**0.5)+1):
        if num % i == 0:
            return False
    return True
```<|endoftext|>Create a Python script to generate a histogram given a set of data [2, 5, 8, 10, 3, 9]  ```python
import matplotlib.pyplot as plt

def histogram(data):
    plt.hist(data)
    plt.xlabel('Index')
    plt.ylabel('Frequency')
    plt.title('Frequency')
    plt.show()

data = [2, 5, 8, 10, 3, 9]
plt.plot(data)
```<|endoftext|>Create a basic web-based web server in Python that can serve static pages from a given URL  ```python

