In [189]:
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F

In [190]:
class Tokenizer:
    def __init__(self, model_name: str):
        self.encoding = tiktoken.get_encoding(model_name)

    def encode(self, text: str):
        return self.encoding.encode(text)

    def decode(self, tokens: list[int]):
        return self.encoding.decode(tokens)

In [191]:
config = {
    "model_name": "gpt2",
    "vocab_size": 50257,  # GPT-2 vocab size
    "context_length": 1024,  # GPT-2 max position
    "n_heads": 12,
    "hidden_dim": 3072,
    "embedding_dim": 768,
    "n_layers": 12
}

In [192]:
class Embedding(nn.Module):
    def __init__(self, model_name: str, config):
        super().__init__()
        self.model_name = model_name
        self.config = config
        self.tokenizer = Tokenizer(model_name) 
        self.embedding = nn.Embedding(config["vocab_size"], config["embedding_dim"])
        self.positional_encoding = nn.Embedding(config["context_length"], config["embedding_dim"])

    def forward(self, inputs):
        # inputs: token IDs, shape (batch, seq_len) or (seq_len,)
        ey = self.embedding(inputs)
        
        # Create position indices: [0, 1, 2, 3, ..., seq_len-1]
        seq_len = inputs.shape[-1]
        positions = torch.arange(seq_len, dtype=torch.long)
        pex = self.positional_encoding(positions)
        
        x = ey + pex
        return x

In [193]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.q = nn.Linear(config["embedding_dim"], config["embedding_dim"])
        self.k = nn.Linear(config["embedding_dim"], config["embedding_dim"])
        self.v = nn.Linear(config["embedding_dim"], config["embedding_dim"])
        self.out_proj = nn.Linear(config["embedding_dim"], config["embedding_dim"])

    def forward(self, input_text):
        # Ensure input_text has the correct shape (T, D)
        if input_text.dim() == 2:  # If input_text is 2D (T, D)
            text = input_text.unsqueeze(0)  # Add batch dimension (1, T, D)
        else:
            text = input_text
        
        B, T, D = text.shape
        H = self.config["n_heads"]
        d_head = D // H

        Q = self.q(text)
        K = self.k(text)
        V = self.v(text)

        Q = Q.view(B, T, H, d_head).transpose(1, 2)
        K = K.view(B, T, H, d_head).transpose(1, 2)
        V = V.view(B, T, H, d_head).transpose(1, 2)

        scores = (Q @ K.transpose(-2, -1)) / (d_head ** 0.5)
        
        # Apply causal mask BEFORE softmax
        mask = torch.tril(torch.ones(T, T)).unsqueeze(0).unsqueeze(0)
        scores = scores.masked_fill(mask == 0, float('-inf'))
        
        weights = F.softmax(scores, dim=-1)

        out = weights @ V                # (B, H, T, d_head)

        out = out.transpose(1, 2).contiguous().view(B, T, D)
        out = self.out_proj(out)

        return out        

In [194]:
class LayerNorm(nn.Module):
    def __init__(self, dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(dim))
        self.beta = nn.Parameter(torch.zeros(dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * x_norm + self.beta

In [195]:
class FeedForward(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.fc1 = nn.Linear(config["embedding_dim"], config["hidden_dim"])
        self.fc2 = nn.Linear(config["hidden_dim"], config["embedding_dim"])
        self.activation = nn.GELU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)
        return x

In [196]:
class Dropout(nn.Module):
    def __init__(self, p: float = 0.5):
        super().__init__()
        self.p = p

    def forward(self, x):
        if not self.training or self.p == 0.0:
            return x
        mask = (torch.rand_like(x) > self.p).float()
        return x * mask / (1.0 - self.p)


In [197]:
class Llm(nn.Module):
    def __init__(self, model_name: str, config):
        super().__init__()
        self.embedding = Embedding(model_name, config)
        self.n_layers = config["n_layers"]
        self.transformers = nn.ModuleList([MultiHeadAttention(config) for _ in range(self.n_layers)])
        self.feedforwards = nn.ModuleList([FeedForward(config) for _ in range(self.n_layers)])
        self.layernorms1 = nn.ModuleList([LayerNorm(config["embedding_dim"]) for _ in range(self.n_layers)])
        self.layernorms2 = nn.ModuleList([LayerNorm(config["embedding_dim"]) for _ in range(self.n_layers)])
        self.dropouts = nn.ModuleList([nn.Dropout(p=0.1) for _ in range(self.n_layers)])
        self.output_layer = nn.Linear(config["embedding_dim"], config["vocab_size"])

    def forward(self, input_text):
        # Embedding layer
        x = self.embedding(input_text)
        
        for i in range(self.n_layers):
            # Post-LN Attention
            residual = x
            attn_in = self.layernorms1[i](x)
            attn_out = self.transformers[i](attn_in)
            x = attn_out + residual
            
            # Post-LN FFN
            residual = x
            ff_in = self.layernorms2[i](x)
            ff_out = self.feedforwards[i](ff_in)
            x = ff_out + residual
        
        # Output layer to convert back to vocabulary
        logits = self.output_layer(x)
        return logits        


In [198]:
# Create your custom model and load GPT-2 weights
from transformers import GPT2LMHeadModel

llm = Llm("gpt2", config)
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')

def load_gpt2_weights(custom_model, gpt2_model):
    print(f"Loading weights for {custom_model.n_layers} layers")
    
    # Load embedding weights
    custom_model.embedding.embedding.load_state_dict({'weight': gpt2_model.transformer.wte.weight})
    custom_model.embedding.positional_encoding.load_state_dict({'weight': gpt2_model.transformer.wpe.weight})
    
    # Load weights for each layer
    for i in range(custom_model.n_layers):
        print(f"Loading layer {i}")
        # GPT-2 combines Q, K, V in c_attn: [768, 2304] = [768, 768*3]
        c_attn_weight = gpt2_model.transformer.h[i].attn.c_attn.weight  # [768, 2304]
        c_attn_bias = gpt2_model.transformer.h[i].attn.c_attn.bias      # [2304]
        
        # Split into Q, K, V weights (each [768, 768]) and transpose for PyTorch Linear
        q_weight = c_attn_weight[:, :768].t()      # [768, 768]
        k_weight = c_attn_weight[:, 768:1536].t()  # [768, 768] 
        v_weight = c_attn_weight[:, 1536:].t()     # [768, 768]
        
        q_bias = c_attn_bias[:768]             # [768]
        k_bias = c_attn_bias[768:1536]         # [768]
        v_bias = c_attn_bias[1536:]            # [768]
        
        custom_model.transformers[i].q.load_state_dict({'weight': q_weight, 'bias': q_bias})
        custom_model.transformers[i].k.load_state_dict({'weight': k_weight, 'bias': k_bias})
        custom_model.transformers[i].v.load_state_dict({'weight': v_weight, 'bias': v_bias})
        
        # Load output projection
        custom_model.transformers[i].out_proj.load_state_dict({
            'weight': gpt2_model.transformer.h[i].attn.c_proj.weight.t(),  # [768, 768]
            'bias': gpt2_model.transformer.h[i].attn.c_proj.bias       # [768]
        })
        
        # Load feedforward weights (need to transpose)
        custom_model.feedforwards[i].fc1.load_state_dict({
            'weight': gpt2_model.transformer.h[i].mlp.c_fc.weight.t(),  # [3072, 768]
            'bias': gpt2_model.transformer.h[i].mlp.c_fc.bias
        })
        custom_model.feedforwards[i].fc2.load_state_dict({
            'weight': gpt2_model.transformer.h[i].mlp.c_proj.weight.t(),  # [768, 3072]
            'bias': gpt2_model.transformer.h[i].mlp.c_proj.bias
        })
        
        # Load layer norm weights
        custom_model.layernorms1[i].load_state_dict({
            'gamma': gpt2_model.transformer.h[i].ln_1.weight,
            'beta': gpt2_model.transformer.h[i].ln_1.bias
        })
        custom_model.layernorms2[i].load_state_dict({
            'gamma': gpt2_model.transformer.h[i].ln_2.weight,
            'beta': gpt2_model.transformer.h[i].ln_2.bias
        })
    
    # Load output layer weights (language modeling head)
    custom_model.output_layer.load_state_dict({
        'weight': gpt2_model.lm_head.weight
    }, strict=False)  # GPT-2 LM head has no bias, so use strict=False
    
    print("GPT-2 weights loaded successfully!")
    return custom_model

# Load the weights into the llm instance
llm = load_gpt2_weights(llm, gpt2_model)

Loading weights for 12 layers
Loading layer 0
Loading layer 1
Loading layer 2
Loading layer 3
Loading layer 4
Loading layer 5
Loading layer 6
Loading layer 7
Loading layer 8
Loading layer 9
Loading layer 10
Loading layer 11
GPT-2 weights loaded successfully!


In [199]:
def generate_text_simple(model, idx, max_new_tokens, context_size, temperature=1.0):
    model.eval()  # Disable dropout for inference
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)  # (batch, n_tokens, vocab_size)
        
        # Focus only on the last time step
        logits = logits[:, -1, :]  # (batch, vocab_size)
        
        # Apply temperature scaling
        logits = logits / temperature
        
        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)
        
        # Sample from the distribution instead of argmax
        idx_next = torch.multinomial(probas, num_samples=1)  # (batch, 1)
        
        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)
    
    return idx

In [200]:
# Embedding_layer = Embedding("gpt-2", config)
input_text = "Hello, how are you?"
token=Tokenizer("gpt2")
input_text=token.encode(input_text)
input_text

[15496, 11, 703, 389, 345, 30]

In [201]:
out = generate_text_simple(
model=llm,
idx=torch.tensor([input_text], dtype=torch.long),
max_new_tokens=6,
context_size=config["context_length"],
temperature=1.2
)

for i in range(out.shape[1]):
    print(token.decode([out[0, i].item()]), end='')

Hello, how are you? the the the the the the