In [409]:
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F

In [410]:
class Tokenizer:
    def __init__(self, model_name: str):
        self.encoding = tiktoken.get_encoding(model_name)

    def encode(self, text: str):
        return self.encoding.encode(text)

    def decode(self, tokens: list[int]):
        return self.encoding.decode(tokens)

In [411]:
config = {
    "model_name": "cl100k_base",
    "vocab_size": 100277,
    "context_length": 12,
    "n_heads": 12,
    "hidden_dim": 3072,
    "embedding_dim": 768}

In [412]:
class Embedding(nn.Module):
    def __init__(self, model_name: str, config):
        super().__init__()
        self.model_name = model_name
        self.config = config
        self.tokenizer = Tokenizer(model_name) 
        self.embedding = nn.Embedding(config["vocab_size"], config["embedding_dim"])
        self.positional_encoding = nn.Embedding(config["context_length"], config["embedding_dim"])

    def forward(self, inputs):
        # inputs: token IDs, shape (batch, seq_len) or (seq_len,)
        ey = self.embedding(inputs)
        
        # Create position indices: [0, 1, 2, 3, ..., seq_len-1]
        seq_len = inputs.shape[-1]
        positions = torch.arange(seq_len, dtype=torch.long)
        pex = self.positional_encoding(positions)
        
        x = ey + pex
        return x

In [413]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.q = nn.Linear(config["embedding_dim"], config["embedding_dim"])
        self.k = nn.Linear(config["embedding_dim"], config["embedding_dim"])
        self.v = nn.Linear(config["embedding_dim"], config["embedding_dim"])
        self.out_proj = nn.Linear(config["embedding_dim"], config["embedding_dim"])

    def forward(self, input_text):
        # Ensure input_text has the correct shape (T, D)
        if input_text.dim() == 2:  # If input_text is 2D (T, D)
            text = input_text.unsqueeze(0)  # Add batch dimension (1, T, D)
        else:
            text = input_text
        
        B, T, D = text.shape
        H = self.config["n_heads"]
        d_head = D // H

        Q = self.q(text)
        K = self.k(text)
        V = self.v(text)

        Q = Q.view(B, T, H, d_head).transpose(1, 2)
        K = K.view(B, T, H, d_head).transpose(1, 2)
        V = V.view(B, T, H, d_head).transpose(1, 2)

        scores = (Q @ K.transpose(-2, -1)) / (d_head ** 0.5)
        
        # Apply causal mask BEFORE softmax
        mask = torch.tril(torch.ones(T, T)).unsqueeze(0).unsqueeze(0)
        scores = scores.masked_fill(mask == 0, float('-inf'))
        
        weights = F.softmax(scores, dim=-1)

        out = weights @ V                # (B, H, T, d_head)

        out = out.transpose(1, 2).contiguous().view(B, T, D)
        out = self.out_proj(out)

        return out        

In [414]:
class LayerNorm(nn.Module):
    def __init__(self, dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(dim))
        self.beta = nn.Parameter(torch.zeros(dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * x_norm + self.beta

In [415]:
class FeedForward(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.fc1 = nn.Linear(config["embedding_dim"], config["hidden_dim"])
        self.fc2 = nn.Linear(config["hidden_dim"], config["embedding_dim"])
        self.activation = nn.GELU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)
        return x

In [416]:
class Dropout(nn.Module):
    def __init__(self, p: float = 0.5):
        super().__init__()
        self.p = p

    def forward(self, x):
        if not self.training or self.p == 0.0:
            return x
        mask = (torch.rand_like(x) > self.p).float()
        return x * mask / (1.0 - self.p)


In [417]:
class Llm(nn.Module):
    def __init__(self, model_name: str, config):
        super().__init__()
        self.embedding = Embedding(model_name, config)
        self.transformer = MultiHeadAttention(config)
        self.dropout = nn.Dropout(p=0.1)
        self.layernorm1 = LayerNorm(config["embedding_dim"])
        self.layernorm2 = LayerNorm(config["embedding_dim"])
        self.feedforward = FeedForward(config)
        self.output_layer = nn.Linear(config["embedding_dim"], config["vocab_size"])

    def forward(self, input_text):
        # Embedding layer
        embedded_text = self.embedding.forward(input_text)
        
        # First transformer block
        x = self.transformer.forward(embedded_text)
        x = self.dropout(x)
        # Fix shape mismatch: remove batch dimension for residual connection
        x = x.squeeze(0) + embedded_text  # residual connection
        x = self.layernorm1.forward(x)
        
        # Feed forward block
        residual = x
        x = self.feedforward.forward(x)
        x = self.dropout(x)
        x = x + residual  # residual connection
        x = self.layernorm2.forward(x)
        # Output layer to convert back to vocabulary
        logits = self.output_layer(x)
        return logits        


In [418]:
llm=Llm("cl100k_base", config)

In [419]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context

    ###Input batch:
 ###tensor([[6109, 3626, 6100,  345],
        ##[6109, 1110, 6622,  257]])
    idx=idx
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            # Remove batch dimension for model (it expects single sequence)
            logits = model(idx_cond[0])  # Pass (n_tokens,) â†’ get (n_tokens, vocab_size)
        
        # Focus only on the last time step
        # (n_tokens, vocab_size) becomes (vocab_size,)
        logits = logits[-1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (vocab_size,)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1)  # scalar
        
        # Reshape to (1, 1) for concatenation
        idx_next = idx_next.unsqueeze(0).unsqueeze(0)  # (1, 1)
        
        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [420]:
Embedding_layer = Embedding("cl100k_base", config)
input_text = "Hello, how are you?"
token=Tokenizer("cl100k_base")
input_text=token.encode(input_text)
input_text

[9906, 11, 1268, 527, 499, 30]

In [422]:
out = generate_text_simple(
model=llm,
idx=torch.tensor([input_text], dtype=torch.long),
max_new_tokens=6,
context_size=config["context_length"]
)

for i in range(out.shape[1]):
    print(token.decode([out[0, i].item()]), end='')

Hello, how are you?_gui linkingWizard lock vanished municipality