In [6]:
import torch

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [7]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        # assert (d_out % num_heads == 0), 
        #     "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = torch.nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = torch.nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) 
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) 
        
        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

In [8]:
class GELU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [9]:
class Feed_Forward(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.cfg = cfg
        self.layer = torch.nn.Sequential(
            torch.nn.Linear(cfg['emb_dim'],4*cfg['emb_dim']),
            GELU(),
            torch.nn.Linear(4*cfg['emb_dim'],cfg['emb_dim'])
        )

    def forward(self,x):
        return self.layer(x)

In [10]:
class Layer_Norm(torch.nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.emb_dim = emb_dim
        self.eps = 6e-8
        self.scale = torch.nn.Parameter(torch.ones(emb_dim))
        self.shift = torch.nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean = x.mean(dim=-1,keepdim = True)
        var = x.var(dim = -1,keepdim = True)
        out = (x - mean)/torch.sqrt(var + self.eps)
        out = out * self.scale + self.shift

        return out

In [11]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length is the length of the input sequence means no of tokens in each input sequence
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [12]:
class Transformer(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.attn = MultiHeadAttention(
            d_in = cfg['emb_dim'],
            d_out = cfg['emb_dim'],#this will be the context vector dim for each token
            context_length= cfg['context_length'],#total no of tokens in the input sequence
            dropout=cfg['drop_rate'],
            num_heads= cfg['n_heads'],
            qkv_bias=cfg['qkv_bias']
        )
        self.ff = Feed_Forward(cfg)
        self.norm_1 = Layer_Norm(cfg['emb_dim'])
        self.norm_2 = Layer_Norm(cfg['emb_dim'])
        self.dropout_shortcut = torch.nn.Dropout(cfg['drop_rate'])

    def forward(self,x):
        shortcut = x
        x = self.norm_1(x)
        x = self.attn(x)
        x = self.dropout_shortcut(x)
        x = x + shortcut
        # now ff
        shortcut = x
        x = self.norm_2(x)
        x = self.ff(x)
        x = self.dropout_shortcut(x)
        x = x+shortcut

        return x
        
        
    

In [13]:
class GPT_Model(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.cfg = cfg
        self.tok_emb = torch.nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = torch.nn.Embedding(cfg['context_length'],cfg['emb_dim'])
        self.dropout = torch.nn.Dropout(cfg['drop_rate'])
        self.tranformer = torch.nn.Sequential(
            *[
               Transformer(cfg) for _ in range(self.cfg['n_layers'])
            ]
        )
        self.out_norm = Layer_Norm(cfg['emb_dim'])
        self.out_head = torch.nn.Linear(cfg['emb_dim'],cfg['vocab_size'],bias=False)


    def forward(self,inp_idx):
        batch,seq_len = inp_idx.shape
        x = self.tok_emb(inp_idx)
        x_pos = self.pos_emb(torch.arange(seq_len,device=inp_idx.device))
        x = x+x_pos
        x = self.dropout(x)
        x = self.tranformer(x)
        x = self.out_norm(x)
        logits = self.out_head(x)
        return logits


In [14]:
batch = torch.tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [15]:
torch.manual_seed(123)
model = GPT_Model(GPT_CONFIG_124M)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3612,  0.4223, -0.0709,  ...,  0.3479,  0.4655, -0.2833],
         [-0.1785, -0.5656, -0.9477,  ...,  0.0476,  0.5173, -0.3160],
         [ 0.7118,  0.0335,  0.1078,  ...,  0.1020, -0.4331, -0.2547],
         [-1.0068,  0.3420, -0.1191,  ...,  0.7193,  0.4018,  0.0532]],

        [[-0.2562,  0.0899,  0.0337,  ...,  0.2659,  0.4448, -0.6800],
         [ 0.1230,  0.3651, -0.2071,  ...,  0.7704,  0.2702,  0.2250],
         [ 1.0555,  1.0312, -0.2797,  ...,  0.6934,  0.3201, -0.3172],
         [-0.1559,  0.3922,  0.3286,  ...,  1.2627, -0.1862,  0.0391]]],
       grad_fn=<UnsafeViewBackward0>)


In [16]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")csaf

SyntaxError: invalid syntax (3560178217.py, line 2)

In [None]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]#takes only last context size tokens as input in each batch
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)# gets batch*no_tokens*vocab_size
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  #takes only the last row of input sequence

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)
        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)#appens the output token to input and iterate again to predict nexxt one

    return idx

In [None]:
# pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.12.0-cp314-cp314-win_amd64.whl.metadata (6.9 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2025.11.3-cp314-cp314-win_amd64.whl.metadata (41 kB)
Collecting requests>=2.26.0 (from tiktoken)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests>=2.26.0->tiktoken)
  Using cached charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl.metadata (38 kB)
Collecting idna<4,>=2.5 (from requests>=2.26.0->tiktoken)
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.26.0->tiktoken)
  Using cached urllib3-2.6.3-py3-none-any.whl.metadata (6.9 kB)
Collecting certifi>=2017.4.17 (from requests>=2.26.0->tiktoken)
  Using cached certifi-2026.1.4-py3-none-any.whl.metadata (2.5 kB)
Downloading tiktoken-0.12.0-cp314-cp314-win_amd64.whl (921 kB)
   ---------------------------------------- 0.0/921.1 kB ? eta -:--:--
   ------------

In [None]:
import tiktoken

In [None]:
tokenizer  = tiktoken.get_encoding("gpt2")

In [17]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) #A
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [18]:
model.eval() #A
out = generate_text_simple(
model=model,
idx=encoded_tensor,
max_new_tokens=6,
context_size=GPT_CONFIG_124M["context_length"]
)
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Output length: 10


In [20]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am Featureiman Byeswickattribute argue
