### Resources i used to learn Transformer Model
- Transformers Explained | Simple Explanation of Transformers  
  [code basics](https://youtu.be/ZhAz268Hdpw?si=tjJOYzC5AurHZF5Z)
- Neural Networks  
  [3blue1Brown](https://youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi&si=lGqZJfoAP0BBXEHh)
- Coding a Transformer From Scratch (Line By Line)  
  [Dev G](https://youtu.be/kNf7VdUAVS8?si=NOa3pI6ztiST8pBS)
- Self Attention - NeetCode  
  [NeetCode](https://neetcode.io/problems/self-attention)

- GPT Dataset Problem - Neetcode  
  [NeetCode](https://neetcode.io/problems/gpt-dataset)
- Code Gpt - Neet Code  
  [NeetCode](https://neetcode.io/problems/code-gpt)
- Make Gpt Talk - Neetcode  
  [NeetCode](https://neetcode.io/problems/make-gpt-talk-back)
- Learn How LLM Transformer Models Work with Interactive Visualization  
  [https://poloclub.github.io/transformer-explainer/](https://poloclub.github.io/transformer-explainer/)

In [1]:
pip install torchtyping

Collecting torchtyping
  Downloading torchtyping-0.1.5-py3-none-any.whl.metadata (9.5 kB)
Collecting typeguard<3,>=2.11.1 (from torchtyping)
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading torchtyping-0.1.5-py3-none-any.whl (17 kB)
Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, torchtyping
  Attempting uninstall: typeguard
    Found existing installation: typeguard 4.4.4
    Uninstalling typeguard-4.4.4:
      Successfully uninstalled typeguard-4.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
inflect 7.5.0 requires typeguard>=4.0.1, but you have typeguard 2.13.3 which is incompatible.[0m[31m
[0mSuccessfully installed torchtyping-0.1.5 typeguard-2.13.3


**GPT Dataset Problem**

In [2]:
import torch
from typing import Tuple, List

class Solution:
  def batch_loader(self, raw_dataset: str, context_length: int, barch_size: int) -> Tuple[List[List[str]]]:
    torch.manual_seed(0)
    words = raw_dataset.split()
    indices = torch.randint(low=0, high = len(words) - context_length, size = (batch_size,) )
    X= []
    Y = []
    for idx in indices:
      X.append(words[idx:idx+context_length])
      Y.append(words[idx+1:idx+1+context_length])
    return X,Y

**Self Attention Problem -  Single Headed**

In [3]:
import torch
import torch.nn as nn
from torchtyping import TensorType
import math
class SSSingleHeadAttention(nn.Module):
      def __init__(self, embedding_dim: int, attention_dim: int):
        super().__init__()
        torch.manual_seed(0)
        self.get_keys = nn.Linear(embedding_dim, attention_dim)
        self.get_queries = nn.Linear(embedding_dim, attention_dim)
        self.get_values = nn.Linear(embedding_dim, attention_dim)

      def forward(self, embedded: TensorType[float]) -> TensorType[float]:
        k = self.get_keys(embedded) # B, T, A
        q = self.get_queries(embedded)
        v = self.get_values(embedded)

        scores = q @ torch.transpose(k, 1, 2)
        B, T, A = k.shape
        scores = scores / (A ** 0.5)

        pre_mask = torch.tril(torch.ones(T, T))

        mask = pre_mask == 0
        scores = scores.masked_fill(mask, float('-inf'))
        scores = nn.functional.softmax(scores, dim=2) # B, T, T
        transformed = scores @ v
        return torch.round(transformed, decimals=4)

2ND **METHOD**  [ Just Small Changes ]

In [4]:
# ------ OR ---------

import torch
import torch.nn as nn
from torchtyping import TensorType
import math

class SingleHeadAttention(nn.Module):
    def __init__(self, embedding_dim: int, attention_dim: int):
        super().__init__()
        torch.manual_seed(0)

        # Linear layers for Key, Query, Value
        self.key = nn.Linear(embedding_dim, attention_dim, bias=False)
        self.query = nn.Linear(embedding_dim, attention_dim, bias=False)
        self.value = nn.Linear(embedding_dim, attention_dim, bias=False)

    def forward(self, embedded: TensorType[float]) -> TensorType[float]:
        B, T, _ = embedded.shape   # (batch, seq_len, embedding_dim)

        # 1. Compute Q, K, V
        Q = self.query(embedded)   # (B, T, A)
        K = self.key(embedded)     # (B, T, A)
        V = self.value(embedded)   # (B, T, A)

        # 2. Compute attention scores
        scores = Q @ K.transpose(1, 2) / math.sqrt(K.size(-1))  # (B, T, T)

        # 3. Apply causal mask (future = -inf)
        mask = torch.tril(torch.ones(T, T, device=embedded.device))
        scores = scores.masked_fill(mask == 0, float('-inf'))

        # 4. Softmax to get attention weights
        attn_weights = torch.softmax(scores, dim=-1)

        # 5. Weighted sum of values
        out = attn_weights @ V  # (B, T, A)

        return out


**Multi Headed Self Attention**

In [5]:
class MultiHeadedSelfAttention(nn.Module):

  def __init__(self, embedding_dim: int, num_heads: int, attention_dim: int):
    super().__init__()
    torch.manual_seed(0)
    self.heads = nn.ModuleList()
    for i in range(num_heads):
      self.heads.append(SingleHeadAttention(embedding_dim, attention_dim // num_heads))

  def forward(self, embedded: TensorType[float]) -> TensorType[float]:
    outputs = [] # each element in this list is B, T, Head_Size ->>> B,T,Attention_Dim
    for head in self.heads:
      outputs.append(head(embedded))
    cated = torch.cat(outputs, dim=2)
    return torch.round(cated, decimals=4)

  class SingleHeadAttention(nn.Module):
    def __init__(self, embedding_dim: int, attention_dim: int):
        super().__init__()
        torch.manual_seed(0)

        # Linear layers for Key, Query, Value
        self.key = nn.Linear(embedding_dim, attention_dim, bias=False)
        self.query = nn.Linear(embedding_dim, attention_dim, bias=False)
        self.value = nn.Linear(embedding_dim, attention_dim, bias=False)

    def forward(self, embedded: TensorType[float]) -> TensorType[float]:
        B, T, _ = embedded.shape   # (batch, seq_len, embedding_dim)

        # 1. Compute Q, K, V
        Q = self.query(embedded)   # (B, T, A)
        K = self.key(embedded)     # (B, T, A)
        V = self.value(embedded)   # (B, T, A)

        # 2. Compute attention scores
        scores = Q @ K.transpose(1, 2) / math.sqrt(K.size(-1))  # (B, T, T)

        # 3. Apply causal mask (future = -inf)
        mask = torch.tril(torch.ones(T, T, device=embedded.device))
        scores = scores.masked_fill(mask == 0, float('-inf'))

        # 4. Softmax to get attention weights
        attn_weights = torch.softmax(scores, dim=-1)

        # 5. Weighted sum of values
        out = attn_weights @ V  # (B, T, A)

        return out

**Transformer Block**

In [6]:
import torch
import torch.nn as nn
from torchtyping import TensorType
import math

class TransformerBlock(nn.Module):
  def __init__(self, model_dim: int, num_heads: int):
    super().__init__()
    torch.manual_seed(0)
    self.mhsa = self.MultiHeadedSelfAttention(model_dim, num_heads)
    self.first_ln = nn.LayerNorm(model_dim)
    self.second_ln = nn.LayerNorm(model_dim)
    self.ff = self.VanillaNeuralNetwork(model_dim)

  def forward(self, embedded: TensorType[float]) -> TensorType[float]:
    torch.manual_seed(0)
    first_part = embedded + self.mhsa(self.first_ln(embedded)) # first that doesnt have the feed forward layer
    res = first_part + self.ff(self.second_ln(first_part))
    return torch.round(res, decimals=4)

  class MultiHeadedSelfAttention(nn.Module):

    def __init__(self, embedding_dim: int, num_heads: int, attention_dim: int):
      super().__init__()
      torch.manual_seed(0)
      self.heads = nn.ModuleList()
      for i in range(num_heads):
        self.heads.append(SingleHeadAttention(embedding_dim, attention_dim // num_heads))

    def forward(self, embedded: TensorType[float]) -> TensorType[float]:
      outputs = [] # each element in this list is B, T, Head_Size ->>> B,T,Attention_Dim
      for head in self.heads:
        outputs.append(head(embedded))
      cated = torch.cat(outputs, dim=2)
      return torch.round(cated, decimals=4)

  class SingleHeadAttention(nn.Module):
    def __init__(self, embedding_dim: int, attention_dim: int):
        super().__init__()
        torch.manual_seed(0)

        # Linear layers for Key, Query, Value
        self.key = nn.Linear(embedding_dim, attention_dim, bias=False)
        self.query = nn.Linear(embedding_dim, attention_dim, bias=False)
        self.value = nn.Linear(embedding_dim, attention_dim, bias=False)

    def forward(self, embedded: TensorType[float]) -> TensorType[float]:
        B, T, _ = embedded.shape   # (batch, seq_len, embedding_dim)

        # 1. Compute Q, K, V
        Q = self.query(embedded)   # (B, T, A)
        K = self.key(embedded)     # (B, T, A)
        V = self.value(embedded)   # (B, T, A)

        # 2. Compute attention scores
        scores = Q @ K.transpose(1, 2) / math.sqrt(K.size(-1))  # (B, T, T)

        # 3. Apply causal mask (future = -inf)
        mask = torch.tril(torch.ones(T, T, device=embedded.device))
        scores = scores.masked_fill(mask == 0, float('-inf'))

        # 4. Softmax to get attention weights
        attn_weights = torch.softmax(scores, dim=-1)

        # 5. Weighted sum of values
        out = attn_weights @ V  # (B, T, A)

        return out

  class VanillaNeuralNetwork(nn.Module):

      def __init__(self, model_dim: int):
            super().__init__()
            torch.manual_seed(0)
            self.up_projection = nn.Linear(model_dim, model_dim * 4)
            self.relu = nn.ReLU()
            self.down_projection = nn.Linear(model_dim * 4, model_dim)
            self.dropout = nn.Dropout(0.2) # using p = 0.2

      def forward(self, x: TensorType[float]) -> TensorType[float]:
            torch.manual_seed(0)
            return self.dropout(self.down_projection(self.relu(self.up_projection(x))))

**[GPT Class](https://neetcode.io/problems/code-gpt)** :-

In [7]:
import torch
import torch.nn as nn
from torchtyping import TensorType

# 1. Remember to include an additional LayerNorm after the block sequence and before the final linear layer
# 2. Instantiate in the following order: Word embeddings, position embeddings, transformer blocks, final layer norm, and vocabulary projection.
class GPT(nn.Module):

    def __init__(self, vocab_size: int, context_length: int, model_dim: int, num_blocks: int, num_heads: int):
        super().__init__()
        torch.manual_seed(0)
        # Hint: nn.Sequential() will be useful for the block sequence
        self.token_embeddings = nn.Embedding(vocab_size, model_dim)
        self.pos_embeddings = nn.Embedding(context_length, model_dim)
        self.blocks = nn.Sequential()
        for i in range(num_blocks):
          self.blocks.append(self.TransformerBlock(model_dim, num_heads))
        self.final_ln = nn.LayerNorm(model_dim) # B, T, D -> B, T, D
        self.vocab_projection = nn.Linear(model_dim, vocab_size)


    def forward(self, context: TensorType[int]) -> TensorType[float]:
        torch.manual_seed(0)
        # Round answer to 4 decimal places
        token_embeds = self.token_embeddings(context) # B, T, D
        B, T, D = token_embeds.shape
        pos_embeds = self.pos_embeddings(torch.arange(T))
        total_embeddings = token_embeds + pos_embeds

        un_normalized = self.vocab_projection(self.final_ln(self.blocks(total_embeddings)))
        probs = nn.functional.softmax(un_normalized, dim = -1)
        return torch.round(probs, decimals=4)


    # Do NOT modify the code below this line
    class TransformerBlock(nn.Module):

        class MultiHeadedSelfAttention(nn.Module):

            class SingleHeadAttention(nn.Module):
                def __init__(self, model_dim: int, head_size: int):
                    super().__init__()
                    torch.manual_seed(0)
                    self.key_gen = nn.Linear(model_dim, head_size, bias=False)
                    self.query_gen = nn.Linear(model_dim, head_size, bias=False)
                    self.value_gen = nn.Linear(model_dim, head_size, bias=False)

                def forward(self, embedded: TensorType[float]) -> TensorType[float]:
                    k = self.key_gen(embedded)
                    q = self.query_gen(embedded)
                    v = self.value_gen(embedded)

                    scores = q @ torch.transpose(k, 1, 2) # @ is the same as torch.matmul()
                    context_length, attention_dim = k.shape[1], k.shape[2]
                    scores = scores / (attention_dim ** 0.5)

                    lower_triangular = torch.tril(torch.ones(context_length, context_length))
                    mask = lower_triangular == 0
                    scores = scores.masked_fill(mask, float('-inf'))
                    scores = nn.functional.softmax(scores, dim = 2)

                    return scores @ v

            def __init__(self, model_dim: int, num_heads: int):
                super().__init__()
                torch.manual_seed(0)
                self.att_heads = nn.ModuleList()
                for i in range(num_heads):
                    self.att_heads.append(self.SingleHeadAttention(model_dim, model_dim // num_heads))

            def forward(self, embedded: TensorType[float]) -> TensorType[float]:
                head_outputs = []
                for head in self.att_heads:
                    head_outputs.append(head(embedded))
                concatenated = torch.cat(head_outputs, dim = 2)
                return concatenated

        class VanillaNeuralNetwork(nn.Module):

            def __init__(self, model_dim: int):
                super().__init__()
                torch.manual_seed(0)
                self.up_projection = nn.Linear(model_dim, model_dim * 4)
                self.relu = nn.ReLU()
                self.down_projection = nn.Linear(model_dim * 4, model_dim)
                self.dropout = nn.Dropout(0.2) # using p = 0.2

            def forward(self, x: TensorType[float]) -> TensorType[float]:
                torch.manual_seed(0)
                return self.dropout(self.down_projection(self.relu(self.up_projection(x))))

        def __init__(self, model_dim: int, num_heads: int):
            super().__init__()
            torch.manual_seed(0)
            self.attention = self.MultiHeadedSelfAttention(model_dim, num_heads)
            self.linear_network = self.VanillaNeuralNetwork(model_dim)
            self.first_norm = nn.LayerNorm(model_dim)
            self.second_norm = nn.LayerNorm(model_dim)

        def forward(self, embedded: TensorType[float]) -> TensorType[float]:
            torch.manual_seed(0)
            embedded = embedded + self.attention(self.first_norm(embedded)) # skip connection
            embedded = embedded + self.linear_network(self.second_norm(embedded)) # another skip connection
            return embedded


[Make GPT Talk Back](https://neetcode.io/problems/make-gpt-talk-back)

In [8]:
import torch
import torch.nn as nn
from torchtyping import TensorType

class Solution:
    def generate(self, model, new_chars: int, context: TensorType[int], context_length: int, int_to_char: dict) -> str:
        # 1. Use torch.multinomial() to choose the next token.
        #    This function simulates a weighted draw from a given list of probabilities
        #    It's similar to picking marbles out of a bag.
        # 2. the given model's output is BEFORE softmax is applied,
        #    and the forward() output has shape batch X time X vocab_size
        # 3. Do not alter the code below, only add to it. This is for maintaining reproducibility in testing.

        generator = torch.manual_seed(0)
        initial_state = generator.get_state()
        res = []
        # [5].item() -> 5
        # context is B x T
        # len(context) = B, len(context.T) = T
        for i in range(new_chars):
            if len(context.T) > context_length :
              context = context[:, -context_length:]
            prediction = model(context) # B, T, V
            last_time_step = prediction[:, -1, :] #B, V
            probs = nn.functional.softmax(last_time_step, dim=-1)
            # YOUR CODE (arbitrary number of lines)
            # The line where you call torch.multinomial(). Pass in the generator as well.
            next_char = torch.multinomial(probs, 1, generator=generator)

            generator.set_state(initial_state)
            # MORE OF YOUR CODE (arbitrary number of lines)

            context = torch.cat((context, next_char), dim=-1) # B, T -> B, T+1
            res.append(int_to_char[next_char.item()])
        return ''.join(res)

        # Once your code passes the test, check out the Colab link and hit Run to see your code generate new Drake lyrics!
        # Your code's output, ran in this sandbox will be boring because of the computational limits in this sandbox
