In [5]:
pip install torchtyping

Collecting torchtyping
  Downloading torchtyping-0.1.5-py3-none-any.whl.metadata (9.5 kB)
Collecting typeguard<3,>=2.11.1 (from torchtyping)
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading torchtyping-0.1.5-py3-none-any.whl (17 kB)
Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, torchtyping
  Attempting uninstall: typeguard
    Found existing installation: typeguard 4.4.4
    Uninstalling typeguard-4.4.4:
      Successfully uninstalled typeguard-4.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
inflect 7.5.0 requires typeguard>=4.0.1, but you have typeguard 2.13.3 which is incompatible.[0m[31m
[0mSuccessfully installed torchtyping-0.1.5 typeguard-2.13.3


**GPT Dataset Problem**

In [10]:
import torch
from typing import Tuple, List

class Solution:
  def batch_loader(self, raw_dataset: str, context_length: int, barch_size: int) -> Tuple[List[List[str]]]:
    torch.manual_seed(0)
    words = raw_dataset.split()
    indices = torch.randint(low=0, high = len(words) - context_length, size = (batch_size,) )
    X= []
    Y = []
    for idx in indices:
      X.append(words[idx:idx+context_length])
      Y.append(words[idx+1:idx+1+context_length])
    return X,Y

**Self Attention Problem**

In [7]:
import torch
import torch.nn as nn
from torchtyping import TensorType
import math
class SingleHeadAttention(nn.Module):
      def __init__(self, embedding_dim: int, attention_dim: int):
        super().__init__()
        torch.manual_seed(0)
        self.get_keys = nn.Linear(embedding_dim, attention_dim)
        self.get_queries = nn.Linear(embedding_dim, attention_dim)
        self.get_values = nn.Linear(embedding_dim, attention_dim)

      def forward(self, embedded: TensorType[float]) -> TensorType[float]:
        k = self.get_keys(embedded) # B, T, A
        q = self.get_queries(embedded)
        v = self.get_values(embedded)

        scores = q @ torch.transpose(k, 1, 2)
        B, T, A = k.shape
        scores = scores / (A ** 0.5)

        pre_mask = torch.tril(torch.ones(T, T))

        mask = pre_mask == 0
        scores = scores.masked_fill(mask, float('-inf'))
        scores = nn.functional.softmax(scores, dim=2) # B, T, T
        transformed = scores @ v
        return torch.round(transformed, decimals=4)

2ND **METHOD**

In [9]:
# ------ OR ---------

import torch
import torch.nn as nn
from torchtyping import TensorType
import math

class SingleHeadAttention(nn.Module):
    def __init__(self, embedding_dim: int, attention_dim: int):
        super().__init__()
        torch.manual_seed(0)

        # Linear layers for Key, Query, Value
        self.key = nn.Linear(embedding_dim, attention_dim, bias=False)
        self.query = nn.Linear(embedding_dim, attention_dim, bias=False)
        self.value = nn.Linear(embedding_dim, attention_dim, bias=False)

    def forward(self, embedded: TensorType[float]) -> TensorType[float]:
        B, T, _ = embedded.shape   # (batch, seq_len, embedding_dim)

        # 1. Compute Q, K, V
        Q = self.query(embedded)   # (B, T, A)
        K = self.key(embedded)     # (B, T, A)
        V = self.value(embedded)   # (B, T, A)

        # 2. Compute attention scores
        scores = Q @ K.transpose(1, 2) / math.sqrt(K.size(-1))  # (B, T, T)

        # 3. Apply causal mask (future = -inf)
        mask = torch.tril(torch.ones(T, T, device=embedded.device))
        scores = scores.masked_fill(mask == 0, float('-inf'))

        # 4. Softmax to get attention weights
        attn_weights = torch.softmax(scores, dim=-1)

        # 5. Weighted sum of values
        out = attn_weights @ V  # (B, T, A)

        return out
