#Assignment #6

Vision Transformer

---

# Part A - Data and tokens

Q1. Build a tiny toy dataset with pandas
Create a pandas DataFrame with columns text and label.
- Include at least 12 short sentences (3-10 words each).
- The label is 0/1 (e.g., positive vs negative sentiment).
- Shuffle rows and split into train/test (80/20) using a fixed random seed.
Return: df_train, df_test.

In [1]:
import pandas as pd

def make_toy_dataset(seed: int = 42):
    """Return df_train, df_test with columns: text (str), label (int)."""

    # Step 1: Create toy dataset (12 sentences)
    data = {
        "text": [
            "I love this movie",
            "This food tastes amazing",
            "What a wonderful day",
            "I enjoy learning new things",
            "The service was excellent",
            "This product is very good",
            "I hate this weather",
            "This is a terrible mistake",
            "The experience was bad",
            "I feel very sad today",
            "This movie is boring",
            "The food was disgusting"
        ],
        "label": [
            1, 1, 1, 1, 1, 1,   # Positive = 1
            0, 0, 0, 0, 0, 0    # Negative = 0
        ]
    }

    # Step 2: Create DataFrame
    df = pd.DataFrame(data)

    # Step 3: Shuffle dataset
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    # Step 4: Train/Test split (80/20)
    split_idx = int(0.8 * len(df))
    df_train = df.iloc[:split_idx].reset_index(drop=True)
    df_test = df.iloc[split_idx:].reset_index(drop=True)

    return df_train, df_test


Q2. Clean and tokenize text

Implement a basic cleaner: lowercase, strip, replace multiple spaces with one, and remove punctuation
(.,!?;:).
Tokenize by whitespace.
Add a new column tokens that stores a list of tokens per row.
Return the updated DataFrame.

In [2]:
import re
import pandas as pd

def clean_text(s: str) -> str:
    """Basic cleaner: lowercase, remove punctuation, normalize spaces."""

    # Lowercase
    s = s.lower()

    # Remove punctuation (.,!?;:)
    s = re.sub(r"[.,!?;:]", "", s)

    # Strip leading/trailing spaces
    s = s.strip()

    # Replace multiple spaces with single space
    s = re.sub(r"\s+", " ", s)

    return s


def add_tokens_column(df: pd.DataFrame) -> pd.DataFrame:
    """Adds df['tokens'] = list[str]."""

    df = df.copy()

    # Clean + tokenize
    df["tokens"] = df["text"].apply(lambda x: clean_text(x).split())

    return df


Q3. Build a vocabulary + token/id mappings

Build token2id and id2token using the training tokens.
Include special tokens: [PAD], [UNK], [BOS], [EOS] at the beginning.
Add tokens that occur at least min_freq times.
Return: token2id (dict), id2token (list).

In [3]:
from collections import Counter
from typing import Dict, List

SPECIALS = ['[PAD]', '[UNK]', '[BOS]', '[EOS]']

def build_vocab(list_of_token_lists, min_freq: int = 1):
    """Return token2id (dict) and id2token (list)."""

    # Count token frequencies
    counter = Counter()
    for tokens in list_of_token_lists:
        counter.update(tokens)

    # Start vocab with special tokens
    id2token = SPECIALS.copy()

    # Add tokens meeting min_freq condition
    for token, freq in counter.items():
        if freq >= min_freq:
            id2token.append(token)

    # Create token2id mapping
    token2id = {token: idx for idx, token in enumerate(id2token)}

    return token2id, id2token


Q4. Convert tokens to ids + pad to a batch

Implement tokens_to_ids for one sequence.
Implement pad_batch that takes a list of id sequences and returns:
- X: int array (B,T) padded with pad_id
- pad_mask: bool array (B,T) where True means 'real token' and False means padding

In [4]:
import numpy as np

def tokens_to_ids(tokens, token2id, unk_token='[UNK]'):
    """Convert one token list into list of ids."""

    unk_id = token2id[unk_token]

    ids = [token2id.get(tok, unk_id) for tok in tokens]

    return ids


def pad_batch(id_seqs, pad_id: int):
    """
    Return:
    - X: padded int array (B,T)
    - pad_mask: bool array (B,T)
    """

    batch_size = len(id_seqs)
    max_len = max(len(seq) for seq in id_seqs)

    # Initialize padded array
    X = np.full((batch_size, max_len), pad_id, dtype=int)

    # Mask: True for real tokens
    pad_mask = np.zeros((batch_size, max_len), dtype=bool)

    for i, seq in enumerate(id_seqs):
        length = len(seq)
        X[i, :length] = seq
        pad_mask[i, :length] = True

    return X, pad_mask


#Part B - Core Transformer math

Q5. Embedding lookup

Implement an embedding table E of shape (V,D) initialized from a normal distribution (mean 0, std 0.02).
Given token ids X (B,T), return embeddings of shape (B,T,D) using NumPy indexing.


In [5]:
import numpy as np

def init_embeddings(vocab_size: int, d_model: int, seed: int = 0):
    """Initialize embedding table E: (V,D) from N(0, 0.02)."""

    rng = np.random.RandomState(seed)
    E = rng.normal(loc=0.0, scale=0.02, size=(vocab_size, d_model))

    return E


def embed(X: np.ndarray, E: np.ndarray):
    """
    X: (B,T) token ids
    E: (V,D) embedding matrix
    Return: out (B,T,D)
    """

    out = E[X]   # NumPy indexing

    return out


Q6. Sinusoidal positional encoding

Implement the classic sinusoidal positional encoding PE of shape (T,D).
Then add it to token embeddings (B,T,D).
Make sure your implementation works for both even and odd D.

In [6]:
import numpy as np

def sinusoidal_positional_encoding(T: int, D: int):
    """Return sinusoidal PE: (T,D)."""

    PE = np.zeros((T, D))

    for pos in range(T):
        for i in range(0, D, 2):
            div_term = np.exp(-np.log(10000.0) * i / D)

            PE[pos, i] = np.sin(pos * div_term)

            # Only apply cosine if dimension exists (odd D safe)
            if i + 1 < D:
                PE[pos, i + 1] = np.cos(pos * div_term)

    return PE


def add_positional_encoding(X_emb: np.ndarray, PE: np.ndarray):
    """
    X_emb: (B,T,D)
    PE: (T,D)
    Return: (B,T,D)
    """

    X_emb_pe = X_emb + PE[None, :, :]   # broadcast across batch

    return X_emb_pe


Q7. Scaled dot-product attention with masking

Implement scaled dot-product attention:
Attention(Q,K,V) = softmax((Q @ K^T) / sqrt(dk) + mask) @ V
Inputs: Q,K,V are (B,H,T,Dh). Mask is boolean broadcastable to (B,H,T,T) where False means 'mask out'.
Requirements:
- Use a numerically stable softmax (subtract max).
- Convert boolean mask to large negative values before softmax.
Return: context (B,H,T,Dh) and attention weights (B,H,T,T).

In [7]:
import numpy as np

def softmax(x: np.ndarray, axis: int = -1):
    """Stable softmax."""

    x_max = np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x - x_max)
    y = exp_x / np.sum(exp_x, axis=axis, keepdims=True)

    return y


def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Q,K,V: (B,H,T,Dh)
    mask: bool broadcastable to (B,H,T,T)
    Return:
      context: (B,H,T,Dh)
      attn: (B,H,T,T)
    """

    dk = Q.shape[-1]

    # Compute attention scores: (B,H,T,T)
    scores = (Q @ K.transpose(0, 1, 3, 2)) / np.sqrt(dk)

    # Apply mask (False means masked out)
    if mask is not None:
        scores = np.where(mask, scores, -1e9)

    # Softmax over last dimension (keys)
    attn = softmax(scores, axis=-1)

    # Weighted sum: (B,H,T,Dh)
    context = attn @ V

    return context, attn


Q8. Multi-head self-attention (MHA)

Implement multi-head self-attention for input X (B,T,D).
- Project to Q,K,V using weight matrices Wq,Wk,Wv each (D,D).
- Reshape/split into heads -> (B,H,T,Dh) where Dh=D/H.
- Apply scaled dot-product attention with a pad mask (B,T) (broadcast it appropriately).
- Concatenate heads and apply output projection Wo (D,D).
Return: out (B,T,D) and attention weights (B,H,T,T).

In [8]:
import numpy as np

def linear(x: np.ndarray, W: np.ndarray, b=None):
    """Linear layer: x @ W + b"""

    y = x @ W
    if b is not None:
        y = y + b
    return y


def split_heads(x: np.ndarray, n_heads: int):
    """
    (B,T,D) -> (B,H,T,Dh)
    """

    B, T, D = x.shape
    Dh = D // n_heads

    x = x.reshape(B, T, n_heads, Dh)
    xh = x.transpose(0, 2, 1, 3)

    return xh


def combine_heads(xh: np.ndarray):
    """
    (B,H,T,Dh) -> (B,T,D)
    """

    B, H, T, Dh = xh.shape

    xh = xh.transpose(0, 2, 1, 3)
    x = xh.reshape(B, T, H * Dh)

    return x


def mha_self_attention(X, Wq, Wk, Wv, Wo, n_heads: int, pad_mask=None):
    """
    X: (B,T,D)
    pad_mask: (B,T) True=real token, False=padding
    Return:
      out: (B,T,D)
      attn: (B,H,T,T)
    """

    # Project inputs
    Q = linear(X, Wq)
    K = linear(X, Wk)
    V = linear(X, Wv)

    # Split into heads
    Qh = split_heads(Q, n_heads)
    Kh = split_heads(K, n_heads)
    Vh = split_heads(V, n_heads)

    # Prepare attention mask
    mask = None
    if pad_mask is not None:
        # (B,T) -> (B,1,1,T)
        mask = pad_mask[:, None, None, :]

    # Attention
    context, attn = scaled_dot_product_attention(Qh, Kh, Vh, mask)

    # Combine heads back
    context_combined = combine_heads(context)

    # Final projection
    out = linear(context_combined, Wo)

    return out, attn


Q9. LayerNorm + residual connection

Implement LayerNorm for X (B,T,D) using learnable gamma and beta of shape (D,).
Then implement residual_add_and_norm(Y, X, gamma, beta) that returns LayerNorm(X + Y).

In [9]:
import numpy as np

def layer_norm(X: np.ndarray, gamma: np.ndarray, beta: np.ndarray, eps: float = 1e-5):
    """
    X: (B,T,D)
    gamma, beta: (D,)
    Return: normalized output
    """

    mean = np.mean(X, axis=-1, keepdims=True)
    var = np.var(X, axis=-1, keepdims=True)

    X_norm = (X - mean) / np.sqrt(var + eps)

    Y = gamma * X_norm + beta

    return Y


def residual_add_and_norm(Y: np.ndarray, X: np.ndarray, gamma: np.ndarray, beta: np.ndarray):
    """Return LayerNorm(X + Y)."""

    Z = layer_norm(X + Y, gamma, beta)

    return Z


Q10. Position-wise FeedForward network

Implement FFN: FFN(X) = relu(X @ W1 + b1) @ W2 + b2
Shapes: X (B,T,D), W1 (D,Dff), b1 (Dff,), W2 (Dff,D), b2 (D,)
Return: (B,T,D).

In [10]:
import numpy as np

def relu(x: np.ndarray):
    """ReLU activation."""
    return np.maximum(0, x)


def feed_forward(X: np.ndarray, W1: np.ndarray, b1: np.ndarray,
                 W2: np.ndarray, b2: np.ndarray):
    """
    X: (B,T,D)
    Return: (B,T,D)
    """

    hidden = relu(X @ W1 + b1)
    Y = hidden @ W2 + b2

    return Y


# Part C - Putting it together

Q11. One Transformer encoder block (forward)

Implement a single encoder block forward pass:
1) MHA = MultiHeadSelfAttention(X) with pad_mask
2) X1 = LayerNorm(X + MHA)
3) FFN = FeedForward(X1)
4) X2 = LayerNorm(X1 + FFN)
Return X2.
You may pass all parameters explicitly (weights, gamma/beta).

In [11]:
def encoder_block_forward(X, params, n_heads: int, pad_mask=None):
    """
    X: (B,T,D)
    params: dictionary containing all weights + gamma/beta
    pad_mask: (B,T) boolean mask
    Return: X2 (B,T,D)
    """

    # ---- Multi-Head Self Attention ----
    mha_out, attn = mha_self_attention(
        X,
        params["Wq"],
        params["Wk"],
        params["Wv"],
        params["Wo"],
        n_heads=n_heads,
        pad_mask=pad_mask
    )

    # ---- Residual + LayerNorm 1 ----
    X1 = residual_add_and_norm(
        mha_out,
        X,
        params["gamma1"],
        params["beta1"]
    )

    # ---- Feed Forward Network ----
    ffn_out = feed_forward(
        X1,
        params["W1"],
        params["b1"],
        params["W2"],
        params["b2"]
    )

    # ---- Residual + LayerNorm 2 ----
    X2 = residual_add_and_norm(
        ffn_out,
        X1,
        params["gamma2"],
        params["beta2"]
    )

    return X2


Q12. Sequence classification head + end-to-end demo

Create an end-to-end forward pass for a tiny classifier:
- Input ids -> embeddings + positional enc
- One encoder block
- Pooling: take the [BOS] position (t=0) as the sequence representation
- Linear head: logits = h0 @ Wcls + bcls with Wcls (D,2), bcls (2,)
- Softmax to probabilities
Write predict_proba that takes a batch of texts and returns probs (B,2).
Include simple sanity checks: shapes, probabilities sum to 1, and masking doesn't crash for different
lengths.

In [12]:
def predict_proba(texts, token2id, E, PE, params, Wcls, bcls, n_heads: int):
    """
    texts: list[str]
    Return probs: (B,2)
    """

    # ---------- Step 1: Tokenize texts ----------
    tokenized = []
    for text in texts:
        clean = clean_text(text)
        tokens = clean.split()

        # Add BOS and EOS
        tokens = ["[BOS]"] + tokens + ["[EOS]"]
        tokenized.append(tokens)

    # ---------- Step 2: Convert tokens â†’ ids ----------
    id_seqs = [tokens_to_ids(seq, token2id) for seq in tokenized]

    # ---------- Step 3: Pad batch ----------
    pad_id = token2id["[PAD]"]
    X_ids, pad_mask = pad_batch(id_seqs, pad_id)

    # Sanity check
    B, T = X_ids.shape
    assert pad_mask.shape == (B, T)

    # ---------- Step 4: Embedding Lookup ----------
    X_emb = embed(X_ids, E)  # (B,T,D)

    # ---------- Step 5: Add Positional Encoding ----------
    X_emb_pe = add_positional_encoding(X_emb, PE[:T])

    # ---------- Step 6: Encoder Block ----------
    X_enc = encoder_block_forward(
        X_emb_pe,
        params,
        n_heads=n_heads,
        pad_mask=pad_mask
    )

    # ---------- Step 7: Pooling (BOS token at t=0) ----------
    h0 = X_enc[:, 0, :]   # (B,D)

    # ---------- Step 8: Classification Head ----------
    logits = h0 @ Wcls + bcls   # (B,2)

    # ---------- Step 9: Softmax Probabilities ----------
    probs = softmax(logits, axis=-1)

    # ---------- Sanity Checks ----------
    assert probs.shape == (B, 2)
    assert np.allclose(np.sum(probs, axis=1), 1.0)

    return probs
