# **Attention**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/'Colab Notebooks'

/content/drive/MyDrive/Colab Notebooks


In [None]:
def sdpa(Wq, Wk, Wv, text):
  embed = get_embeddings(text)
  Q = torch.matmul(embed, Wq)
  K = torch.matmul(embed, Wk)
  V = torch.matmul(embed, Wv)
  print(Q.shape,K.shape,embed.shape)
  attn_scores = torch.matmul(Q, torch.transpose(K,1,2)) / torch.sqrt(torch.tensor(embed.shape[2]))
  #d_k is the embedding dimension
  #do we need -1?
  print("attn_scores shape",attn_scores.shape)
  scaled = torch.softmax(attn_scores,dim=-1)
  attn = torch.matmul(scaled, V)

  return attn

Wq = torch.rand(768,768)
Wk = torch.rand(768,768)
Wv = torch.rand(768,768)

print(sdpa(Wq, Wk, Wv, "hello").shape)



$$\text{MultiHead}(\mathbf{Q}, \mathbf{K}, \mathbf{V}) = \text{Concat}(\text{head}_1, \text{head}_2, \dots, \text{head}_h)\mathbf{W}^O$$

$$\text{Attention}(\mathbf{Q}, \mathbf{K}, \mathbf{V}) = \text{softmax}\left(\frac{\mathbf{Q}\mathbf{K}^T}{\sqrt{d_k}}\right)\mathbf{V}$$

In [None]:
#karpathy lets build gpt 1:02:00
torch.manual_seed(1337)
B,T,C = 4, 8, 32
x = torch.randn(B,T,C)

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

out = wei @ x


Triton SDPA

When $\text{softmax}$ is applied:$$\text{softmax}\left(\frac{\mathbf{Q}\mathbf{K}^T}{\sqrt{d_k}} \text{ with causal mask}\right)$$The $e^{-\infty}$ terms become zero, effectively giving zero attention weight to all future tokens.

In [None]:
#karpathy lets build gpt 1:02:00
torch.manual_seed(1337)
B,T,C = 4, 8, 32
x = torch.randn(B,T,C)

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

out = wei @ x


In [None]:
import torch
import triton
import triton.language as tl
import math

# Define the Triton Kernel
@triton.jit
def causal_attention_kernel(
    Q, K, V, O,  # Data Pointers for Query, Key, Value, Output
    sm_scale,  # Scaling factor: 1/sqrt(d_k)
    # Tensors dimensions
    Lq, Lk, Lv, Lo,  # Strides for Q, K, V, O
    N_CTX,  # Sequence Length (Max tokens)
    D_HEAD,  # Head Dimension (d_k)
    # Block dimensions (These are fixed by the user when launching)
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
):
    """
    Computes Causal Masked Scaled Dot-Product Attention.

    This kernel implements the online softmax trick to avoid materializing the
    full N x N attention matrix in global memory, which is the core principle
    of FlashAttention.
    """
    # 1. Block Indexing for parallelization
    # Program ID (PID) maps to a Query block M (rows in the attention matrix)
    pid_m = tl.program_id(0)

    # Initialize a pointer to the output block (O_ptr)
    # O is indexed by [pid_m * BLOCK_M, :], scaled by the strides.
    offs_om = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = tl.arange(0, BLOCK_DMODEL)
    O_ptr = O + offs_om[:, None] * Lo + offs_n[None, :] * Lk

    # Initialize the accumulators for the output (O_i), running maximum (m_i),
    # and normalization factor (l_i). These are the core variables for online softmax.
    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
    l_i = tl.full([BLOCK_M], 0.0, dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)

    # 2. Load Query block Q_i
    # Q is indexed by [pid_m * BLOCK_M, :]
    offs_qm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_d = tl.arange(0, BLOCK_DMODEL)
    Q_ptr = Q + offs_qm[:, None] * Lq + offs_d[None, :] * Lk

    # Load Q block and multiply by the scaling factor
    q = tl.load(Q_ptr) * sm_scale

    # 3. Iterate over blocks of K_j and V_j (tiling along the sequence dimension N)
    for start_n in range(0, N_CTX, BLOCK_N):
        # Create pointer offsets for the current block K_j and V_j
        offs_n_load = start_n + tl.arange(0, BLOCK_N)
        offs_k = offs_n_load[None, :] * Lk + offs_d[:, None] * Lq
        K_ptr = K + offs_k
        V_ptr = V + offs_n_load[:, None] * Lv + offs_d[None, :] * Lk

        # Load K_j and V_j block
        k = tl.load(K_ptr)
        v = tl.load(V_ptr)

        # 4. Compute attention scores S_ij = Q_i * K_j^T
        # s has shape [BLOCK_M, BLOCK_N]
        s = tl.dot(q, k, allow_tf32=True)

        # 5. Apply Causal Masking (Prevent attending to future tokens)
        # s must be masked where query index > key index.
        # This is where the causal constraint (i <= j) is enforced.
        mask = offs_qm[:, None] >= offs_n_load[None, :]
        s = tl.where(mask, s, float("-inf"))

        # 6. Online Softmax Update (Row-wise max and sum)
        # This is the core trick for numerical stability and memory efficiency.

        # 6a. Compute the new row-wise maximum m_j
        m_j = tl.max(s, 1)

        # 6b. Update the running maximum m_i
        m_new = tl.maximum(m_i, m_j)

        # 6c. Compute the exponential terms e_i and e_j
        alpha = tl.exp(m_i - m_new)
        beta = tl.exp(m_j - m_new)

        # 6d. Update the running normalization factor l_i
        l_new = alpha * l_i + beta
        l_i = l_new

        # 6e. Re-scale the previous accumulator acc
        acc_scale = alpha / l_i
        acc = acc * acc_scale[:, None]

        # 6f. Compute the attention weights and update the accumulator
        s = s - m_new[:, None]
        p = tl.exp(s) * (beta / l_i)[:, None]

        # 6g. Update the accumulator: acc_new = acc_old + P_ij * V_j
        acc = acc + tl.dot(p, v, allow_tf32=True)

        # 6h. Update the running maximum m_i for the next iteration
        m_i = m_new

    # 7. Write the final result to the output tensor O
    # The final output is acc (weighted sum of V) divided by the final normalization l_i
    tl.store(O_ptr, acc / l_i[:, None])

# ----------------------------------------------------------------------
# Python Host Wrapper
# ----------------------------------------------------------------------

def run_attention_kernel(q, k, v, is_causal=True):
    """
    Runs the Triton attention kernel with torch tensors.

    Args:
        q (torch.Tensor): Query tensor (L, D).
        k (torch.Tensor): Key tensor (L, D).
        v (torch.Tensor): Value tensor (L, D).
        is_causal (bool): Whether to apply causal masking.
    """
    assert q.shape == k.shape == v.shape, "Q, K, V must have the same shape (L, D) for self-attention."
    assert q.is_cuda and k.is_cuda and v.is_cuda, "Inputs must be on the GPU."

    N_CTX, D_HEAD = q.shape[0], q.shape[1]

    # Hyperparameters: Tune these for performance
    BLOCK_M = 64  # Block size for the Query dimension (rows)
    BLOCK_N = 64  # Block size for the Key dimension (columns)
    BLOCK_DMODEL = D_HEAD # Block size for the Head dimension

    # Define the scaling factor: 1/sqrt(d_k)
    sm_scale = 1.0 / math.sqrt(D_HEAD)

    # Output tensor initialization (N_CTX, D_HEAD)
    o = torch.empty_like(q)

    # 1D launch grid: we launch one program for each block M (rows) in the sequence
    grid = lambda META: (triton.cdiv(N_CTX, META['BLOCK_M']),)

    # Kernel call
    causal_attention_kernel[grid](
        q, k, v, o,  # Pointers
        sm_scale,  # Scaling factor
        q.stride(0), k.stride(0), v.stride(0), o.stride(0),  # Strides
        N_CTX, D_HEAD,
        BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_DMODEL=BLOCK_DMODEL,
        num_warps=4,
        num_stages=3
    )
    return o

# ----------------------------------------------------------------------
# Example Usage
# ----------------------------------------------------------------------
if __name__ == '__main__':
    # Ensure inputs are in a format compatible with Triton (float16 is typical for perf)
    dtype = torch.float16
    device = 'cuda'

    # Set Sequence Length (N_CTX) and Head Dimension (D_HEAD)
    N_CTX = 256
    D_HEAD = 64

    # Create dummy tensors for Q, K, V (all derived from the same input for self-attention)
    # Shape: [Sequence Length, Head Dimension]
    Q_data = torch.randn(N_CTX, D_HEAD, dtype=dtype, device=device)
    K_data = torch.randn(N_CTX, D_HEAD, dtype=dtype, device=device)
    V_data = torch.randn(N_CTX, D_HEAD, dtype=dtype, device=device)

    # Run the Triton kernel
    output_triton = run_attention_kernel(Q_data, K_data, V_data)

    print(f"Input Shape (Q, K, V): {Q_data.shape}")
    print(f"Output Shape (O):      {output_triton.shape}")
    print(f"\nExample Output (First 5 elements of first row):\n{output_triton[0, :5]}")

    # For comparison, you would compare this output against a known, verified
    # PyTorch implementation (like F.scaled_dot_product_attention with causal=True).
    # The numerical results should be close, demonstrating the kernel works.

Strategies for cleaning data. Do these work?

In [None]:
candidate_policies = [
    {"theta": 0.05, "noisy_weight": 0.5},
    {"theta": 0.10, "noisy_weight": 0.5},
    {"theta": 0.10, "noisy_weight": 0.2},
    {"theta": 0.15, "noisy_weight": 0.2},
    {"theta": 0.20, "noisy_weight": 0.0},  # drop most uncertain
    {"theta": 0.30, "noisy_weight": 0.0},
]

def apply_policy(policy, conf):
    theta = policy["theta"]
    noisy_weight = policy["noisy_weight"]
    w = np.ones_like(conf, dtype=float)
    w[conf < theta] = noisy_weight
    return w

n_trials = 20
epsilon = 0.3
K = len(candidate_policies)
Q = np.zeros(K)
N = np.zeros(K)

for t in range(n_trials):
    if np.random.rand() < epsilon:
        k = np.random.randint(K)
    else:
        k = np.argmax(Q)

    policy = candidate_policies[k]
    sample_weight = apply_policy(policy, conf)

    clf = train_model(X_train, y_train, sample_weight=sample_weight)
    reward = eval_model(clf, X_val, y_val)

    N[k] += 1
    Q[k] += (reward - Q[k]) / N[k]

    print(f"Iter {t:02d} | trial policy {k} {policy} | F1={reward:.4f}")

best_k = np.argmax(Q)
best_policy = candidate_policies[best_k]
print("\nBase F1:", base_f1)
print("Best policy from bandit:", best_policy, "Estimated F1:", Q[best_k])

In [None]:
from typing import Optional, Literal, Dict, Any, Tuple, Callable

class NoisyDatasetCleaner:
    """
    A generic wrapper for noisy supervised datasets.
    Strategies:
      - 'bandit_weight'
      - 'cleanlab'
      - 'none'
    You can extend it with active-relabelling logic.
    """
    def __init__(self,
                 strategy: Literal["none", "bandit_weight", "cleanlab"] = "none",
                 clf_factory: Callable[[], Any] = None,
                 strategy_kwargs: Optional[Dict[str, Any]] = None):
        self.strategy = strategy
        self.clf_factory = clf_factory or (lambda: LogisticRegression(max_iter=300, n_jobs=-1))
        self.strategy_kwargs = strategy_kwargs or {}
        self.sample_weight_ = None
        self._cleanlab_model = None

    def fit(self, X, y) -> "NoisyDatasetCleaner":
        if self.strategy == "none":
            self.sample_weight_ = np.ones(len(y), dtype=float)

        elif self.strategy == "bandit_weight":
            self._fit_bandit_weight(X, y)

        elif self.strategy == "cleanlab":
            self._fit_cleanlab(X, y)

        else:
            raise ValueError(f"Unknown strategy: {self.strategy}")

        return self

    def _fit_bandit_weight(self, X, y):
        # train base model
        base_clf = self.clf_factory()
        base_clf.fit(X, y)
        p = base_clf.predict_proba(X)[:, 1]
        conf = np.abs(p - 0.5)

        # simple discrete bandit over thresholds, like earlier
        candidate_policies = self.strategy_kwargs.get("candidate_policies") or [
            {"theta": 0.05, "noisy_weight": 0.5},
            {"theta": 0.10, "noisy_weight": 0.5},
            {"theta": 0.10, "noisy_weight": 0.2},
            {"theta": 0.15, "noisy_weight": 0.2},
            {"theta": 0.20, "noisy_weight": 0.0},
            {"theta": 0.30, "noisy_weight": 0.0},
        ]
        n_trials = self.strategy_kwargs.get("n_trials", 10)
        epsilon  = self.strategy_kwargs.get("epsilon", 0.3)
        X_train, X_val, y_train, y_val, conf_train = train_test_split(
            X, y, conf, test_size=0.2, random_state=0, stratify=y
        )

        def apply_policy(policy, conf_vec):
            w = np.ones_like(conf_vec, dtype=float)
            w[conf_vec < policy["theta"]] = policy["noisy_weight"]
            return w

        K = len(candidate_policies)
        Q = np.zeros(K)
        N = np.zeros(K)

        for _ in range(n_trials):
            if np.random.rand() < epsilon:
                k = np.random.randint(K)
            else:
                k = np.argmax(Q)

            policy = candidate_policies[k]
            weights_train = apply_policy(policy, conf_train)

            clf = self.clf_factory()
            clf.fit(X_train, y_train, sample_weight=weights_train)
            y_pred_val = clf.predict(X_val)
            reward = f1_score(y_val, y_pred_val)

            N[k] += 1
            Q[k] += (reward - Q[k]) / N[k]

        best_k = np.argmax(Q)
        best_policy = candidate_policies[best_k]

        # final weights on full data
        self.sample_weight_ = apply_policy(best_policy, conf)

    def _fit_cleanlab(self, X, y):
        from cleanlab.classification import CleanLearning

        base_clf = self.clf_factory()
        cl = CleanLearning(clf=base_clf)
        cl.fit(X, y)
        self._cleanlab_model = cl

        # label issues summary -> create weights
        issues = cl.get_label_issues()
        is_issue = issues["is_label_issue"].values
        # simple scheme: 0.3 weight for suspected issues
        w = np.ones(len(y), dtype=float)
        w[is_issue] = 0.3
        self.sample_weight_ = w

    def get_weights(self) -> np.ndarray:
        if self.sample_weight_ is None:
            raise RuntimeError("Call fit() first")
        return self.sample_weight_

    def fit_clean_model(self, X, y):
        """Train a final classifier using the learned sample weights."""
        w = self.get_weights()
        clf = self.clf_factory()
        clf.fit(X, y, sample_weight=w)
        return clf

In [None]:
cleaner = NoisyDatasetCleaner(strategy="bandit_weight")
cleaner.fit(X_train, y_train)
weights = cleaner.get_weights()

clf_clean = cleaner.fit_clean_model(X_train, y_train)
f1_clean = eval_model(clf_clean, X_val, y_val)
print("F1 with NoisyDatasetCleaner (bandit_weight):", f1_clean)

# Or:
cleaner_cl = NoisyDatasetCleaner(strategy="cleanlab")
cleaner_cl.fit(X_train, y_train)
clf_cleanlab = cleaner_cl.fit_clean_model(X_train, y_train)
print("F1 with NoisyDatasetCleaner (cleanlab):", eval_model(clf_cleanlab, X_val, y_val))

Create KV server wo vllm


In [None]:
# tinyllama_server_kv_stream.py
import asyncio
import json
import time
import uuid
from typing import List, Optional

import torch
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# ---------------------------------------------------------
# Model load
# ---------------------------------------------------------
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print(f"Loading model: {MODEL_NAME}")

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=dtype,
    device_map="auto" if device == "cuda" else None,
)
model.to(device)
model.eval()

# ---------------------------------------------------------
# FastAPI + schemas
# ---------------------------------------------------------
app = FastAPI(title="TinyLlama KV streaming demo")

class ChatMessage(BaseModel):
    role: str
    content: str

class ChatRequest(BaseModel):
    model: Optional[str] = None
    messages: List[ChatMessage]
    max_tokens: int = 128
    temperature: float = 0.7
    top_p: float = 1.0
    n: int = 1
    stream: bool = False
    stop: Optional[List[str]] = None

# ---------------------------------------------------------
# Prompt formatting
# ---------------------------------------------------------
def build_prompt(messages: List[ChatMessage]) -> str:
    parts = []
    for m in messages:
        if m.role == "user":
            parts.append(f"User: {m.content}")
        elif m.role == "assistant":
            parts.append(f"Assistant: {m.content}")
        else:
            parts.append(f"{m.role.capitalize()}: {m.content}")
    parts.append("Assistant:")
    return "\n".join(parts)

# ---------------------------------------------------------
# Sampling helpers
# ---------------------------------------------------------

def sample_next_token(
    logits: torch.Tensor,
    temperature: float,
    top_p: float,
) -> int:
    """
    logits: [vocab_size] (for a single position)
    returns: int token id
    """
    if temperature <= 0:
        # greedy
        return int(torch.argmax(logits, dim=-1).item())

    # temperature scaling
    logits = logits / temperature

    # softmax to probs
    probs = torch.softmax(logits, dim=-1)

    if top_p < 1.0:
        # nucleus sampling
        sorted_probs, sorted_indices = torch.sort(probs, descending=True)
        cumulative = torch.cumsum(sorted_probs, dim=-1)

        # keep minimal set that sums to >= top_p
        mask = cumulative - sorted_probs > top_p
        sorted_probs[mask] = 0
        sorted_probs = sorted_probs / sorted_probs.sum()
        idx = torch.multinomial(sorted_probs, 1)
        token_id = sorted_indices[idx]
        return int(token_id.item())
    else:
        # plain multinomial over full vocab
        token_id = torch.multinomial(probs, 1)
        return int(token_id.item())

# ---------------------------------------------------------
# Non-streaming (simple single-call generate)
# ---------------------------------------------------------

async def handle_non_stream(req: ChatRequest):
    prompt = build_prompt(req.messages)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=req.max_tokens,
            temperature=req.temperature,
            top_p=req.top_p,
            do_sample=req.temperature > 0,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    input_len = inputs["input_ids"].shape[1]
    new_tokens = gen_ids[0, input_len:]
    text = tokenizer.decode(new_tokens, skip_special_tokens=True)

    prompt_tokens = int(inputs["input_ids"].numel())
    completion_tokens = int(new_tokens.numel())
    total_tokens = prompt_tokens + completion_tokens

    resp = {
        "id": f"chatcmpl-{uuid.uuid4().hex}",
        "object": "chat.completion",
        "created": int(time.time()),
        "model": req.model or MODEL_NAME,
        "choices": [
            {
                "index": 0,
                "message": {"role": "assistant", "content": text},
                "finish_reason": "stop",
            }
        ],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "total_tokens": total_tokens,
        },
    }
    return resp

# ---------------------------------------------------------
# Real per-token streaming with KV cache
# ---------------------------------------------------------

async def handle_stream(req: ChatRequest):
    prompt = build_prompt(req.messages)
    request_id = f"chatcmpl-{uuid.uuid4().hex}"
    model_name = req.model or MODEL_NAME
    created = int(time.time())

    async def event_stream():
        # 1) initial input: full prompt
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        input_ids = inputs["input_ids"]  # [1, seq_len]
        attention_mask = inputs["attention_mask"]

        max_new_tokens = req.max_tokens
        eos_id = tokenizer.eos_token_id

        # Send initial chunk with role (OpenAI-style)
        first_chunk = {
            "id": request_id,
            "object": "chat.completion.chunk",
            "created": created,
            "model": model_name,
            "choices": [
                {
                    "index": 0,
                    "delta": {"role": "assistant"},
                    "finish_reason": None,
                }
            ],
        }
        yield f"data: {json.dumps(first_chunk)}\n\n"

        past_key_values = None
        generated = []
        finish_reason = None

        for step in range(max_new_tokens):
            # 2) forward pass: either full prompt (first step) or just last token (subsequent steps)
            with torch.no_grad():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    past_key_values=past_key_values,
                    use_cache=True,
                )

            logits = outputs.logits[:, -1, :]  # [1, vocab]
            past_key_values = outputs.past_key_values

            # 3) sample next token
            next_token_id = sample_next_token(
                logits[0], req.temperature, req.top_p
            )
            generated.append(next_token_id)

            if next_token_id == eos_id:
                finish_reason = "stop"
                break

            # 4) decode just this token to text piece
            token_text = tokenizer.decode([next_token_id], skip_special_tokens=True)

            if token_text:
                chunk = {
                    "id": request_id,
                    "object": "chat.completion.chunk",
                    "created": created,
                    "model": model_name,
                    "choices": [
                        {
                            "index": 0,
                            "delta": {"content": token_text},
                            "finish_reason": None,
                        }
                    ],
                }
                yield f"data: {json.dumps(chunk)}\n\n"

            # 5) prepare for next step: feed only this token, no need to resend the whole prompt
            input_ids = torch.tensor([[next_token_id]], device=device)
            attention_mask = None  # not strictly needed when using past with single token

            # let event loop breathe a bit
            await asyncio.sleep(0)

        if finish_reason is None:
            finish_reason = "length"

        # final empty delta with finish_reason
        done_chunk = {
            "id": request_id,
            "object": "chat.completion.chunk",
            "created": created,
            "model": model_name,
            "choices": [
                {
                    "index": 0,
                    "delta": {},
                    "finish_reason": finish_reason,
                }
            ],
        }
        yield f"data: {json.dumps(done_chunk)}\n\n"
        yield "data: [DONE]\n\n"

    return StreamingResponse(event_stream(), media_type="text/event-stream")

# ---------------------------------------------------------
# Main endpoint
# ---------------------------------------------------------

@app.post("/v1/chat/completions")
async def chat_completions(req: ChatRequest):
    if req.stream:
        return await handle_stream(req)
    else:
        return await handle_non_stream(req)