<a href="https://colab.research.google.com/github/ciro-greco/AI-engineering-IEOR4574E001/blob/main/week_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language models are completion engines


# Building a Bigram Language Model from Scratch


In [None]:
"""
Bigram Language Model (from scratch)

What this cell does
- Tokenizes a tiny corpus into words
- Builds bigram counts: P(next_word | previous_word) estimated by relative frequency
- Provides helper functions to inspect next-word probabilities and to predict greedily

How to extend
- Add more sentences to the corpus to get richer predictions
- Swap in a different tokenizer (e.g., handle punctuation, numbers, accents)
- Turn counts into smoothed probabilities (e.g., add-1/Laplace, Kneser-Ney) for unseen words
"""

from collections import defaultdict, Counter
import re

# A tiny toy corpus. In practice, expand this to improve estimates.
corpus = """
new york city is big
new york times is popular
the cat sat on the mat
the cat sat on the floor
peanut butter and jelly is tasty
peanut butter and bananas are great
i love large language models
large language models predict the next token
language models are completion engines
"""

def tokenize(text):
    """
    Simple tokenizer:
    - Lowercases text
    - Removes all characters except letters and whitespace
    - Splits on whitespace
    Returns a list of tokens.
    """
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return [w for w in text.split() if w.strip()]

# Tokenize the entire corpus once
tokens = tokenize(corpus)

# Bigram frequency table:
# For each previous word, store a Counter of next words and their counts.
# Example: bigrams['new'] might be Counter({'york': 2})
bigrams = defaultdict(Counter)
for prev, nxt in zip(tokens[:-1], tokens[1:]):
    bigrams[prev][nxt] += 1

def next_word_probs(prev_word, topk=5):
    """
    Given a previous word, return up to topk likely next words with
    their empirical probabilities.

    P(next | prev) = count(prev, next) / sum_over_v count(prev, v)
    """
    counts = bigrams.get(prev_word, Counter())
    total = sum(counts.values())
    if not total:
        # Unknown previous word or no outgoing transitions
        return []
    # Most_common returns (word, count) sorted by count desc
    probs = [(w, c / total) for w, c in counts.most_common()]
    return probs[:topk]

def predict_next(prev_word):
    """
    Greedy next-word prediction:
    - Pick the single most frequent next word after prev_word
    Returns the predicted word or None if there is no data.
    """
    probs = next_word_probs(prev_word, topk=1)
    return probs[0][0] if probs else None

def suggest_after(prefix):
    """
    Convenience for multi-word prefixes:
    - Tokenize the prefix
    - Use only the last token as the 'previous word'
    - Return top next-word options with probabilities

    Example: suggest_after("the cat sat on the")
             uses 'the' as the previous word
    """
    toks = tokenize(prefix)
    if not toks:
        return []
    last = toks[-1]
    return next_word_probs(last)

# --- Demo: explore conditional next-word probabilities ---
print("After 'new york' →", suggest_after("new york"))
print("After 'the cat sat on the' →", suggest_after("the cat sat on the"))
print("After 'peanut butter and' →", suggest_after("peanut butter and"))

# --- Demo: greedy one-step predictions from single words ---
print("\nGreedy prediction examples:")
print("new →", predict_next("new"))
print("peanut →", predict_next("peanut"))
print("language →", predict_next("language"))


After 'new york' → [('city', 0.5), ('times', 0.5)]
After 'the cat sat on the' → [('cat', 0.4), ('mat', 0.2), ('floor', 0.2), ('next', 0.2)]
After 'peanut butter and' → [('jelly', 0.5), ('bananas', 0.5)]

Greedy prediction examples:
new → york
peanut → butter
language → models


# TO DO fix the following code that needs a hugginface API key

In [None]:
!pip install transformers accelerate sentencepiece torch --upgrade



In [None]:
from transformers import pipeline

# Small baseline
small = pipeline("text-generation", model="gpt2", device=0)

# Larger instruct model on GPU (about 7B) – adjust to your GPU quota if needed:
# Alternatives that commonly load on Colab T4: "mistralai/Mistral-7B-Instruct-v0.2"
larger = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2", device=0)

prompt_translate = "Translate to French: The book is on the table."
prompt_summarize = "Summarize in one short sentence: Large language models can write code and translate."

print("=== SMALL (gpt2) → translation ===")
print(small(prompt_translate, max_new_tokens=40)[0]["generated_text"], "\n")

print("=== LARGER (Mistral-7B-Instruct) → translation ===")
print(larger(prompt_translate, max_new_tokens=40)[0]["generated_text"], "\n")

print("=== SMALL (gpt2) → summarize ===")
print(small(prompt_summarize, max_new_tokens=40)[0]["generated_text"], "\n")

print("=== LARGER (Mistral-7B-Instruct) → summarize ===")
print(larger(prompt_summarize, max_new_tokens=40)[0]["generated_text"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2.
401 Client Error. (Request ID: Root=1-68c6f149-06855b55409ad42e45fb12e9;145da015-3b75-4175-91ba-9cc1c210228b)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must have access to it and be authenticated to access it. Please log in.

# Tokenization


In [None]:
!pip install transformers torch --quiet

In [None]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained("gpt2")  # small, ungated
text = "I can't wait to build AI applications."
ids = tok.encode(text)
tokens = tok.convert_ids_to_tokens(ids)

print("TEXT:", text)
print("TOKENS:", tokens)
print("Token count:", len(tokens))


TEXT: I can't wait to build AI applications.
TOKENS: ['I', 'Ġcan', "'t", 'Ġwait', 'Ġto', 'Ġbuild', 'ĠAI', 'Ġapplications', '.']
Token count: 9


# Stocasticity

In [None]:
!pip install transformers torch sentencepiece --quiet

In [None]:
from transformers import pipeline

t5 = pipeline("text2text-generation", model="google/flan-t5-large", device=-1)

print("FR:", t5("Translate to French: The book is on the table.", max_new_tokens=40)[0]["generated_text"])
print("SUM:", t5("Summarize in one sentence: Large language models can write code and translate.",
                max_new_tokens=40)[0]["generated_text"])


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


FR: L'ouvrage est sur la table.
SUM: Large language models can write code and translate.


# Representing language as vectors: Bag of words

In [None]:
"""
Bag of Words in two parts:
1) Build a BoW representation from scratch to see every decision.
2) Repeat with scikit-learn's CountVectorizer and TfidfVectorizer.

Students can paste this into a notebook and run as-is.
"""

import re
import numpy as np
from collections import Counter

# --- 0. Toy corpus: 4 short "documents" that reuse the word 'bank' in two senses ---
docs = [
    "The bank raised interest rates today.",
    "Sailing along the river bank is relaxing.",
    "The central bank sets policy rates each quarter.",
    "We sat on the riverbank and watched boats."
]

# --- 1. Normalization: lowercase the text to reduce trivial variants ('Bank' vs 'bank') ---
docs_lower = [d.lower() for d in docs]

# --- 2. Tokenization: split on word boundaries.
# We keep letters and digits; this treats 'riverbank' as ONE token and 'river bank' as TWO.
# This is intentional, so we can talk about how tokenization choices change the vocabulary.
def tokenize(text):
    # \b word boundary, \w+ one or more word chars [A-Za-z0-9_]
    return re.findall(r"\b\w+\b", text)

tokenized = [tokenize(d) for d in docs_lower]

# --- 3. Optional stopword removal.
# For pedagogy we use a tiny, explicit list so students see the effect.
# In practice you might use scikit-learn's 'english' list or a domain-specific list.
stopwords = set("""
a an the and or of to in is are was were be being been on each
""".split())

tokenized_no_stop = [[t for t in toks if t not in stopwords] for toks in tokenized]

# --- 4. Build the vocabulary: a sorted list of unique tokens across the corpus.
# Sorting stabilizes the feature order for display. Frequency-based orders are also common.
all_tokens = [t for doc in tokenized_no_stop for t in doc]
vocab = sorted(set(all_tokens))                 # list of unique tokens
token2id = {t: i for i, t in enumerate(vocab)}  # mapping token -> column index

print("Vocabulary size:", len(vocab))
print("Vocabulary (token -> id):", token2id, "\n")

# --- 5. Vectorize: create a document-term matrix X with raw counts (shape: n_docs x |V|).
X = np.zeros((len(docs), len(vocab)), dtype=int)

for i, doc_tokens in enumerate(tokenized_no_stop):
    counts = Counter(doc_tokens)                # count tokens in a document
    for token, c in counts.items():
        X[i, token2id[token]] = c

# --- 6. Inspect the matrix with a header so students can read it like a table ---
header = ["doc"] + vocab
print("\t".join(header))
for i in range(len(docs)):
    row = [f"d{i+1}"] + [str(X[i, j]) for j in range(len(vocab))]
    print("\t".join(row))

# --- 7. Optional: show a quick cosine similarity between documents in count space.
# Cosine highlights direction rather than magnitude (common for sparse text features).
def cosine(u, v):
    num = (u * v).sum()
    den = np.linalg.norm(u) * np.linalg.norm(v)
    return 0.0 if den == 0 else float(num / den)

print("\nCosine similarity d1 vs d3 (both about monetary policy):",
      round(cosine(X[0], X[2]), 3))
print("Cosine similarity d1 vs d2 (finance vs river):",
      round(cosine(X[0], X[1]), 3))

# === Part 2: Use scikit-learn to do the same thing with one-liners =====================

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# CountVectorizer builds the vocabulary and document-term counts automatically.
# We supply the same token pattern and stopwords to mirror the manual build.
count_vec = CountVectorizer(
    lowercase=True,
    token_pattern=r"\b\w+\b",
    stop_words=list(stopwords)  # could also use 'english' for a standard list
)

X_counts = count_vec.fit_transform(docs)  # sparse matrix shape: (n_docs, |V|)
feature_names = count_vec.get_feature_names_out()

print("\n[scikit-learn] CountVectorizer features:", feature_names.tolist())
print("[scikit-learn] Document-term matrix shape:", X_counts.shape)

# TF-IDF re-weights terms: common across the corpus get down-weighted,
# rare but discriminative terms get up-weighted.
tfidf_vec = TfidfVectorizer(
    lowercase=True,
    token_pattern=r"\b\w+\b",
    stop_words=list(stopwords),
    norm="l2"  # unit-length rows; good default for cosine similarity
)

X_tfidf = tfidf_vec.fit_transform(docs)
tfidf_features = tfidf_vec.get_feature_names_out()

# Show the top 5 TF-IDF terms per document to illustrate re-weighting.
for i, d in enumerate(docs):
    row = X_tfidf[i].toarray().ravel()
    top_idx = row.argsort()[::-1][:5]
    top_terms = [(tfidf_features[j], round(row[j], 3)) for j in top_idx if row[j] > 0]
    print(f"\n[TF-IDF] Doc {i+1} top terms:", top_terms)

"""
Teaching notes:
- Change 'riverbank' to 'river bank' and re-run to show how tokenization choices affect features.
- Swap the stopword list for 'english' to show vocabulary shrinkage.
- Add bigrams with CountVectorizer(ngram_range=(1,2)) to show how word order can be incorporated.
- Emphasize that BoW is sparse and high-dimensional, which motivates dense embeddings next.
"""


Vocabulary size: 18
Vocabulary (token -> id): {'along': 0, 'bank': 1, 'boats': 2, 'central': 3, 'interest': 4, 'policy': 5, 'quarter': 6, 'raised': 7, 'rates': 8, 'relaxing': 9, 'river': 10, 'riverbank': 11, 'sailing': 12, 'sat': 13, 'sets': 14, 'today': 15, 'watched': 16, 'we': 17} 

doc	along	bank	boats	central	interest	policy	quarter	raised	rates	relaxing	river	riverbank	sailing	sat	sets	today	watched	we
d1	0	1	0	0	1	0	0	1	1	0	0	0	0	0	0	1	0	0
d2	1	1	0	0	0	0	0	0	0	1	1	0	1	0	0	0	0	0
d3	0	1	0	1	0	1	1	0	1	0	0	0	0	0	1	0	0	0
d4	0	0	1	0	0	0	0	0	0	0	0	1	0	1	0	0	1	1

Cosine similarity d1 vs d3 (both about monetary policy): 0.365
Cosine similarity d1 vs d2 (finance vs river): 0.2

[scikit-learn] CountVectorizer features: ['along', 'bank', 'boats', 'central', 'interest', 'policy', 'quarter', 'raised', 'rates', 'relaxing', 'river', 'riverbank', 'sailing', 'sat', 'sets', 'today', 'watched', 'we']
[scikit-learn] Document-term matrix shape: (4, 18)

[TF-IDF] Doc 1 top terms: [('raised', np.float64

"\nTeaching notes:\n- Change 'riverbank' to 'river bank' and re-run to show how tokenization choices affect features.\n- Swap the stopword list for 'english' to show vocabulary shrinkage.\n- Add bigrams with CountVectorizer(ngram_range=(1,2)) to show how word order can be incorporated.\n- Emphasize that BoW is sparse and high-dimensional, which motivates dense embeddings next.\n"

# Dense language embeddings

In [None]:
"""
What this cell shows:
A) Compare sparse TF-IDF vs dense sentence embeddings using cosine similarity.
B) Show contextual word embeddings for the token 'bank' in two different sentences.

Install once in a fresh environment:
!pip install scikit-learn transformers sentence-transformers torch --quiet

Notes:
- First run will download model weights and tokenizers.
- GPU is optional. CPU works for this small demo.
"""

import numpy as np

# -------------------------------
# Utilities: cosine similarity
# -------------------------------
def cosine(u, v):
    u = np.asarray(u).astype(float)
    v = np.asarray(v).astype(float)
    num = float((u * v).sum())
    den = float(np.linalg.norm(u) * np.linalg.norm(v))
    return 0.0 if den == 0.0 else num / den

# ---------------------------------------------
# Part A. Sparse TF-IDF vs dense sentence embeddings
# ---------------------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

# Two pairs of sentences.
# Pair 1: financial sense of "bank"
# Pair 2: river sense of "bank"
s1 = "The central bank raised interest rates."
s2 = "The national bank increased borrowing costs."
r1 = "We had a picnic by the river bank."
r2 = "They launched the kayak from the river bank."

corpus = [s1, s2, r1, r2]

# --- Sparse baseline: TF-IDF over the tiny corpus ---
tfidf = TfidfVectorizer(lowercase=True)
X = tfidf.fit_transform(corpus).toarray()

# Similarities with sparse vectors
sim_s_sparse = cosine(X[0], X[1])  # finance vs finance
sim_r_sparse = cosine(X[2], X[3])  # river vs river
sim_cross_sparse = cosine(X[0], X[2])  # finance vs river

print("A) TF-IDF cosine similarities (sparse counts)")
print(f"   finance vs finance: {sim_s_sparse:.3f}")
print(f"   river vs river:     {sim_r_sparse:.3f}")
print(f"   finance vs river:   {sim_cross_sparse:.3f}")

# --- Dense sentence embeddings: pre-trained encoder ---
# Good default for classroom demos: small, fast, widely used
encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
E = encoder.encode(corpus, normalize_embeddings=True)  # L2-normalized works well with cosine

sim_s_dense = cosine(E[0], E[1])
sim_r_dense = cosine(E[2], E[3])
sim_cross_dense = cosine(E[0], E[2])

print("\nA) Sentence-embedding cosine similarities (dense)")
print(f"   finance vs finance: {sim_s_dense:.3f}")
print(f"   river vs river:     {sim_r_dense:.3f}")
print(f"   finance vs river:   {sim_cross_dense:.3f}")

"""
Interpretation:
- TF-IDF often does OK when words overlap, but it still relies on exact tokens.
- Dense embeddings capture paraphrase and synonymy, so 'raised interest rates' and
  'increased borrowing costs' are very close even with fewer shared words.
- Cross-sense similarity drops because the surrounding contexts differ.
"""

# ---------------------------------------------------
# Part B. Contextual token embeddings with BERT
# ---------------------------------------------------
from transformers import AutoTokenizer, AutoModel
import torch

# We will extract the vector for the token 'bank' in two different sentences
sent_fin = "The central bank raised interest rates today."
sent_riv = "They sat by the river bank and watched the boats."

tok = AutoTokenizer.from_pretrained("bert-base-uncased")
mdl = AutoModel.from_pretrained("bert-base-uncased")
mdl.eval()  # we are only encoding

def token_vector_for_bank(text):
    """
    Returns:
      token_vec: the contextual embedding (last hidden state) for the first 'bank' subword
      tokens: the list of WordPiece tokens for inspection
      idx: the index of the 'bank' token we used
    """
    # Encode with special tokens and get model outputs
    encoded = tok(text, return_tensors="pt")
    with torch.no_grad():
        out = mdl(**encoded)  # last_hidden_state shape: [1, seq_len, hidden_dim]
    hidden = out.last_hidden_state[0]  # [seq_len, 768] for BERT base

    # Convert ids to tokens so we can find 'bank'
    tokens = tok.convert_ids_to_tokens(encoded["input_ids"][0])
    # Find the first position where the token equals 'bank'
    try:
        idx = tokens.index("bank")
    except ValueError:
        # Sometimes 'bank' might be split differently; as a fallback, search for subwords that include 'bank'
        idx = next(i for i, t in enumerate(tokens) if "bank" in t.replace("##", ""))
    token_vec = hidden[idx].numpy()
    return token_vec, tokens, idx

vec_fin, toks_fin, i_fin = token_vector_for_bank(sent_fin)
vec_riv, toks_riv, i_riv = token_vector_for_bank(sent_riv)

sim_bank = cosine(vec_fin, vec_riv)

print("\nB) Contextual token embeddings with BERT")
print("   Finance sentence tokens:", toks_fin)
print("   River sentence tokens:  ", toks_riv)
print(f"   Cosine between 'bank' vectors (finance vs river): {sim_bank:.3f}")

"""
Interpretation:
- Same surface word 'bank', two different meanings.
- Because BERT conditions on context, the 'bank' vector in a finance sentence
  differs from the 'bank' vector in a river sentence, so cosine similarity is lower.
- This is the key value of contextual embeddings vs static word embeddings.
"""


A) TF-IDF cosine similarities (sparse counts)
   finance vs finance: 0.120
   river vs river:     0.259
   finance vs river:   0.112


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


A) Sentence-embedding cosine similarities (dense)
   finance vs finance: 0.644
   river vs river:     0.445
   finance vs river:   0.171


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


B) Contextual token embeddings with BERT
   Finance sentence tokens: ['[CLS]', 'the', 'central', 'bank', 'raised', 'interest', 'rates', 'today', '.', '[SEP]']
   River sentence tokens:   ['[CLS]', 'they', 'sat', 'by', 'the', 'river', 'bank', 'and', 'watched', 'the', 'boats', '.', '[SEP]']
   Cosine between 'bank' vectors (finance vs river): 0.440


"\nInterpretation:\n- Same surface word 'bank', two different meanings.\n- Because BERT conditions on context, the 'bank' vector in a finance sentence\n  differs from the 'bank' vector in a river sentence, so cosine similarity is lower.\n- This is the key value of contextual embeddings vs static word embeddings.\n"

# Tokenization


In [None]:
"""
Slide 5 demo: Tokenization 101

What this cell shows:
A) Bytes vs characters vs words on a tricky string.
B) Subword tokenizers: BPE (GPT-2), WordPiece (BERT), SentencePiece (T5).
C) How token counts differ across strategies and why it matters.

Run once to install in a fresh environment:
!pip install transformers sentencepiece --quiet
# Note: First run will download tokenizers. Internet is required for that step.

This cell is designed for quick classroom runs on CPU.
"""

import re
import unicodedata

# --------------------------
# A) Bytes vs characters vs words
# --------------------------
text = (
    "I can't believe Llama 3.1 handles accents naïvely, "
    "URLs like https://exámple.com/docs?id=123, and Chinese 測試."
)

print("Original text:")
print(text)

# Unicode normalization helps expose hidden differences.
# 'naïve' can be precomposed (NFC) or decomposed (NFD). Many pipelines normalize to NFC.
text_nfc = unicodedata.normalize("NFC", text)
text_nfd = unicodedata.normalize("NFD", text)
print("\nUnicode normalization:")
print(" - NFC equals NFD?", text_nfc == text_nfd)

# Bytes: show how the string looks when encoded to UTF-8
b = text.encode("utf-8")
print("\nA) Bytes")
print(" - Number of bytes:", len(b))
print(" - First 24 bytes:", list(b[:24]))

# Characters: one codepoint per element
chars = list(text)
print("\nA) Characters")
print(" - Number of characters:", len(chars))
print(" - First 24 characters:", chars[:24])

# Words: a simple regex-based tokenizer
# Note: This is just for illustration. Real languages need more robust rules.
words = re.findall(r"\b\w+\b", text.lower())
print("\nA) Words")
print(" - Number of words:", len(words))
print(" - Sample words:", words[:12])

# --------------------------
# B) Subword tokenizers via Hugging Face
#    BPE (GPT-2), WordPiece (BERT), SentencePiece (T5)
# --------------------------
from transformers import AutoTokenizer

tokenizers = {
    "BPE (GPT-2)": "gpt2",                       # byte-level BPE
    "WordPiece (BERT base uncased)": "bert-base-uncased",
    "SentencePiece (T5-small)": "t5-small"
}

print("\nB) Subword tokenization")
for label, model_id in tokenizers.items():
    tok = AutoTokenizer.from_pretrained(model_id)
    enc = tok(text)  # enc is a dict with 'input_ids' and 'attention_mask'
    ids = enc["input_ids"]
    toks = tok.convert_ids_to_tokens(ids)
    print(f" - {label}: {len(ids)} tokens")
    # Show a small slice to keep output readable
    print("   Sample tokens:", toks[:16])

# --------------------------
# C) Why token counts matter
# --------------------------
print("\nC) Practical notes")
print(" - Different tokenizers produce different lengths on the same text.")
print(" - Longer sequences mean more compute and possibly higher API cost.")
print(" - Byte-level BPE and SentencePiece handle non-Latin text and URLs more gracefully.")
print(" - Always inspect splits for your domain strings (code, logs, product names, multi-lingual input).")

"""
Teaching variations:
- Swap the sample text for something from your domain, for example log lines or product SKUs.
- Change normalization to 'NFD' and re-tokenize to see how accent decomposition affects splits.
- Try longer numeric strings or camelCase to see how word vs subword tokenizers differ.
- For students who are curious, compare token counts across models you plan to use in projects.
"""


Original text:
I can't believe Llama 3.1 handles accents naïvely, URLs like https://exámple.com/docs?id=123, and Chinese 測試.

Unicode normalization:
 - NFC equals NFD? False

A) Bytes
 - Number of bytes: 115
 - First 24 bytes: [73, 32, 99, 97, 110, 39, 116, 32, 98, 101, 108, 105, 101, 118, 101, 32, 76, 108, 97, 109, 97, 32, 51, 46]

A) Characters
 - Number of characters: 109
 - First 24 characters: ['I', ' ', 'c', 'a', 'n', "'", 't', ' ', 'b', 'e', 'l', 'i', 'e', 'v', 'e', ' ', 'L', 'l', 'a', 'm', 'a', ' ', '3', '.']

A) Words
 - Number of words: 21
 - Sample words: ['i', 'can', 't', 'believe', 'llama', '3', '1', 'handles', 'accents', 'naïvely', 'urls', 'like']

B) Subword tokenization


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

 - BPE (GPT-2): 40 tokens
   Sample tokens: ['I', 'Ġcan', "'t", 'Ġbelieve', 'ĠLl', 'ama', 'Ġ3', '.', '1', 'Ġhandles', 'Ġaccents', 'ĠnaÃ¯ve', 'ly', ',', 'ĠURLs', 'Ġlike']
 - WordPiece (BERT base uncased): 40 tokens
   Sample tokens: ['[CLS]', 'i', 'can', "'", 't', 'believe', 'll', '##ama', '3', '.', '1', 'handles', 'accents', 'naive', '##ly', ',']


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

 - SentencePiece (T5-small): 46 tokens
   Sample tokens: ['▁I', '▁can', "'", 't', '▁believe', '▁L', 'l', 'am', 'a', '▁', '3.1', '▁handles', '▁accent', 's', '▁', 'n']

C) Practical notes
 - Different tokenizers produce different lengths on the same text.
 - Longer sequences mean more compute and possibly higher API cost.
 - Byte-level BPE and SentencePiece handle non-Latin text and URLs more gracefully.
 - Always inspect splits for your domain strings (code, logs, product names, multi-lingual input).


"\nTeaching variations:\n- Swap the sample text for something from your domain, for example log lines or product SKUs.\n- Change normalization to 'NFD' and re-tokenize to see how accent decomposition affects splits.\n- Try longer numeric strings or camelCase to see how word vs subword tokenizers differ.\n- For students who are curious, compare token counts across models you plan to use in projects.\n"

In [None]:
"""
Tokenization 101 - Companion visualizer

What this cell does
- Shows how different tokenizers split the same string.
- Compares three families:
  * BPE (GPT-2)
  * WordPiece (BERT base uncased)
  * SentencePiece (T5-small)
- Prints a readable token stream with ' | ' separators and a token-count summary.

Why this matters
- Token boundaries differ across models, which affects sequence length, cost, and behavior.
- This demo complements the slide by making those differences concrete.

One-time setup (in a fresh environment):
!pip install transformers sentencepiece --quiet
"""

import re
import unicodedata
from typing import List, Dict

from transformers import AutoTokenizer

# ---------------------------
# 1) Inputs to probe
#    - Include accents, URLs, code, non-Latin script, and a polysemous example ("river bank").
#    - Avoid emojis to keep output consistent in all terminals.
# ---------------------------
texts: List[str] = [
    "They sat by the river bank and watched the boats.",
    "I cant believe tokenizers handle accents näively and names like Zürich.",
    "Visit https://exámple.com/docs?id=123&lang=en for details.",
    "银行业务很复杂",
    "def hello_world(): print('HelloWorld123')"
]

# Normalize to NFC (common production choice); toggle to "NFD" to show decomposition effects.
NORMALIZE = "NFC"  # or None, or "NFD"

# ---------------------------
# 2) Tokenizers to compare
# ---------------------------
tokenizers: Dict[str, str] = {
    "BPE (GPT-2)": "gpt2",                         # byte-level BPE
    "WordPiece (BERT base uncased)": "bert-base-uncased",
    "SentencePiece (T5-small)": "t5-small"
}

# ---------------------------
# 3) Helpers
# ---------------------------
def clean_tokens(tokens: List[str]) -> List[str]:
    """
    Make tokens more readable without losing boundaries.

    - Remove WordPiece '##' markers but keep boundaries explicit via separators.
    - Keep special tokens as-is (e.g., [CLS], [SEP], </s>).
    """
    out = []
    for t in tokens:
        if t.startswith("##"):              # WordPiece continuation
            out.append(t.replace("##", "")) # show the raw piece
        else:
            out.append(t)
    return out

def tokenize_with(model_id: str, text: str) -> List[str]:
    tok = AutoTokenizer.from_pretrained(model_id)
    # add_special_tokens=True to show model-specific control tokens when applicable
    enc = tok(text, return_tensors=None, add_special_tokens=True)
    ids = enc["input_ids"]
    toks = tok.convert_ids_to_tokens(ids)
    return clean_tokens(toks)

def show_for_text(text: str) -> None:
    if NORMALIZE:
        text_proc = unicodedata.normalize(NORMALIZE, text)
    else:
        text_proc = text

    # Basic stats
    byte_len = len(text_proc.encode("utf-8"))
    char_len = len(text_proc)

    print("\n" + "=" * 88)
    print("TEXT:", text_proc)
    print(f"chars={char_len}  bytes(utf-8)={byte_len}")

    # Tokenize with each family
    counts = []
    for label, model_id in tokenizers.items():
        toks = tokenize_with(model_id, text_proc)
        counts.append((label, len(toks)))
        # Render a compact, slide-friendly preview
        preview = " | ".join(toks)
        # Truncate very long previews to keep console readable
        if len(preview) > 300:
            preview = preview[:300] + " ... "
        print(f" - {label:32s} ({len(toks):3d} tokens): {preview}")

    # Summary table (aligned)
    print("   Token counts:")
    for label, n in counts:
        print(f"     {label:32s} : {n:3d}")

# ---------------------------
# 4) Run the comparison
# ---------------------------
for t in texts:
    show_for_text(t)

"""
Try this:
- Switch NORMALIZE to "NFD" and re-run. Watch token boundaries around accented characters.
- Replace the URL with something longer or add camelCase identifiers to see how each tokenizer behaves.
- Add samples from your domain (logs, SKUs, chat excerpts, code snippets) to anticipate sequence lengths.
- If you plan to use a specific model in projects, add its tokenizer here and compare token counts.
"""



TEXT: They sat by the river bank and watched the boats.
chars=49  bytes(utf-8)=49
 - BPE (GPT-2)                      ( 11 tokens): They | Ġsat | Ġby | Ġthe | Ġriver | Ġbank | Ġand | Ġwatched | Ġthe | Ġboats | .
 - WordPiece (BERT base uncased)    ( 13 tokens): [CLS] | they | sat | by | the | river | bank | and | watched | the | boats | . | [SEP]
 - SentencePiece (T5-small)         ( 14 tokens): ▁They | ▁ | s | at | ▁by | ▁the | ▁river | ▁bank | ▁and | ▁watched | ▁the | ▁boats | . | </s>
   Token counts:
     BPE (GPT-2)                      :  11
     WordPiece (BERT base uncased)    :  13
     SentencePiece (T5-small)         :  14

TEXT: I cant believe tokenizers handle accents näively and names like Zürich.
chars=71  bytes(utf-8)=73
 - BPE (GPT-2)                      ( 17 tokens): I | Ġcant | Ġbelieve | Ġtoken | izers | Ġhandle | Ġaccents | Ġn | Ã¤ | ively | Ġand | Ġnames | Ġlike | ĠZ | Ã¼ | rich | .
 - WordPiece (BERT base uncased)    ( 18 tokens): [CLS] | i | can | t | believe 

'\nTry this:\n- Switch NORMALIZE to "NFD" and re-run. Watch token boundaries around accented characters.\n- Replace the URL with something longer or add camelCase identifiers to see how each tokenizer behaves.\n- Add samples from your domain (logs, SKUs, chat excerpts, code snippets) to anticipate sequence lengths.\n- If you plan to use a specific model in projects, add its tokenizer here and compare token counts.\n'

# Training static embedding with word2vec

In [None]:
!pip install torch --quiet

In [None]:
"""
Word2Vec (Skip-gram with Negative Sampling) from scratch in PyTorch
- Demonstrates model, loss, and backprop.
- Small toy corpus duplicated for quicker convergence.

One-time setup in a clean environment:
!pip install torch --quiet
"""

import math, random
from collections import Counter
import numpy as np
import torch
import torch.nn.functional as F  # <-- use F.logsigmoid here

torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# -----------------------------
# 1) Tiny corpus and preprocessing
# -----------------------------
raw_corpus = [
    "king queen prince princess royal palace throne",
    "paris france capital europe city",
    "rome italy capital europe city",
    "man woman boy girl person",
    "river bank water boat shore",
    "finance bank money loan interest"
] * 200  # duplicate to give the model more signal without big data

sentences = [s.split() for s in raw_corpus]
words = [w for s in sentences for w in s]
freq = Counter(words)
vocab = sorted(freq)
stoi = {w:i for i,w in enumerate(vocab)}
itos = {i:w for w,i in stoi.items()}
V = len(vocab)
print(f"Vocab size: {V}")

# -----------------------------
# 2) Build skip-gram training pairs with a symmetric window
# -----------------------------
window = 2
pairs = []
for s in sentences:
    idxs = [stoi[w] for w in s]
    for i, c in enumerate(idxs):
        left = max(0, i - window)
        right = min(len(idxs), i + window + 1)
        for j in range(left, right):
            if j == i:
                continue
            pairs.append((c, idxs[j]))  # (center, context)

print(f"Positive pairs: {len(pairs):,}")

# -----------------------------
# 3) Negative sampling distribution (unigram^0.75)
# -----------------------------
counts = np.array([freq[itos[i]] for i in range(V)], dtype=np.float64)
p_neg = counts ** 0.75
p_neg = p_neg / p_neg.sum()

def sample_negatives(batch_size, K):
    return np.random.choice(V, size=(batch_size, K), p=p_neg)

# -----------------------------
# 4) Model: two embedding tables, input and output
# -----------------------------
class SGNS(torch.nn.Module):
    def __init__(self, vocab_size, dim):
        super().__init__()
        self.in_embed = torch.nn.Embedding(vocab_size, dim)
        self.out_embed = torch.nn.Embedding(vocab_size, dim)
        # Init: small uniform for input, zeros for output is a stable classroom choice
        bound = 0.5 / dim
        torch.nn.init.uniform_(self.in_embed.weight, -bound, bound)
        torch.nn.init.zeros_(self.out_embed.weight)

    def forward(self, center, pos, neg):
        """
        center: [B]      indices of center words (Long)
        pos:    [B]      indices of positive context words (Long)
        neg:    [B, K]   indices of negative samples (Long)
        """
        v_c = self.in_embed(center)          # [B, D]
        u_o = self.out_embed(pos)            # [B, D]
        pos_score = (v_c * u_o).sum(dim=1)   # [B]
        pos_loss = F.logsigmoid(pos_score)   # log σ(u_o^T v_c)

        u_k = self.out_embed(neg)            # [B, K, D]
        neg_score = torch.einsum("bkd,bd->bk", u_k, v_c)  # [B, K]
        neg_loss = F.logsigmoid(-neg_score).sum(dim=1)    # Σ_k log σ(-u_k^T v_c)

        # We minimize the negative of the log-likelihood terms
        loss = -(pos_loss + neg_loss).mean()
        return loss

# -----------------------------
# 5) Training loop
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
dim = 50
K = 5          # negatives per positive
batch_size = 512
epochs = 3

model = SGNS(V, dim).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.01)

pairs_arr = np.array(pairs, dtype=np.int64)
num_batches = math.ceil(len(pairs_arr) / batch_size)

for epoch in range(1, epochs+1):
    perm = np.random.permutation(len(pairs_arr))
    pairs_arr = pairs_arr[perm]
    running = 0.0

    for b in range(num_batches):
        start = b * batch_size
        end = min(len(pairs_arr), start + batch_size)
        batch = pairs_arr[start:end]
        center = torch.tensor(batch[:,0], device=device, dtype=torch.long)
        pos    = torch.tensor(batch[:,1], device=device, dtype=torch.long)
        neg    = torch.tensor(sample_negatives(len(batch), K), device=device, dtype=torch.long)

        opt.zero_grad()
        loss = model(center, pos, neg)
        loss.backward()           # backprop through embeddings
        opt.step()
        running += loss.item()

    print(f"Epoch {epoch} | loss {running/num_batches:.4f}")

# -----------------------------
# 6) Inspect neighbors with cosine similarity
# -----------------------------
with torch.no_grad():
    E = model.in_embed.weight.detach().cpu()   # [V, D]
    E = torch.nn.functional.normalize(E, p=2, dim=1)

def nearest(word, k=5):
    i = stoi[word]
    sims = (E @ E[i]).numpy()
    top = sims.argsort()[::-1]
    out = []
    for j in top[:k+1]:          # include the word itself at rank 0
        out.append((itos[j], float(sims[j])))
    return out[1:k+1]            # drop the query word itself

for w in ["king", "queen", "paris", "bank"]:
    print(f"\nNearest to '{w}':", nearest(w, k=5))

print("\nNote: 'bank' mixes finance and river senses because this model has one vector per word.")


Vocab size: 28
Positive pairs: 18,400
Epoch 1 | loss 3.1560
Epoch 2 | loss 2.0720
Epoch 3 | loss 1.3926

Nearest to 'king': [('princess', 0.9184070229530334), ('queen', 0.8116123676300049), ('prince', 0.8041300773620605), ('royal', 0.7505913972854614), ('throne', 0.7146414518356323)]

Nearest to 'queen': [('prince', 0.9011702537536621), ('king', 0.8116123676300049), ('royal', 0.7652005553245544), ('princess', 0.7562607526779175), ('palace', 0.6760899424552917)]

Nearest to 'paris': [('europe', 0.9165871739387512), ('rome', 0.8345823287963867), ('capital', 0.7935218811035156), ('city', 0.7509962320327759), ('france', 0.7334145903587341)]

Nearest to 'bank': [('interest', 0.7990052700042725), ('shore', 0.7587672472000122), ('money', 0.7164175510406494), ('water', 0.7112097144126892), ('finance', 0.6727312803268433)]

Note: 'bank' mixes finance and river senses because this model has one vector per word.


# Inside a pre-trained LM

In [None]:
"""
What is inside a pretrained LM? Inspect GPT-2 (small) for concreteness.
- Prints parameter counts to show embeddings are only part of the model.
- Verifies whether the LM head shares weights with the token embedding table.
- Shows shapes of hidden states to contrast initial embeddings vs contextual layers.

One-time setup in a clean environment:
!pip install transformers torch --quiet
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "gpt2"  # small, no license gates, runs on CPU
tok = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# 1) Shapes and parameter counts
wte = model.transformer.wte           # token embeddings
wpe = model.transformer.wpe           # positional embeddings
lmh = model.lm_head                   # output projection

def nparams(m): return sum(p.numel() for p in m.parameters())
total = nparams(model)
embed_params = wte.weight.numel() + wpe.weight.numel()
head_params = lmh.weight.numel()

print(f"Total parameters:      {total:,}")
print(f"Token embeddings:      {wte.weight.shape} -> {wte.weight.numel():,}")
print(f"Position embeddings:   {wpe.weight.shape} -> {wpe.weight.numel():,}")
print(f"LM head:               {lmh.weight.shape} -> {lmh.weight.numel():,}")
print(f"Share of embeddings (token+pos): {100.0 * embed_params / total:.2f}%")

# 2) Weight tying check: many decoder LMs tie LM head to token embeddings
tied = lmh.weight.data_ptr() == wte.weight.data_ptr()
print("LM head tied to token embeddings:", tied)

# 3) Forward pass: compare initial embedding output vs last hidden state
text = "The river bank differs from a finance bank."
inputs = tok(text, return_tensors="pt")
with torch.no_grad():
    out = model(**inputs, output_hidden_states=True, use_cache=False)

hidden_states = out.hidden_states
print(f"Number of hidden states (embeddings + 12 layers for GPT-2): {len(hidden_states)}")
print("Embeddings output shape:", hidden_states[0].shape)   # [1, seq_len, d_model]
print("Last layer shape:      ", hidden_states[-1].shape)   # [1, seq_len, d_model]
print("Logits shape:          ", out.logits.shape)          # [1, seq_len, vocab_size]

"""
Interpretation:
- hidden_states[0] is the token+position embedding output (context-free).
- hidden_states[-1] is contextual. The LM head turns this into logits.
- Embeddings are a small share of total params; most parameters live in attention and MLPs.
"""


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Total parameters:      124,439,808
Token embeddings:      torch.Size([50257, 768]) -> 38,597,376
Position embeddings:   torch.Size([1024, 768]) -> 786,432
LM head:               torch.Size([50257, 768]) -> 38,597,376
Share of embeddings (token+pos): 31.65%
LM head tied to token embeddings: True
Number of hidden states (embeddings + 12 layers for GPT-2): 13
Embeddings output shape: torch.Size([1, 9, 768])
Last layer shape:       torch.Size([1, 9, 768])
Logits shape:           torch.Size([1, 9, 50257])


'\nInterpretation:\n- hidden_states[0] is the token+position embedding output (context-free).\n- hidden_states[-1] is contextual. The LM head turns this into logits.\n- Embeddings are a small share of total params; most parameters live in attention and MLPs.\n'