<a href="https://colab.research.google.com/github/footballest/ml-blog-jaketae/blob/main/notebooks/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# TITLE – Replication/Notes (test)

**Original post**: <https://jaketae.github.io/study/word2vec/>  
**Your name**: <YOUR_NAME>  
**Last updated**: <YYYY-MM-DD>

---

## Goals
- Summarize key ideas in my own words
- Implement the core algorithm from scratch (where feasible)
- Compare with a reference implementation (if relevant)
- Run a tiny experiment and record results


In [None]:

#@title Setup (mount Drive and create workspace paths)
from google.colab import drive
drive.mount('/content/drive')
BASE_DIR = '/content/drive/MyDrive/ml-blog'
DATA_DIR = f'{BASE_DIR}/data'
IMG_DIR = f'{BASE_DIR}/images'
ART_DIR = f'{BASE_DIR}/artifacts'

import os
for d in [BASE_DIR, DATA_DIR, IMG_DIR, ART_DIR]:
    os.makedirs(d, exist_ok=True)
print("Workspace ready:", BASE_DIR)


Mounted at /content/drive
Workspace ready: /content/drive/MyDrive/ml-blog


In [None]:

#@title Environment check (optional)
try:
    import torch
    print("PyTorch:", torch.__version__)
    if torch.cuda.is_available():
        print("CUDA device:", torch.cuda.get_device_name(0))
except Exception as e:
    print("Torch not available by default on Colab runtimes; that's okay.", e)


PyTorch: 2.8.0+cu126


## Theory recap
Write the core ideas, equations, and intuition here.

## Implementation (from scratch or minimal lib use)
Add your code here.

## Experiments
Describe the dataset, metrics, and small experiments you ran.

## Results & Discussion
What worked? What didn’t? What would you try next?

## References
- Add links to papers, docs, and the original post.

In [1]:
# Minimal Word2Vec (SGNS) on a tiny corpus
import math, random, re, collections
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

text = """
we all learn by building small models from scratch
small models teach big ideas
we learn by repeating ideas and connecting ideas to code
""".strip().lower()

# --- tokenize & vocab ---
tokens = re.findall(r"[a-z']+", text)
freq = collections.Counter(tokens)
itos = [w for w,c in freq.items() if c>=1]
stoi = {w:i for i,w in enumerate(itos)}
ids = [stoi[w] for w in tokens]

# --- negative sampling distribution (unigram^0.75) ---
counts = torch.tensor([freq[w] for w in itos], dtype=torch.float)
neg_dist = (counts ** 0.75) / (counts ** 0.75).sum()

# --- make (center, context) pairs ---
window = 2
pairs = []
for i, c in enumerate(ids):
    L = max(0, i - window); R = min(len(ids), i + window + 1)
    for j in range(L, R):
        if j == i: continue
        pairs.append((c, ids[j]))

class SGNSDataset(Dataset):
    def __init__(self, pairs, neg_dist, num_neg=5):
        self.pairs = pairs
        self.neg_dist = neg_dist
        self.num_neg = num_neg
    def __len__(self): return len(self.pairs)
    def __getitem__(self, idx):
        c,o = self.pairs[idx]
        neg = torch.multinomial(self.neg_dist, self.num_neg, replacement=True)
        return torch.tensor(c), torch.tensor(o), neg

ds = SGNSDataset(pairs, neg_dist, num_neg=5)
dl = DataLoader(ds, batch_size=64, shuffle=True)

# --- model: two embedding tables ---
class SGNS(nn.Module):
    def __init__(self, vocab_size, d=50):
        super().__init__()
        self.in_emb  = nn.Embedding(vocab_size, d)
        self.out_emb = nn.Embedding(vocab_size, d)
        nn.init.uniform_(self.in_emb.weight,  -0.5/d, 0.5/d)
        nn.init.zeros_(self.out_emb.weight)
    def forward(self, c, pos, neg):
        # c: (B,), pos: (B,), neg: (B, K)
        vc  = self.in_emb(c)              # (B, d)
        uo  = self.out_emb(pos)           # (B, d)
        un  = self.out_emb(neg)           # (B, K, d)
        pos_score = torch.sum(vc * uo, dim=1)            # (B,)
        neg_score = torch.einsum('bd,bkd->bk', vc, un)   # (B, K)
        loss = -torch.log(torch.sigmoid(pos_score) + 1e-9).mean() \
               -torch.log(torch.sigmoid(-neg_score) + 1e-9).mean()
        return loss
    def embeddings(self):
        # common to use input table as word vectors
        return self.in_emb.weight.data

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SGNS(len(itos), d=50).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-2)

for epoch in range(20):
    total = 0.0
    for c,o,n in dl:
        c,o,n = c.to(device), o.to(device), n.to(device)
        opt.zero_grad()
        loss = model(c,o,n)
        loss.backward()
        opt.step()
        total += loss.item()*c.size(0)
    print(f"epoch {epoch+1:02d} | loss {total/len(ds):.4f}")

# --- quick nearest neighbors ---
with torch.no_grad():
    E = nn.functional.normalize(model.embeddings().cpu(), dim=1)
    def nn_words(query, topk=5):
        q = E[stoi[query]]
        sim = (E @ q)
        vals, idxs = torch.topk(sim, topk+1)
        out = [itos[i] for i in idxs.tolist() if itos[i] != query][:topk]
        return out

print("Neighbors for 'ideas':", nn_words('ideas'))
print("Neighbors for 'learn':", nn_words('learn'))


epoch 01 | loss 1.3863
epoch 02 | loss 1.3851
epoch 03 | loss 1.3802
epoch 04 | loss 1.3701
epoch 05 | loss 1.3592
epoch 06 | loss 1.3384
epoch 07 | loss 1.3150
epoch 08 | loss 1.2928
epoch 09 | loss 1.2723
epoch 10 | loss 1.2257
epoch 11 | loss 1.2243
epoch 12 | loss 1.1830
epoch 13 | loss 1.1780
epoch 14 | loss 1.1348
epoch 15 | loss 1.1266
epoch 16 | loss 1.0702
epoch 17 | loss 1.0821
epoch 18 | loss 1.1103
epoch 19 | loss 1.0532
epoch 20 | loss 1.1340
Neighbors for 'ideas': ['repeating', 'to', 'code', 'and', 'connecting']
Neighbors for 'learn': ['all', 'by', 'repeating', 'we', 'ideas']
