# Project 2: The Geometry of Meaning - SOLUTION
## Training Word2Vec Skip-gram from Scratch

**This notebook contains complete solutions to all tasks.**

## Part 1: Setup and Data Loading

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from collections import Counter
from typing import List, Tuple, Dict

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print(f"PyTorch version: {torch.__version__}")

# Download TinyShakespeare
import urllib.request

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
try:
    urllib.request.urlretrieve(url, "tinyshakespeare.txt")
    print("Downloaded TinyShakespeare corpus")
    with open("tinyshakespeare.txt", "r") as f:
        text = f.read().lower()
except:
    print("Using sample data")
    text = "the quick brown fox jumps over the lazy dog " * 100

# Simple tokenization (word-level)
words = text.split()
print(f"Total words: {len(words)}")
print(f"Unique words: {len(set(words))}")

### Build Vocabulary - SOLUTION

In [None]:
def build_vocabulary(words: List[str], min_count: int = 5) -> Tuple[Dict[str, int], Dict[int, str]]:
    """
    Create word-to-index and index-to-word mappings.
    
    Args:
        words: List of words
        min_count: Minimum frequency to include word
        
    Returns:
        word2idx: Word to index mapping
        idx2word: Index to word mapping
    """
    word_counts = Counter(words)
    
    # Filter words by minimum count
    vocab_words = [w for w, c in word_counts.items() if c >= min_count]
    
    # Create mappings
    word2idx = {w: i for i, w in enumerate(vocab_words)}
    idx2word = {i: w for w, i in word2idx.items()}
    
    return word2idx, idx2word

word2idx, idx2word = build_vocabulary(words, min_count=5)
vocab_size = len(word2idx)

print(f"Vocabulary size: {vocab_size}")
print(f"Sample words: {list(word2idx.keys())[:10]}")

## Part 2: Create Skip-gram Training Data - SOLUTION

In [None]:
def create_skipgram_pairs(words: List[str], word2idx: Dict[str, int], window_size: int = 2) -> List[Tuple[int, int]]:
    """
    Generate (center_word, context_word) pairs for Skip-gram training.
    """
    pairs = []
    
    for i, center_word in enumerate(words):
        # Skip words not in vocabulary
        if center_word not in word2idx:
            continue
            
        center_idx = word2idx[center_word]
        
        # Get context words within window
        for j in range(max(0, i - window_size), min(len(words), i + window_size + 1)):
            if i == j:  # Skip center word itself
                continue
                
            context_word = words[j]
            if context_word in word2idx:
                context_idx = word2idx[context_word]
                pairs.append((center_idx, context_idx))
    
    return pairs

# Create training pairs
skipgram_pairs = create_skipgram_pairs(words, word2idx, window_size=2)
print(f"Total training pairs: {len(skipgram_pairs)}")
print(f"Sample pairs (as indices): {skipgram_pairs[:5]}")
print(f"Sample pairs (as words): {[(idx2word[c], idx2word[ctx]) for c, ctx in skipgram_pairs[:5]]}")

### Create PyTorch Dataset - SOLUTION

In [None]:
class SkipGramDataset(Dataset):
    def __init__(self, pairs: List[Tuple[int, int]]):
        self.pairs = pairs
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        center, context = self.pairs[idx]
        return torch.tensor(center, dtype=torch.long), torch.tensor(context, dtype=torch.long)

dataset = SkipGramDataset(skipgram_pairs)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

print(f"Number of batches: {len(dataloader)}")

## Part 3: Implement Skip-gram Model - SOLUTION

In [None]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        """
        Simple Skip-gram model.
        
        Args:
            vocab_size: Number of unique words
            embedding_dim: Dimension of embedding vectors
        """
        super().__init__()
        
        # Embedding layer for input words
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Output layer to predict context words
        self.output = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, center_word: torch.Tensor) -> torch.Tensor:
        """
        Forward pass.
        
        Args:
            center_word: Tensor of center word indices [batch_size]
            
        Returns:
            Logits for context word prediction [batch_size, vocab_size]
        """
        # Get embeddings for center words
        embeds = self.embeddings(center_word)  # [batch_size, embedding_dim]
        
        # Pass through output layer
        logits = self.output(embeds)  # [batch_size, vocab_size]
        
        return logits
    
    def get_embeddings(self) -> np.ndarray:
        """Return the learned word embeddings as numpy array."""
        return self.embeddings.weight.detach().cpu().numpy()

# Initialize model
embedding_dim = 128
model = SkipGramModel(vocab_size, embedding_dim)

print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")
print(model)

## Part 4: Train the Model - SOLUTION

In [None]:
def train_skipgram(model, dataloader, epochs=5, lr=0.01):
    """
    Train Skip-gram model.
    """
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    losses = []
    
    for epoch in range(epochs):
        epoch_loss = 0
        
        for center, context in dataloader:
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            logits = model(center)
            
            # Compute loss
            loss = criterion(logits, context)
            
            # Backward pass
            loss.backward()
            
            # Update weights
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(dataloader)
        losses.append(avg_loss)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
    
    return losses

# Train model
print("Training Skip-gram model...\n")
losses = train_skipgram(model, dataloader, epochs=10, lr=0.01)

# Plot loss curve
plt.figure(figsize=(10, 5))
plt.plot(losses, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Skip-gram Training Loss')
plt.grid(True, alpha=0.3)
plt.show()

## Part 5: Visualize Embeddings - SOLUTION

### Task 1: Cosine Similarity Matrix

In [None]:
def plot_cosine_matrix(embeddings: np.ndarray, words: List[str], top_n: int = 50):
    """
    Plot heatmap of cosine similarities between words.
    """
    # Select top_n most frequent words
    selected_embeddings = embeddings[:top_n]
    selected_words = words[:top_n]
    
    # Compute cosine similarity matrix
    cos_sim = cosine_similarity(selected_embeddings)
    
    # Plot
    plt.figure(figsize=(12, 10))
    sns.heatmap(cos_sim, 
                xticklabels=selected_words, 
                yticklabels=selected_words, 
                cmap='coolwarm',
                center=0,
                vmin=-1,
                vmax=1)
    plt.title('Cosine Similarity Matrix')
    plt.tight_layout()
    plt.show()

# Get embeddings and word list
embeddings = model.get_embeddings()
word_list = [idx2word[i] for i in range(len(idx2word))]

plot_cosine_matrix(embeddings, word_list, top_n=30)

### Task 2: 2D Projection with t-SNE - SOLUTION

In [None]:
def plot_embeddings_2d(embeddings: np.ndarray, words: List[str], method='tsne', top_n: int = 200):
    """
    Project embeddings to 2D and visualize.
    
    Args:
        method: 'pca' or 'tsne'
    """
    # Select subset
    selected_embeddings = embeddings[:top_n]
    selected_words = words[:top_n]
    
    # Dimensionality reduction
    if method == 'pca':
        reducer = PCA(n_components=2)
        title_suffix = 'PCA'
    else:
        reducer = TSNE(n_components=2, random_state=42, perplexity=30)
        title_suffix = 't-SNE'
    
    coords = reducer.fit_transform(selected_embeddings)
    
    # Plot
    plt.figure(figsize=(15, 12))
    plt.scatter(coords[:, 0], coords[:, 1], alpha=0.5, s=50)
    
    # Annotate points (only every 3rd to avoid clutter)
    for i in range(0, len(selected_words), 3):
        plt.annotate(selected_words[i], 
                    (coords[i, 0], coords[i, 1]), 
                    fontsize=8, 
                    alpha=0.7)
    
    plt.title(f'Word Embeddings ({title_suffix})')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.tight_layout()
    plt.show()

print("Generating 2D visualization (this may take a minute)...")
plot_embeddings_2d(embeddings, word_list, method='tsne', top_n=100)

### Task 3: Test Semantic Analogies - SOLUTION

In [None]:
def test_analogy(embeddings, word2idx, idx2word, a: str, b: str, c: str, top_k: int = 5):
    """
    Test analogy: a is to b as c is to ?
    
    Example: king - man + woman â‰ˆ queen
    
    Returns top_k most similar words to (b - a + c)
    """
    # Get embeddings for input words
    if a not in word2idx or b not in word2idx or c not in word2idx:
        print(f"One or more words not in vocabulary")
        return
    
    a_vec = embeddings[word2idx[a]]
    b_vec = embeddings[word2idx[b]]
    c_vec = embeddings[word2idx[c]]
    
    # Compute target vector: b - a + c
    target_vec = b_vec - a_vec + c_vec
    
    # Find most similar words
    similarities = cosine_similarity([target_vec], embeddings)[0]
    
    # Get top-k (excluding input words)
    top_indices = similarities.argsort()[::-1]
    
    print(f"\n'{a}' is to '{b}' as '{c}' is to:")
    count = 0
    for idx in top_indices:
        word = idx2word[idx]
        if word not in [a, b, c] and count < top_k:
            print(f"  {count+1}. {word} (similarity: {similarities[idx]:.4f})")
            count += 1
        if count >= top_k:
            break

# Test some analogies
# Note: These may not work well on small corpus - Shakespeare doesn't have "king/queen" examples
# But we can test with words that appear in the text

print("Testing semantic analogies...")
print("Note: Results depend on corpus content and may not always be meaningful.")

# Find some common words to test
common_words = list(word2idx.keys())[:50]
print(f"\nCommon words in vocabulary: {common_words[:20]}")

# Try some analogies if appropriate words exist
if 'good' in word2idx and 'better' in word2idx and 'bad' in word2idx:
    test_analogy(embeddings, word2idx, idx2word, 'good', 'better', 'bad')

if 'man' in word2idx and 'king' in word2idx and 'woman' in word2idx:
    test_analogy(embeddings, word2idx, idx2word, 'man', 'king', 'woman')

## Part 6: Compare One-Hot vs Learned Embeddings - SOLUTION

In [None]:
class OneHotModel(nn.Module):
    """Baseline model using one-hot encoding."""
    def __init__(self, vocab_size: int):
        super().__init__()
        # No embedding layer - input is already one-hot
        # Direct mapping from vocab_size to vocab_size
        self.output = nn.Linear(vocab_size, vocab_size)
    
    def forward(self, center_word_indices: torch.Tensor) -> torch.Tensor:
        # Convert indices to one-hot
        batch_size = center_word_indices.size(0)
        one_hot = torch.zeros(batch_size, self.output.in_features)
        one_hot.scatter_(1, center_word_indices.unsqueeze(1), 1)
        
        # Pass through linear layer
        return self.output(one_hot)

# Compare model sizes
onehot_model = OneHotModel(vocab_size)

skipgram_params = sum(p.numel() for p in model.parameters())
onehot_params = sum(p.numel() for p in onehot_model.parameters())

print(f"\nModel Size Comparison:")
print(f"="*50)
print(f"Vocabulary size: {vocab_size}")
print(f"Embedding dimension: {embedding_dim}")
print(f"\nSkip-gram (with embeddings):")
print(f"  Parameters: {skipgram_params:,}")
print(f"  Memory: ~{skipgram_params * 4 / 1024:.2f} KB (FP32)")
print(f"\nOne-hot (no compression):")
print(f"  Parameters: {onehot_params:,}")
print(f"  Memory: ~{onehot_params * 4 / 1024:.2f} KB (FP32)")
print(f"\nMemory savings: {(1 - skipgram_params/onehot_params)*100:.1f}%")
print(f"Compression ratio: {onehot_params / skipgram_params:.2f}x")

## Part 7: Analysis and Reflection

### Questions and Answers:

#### 1. Why are one-hot vectors orthogonal?

**Answer:** 
One-hot vectors have exactly one element set to 1 and all others to 0. When computing the dot product of two different one-hot vectors:
- `[1,0,0,0] Â· [0,1,0,0] = 0`
- The dot product is always 0 (orthogonal)
- This means one-hot encoding captures NO semantic similarity between words
- "cat" and "dog" are as different as "cat" and "computer"

#### 2. What does cosine similarity measure in embedding space?

**Answer:**
Cosine similarity measures the angle between two vectors:
- cos(Î¸) = 1: Vectors point in the same direction (very similar meaning)
- cos(Î¸) = 0: Vectors are orthogonal (unrelated)
- cos(Î¸) = -1: Vectors point in opposite directions (opposite meaning)

In embedding space:
- Words with similar contexts get similar embeddings
- High cosine similarity = words are semantically related
- Examples: "king" and "queen", "run" and "running"

#### 3. Did the analogies work? Why or why not?

**Answer:**
Analogies may or may not work well depending on:
- **Corpus size**: TinyShakespeare is small (~1MB). Analogies work better with larger corpora
- **Training time**: More epochs = better embeddings
- **Word frequency**: Rare words have poor embeddings
- **Context diversity**: Words need to appear in varied contexts

The famous "king - man + woman = queen" works on large corpora (Wikipedia, Common Crawl) but may fail on small domain-specific text.

#### 4. What clusters did you observe in the t-SNE plot?

**Answer:**
Expected clusters (depends on corpus):
- **Syntactic clusters**: Verbs group together, nouns group together
- **Semantic clusters**: Related concepts (characters, emotions, actions)
- **Functional words**: "the", "and", "of" may cluster separately
- **Character names**: In Shakespeare, character names may cluster by play or role

The geometry of embeddings captures both semantic and syntactic relationships!

## Part 8: Additional Analysis

In [None]:
# Find nearest neighbors for a word
def find_nearest_neighbors(word: str, embeddings, word2idx, idx2word, top_k=10):
    """Find most similar words to a given word."""
    if word not in word2idx:
        print(f"'{word}' not in vocabulary")
        return
    
    word_vec = embeddings[word2idx[word]]
    similarities = cosine_similarity([word_vec], embeddings)[0]
    
    # Get top-k
    top_indices = similarities.argsort()[::-1][1:top_k+1]  # Exclude the word itself
    
    print(f"\nNearest neighbors of '{word}':")
    for i, idx in enumerate(top_indices, 1):
        print(f"  {i}. {idx2word[idx]} (similarity: {similarities[idx]:.4f})")

# Test with some words
test_words = ['love', 'death', 'king', 'good']
for word in test_words:
    if word in word2idx:
        find_nearest_neighbors(word, embeddings, word2idx, idx2word, top_k=5)

## ðŸŽ¯ Completion Checklist

- âœ… Built vocabulary from corpus
- âœ… Created Skip-gram training pairs
- âœ… Implemented `SkipGramModel`
- âœ… Trained model and plotted loss curve
- âœ… Visualized cosine similarity matrix
- âœ… Created 2D projection (t-SNE/PCA)
- âœ… Tested semantic analogies
- âœ… Compared with one-hot baseline
- âœ… Answered reflection questions

## Key Takeaways

1. **Dense embeddings beat one-hot**: Capture semantic relationships
2. **Context is key**: Words with similar contexts get similar embeddings
3. **Geometry encodes meaning**: Vector arithmetic reveals semantic relationships
4. **Corpus matters**: Quality and size of training data affect results

## ðŸš€ Next Project
Move to **03_rope_animator** to learn how to add positional information to sequences!