In [17]:
# Install fasttext if not already installed
!pip install fasttext

import numpy as np
import fasttext
import fasttext.util
import os
from collections import defaultdict
from scipy.linalg import orthogonal_procrustes
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim



# Data Preparation

In [20]:
# --- English Model ---
en_model_path = 'cc.en.300.bin'
if not os.path.exists(en_model_path):
    print("Downloading English fastText model...")
    fasttext.util.download_model('en', if_exists='ignore')
    os.rename('cc.en.300.bin', en_model_path)
    print("English model downloaded.")
else:
    print("English model already exists.")

print("Loading English model...")
en_model = fasttext.load_model(en_model_path)
print("English model loaded.")

# Deleting the downloaded file to free up disk space
if os.path.exists(en_model_path):
    os.remove(en_model_path)
    print("English model download file deleted.")

# --- Hindi Model ---
hi_model_path = 'cc.hi.300.bin'
if not os.path.exists(hi_model_path):
    print("Downloading Hindi fastText model...")
    fasttext.util.download_model('hi', if_exists='ignore')
    os.rename('cc.hi.300.bin', hi_model_path)
    print("Hindi model downloaded.")
else:
    print("Hindi model already exists.")

print("Loading Hindi model...")
hi_model = fasttext.load_model(hi_model_path)
print("Hindi model loaded.")

Downloading English fastText model...
English model downloaded.
Loading English model...
English model loaded.
English model download file deleted.
Hindi model already exists.
Loading Hindi model...
Hindi model loaded.


In [21]:
#Checking if we are able to get vectors from models
english_word = "hello"
hindi_word = "‡§∞‡§æ‡§ú‡§ï‡•Å‡§Æ‡§æ‡§∞‡•Ä"

try:
  english_vector = en_model.get_word_vector(english_word)
  print(f"Vector for '{english_word}': {english_vector[:5]}...") # Print first 5 elements

  hindi_vector = hi_model.get_word_vector(hindi_word)
  print(f"Vector for '{hindi_word}': {hindi_vector[:5]}...") # Print first 5 elements

except KeyError as e:
    print(f"Error: Word '{e}' not found in the model vocabulary.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Vector for 'hello': [ 0.15757619  0.04378209 -0.00451272  0.06659314  0.07703468]...
Vector for '‡§∞‡§æ‡§ú‡§ï‡•Å‡§Æ‡§æ‡§∞‡•Ä': [0.0275354  0.00600677 0.00590774 0.03287324 0.01008815]...


In [22]:
def extract_translation_pairs(filepath):
    """
    Extracts word translation pairs from a MUSE dictionary file.

    Args:
        filepath (str): Path to the dictionary file.

    Returns:
        List[Tuple[str, str]]: List of (source_word, target_word) tuples.
    """
    pairs = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                source_word, target_word = line.strip().split()
                pairs.append((source_word, target_word))
            except ValueError:
                continue
    return pairs

In [23]:
# Download the bilingual dictionaries (you can comment these if already downloaded)
#!curl -Lo hi-en.txt https://dl.fbaipublicfiles.com/arrival/dictionaries/hi-en.txt
#!curl -Lo en-hi.txt https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt

# Extract word pairs (here using English to Hindi pairs)
en_hi_pairs = extract_translation_pairs('en-hi.txt')

print(f"Extracted {len(en_hi_pairs)} English-Hindi pairs.")
print("First 5 English-Hindi pairs:", en_hi_pairs[:5])

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  808k  100  808k    0     0  3530k      0 --:--:-- --:--:-- --:--:-- 3544k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  909k  100  909k    0     0  7153k      0 --:--:-- --:--:-- --:--:-- 7157k
Extracted 38221 English-Hindi pairs.
First 5 English-Hindi pairs: [('and', '‡§î‡§∞'), ('was', '‡§•‡§æ'), ('was', '‡§•‡•Ä'), ('for', '‡§≤‡§ø‡§Ø‡•á'), ('that', '‡§â‡§∏')]


In [25]:
def align_embeddings(src_embeddings, tgt_embeddings):
    """
    Aligns source embeddings to target embeddings using Orthogonal Procrustes.

    Args:
        src_embeddings (np.ndarray): Source word embeddings (n x d).
        tgt_embeddings (np.ndarray): Target word embeddings (n x d).

    Returns:
        Tuple[np.ndarray, np.ndarray]: (Aligned source embeddings, rotation matrix)
    """
    # Find rotation matrix to map source embeddings to target space.
    rotation, _ = orthogonal_procrustes(src_embeddings, tgt_embeddings)
    aligned_src_embeddings = np.dot(src_embeddings, rotation)
    return aligned_src_embeddings, rotation

# Embedding Alignment

In [27]:
# Prepare embeddings for the bilingual lexicon (using English-Hindi pairs)
english_embeddings = np.array([
    en_model.get_word_vector(en_word) for en_word, hi_word in en_hi_pairs
])
hindi_embeddings = np.array([
    hi_model.get_word_vector(hi_word) for en_word, hi_word in en_hi_pairs
])

# Align English embeddings into Hindi space.
aligned_english_embeddings, rotation_matrix = align_embeddings(english_embeddings, hindi_embeddings)
print("Alignment completed.")

Alignment completed.


# Evaluation

In [29]:
def normalize_embeddings(embeddings):
    """
    Normalize word embeddings to unit vectors (for cosine similarity).

    Args:
        embeddings (np.ndarray): Embedding matrix of shape (n, d).

    Returns:
        np.ndarray: Normalized embedding matrix.
    """
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

def get_top_k_neighbors(query_vector, tgt_embeddings, tgt_words, k=5):
    """
    Finds the top-k nearest neighbors using cosine similarity.

    Args:
        query_vector (np.ndarray): Aligned embedding vector for a source word.
        tgt_embeddings (np.ndarray): Normalized target embedding matrix (n x d).
        tgt_words (List[str]): List of target words.
        k (int): Number of neighbors to return.

    Returns:
        List[str]: List of top-k nearest target words.
    """
    query_norm = query_vector / np.linalg.norm(query_vector)
    similarities = np.dot(tgt_embeddings, query_norm)
    top_k_indices = np.argsort(-similarities)[:k]
    return [tgt_words[i] for i in top_k_indices]

def evaluate_translation(
    translation_pairs,
    en_model,
    hi_model,
    rotation_matrix,
    top_k=[1, 5],
    max_vocab_size=50000,
    verbose=True
):
    """
    Evaluates translation accuracy using a test dictionary and aligned embeddings.

    Args:
        translation_pairs (List[Tuple[str, str]]): Test (English, Hindi) word pairs.
        en_model (FastText model): Pre-trained English FastText model.
        hi_model (FastText model): Pre-trained Hindi FastText model.
        rotation_matrix (np.ndarray): Mapping from English to Hindi embedding space.
        top_k (List[int]): Values of 'k' for Precision@k.
        max_vocab_size (int): Limit the size of the Hindi vocabulary to search.
        verbose (bool): Whether to print progress during evaluation.

    Returns:
        Dict[int, float]: Precision@k scores.
    """
    correct_at_k = defaultdict(int)
    total = 0

    # Prepare Hindi vocab
    hi_vocab = hi_model.get_words(include_freq=False)[:max_vocab_size]
    hi_embeddings = np.array([hi_model.get_word_vector(w) for w in hi_vocab])
    hi_embeddings_norm = hi_embeddings / np.linalg.norm(hi_embeddings, axis=1, keepdims=True)

    for idx, (en_word, hi_word) in enumerate(translation_pairs):
        try:
            # Align English word into Hindi space
            en_vec = en_model.get_word_vector(en_word)
            aligned_vec = np.dot(en_vec, rotation_matrix)

            # Normalize aligned vector
            aligned_norm = aligned_vec / np.linalg.norm(aligned_vec)

            # Cosine similarities = dot product (because all are normalized)
            sims = np.dot(hi_embeddings_norm, aligned_norm)

            # Top-k predictions
            top_indices = np.argsort(-sims)[:max(top_k)]
            top_predictions = [hi_vocab[i] for i in top_indices]

            for k_val in top_k:
                if hi_word in top_predictions[:k_val]:
                    correct_at_k[k_val] += 1

            total += 1

            # Progress output
            #if verbose and (idx + 1) % 100 == 0:
                #print(f"Evaluated {idx + 1} / {len(translation_pairs)} words...")

        except Exception as e:
            continue  # Skip OOV or errors

    precision_scores = {k: correct_at_k[k] / total for k in top_k}
    #if verbose:
        #print(f"\n Evaluation complete on {total} word pairs.")
    return precision_scores

In [30]:
# Evaluate using the English-Hindi test pairs
scores = evaluate_translation(
    translation_pairs=en_hi_pairs,
    en_model=en_model,
    hi_model=hi_model,
    rotation_matrix=rotation_matrix,
    top_k=[1, 5]
)

print(f"Precision@1: {scores[1]:.4f}")
print(f"Precision@5: {scores[5]:.4f}")

Precision@1: 0.1659
Precision@5: 0.3073


**This means that**:

In about 16.6% of test cases, the top predicted Hindi word was the correct translation.

In about 30.7% of cases, the correct Hindi translation appeared within the top 5 predictions.

These values may appear modest at first glance, but they are consistent with results reported in literature for distant language pairs like English‚ÄìHindi using unsupervised monolingual embeddings.

The relatively low Precision@1 reflects the inherent difficulty of aligning independently trained word embeddings from these two typologically different languages using a linear transformation method like Procrustes.

Precision@5 shows that the correct translation is often among the top candidates, indicating that the alignment is *semantically meaningful*, even if exact matches are not always ranked first.

In [32]:
def run_ablation_study(pair_list, en_model, hi_model, test_pairs, sizes=[5000, 10000, 20000]):
    """
    Runs an ablation study on bilingual lexicon sizes for supervised alignment.

    Args:
        pair_list (List[Tuple[str, str]]): Full bilingual lexicon (training pairs).
        en_model (FastText model): Pretrained English FastText model.
        hi_model (FastText model): Pretrained Hindi FastText model.
        test_pairs (List[Tuple[str, str]]): Test pairs for evaluation.
        sizes (List[int]): Lexicon sizes to try.

    Returns:
        Dict[int, Dict[int, float]]: Dictionary mapping lexicon size to precision scores.
    """
    results = {}

    for size in sizes:
        print(f"\nüîß Running alignment with {size} training pairs...")

        # Limit training data
        train_subset = pair_list[:size]

        # Extract embeddings
        try:
            en_train_embeds = np.array([en_model.get_word_vector(e) for e, h in train_subset])
            hi_train_embeds = np.array([hi_model.get_word_vector(h) for e, h in train_subset])
        except Exception as e:
            print(f"Skipping size {size} due to embedding error: {e}")
            continue

        # Align
        aligned_embeds, rotation = align_embeddings(en_train_embeds, hi_train_embeds)

        # Evaluate
        precision_scores = evaluate_translation(
            translation_pairs=test_pairs,
            en_model=en_model,
            hi_model=hi_model,
            rotation_matrix=rotation,
            top_k=[1, 5]
        )

        print(f"Lexicon size {size}: P@1 = {precision_scores[1]:.4f}, P@5 = {precision_scores[5]:.4f}")
        results[size] = precision_scores

    return results

def plot_ablation_results(results_dict):
    sizes = sorted(results_dict.keys())
    p1_scores = [results_dict[s][1] for s in sizes]
    p5_scores = [results_dict[s][5] for s in sizes]

    plt.figure(figsize=(8, 5))
    plt.plot(sizes, p1_scores, marker='o', label='Precision@1')
    plt.plot(sizes, p5_scores, marker='s', label='Precision@5')
    plt.title("Ablation Study: Impact of Lexicon Size on Alignment Quality")
    plt.xlabel("Lexicon Size")
    plt.ylabel("Precision")
    plt.legend()
    plt.grid(True)
    plt.savefig("ablation_study.jpg")
    plt.show()

In [None]:
ablation_results = run_ablation_study(
    pair_list=en_hi_pairs,
    en_model=en_model,
    hi_model=hi_model,
    test_pairs=en_hi_pairs,  # or use a dedicated test set
    sizes=[5000, 10000, 20000]
)

# Print results clearly
for size, scores in ablation_results.items():
    print(f"Lexicon size {size} ‚Üí P@1: {scores[1]:.4f}, P@5: {scores[5]:.4f}")


üîß Running alignment with 5000 training pairs...
Lexicon size 5000: P@1 = 0.1259, P@5 = 0.2534

üîß Running alignment with 10000 training pairs...
Lexicon size 10000: P@1 = 0.1483, P@5 = 0.2867

üîß Running alignment with 20000 training pairs...


In [None]:
# Run and store results
ablation_results = run_ablation_study(
    pair_list=en_hi_pairs,
    en_model=en_model,
    hi_model=hi_model,
    test_pairs=en_hi_pairs,  # You can also use a separate test set if needed
    sizes=[5000, 10000, 20000]
)

for size, scores in ablation_results.items():
    print(f"Lexicon size {size} ‚Üí Precision@1: {scores[1]:.4f}, Precision@5: {scores[5]:.4f}")

In [None]:
# Plot the results
plot_ablation_results(ablation_results)
plt.savefig("ablation_study.png")

Increasing the size of the training lexicon consistently improves both Precision@1 and Precision@5.

The most significant gain occurs when moving from 5k to 10k pairs, suggesting that a moderately sized bilingual dictionary already captures much of the necessary structure for alignment.

The improvements begin to plateau between 10k and 20k, indicating diminishing returns for larger dictionaries.

This trend highlights the practical trade-off between annotation cost (or lexicon availability) and alignment quality ‚Äî and motivates the use of unsupervised methods when large dictionaries aren't available.

# Unsupervised Alignment (extra credit) 

In [None]:
# Set manual seed for reproducibility
torch.manual_seed(42)

# Device config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Mapping W: learnable orthogonal matrix (initialized as identity)
class Generator(nn.Module):
    def __init__(self, dim):
        super(Generator, self).__init__()
        self.W = nn.Linear(dim, dim, bias=False)
        self.W.weight.data.copy_(torch.eye(dim))  # Start as identity

    def forward(self, x):
        return self.W(x)

# Discriminator D: binary classifier to distinguish real/fake
class Discriminator(nn.Module):
    def __init__(self, dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(dim, 2048),
            nn.LeakyReLU(0.2),
            nn.Linear(2048, 2048),
            nn.LeakyReLU(0.2),
            nn.Linear(2048, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

In [None]:
def load_and_normalize_embeddings(ft_model, max_vocab=50000):
    """
    Loads and normalizes FastText embeddings from a pretrained model.

    Args:
        ft_model: A loaded fastText model (English or Hindi).
        max_vocab (int): Maximum number of most frequent words to load.

    Returns:
        embeddings (torch.Tensor): Normalized embeddings (n_words x dim).
        vocab (List[str]): Corresponding word list.
    """
    vocab = ft_model.get_words(include_freq=False)[:max_vocab]
    embeddings = np.array([ft_model.get_word_vector(w) for w in vocab])
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    normalized = embeddings / norms
    return torch.tensor(normalized, dtype=torch.float32).to(device), vocab

In [None]:
# Load and normalize top 50k English and Hindi embeddings
en_embeddings, en_vocab = load_and_normalize_embeddings(en_model, max_vocab=50000)
hi_embeddings, hi_vocab = load_and_normalize_embeddings(hi_model, max_vocab=50000)

print("English embedding shape:", en_embeddings.shape)
print("Hindi embedding shape:", hi_embeddings.shape)

In [None]:
def adversarial_train_loop(
    src_embeddings, tgt_embeddings,
    generator, discriminator,
    num_epochs=10, batch_size=128,
    d_steps=5, lr_g=0.1, lr_d=0.1
):
    """
    Runs adversarial training to learn a mapping from source to target space.

    Args:
        src_embeddings (torch.Tensor): English embeddings (n x d).
        tgt_embeddings (torch.Tensor): Hindi embeddings (n x d).
        generator (nn.Module): The mapping (W).
        discriminator (nn.Module): The binary classifier (D).
        num_epochs (int): Number of training epochs.
        batch_size (int): Batch size for both D and G.
        d_steps (int): Number of discriminator steps per generator step.
        lr_g (float): Learning rate for generator.
        lr_d (float): Learning rate for discriminator.

    Returns:
        generator (nn.Module): Trained generator (mapping W).
    """
    g_opt = optim.SGD(generator.parameters(), lr=lr_g)
    d_opt = optim.SGD(discriminator.parameters(), lr=lr_d)
    loss_fn = nn.BCELoss()

    n = src_embeddings.shape[0]
    d = src_embeddings.shape[1]

    for epoch in range(num_epochs):
        generator.train()
        discriminator.train()

        perm = torch.randperm(n)

        for i in range(0, n, batch_size):
            # Sample real Hindi (target) embeddings
            batch_indices = perm[i:i+batch_size]
            real_tgt = tgt_embeddings[batch_indices]

            # Sample matching English embeddings and map to Hindi space
            real_src = src_embeddings[batch_indices]
            fake_tgt = generator(real_src).detach()

            # === Step 1: Train Discriminator ===
            for _ in range(d_steps):
                # Inputs and labels
                d_real_preds = discriminator(real_tgt)
                d_fake_preds = discriminator(fake_tgt)

                d_loss_real = loss_fn(d_real_preds, torch.ones_like(d_real_preds))
                d_loss_fake = loss_fn(d_fake_preds, torch.zeros_like(d_fake_preds))
                d_loss = d_loss_real + d_loss_fake

                d_opt.zero_grad()
                d_loss.backward()
                d_opt.step()

            # === Step 2: Train Generator (to fool D) ===
            fake_tgt = generator(real_src)
            preds = discriminator(fake_tgt)
            g_loss = loss_fn(preds, torch.ones_like(preds))  # want D to think fake is real

            g_opt.zero_grad()
            g_loss.backward()
            g_opt.step()

        print(f"Epoch {epoch+1}/{num_epochs} | D Loss: {d_loss.item():.4f} | G Loss: {g_loss.item():.4f}")

    return generator