In [None]:
# Install necessary libraries
!pip install gensim numpy scikit-learn

# Download pre-trained embeddings for English and Hindi from FastText
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz

# Load pre-trained FastText embeddings using gensim
from gensim.models import KeyedVectors

en_embeddings = KeyedVectors.load_word2vec_format('cc.en.300.vec.gz')
hi_embeddings = KeyedVectors.load_word2vec_format('cc.hi.300.vec.gz')

# Check the size of the vocabulary for both languages
print(f'English vocab size: {len(en_embeddings.index_to_key)}')
print(f'Hindi vocab size: {len(hi_embeddings.index_to_key)}')

# Download the English-Hindi bilingual dictionary from MUSE
!wget https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt

# Load the bilingual dictionary (English-Hindi)
bilingual_lexicon = []
with open('en-hi.txt', 'r') as f:
    for line in f:
        en_word, hi_word = line.strip().split()
        # Ensure both words exist in the embeddings' vocabularies
        if en_word in en_embeddings.key_to_index and hi_word in hi_embeddings.key_to_index:
            bilingual_lexicon.append((en_word, hi_word))

# Check the size of the bilingual lexicon
print(f'Total bilingual pairs: {len(bilingual_lexicon)}')

import numpy as np

# Extract the embeddings for the bilingual lexicon pairs
X = np.array([en_embeddings[word_en] for word_en, word_hi in bilingual_lexicon])
Y = np.array([hi_embeddings[word_hi] for word_en, word_hi in bilingual_lexicon])

# Procrustes alignment method
def procrustes(X, Y):
    U, _, Vt = np.linalg.svd(np.dot(X.T, Y))
    R = np.dot(U, Vt)
    return R

# Compute the optimal orthogonal matrix using the bilingual lexicon
R = procrustes(X, Y)

# Align the English embeddings with the learned transformation matrix
aligned_en_embeddings = np.dot(en_embeddings.vectors, R)

from sklearn.metrics.pairwise import cosine_similarity

# Function to translate words from English to Hindi using aligned embeddings
def translate_word(word, aligned_en_embeddings, hi_embeddings):
    en_vector = aligned_en_embeddings[en_embeddings.key_to_index[word]]
    similarities = cosine_similarity(en_vector.reshape(1, -1), hi_embeddings.vectors)
    most_similar_idx = similarities.argmax()
    return hi_embeddings.index_to_key[most_similar_idx]

# Example translation
word = ('duck')
translation = translate_word(word, aligned_en_embeddings, hi_embeddings)
print(f'Translation of "{word}" is "{translation}"')

# Function to evaluate translation quality using Precision@1 and Precision@5
def evaluate_translation(test_dict, aligned_en_embeddings, hi_embeddings, k=5):
    correct_at_1 = 0
    correct_at_5 = 0
    for en_word, hi_word in test_dict:
        en_vector = aligned_en_embeddings[en_embeddings.key_to_index[en_word]]
        similarities = cosine_similarity(en_vector.reshape(1, -1), hi_embeddings.vectors)
        top_k_indices = similarities[0].argsort()[-k:][::-1]
        top_k_words = [hi_embeddings.index_to_key[i] for i in top_k_indices]

        if hi_word == top_k_words[0]:
            correct_at_1 += 1
        if hi_word in top_k_words:
            correct_at_5 += 1

    precision_at_1 = correct_at_1 / len(test_dict)
    precision_at_5 = correct_at_5 / len(test_dict)
    return precision_at_1, precision_at_5

# Prepare the test dictionary (subset of MUSE)
muse_test_dict = bilingual_lexicon[:1000]  # Use a subset for testing

# Evaluate Precision@1 and Precision@5
precision_at_1, precision_at_5 = evaluate_translation(muse_test_dict, aligned_en_embeddings, hi_embeddings)
print(f'Precision@1: {precision_at_1}')
print(f'Precision@5: {precision_at_5}')

# Step d: Compute and analyze cosine similarities between word pairs
def analyze_cosine_similarity(bilingual_lexicon, aligned_en_embeddings, hi_embeddings, num_pairs=10):
    similarities = []

    for i, (en_word, hi_word) in enumerate(bilingual_lexicon):
        if en_word in en_embeddings and hi_word in hi_embeddings:
            # Get the aligned English embedding and the corresponding Hindi embedding
            en_vector = aligned_en_embeddings[en_embeddings.key_to_index[en_word]]
            hi_vector = hi_embeddings[hi_word]

            # Compute cosine similarity
            similarity = cosine_similarity(en_vector.reshape(1, -1), hi_vector.reshape(1, -1))[0][0]
            similarities.append((en_word, hi_word, similarity))

        # Limit the output to `num_pairs` for analysis
        if i >= num_pairs:
            break

    return similarities

# Compute and print cosine similarities for the first 10 word pairs
cosine_similarities = analyze_cosine_similarity(bilingual_lexicon, aligned_en_embeddings, hi_embeddings, num_pairs=10)

print(f"{'English Word':<15}{'Hindi Word':<15}{'Cosine Similarity'}")
for en_word, hi_word, sim in cosine_similarities:
    print(f"{en_word:<15}{hi_word:<15}{sim:.4f}")


# Step e: Conduct an ablation study to assess the impact of bilingual lexicon size
def ablation_study(bilingual_lexicon, en_embeddings, hi_embeddings, sizes=[5000, 10000, 20000]):
    results = []

    for size in sizes:
        # Limit the lexicon to the current size
        current_lexicon = bilingual_lexicon[:size]

        # Extract embeddings for the bilingual lexicon pairs
        X = np.array([en_embeddings[word_en] for word_en, word_hi in current_lexicon if word_en in en_embeddings and word_hi in hi_embeddings])
        Y = np.array([hi_embeddings[word_hi] for word_en, word_hi in current_lexicon if word_en in en_embeddings and word_hi in hi_embeddings])

        # Perform Procrustes alignment
        R = procrustes(X, Y)
        aligned_en_embeddings = np.dot(en_embeddings.vectors, R)

        # Evaluate using Precision@1 and Precision@5
        precision_at_1, precision_at_5 = evaluate_translation(current_lexicon, aligned_en_embeddings, hi_embeddings)

        # Store results for analysis
        results.append((size, precision_at_1, precision_at_5))

    return results

# Run the ablation study with different dictionary sizes
ablation_results = ablation_study(bilingual_lexicon, en_embeddings, hi_embeddings, sizes=[5000, 10000, 20000])

# Print the results
print(f"{'Lexicon Size':<15}{'Precision@1':<15}{'Precision@5'}")
for size, p1, p5 in ablation_results:
    print(f"{size:<15}{p1:<15}{p5}")



--2024-09-26 13:21:42--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.164.78.81, 18.164.78.72, 18.164.78.128, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.164.78.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz.3’


2024-09-26 13:22:23 (31.0 MB/s) - ‘cc.en.300.vec.gz.3’ saved [1325960915/1325960915]

--2024-09-26 13:22:23--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.162.125.57, 3.162.125.58, 3.162.125.66, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.162.125.57|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1118942272 (1.0G) [binary/octet-stream]
Saving to: ‘cc.hi.300.vec.gz.3’


2024-09-26 13:23:12 (22.2 MB/s) - ‘cc.hi.300.vec.gz.3’ saved 

#CSLS method

In [None]:
!pip install torch faiss-cpu gensim numpy scikit-learn

!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz

from gensim.models import KeyedVectors

en_embeddings = KeyedVectors.load_word2vec_format('cc.en.300.vec.gz')
hi_embeddings = KeyedVectors.load_word2vec_format('cc.hi.300.vec.gz')

# vocabulary size of both embeddings
print(f'English vocab size: {len(en_embeddings.index_to_key)}')
print(f'Hindi vocab size: {len(hi_embeddings.index_to_key)}')


# English-Hindi bilingual dictionary from MUSE
!wget https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt

# Load the bilingual dictionary
bilingual_lexicon = []
with open('en-hi.txt', 'r') as f:
    for line in f:
        en_word, hi_word = line.strip().split()
        if en_word in en_embeddings.key_to_index and hi_word in hi_embeddings.key_to_index:
            bilingual_lexicon.append((en_word, hi_word))

print(f'Total bilingual pairs: {len(bilingual_lexicon)}')


# CSLS function (Unsupervised Alignment)
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_csls(src_emb, tgt_emb, k=10):
    # cosine similarities
    similarities = cosine_similarity(src_emb, tgt_emb)

    # average cosine similarity for each vector in the source and target spaces
    src_avg_sim = np.mean(np.sort(similarities, axis=1)[:, -k:], axis=1)
    tgt_avg_sim = np.mean(np.sort(similarities.T, axis=1)[:, -k:], axis=1)

    # CSLS correction
    csls_similarities = 2 * similarities.T - src_avg_sim - tgt_avg_sim[:, np.newaxis]

    return csls_similarities.T


# the Mapping Network and Discriminator for Adversarial Training
import torch
import torch.nn as nn
import torch.optim as optim

# the mapping network as a linear transformation (orthogonal matrix)
class MappingNetwork(nn.Module):
    def __init__(self, embedding_dim):
        super(MappingNetwork, self).__init__()
        self.mapping = nn.Linear(embedding_dim, embedding_dim, bias=False)

    def forward(self, x):
        return self.mapping(x)

# the discriminator network as a simple binary classifier
class Discriminator(nn.Module):
    def __init__(self, embedding_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x).view(-1)

# adversarial training of the Mapping Network and Discriminator
def adversarial_training(src_emb, tgt_emb, mapping_net, discriminator, num_epochs=10, batch_size=128, lr=0.001):
    # Optimizers
    mapping_optimizer = optim.Adam(mapping_net.parameters(), lr=lr)
    discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=lr)

    # Loss function
    adversarial_loss = nn.BCELoss()

    # Convert embeddings to PyTorch tensors
    src_emb = torch.tensor(src_emb, dtype=torch.float32)
    tgt_emb = torch.tensor(tgt_emb, dtype=torch.float32)

    for epoch in range(num_epochs):
        for i in range(0, len(src_emb), batch_size):
            # Sample a batch of source and target embeddings
            src_batch = src_emb[i:i+batch_size]
            tgt_batch = tgt_emb[i:i+batch_size]

            # Create labels for discriminator training
            src_labels = torch.zeros(src_batch.size(0))
            tgt_labels = torch.ones(tgt_batch.size(0))

            # Train discriminator
            discriminator_optimizer.zero_grad()
            src_pred = discriminator(src_batch)
            tgt_pred = discriminator(tgt_batch)
            loss_d = adversarial_loss(src_pred, src_labels) + adversarial_loss(tgt_pred, tgt_labels)
            loss_d.backward()
            discriminator_optimizer.step()

            # Train mapping to fool the discriminator
            mapping_optimizer.zero_grad()
            mapped_src_batch = mapping_net(src_batch)
            src_pred = discriminator(mapped_src_batch)
            loss_g = adversarial_loss(src_pred, tgt_labels)  # Fool the discriminator
            loss_g.backward()
            mapping_optimizer.step()

        print(f"Epoch {epoch+1}/{num_epochs}, Discriminator Loss: {loss_d.item()}, Generator Loss: {loss_g.item()}")

    # Return the learned mapping
    return mapping_net


# align the source embeddings with the learned mapping and evaluate CSLS
# set up the mapping network and discriminator
embedding_dim = en_embeddings.vector_size
mapping_net = MappingNetwork(embedding_dim)
discriminator = Discriminator(embedding_dim)

# convert the source and target embeddings to numpy arrays for adversarial training
src_emb = np.array([en_embeddings[word_en] for word_en, word_hi in bilingual_lexicon])
tgt_emb = np.array([hi_embeddings[word_hi] for word_en, word_hi in bilingual_lexicon])

# train the mapping using adversarial training
mapping_net = adversarial_training(src_emb, tgt_emb, mapping_net, discriminator)

# align the source embeddings with the learned mapping
aligned_src_emb = mapping_net(torch.tensor(src_emb, dtype=torch.float32)).detach().numpy()

# Evaluate using CSLS
def evaluate_with_csls(src_emb, tgt_emb, k=5):
    csls_similarities = compute_csls(src_emb, tgt_emb)

    # Evaluate Precision@1 and Precision@5
    precision_at_1 = 0
    precision_at_5 = 0

    for i, similarities in enumerate(csls_similarities):
        top_k_indices = similarities.argsort()[-k:][::-1]
        top_k_words = [hi_embeddings.index_to_key[i] for i in top_k_indices]
        en_word, hi_word = bilingual_lexicon[i]

        if hi_word == top_k_words[0]:
            precision_at_1 += 1
        if hi_word in top_k_words:
            precision_at_5 += 1

    precision_at_1 /= len(bilingual_lexicon)
    precision_at_5 /= len(bilingual_lexicon)

    return precision_at_1, precision_at_5

# Evaluate the unsupervised CSLS-based method
precision_at_1_csls, precision_at_5_csls = evaluate_with_csls(aligned_src_emb, tgt_emb)
print(f"Unsupervised CSLS - Precision@1: {precision_at_1_csls}, Precision@5: {precision_at_5_csls}")

# compare with Supervised Procrustes (If implemented earlier)
# Assuming you already have a function to evaluate using Procrustes
# Compare with Procrustes method (already implemented in previous sections)
precision_at_1_procrustes, precision_at_5_procrustes = evaluate_translation(muse_test_dict, aligned_en_embeddings, hi_embeddings)
print(f"Supervised Procrustes - Precision@1: {precision_at_1_procrustes}, Precision@5: {precision_at_5_procrustes}")

# comparison results
print(f"Supervised Procrustes: Precision@1 = {precision_at_1_procrustes}, Precision@5 = {precision_at_5_procrustes}")
print(f"Unsupervised CSLS + Adversarial: Precision@1 = {precision_at_1_csls}, Precision@5 = {precision_at_5_csls}")

--2024-09-26 16:49:11--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.164.78.81, 18.164.78.128, 18.164.78.121, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.164.78.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz.7’


2024-09-26 16:49:25 (97.7 MB/s) - ‘cc.en.300.vec.gz.7’ saved [1325960915/1325960915]

--2024-09-26 16:49:25--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.164.78.81, 18.164.78.128, 18.164.78.121, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.164.78.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1118942272 (1.0G) [binary/octet-stream]
Saving to: ‘cc.hi.300.vec.gz.7’


2024-09-26 16:49:44 (56.8 MB/s) - ‘cc.hi.300.vec.gz.7’ sav

I don't have enough ram so the free colab version keeps crashing but this should work