# **Downloading embeddings from Fastext and unzipping**

In [2]:
import requests
import gzip
import shutil
import os

# Function to download files
def download_file(url, dest_path):
    print(f"Downloading from {url}...")
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as file:
        shutil.copyfileobj(response.raw, file)
    print(f"Downloaded {dest_path}")

# Function to extract gzip files (FastText vectors are usually .vec.gz)
def extract_gzip(source_path, dest_path):
    print(f"Extracting {source_path}...")
    with gzip.open(source_path, 'rb') as f_in:
        with open(dest_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"Extracted to {dest_path}")

# URLs for FastText embeddings (English and Hindi)
english_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz"
hindi_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz"

# Paths to save the downloaded and extracted files
download_dir = "./fasttext_embeddings/"
os.makedirs(download_dir, exist_ok=True)

# English embeddings paths
en_gzip_path = os.path.join(download_dir, "cc.en.300.vec.gz")
en_vec_path = os.path.join(download_dir, "cc.en.300.vec")

# Hindi embeddings paths
hi_gzip_path = os.path.join(download_dir, "cc.hi.300.vec.gz")
hi_vec_path = os.path.join(download_dir, "cc.hi.300.vec")

# Download English and Hindi embeddings
download_file(english_url, en_gzip_path)
download_file(hindi_url, hi_gzip_path)

# Extract the downloaded .gz files
extract_gzip(en_gzip_path, en_vec_path)
extract_gzip(hi_gzip_path, hi_vec_path)


Downloading from https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz...
Downloaded ./fasttext_embeddings/cc.en.300.vec.gz
Downloading from https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz...
Downloaded ./fasttext_embeddings/cc.hi.300.vec.gz
Extracting ./fasttext_embeddings/cc.en.300.vec.gz...
Extracted to ./fasttext_embeddings/cc.en.300.vec
Extracting ./fasttext_embeddings/cc.hi.300.vec.gz...
Extracted to ./fasttext_embeddings/cc.hi.300.vec


# **Downloading the bilingual dictionary english to hindi**

In [3]:
import os
import requests

def download_bilingual_dictionary(url, dest_path):
    print(f"Downloading from {url}...")
    response = requests.get(url)
    with open(dest_path, 'wb') as file:
        file.write(response.content)
    print(f"Downloaded bilingual dictionary to {dest_path}")

bilingual_dict_url = "https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt"

download_dir = "./muse_bilingual_dictionaries/"
os.makedirs(download_dir, exist_ok=True)

en_hi_dict_path = os.path.join(download_dir, "en-hi.txt")

download_bilingual_dictionary(bilingual_dict_url, en_hi_dict_path)


Downloading from https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt...
Downloaded bilingual dictionary to ./muse_bilingual_dictionaries/en-hi.txt


# **Loading the embeddings of each linguals, setting the limit to 100000 as said in the provided documentation**

In [4]:
import numpy as np

def load_fasttext_embeddings(file_path, limit=100000):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i == 0:
                continue  # Skip the first line with metadata
            tokens = line.rstrip().split(' ')
            word = tokens[0]
            vector = np.asarray(tokens[1:], dtype='float32')
            embeddings[word] = vector
            if len(embeddings) >= limit:
                break
    return embeddings

english_embeddings = load_fasttext_embeddings("./fasttext_embeddings/cc.en.300.vec", limit=100000)
hindi_embeddings = load_fasttext_embeddings("./fasttext_embeddings/cc.hi.300.vec", limit=100000)


# **Loading the bilingual dictionary**

In [5]:
def load_bilingual_dictionary(file_path):
    word_pairs = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            en_word, hi_word = line.strip().split()
            word_pairs.append((en_word, hi_word))
    return word_pairs

bilingual_dict = load_bilingual_dictionary("./muse_bilingual_dictionaries/en-hi.txt")


In [12]:
def extract_bilingual_lexicon(bilingual_dict, n=5000):
    return bilingual_dict[:n]

# Example usage
bilingual_lexicon_5k = extract_bilingual_lexicon(bilingual_dict, 5000)
bilingual_lexicon_10k = extract_bilingual_lexicon(bilingual_dict, 10000)
bilingual_lexicon_20k = extract_bilingual_lexicon(bilingual_dict, 20000)


# **This code section makes the alignment of the english and the hindi embeddings vectors by fetching the the vector associated with en_word and hi_word from there respective embeddings**

In [13]:
def create_alignment_matrices(bilingual_dict, en_embeddings, hi_embeddings):
    X = []
    Y = []
    for en_word, hi_word in bilingual_dict:
        if en_word in en_embeddings and hi_word in hi_embeddings:
            X.append(en_embeddings[en_word])
            Y.append(hi_embeddings[hi_word])
    return np.array(X), np.array(Y)

X, Y = create_alignment_matrices(bilingual_dict, english_embeddings, hindi_embeddings)
print(f"Alignment Matrices: X shape = {X.shape}, Y shape = {Y.shape}")


Alignment Matrices: X shape = (15314, 300), Y shape = (15314, 300)


# **This code performs Procrustes alignment to align word embeddings (English to hindi) using linear transformation**

# It computes the optimal linear transformatiom matrix W that aligns X (English embeddings) to Y (Hindi Embeddings) as closely as possible, whereas W is the transformation matrix. This matrix is then used to transform X by computing aligned_X = X.dot(W), aligning the Engish embeddings to the Hindi space.

In [14]:
from scipy.linalg import orthogonal_procrustes

def procrustes_alignment(X, Y):
    W, _ = orthogonal_procrustes(X, Y)
    return W

# Use Procrustes alignment for each lexicon size
def perform_alignment(bilingual_lexicon, en_embeddings, hi_embeddings):
    X, Y = create_alignment_matrices(bilingual_lexicon, en_embeddings, hi_embeddings)
    W = procrustes_alignment(X, Y)
    aligned_X = X.dot(W)
    return aligned_X, Y, W

aligned_X_5k, Y_5k, W_5k = perform_alignment(bilingual_lexicon_5k, english_embeddings, hindi_embeddings)
aligned_X_10k, Y_10k, W_10k = perform_alignment(bilingual_lexicon_10k, english_embeddings, hindi_embeddings)
aligned_X_20k, Y_20k, W_20k = perform_alignment(bilingual_lexicon_20k, english_embeddings, hindi_embeddings)


# It performs English to Hindi word translation using aligned word embeddings and cosine similarity.
1. It first checks if the English word exists in en_embeddings. If not, it returns an empty list
2. If the word exists, it retrieves the embeddings and aligns it using the transformation matrix W obtained from Procrustes alignment.
3. The aligned word vector is compared to all Hindi embeddings using cosine_similarity. The higher the similarity, the closer the words are semantically.
4. It identifies the top-k most similar Hindi words by sorting the cosine similarity values and retreiving the correspondig Hindi words from the hi_embeddings.

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

def translate_word(word, en_embeddings, aligned_X, hi_embeddings, top_k=5):
    if word not in en_embeddings:
        return []

    word_vec = en_embeddings[word].dot(W)
    similarities = cosine_similarity([word_vec], list(hi_embeddings.values()))[0]
    closest_indices = np.argsort(similarities)[-top_k:][::-1]

    hindi_words = list(hi_embeddings.keys())
    return [(hindi_words[i], similarities[i]) for i in closest_indices]

translation_5k = translate_word("world", english_embeddings, aligned_X_5k, hindi_embeddings)
translation_10k = translate_word("world", english_embeddings, aligned_X_10k, hindi_embeddings)
translation_20k = translate_word("world", english_embeddings, aligned_X_20k, hindi_embeddings)

print(f"Top Hindi translations for 'world': {translation_5k}")
print(f"Top Hindi translations for 'world': {translation_10k}")
print(f"Top Hindi translations for 'world': {translation_20k}")

Top Hindi translations for 'world': [('दुनिया', 0.70340943), ('देश', 0.59056157), ('विश्व', 0.5696856), ('दुनियां', 0.5615448), ('संसार', 0.54642093)]
Top Hindi translations for 'world': [('दुनिया', 0.70340943), ('देश', 0.59056157), ('विश्व', 0.5696856), ('दुनियां', 0.5615448), ('संसार', 0.54642093)]
Top Hindi translations for 'world': [('दुनिया', 0.70340943), ('देश', 0.59056157), ('विश्व', 0.5696856), ('दुनियां', 0.5615448), ('संसार', 0.54642093)]


# **This code evaluates the precision of English-to-Hindi word translations using word embeddings. Here's the step-by-step explanation:**


1. correct_at_1 counts cases where the correct translation is the top-1 result.
2. correct_at_5 counts cases where the correct translation is among the top-5 results.
3. total tracks how many word pairs are evaluated.

4. For each word pair (en_word, hi_word), it retrieves the translations using the translate_word function and checks if the correct Hindi word appears in the top-k results.
Calculate Precision:

5. precision_at_1 is the proportion of correct top-1 translations.
6. precision_at_5 is the proportion of correct top-5 translations.

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def translate_word(word, en_embeddings, W, hi_embeddings, top_k=5):
    if word not in en_embeddings:
        return []

    word_vec = en_embeddings[word].dot(W)
    similarities = cosine_similarity([word_vec], list(hi_embeddings.values()))[0]
    closest_indices = np.argsort(similarities)[-top_k:][::-1]

    hindi_words = list(hi_embeddings.keys())
    return [(hindi_words[i], similarities[i]) for i in closest_indices]

def evaluate_precision(bilingual_dict, en_embeddings, hi_embeddings, W, top_k=5):
    correct_at_1 = 0
    correct_at_5 = 0
    total = 0

    for en_word, hi_word in bilingual_dict:
        if en_word in en_embeddings and hi_word in hi_embeddings:
            translations = translate_word(en_word, en_embeddings, W, hi_embeddings, top_k=top_k)
            top_words = [t[0] for t in translations]

            if hi_word == top_words[0]:
                correct_at_1 += 1
            if hi_word in top_words:
                correct_at_5 += 1
            total += 1

    precision_at_1 = correct_at_1 / total
    precision_at_5 = correct_at_5 / total
    return precision_at_1, precision_at_5

# Evaluate with different lexicon sizes
precision_at_1_5k, precision_at_5_5k = evaluate_precision(bilingual_dict, english_embeddings, hindi_embeddings, W_5k, top_k=5)
precision_at_1_10k, precision_at_5_10k = evaluate_precision(bilingual_dict, english_embeddings, hindi_embeddings, W_10k, top_k=5)
precision_at_1_20k, precision_at_5_20k = evaluate_precision(bilingual_dict, english_embeddings, hindi_embeddings, W_20k, top_k=5)

print(f"Precision@1 (5k): {precision_at_1_5k:.4f}, Precision@5 (5k): {precision_at_5_5k:.4f}")
print(f"Precision@1 (10k): {precision_at_1_10k:.4f}, Precision@5 (10k): {precision_at_5_10k:.4f}")
print(f"Precision@1 (20k): {precision_at_1_20k:.4f}, Precision@5 (20k): {precision_at_5_20k:.4f}")


Precision@1 (5k): 0.2867, Precision@5 (5k): 0.5624
Precision@1 (10k): 0.3328, Precision@5 (10k): 0.6233
Precision@1 (20k): 0.3630, Precision@5 (20k): 0.6544
