# full code for unsupevised with adversarial training

In [5]:
import os
import gzip
import shutil
import requests
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.neighbors import NearestNeighbors

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Function to download files
def download_file(url, dest_path):
    if not os.path.exists(dest_path):
        print(f"Downloading from {url}...")
        response = requests.get(url, stream=True)
        with open(dest_path, 'wb') as file:
            shutil.copyfileobj(response.raw, file)
        print(f"Downloaded {dest_path}")
    else:
        print(f"{dest_path} already exists.")

# Function to extract gzip files
def extract_gzip(source_path, dest_path):
    if not os.path.exists(dest_path):
        print(f"Extracting {source_path}...")
        with gzip.open(source_path, 'rb') as f_in:
            with open(dest_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        print(f"Extracted to {dest_path}")
    else:
        print(f"{dest_path} already exists.")

# URLs for FastText embeddings (English and Hindi)
english_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz"
hindi_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz"

# Paths to save the downloaded and extracted files
download_dir = "./fasttext_embeddings/"
os.makedirs(download_dir, exist_ok=True)

# English embeddings paths
en_gzip_path = os.path.join(download_dir, "cc.en.300.vec.gz")
en_vec_path = os.path.join(download_dir, "cc.en.300.vec")

# Hindi embeddings paths
hi_gzip_path = os.path.join(download_dir, "cc.hi.300.vec.gz")
hi_vec_path = os.path.join(download_dir, "cc.hi.300.vec")

# Download English and Hindi embeddings
download_file(english_url, en_gzip_path)
download_file(hindi_url, hi_gzip_path)

# Extract the downloaded .gz files
extract_gzip(en_gzip_path, en_vec_path)
extract_gzip(hi_gzip_path, hi_vec_path)

# Function to load FastText embeddings from a .vec file
def load_fasttext_embeddings(file_path, limit=200000):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        next(f)  # Skip the header
        for i, line in enumerate(f):
            tokens = line.rstrip().split(' ')
            word = tokens[0]
            vector = np.asarray(tokens[1:], dtype='float32')
            embeddings[word] = vector
            if len(embeddings) >= limit:
                break
    return embeddings

# Load English and Hindi embeddings
print("Loading English embeddings...")
english_embeddings = load_fasttext_embeddings(en_vec_path)
print("Loading Hindi embeddings...")
hindi_embeddings = load_fasttext_embeddings(hi_vec_path)

# Filter embeddings to common words only
common_words = set(english_embeddings.keys()).intersection(set(hindi_embeddings.keys()))
print(f"Number of common words: {len(common_words)}")

# Convert embeddings to matrices
en_embeddings = np.array([english_embeddings[word] for word in common_words])
hi_embeddings = np.array([hindi_embeddings[word] for word in common_words])

# Normalize embeddings
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

en_embeddings = normalize_embeddings(en_embeddings)
hi_embeddings = normalize_embeddings(hi_embeddings)

# Adversarial Training Setup
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(input_dim, 2048)
        self.fc2 = nn.Linear(2048, 2048)
        self.fc3 = nn.Linear(2048, 1)
        self.leaky_relu = nn.LeakyReLU(0.2)

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        x = self.leaky_relu(self.fc2(x))
        return torch.sigmoid(self.fc3(x))

class MappingNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MappingNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, output_dim)
        self.fc2 = nn.Linear(output_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        return self.fc2(x)

# Hyperparameters
input_dim = en_embeddings.shape[1]
output_dim = hi_embeddings.shape[1]
num_epochs = 100
batch_size = 64
lr = 0.0002

# Create instances of the Discriminator and Mapping Network
discriminator = Discriminator(input_dim).to(device)
mapping_network = MappingNetwork(input_dim, output_dim).to(device)

# Optimizers
d_optimizer = optim.Adam(discriminator.parameters(), lr=lr)
m_optimizer = optim.Adam(mapping_network.parameters(), lr=lr)

# Adversarial Training
# Adversarial Training
def adversarial_training(src_emb, tgt_emb, mapping_net, discriminator, num_epochs=10, batch_size=128, lr=0.001):
    # Optimizers
    mapping_optimizer = optim.Adam(mapping_net.parameters(), lr=lr)
    discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=lr)

    # Loss function
    adversarial_loss = nn.BCELoss()

    # Convert embeddings to PyTorch tensors
    src_emb = torch.tensor(src_emb, dtype=torch.float32)
    tgt_emb = torch.tensor(tgt_emb, dtype=torch.float32)

    for epoch in range(num_epochs):
        for i in range(0, len(src_emb), batch_size):
            # Sample a batch of source and target embeddings
            src_batch = src_emb[i:i+batch_size]
            tgt_batch = tgt_emb[i:i+batch_size]

            # Create labels for discriminator training
            src_labels = torch.zeros(src_batch.size(0))
            tgt_labels = torch.ones(tgt_batch.size(0))

            # Train discriminator
            discriminator_optimizer.zero_grad()
            src_pred = discriminator(src_batch)
            tgt_pred = discriminator(tgt_batch)
            loss_d = adversarial_loss(src_pred, src_labels) + adversarial_loss(tgt_pred, tgt_labels)
            loss_d.backward()
            discriminator_optimizer.step()

            # Train mapping to fool the discriminator
            mapping_optimizer.zero_grad()
            mapped_src_batch = mapping_net(src_batch)
            src_pred = discriminator(mapped_src_batch)
            loss_g = adversarial_loss(src_pred, tgt_labels)  # Fool the discriminator
            loss_g.backward()
            mapping_optimizer.step()

        print(f"Epoch {epoch+1}/{num_epochs}, Discriminator Loss: {loss_d.item()}, Generator Loss: {loss_g.item()}")

    # Return the learned mapping
    return mapping_net


# Procrustes Refinement
def procrustes_alignment(source_embeddings, target_embeddings):
    """Align source embeddings to target embeddings using Procrustes analysis."""
    source_centered = source_embeddings - np.mean(source_embeddings, axis=0)
    target_centered = target_embeddings - np.mean(target_embeddings, axis=0)
    covariance = np.dot(source_centered.T, target_centered)
    U, _, Vt = np.linalg.svd(covariance)
    rotation_matrix = np.dot(U, Vt)
    refined_source = np.dot(source_embeddings, rotation_matrix)
    return refined_source

# Refine the mapping using Procrustes
refined_en_embeddings = procrustes_alignment(en_embeddings, hi_embeddings)

# Function to calculate Cross-Domain Similarity Local Scaling (CSLS)
def calculate_csls(source_embeddings, target_embeddings, k=5):
    nbrs = NearestNeighbors(n_neighbors=k, metric='cosine').fit(target_embeddings)

    source_csls = np.mean(nbrs.kneighbors(source_embeddings, return_distance=False), axis=1)
    target_csls = np.mean(nbrs.kneighbors(target_embeddings, return_distance=False), axis=1)

    return source_csls, target_csls

# Calculate CSLS
source_csls, target_csls = calculate_csls(refined_en_embeddings, hi_embeddings)
print("CSLS Calculated.")

# Function to calculate Precision at 1 (P@1) and Precision at 5 (P@5)
def calculate_precision(source_embeddings, target_embeddings, common_words, k=5):
    # Create a mapping from word to its index for both languages
    word_list = list(common_words)
    en_word_to_index = {word: idx for idx, word in enumerate(word_list)}
    hi_word_to_index = {word: idx for idx, word in enumerate(word_list)}

    # Fit Nearest Neighbors on target embeddings
    nbrs = NearestNeighbors(n_neighbors=k, metric='cosine').fit(target_embeddings)

    # Initialize counters for correct predictions
    correct_at_1 = 0
    correct_at_5 = 0

    for idx, en_word in enumerate(word_list):
        hi_word = word_list[idx]  # Assuming a one-to-one mapping in the same index
        # Get the English word vector
        en_vector = source_embeddings[en_word_to_index[en_word]]
        # Find the k nearest neighbors in the target embeddings
        neighbors = nbrs.kneighbors([en_vector], return_distance=False)[0]

        # Check if the true translation is in the nearest neighbors
        if hi_word_to_index[hi_word] in neighbors:
            correct_at_1 += 1
        if any(hi_word_to_index[hi_word] == neighbor for neighbor in neighbors):
            correct_at_5 += 1

    precision_at_1 = correct_at_1 / len(common_words)
    precision_at_5 = correct_at_5 / len(common_words)
    return precision_at_1, precision_at_5

# Generate ground truth pairs from common words
ground_truth_pairs = [(word, word) for word in common_words]

# Calculate precision
precision_at_1, precision_at_5 = calculate_precision(refined_en_embeddings, hi_embeddings, ground_truth_pairs)
print(f"Precision at 1: {precision_at_1:.4f}")
print(f"Precision at 5: {precision_at_5:.4f}")



Using device: cpu
./fasttext_embeddings/cc.en.300.vec.gz already exists.
./fasttext_embeddings/cc.hi.300.vec.gz already exists.
./fasttext_embeddings/cc.en.300.vec already exists.
./fasttext_embeddings/cc.hi.300.vec already exists.
Loading English embeddings...
Loading Hindi embeddings...
Number of common words: 35732
CSLS Calculated.
Precision at 1: 0.5493
Precision at 5: 0.5493
