### Objective : English to french translation


## Algorithm 

    1. Suppose X and Y are vector space of english and french respectively.
    2. If we multiply R with X we will get Y. i.e. X * R = Y
    3. Once you are in french vector space, you will have to find the nearest neighbors.

In [1]:
from argparse import Namespace
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import itertools
from collections import defaultdict

In [2]:
## english to frech dictionaries
config = Namespace(
    train_file_path = './data/en-fr.train.txt',
    test_file_path = './data/en-fr.test.txt',
    
    english_embedding_path = './subset_embedding/en_embeddings.p',
    french_embedding_path = './subset_embedding/fr_embeddings.p'
)

In [3]:
## Subset of embedding
en_embedding  = pickle.load(open(config.english_embedding_path, "rb"))
fr_embedding  = pickle.load(open(config.french_embedding_path, "rb"))


len(en_embedding), len(fr_embedding)

(6370, 5766)

## Load Data

In [4]:
train_pairs = []

with open(config.train_file_path) as f:
    for line in f.readlines():
        line = line.strip().split(' ')
        if len(line)==2:
            train_pairs.append([line[0], line[1]])
            
            
test_pairs = []

with open(config.test_file_path) as f:
    for line in f.readlines():
        line = line.strip().split(' ')
        if len(line)==2:
            test_pairs.append([line[0], line[1]])

In [5]:
len(train_pairs), len(test_pairs)

(10872, 2943)

## Utilities

In [11]:
def compute_gradient(X,y,R):
    
    m = X.shape[0]
    gradient = np.dot(X.T, np.dot(X,R) - y) * (2/m)
    return gradient


def compute_loss(X,y,R):
    
    m = X.shape[0]
    diff = np.dot(X,R) - y
    diff_squared = diff**2
    sum_diff_suared = np.sum(diff_squared)
    
    loss = sum_diff_suared/m
    
    return loss

def get_embedding(word, embedding_mat):
    if word in embedding_mat:
        return embedding_mat[word].reshape(1,300)
    
    return None

In [12]:
def get_matrics(pairs, en_embedding, fr_embedding):
    x_list = []
    y_list = []

    for i,(en_word, fr_word) in enumerate(train_pairs):
        en_word_embedd = get_embedding(en_word,en_embedding)
        fr_word_embedd = get_embedding(fr_word, fr_embedding)

        if en_word_embedd is not None and fr_word_embedd is not None:
            x_list.append(en_word_embedd)
            y_list.append(fr_word_embedd)

    X = np.vstack(x_list)
    y= np.vstack(y_list)

    return X, y

In [13]:
X_train, y_train = get_matrics(train_pairs, en_embedding, fr_embedding)
X_val, y_val = get_matrics(test_pairs, en_embedding, fr_embedding)

In [14]:
## Optimizer R
def train(epochs , lr):
    
    np.random.seed(129)
    
    R = np.random.rand(300, 300)

    for epoch in range(epochs):

        loss = compute_loss(X_train,y_train,R)
        gradient = compute_gradient(X_train,y_train,R)
        R = R - (lr * gradient)

        if (epoch+1)%25 == 0:
            print(f"Epoch {epoch} Loss : {loss}")
            
    return R


R = train(epochs = 400, lr = 0.8)

Epoch 24 Loss : 103.75080712125232
Epoch 49 Loss : 28.581241208690365
Epoch 74 Loss : 10.52838409224959
Epoch 99 Loss : 4.740614270409274
Epoch 124 Loss : 2.525009208180008
Epoch 149 Loss : 1.5630449512086788
Epoch 174 Loss : 1.1048770444767242
Epoch 199 Loss : 0.8710846200438042
Epoch 224 Loss : 0.745354168029209
Epoch 249 Loss : 0.6748854542077934
Epoch 274 Loss : 0.6340361932715131
Epoch 299 Loss : 0.6096758141723325
Epoch 324 Loss : 0.5947892102152811
Epoch 349 Loss : 0.5854952917369657
Epoch 374 Loss : 0.5795823131084801
Epoch 399 Loss : 0.5757569140300391


## Nearest Neighbors

In [15]:
def forward(x):
    fr_pred_emb = np.dot(x,R)
    return fr_pred_emb


def cosine_similarity(A, B):
    cos = -10
    dot = np.dot(A, B)
    norma = np.linalg.norm(A)
    normb = np.linalg.norm(B)
    cos = dot / (norma * normb)
    return cos


def nearest_neighbor(v, candidates, k = 1):
    
    similarity_l = []

    for row in candidates:
        cos_similarity = cosine_similarity(v,row)
        similarity_l.append(cos_similarity)
        
    sorted_ids = np.argsort(similarity_l)

    k_idx = sorted_ids[-k:]
    return k_idx

In [16]:
# UNQ_C10 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def test_vocabulary(X, Y, R):
   
    pred = np.dot(X,R)

    num_correct = 0

    for i in range(len(pred)):
        pred_idx = nearest_neighbor(pred[i],Y)

        if pred_idx == i:
            num_correct += 1

    accuracy = num_correct / len(pred)

    return accuracy

In [17]:
acc = test_vocabulary(X_val, y_val, R)  # this might take a minute or two
print(f"accuracy on test set is {acc:.3f}")

accuracy on test set is 0.450


You managed to translate words from one language to another language without ever seing them with almost 45% accuracy by using some basic linear algebra and learning a mapping of words from one language to another!