In [1]:
from gensim.models import KeyedVectors
import numpy as np
import fasttext
import torch
from torch import nn
import io


# Train Transformation

In [2]:
orignal_egy = fasttext.load_model("embedding_EGY.bin")
mapped_egy = KeyedVectors.load_word2vec_format("vectors-EGY.txt")
assert orignal_egy.get_words() == mapped_egy.index_to_key
word_list = orignal_egy.get_words()



In [3]:
project_from = len(orignal_egy["تست"])
project_to = mapped_egy.vector_size

In [4]:
X = np.array([orignal_egy.get_word_vector(word) for word in word_list])
Y = np.array([mapped_egy.get_vector(word) for word in word_list])

In [5]:
class VectorDataSet(torch.utils.data.Dataset):

  def __init__(self, X, y):
    if not torch.is_tensor(X) and not torch.is_tensor(y):
      self.X = torch.from_numpy(X)
      self.y = torch.from_numpy(y)

  def __len__(self):
      return len(self.X)

  def __getitem__(self, i):
      return self.X[i], self.y[i]
      
class NeuralNet(nn.Module):
 
  def __init__(self,):
    super(NeuralNet, self).__init__()
    self.nn = nn.Linear(project_from, project_to)
   
  def forward(self, inputs):
    return self.nn(inputs)

dataset = VectorDataSet(X, Y)
trainloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0)
  
model = NeuralNet()

loss_function = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(2): 
  
  print(f'Starting epoch {epoch+1}')
  
  running_loss = 0.0
  for i, data in enumerate(trainloader, 0):
      inputs, actual = data

      # zero the parameter gradients
      optimizer.zero_grad()

      # forward + backward + optimize
      outputs = model(inputs)
      loss = loss_function(outputs, actual)
      loss.backward()
      optimizer.step()

      # print statistics
      running_loss += loss.item()
      if i % 2000 == 1999:    # print every 2000 mini-batches
          print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
          running_loss = 0.0

print('Training process has finished.')

Starting epoch 1
[1,  2000] loss: 0.196
[1,  4000] loss: 0.141
[1,  6000] loss: 0.112
[1,  8000] loss: 0.094
[1, 10000] loss: 0.083
[1, 12000] loss: 0.076
[1, 14000] loss: 0.071
[1, 16000] loss: 0.069
[1, 18000] loss: 0.067
[1, 20000] loss: 0.066
[1, 22000] loss: 0.064
[1, 24000] loss: 0.064
[1, 26000] loss: 0.064
[1, 28000] loss: 0.062
[1, 30000] loss: 0.062
[1, 32000] loss: 0.061
[1, 34000] loss: 0.061
Starting epoch 2
[2,  2000] loss: 0.061
[2,  4000] loss: 0.061
[2,  6000] loss: 0.061
[2,  8000] loss: 0.060
[2, 10000] loss: 0.060
[2, 12000] loss: 0.060
[2, 14000] loss: 0.060
[2, 16000] loss: 0.060
[2, 18000] loss: 0.060
[2, 20000] loss: 0.060
[2, 22000] loss: 0.060
[2, 24000] loss: 0.060
[2, 26000] loss: 0.060
[2, 28000] loss: 0.060
[2, 30000] loss: 0.060
[2, 32000] loss: 0.059
[2, 34000] loss: 0.059
Training process has finished.


In [6]:
weights = model.nn.weight
weights

Parameter containing:
tensor([[ 0.0027,  0.0077,  0.1022,  ..., -0.0713, -0.0463, -0.0419],
        [ 0.0306,  0.0149,  0.1105,  ...,  0.0757,  0.0461, -0.0160],
        [-0.0107, -0.0302, -0.0311,  ..., -0.0349,  0.0028,  0.0132],
        ...,
        [ 0.0434, -0.0684,  0.0307,  ...,  0.0212, -0.0505,  0.0130],
        [-0.0511,  0.0018,  0.0839,  ...,  0.0124, -0.0109,  0.0302],
        [-0.0634, -0.0398, -0.0474,  ..., -0.0900,  0.0526, -0.0636]],
       requires_grad=True)

# Transform OOV Vector

In [7]:
vector = torch.from_numpy(orignal_egy["ابرا كادبرا"])
vector

tensor([-7.8678e-03,  2.3735e-03,  1.8241e-01,  2.0889e-02, -6.8731e-02,
         1.3860e-01,  1.0363e-01, -4.5243e-02, -6.7927e-03,  9.1357e-03,
        -9.5829e-02, -1.6048e-03, -1.4016e-01,  6.4791e-02, -8.5503e-02,
         2.9857e-02, -3.8317e-02, -9.7960e-02,  1.7813e-02,  1.9014e-02,
         1.0002e-01,  7.5055e-04, -3.4619e-02,  4.2269e-02,  5.6960e-03,
         3.8909e-02, -1.8595e-02, -1.3510e-01,  2.2122e-02,  8.4148e-02,
         2.3929e-02, -9.6059e-02, -5.5370e-02, -6.0250e-02, -2.8113e-02,
         1.0597e-01,  3.5426e-02, -9.2728e-03, -5.5328e-02, -4.9054e-02,
        -5.9051e-03,  1.1898e-01,  7.2115e-03,  4.0465e-05, -2.2899e-01,
        -1.1009e-01, -3.9917e-02,  5.5506e-02, -9.6712e-05,  1.3041e-01,
        -7.8812e-02, -3.1766e-02, -2.0723e-02, -3.6732e-02,  1.0633e-02,
        -7.6645e-02, -7.9169e-02,  6.6527e-02, -2.2412e-02,  6.6237e-02,
         1.2647e-01,  2.1966e-02, -9.6587e-03, -1.2444e-01, -4.2160e-02,
        -9.9926e-02,  1.0107e-03, -8.3764e-02, -4.0

In [8]:
mapping_egy = torch.nn.Linear(project_from, project_to, bias=False)
#to_reload = torch.from_numpy(torch.load('best_mapping.pth'))
mapping_egy.weight.data.copy_(weights.type_as(mapping_egy.weight.data))
translated_vector = mapping_egy(vector)
translated_vector.data

tensor([ 1.3970e-01,  9.1253e-02,  9.8216e-02,  1.3309e-01, -8.9674e-03,
        -1.7742e-01, -3.7670e-02,  9.4546e-02,  1.9565e-01, -1.1822e-03,
         7.2320e-02,  1.1083e-01,  3.3814e-02, -2.6407e-01, -1.4682e-01,
        -1.1834e-01,  8.4182e-02, -3.3805e-01, -1.7135e-01, -1.3218e-02,
        -8.4594e-02, -8.7626e-02, -6.4661e-02,  2.8481e-01, -2.3850e-01,
        -1.4856e-01,  1.2736e-01,  3.5994e-01, -1.1005e-01,  5.1463e-02,
         1.4001e-01, -3.7012e-02, -2.2374e-01, -8.7048e-03,  1.1310e-01,
        -2.4818e-01, -6.8081e-02,  1.2651e-01, -1.5978e-01, -6.6077e-02,
         7.8119e-02, -7.0779e-02, -1.6016e-01,  6.9317e-02, -3.9864e-02,
         1.1742e-02,  7.5328e-02,  1.9741e-04,  4.5231e-02, -3.4390e-02,
         7.1819e-02,  1.9203e-02,  1.8718e-01, -5.7936e-02,  2.9158e-02,
        -5.6911e-03,  3.4707e-02,  1.8340e-01, -4.3190e-02,  1.2007e-01,
        -1.1225e-01, -6.8057e-02,  1.5107e-01,  1.9450e-01,  1.7726e-01,
         9.0552e-02,  3.0664e-02,  1.4105e-01, -1.8

# Find Similarity

## Vocab Similarity

In [9]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id
    
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

def get_nn_oov(word, word_emb, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

In [10]:
src_path = 'vectors-EGY.txt'
tgt_path = 'vectors-GLF.txt'
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

In [11]:
# EGY -> EGY
get_nn("باشا", src_embeddings, src_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of "باشا":
1.0000 - باشا
0.7421 - يباشا
0.7217 - اباشا
0.6005 - ياباشا
0.5972 - حاشا


In [12]:
# EGY -> GLF
get_nn("باشا", src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "باشا":
0.4100 - رح
0.3553 - جحفلي
0.3519 - دماغك
0.3513 - وبكدا
0.3499 - وعشان


In [13]:
# GLF -> EGY
get_nn("تسذا", tgt_embeddings, tgt_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of "تسذا":
0.5657 - تاعبني
0.5567 - فأملأي
0.5515 - بيضحكني
0.5455 - تهزقني
0.5424 - ستاموني


In [14]:
# GLF -> EGY
get_nn("وينك", tgt_embeddings, tgt_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of "وينك":
0.6053 - سلامتك
0.4998 - توحشتك
0.4944 - هفاجئك
0.4943 - بصيرتك
0.4902 - ينورلك


In [24]:
# GLF -> EGY
get_nn("والصلاة", tgt_embeddings, tgt_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of "والصلاة":
0.5644 - اشقائكم
0.5584 - للإخوة
0.5441 - للأخوة
0.5376 - للاخوة
0.5351 - للہ


In [15]:
# GLF -> GLF
get_nn("تسذا", tgt_embeddings, tgt_id2word, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "تسذا":
1.0000 - تسذا
0.8123 - شذا
0.7703 - كيذا
0.7524 - وشذا
0.7409 - خن


## OOV Similarity

In [31]:
get_nn_oov("طائرة حربية ", translated_vector.data, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "طائرة حربية ":
0.3770 - وعشان
0.3749 - رح
0.3689 - جحفلي
0.3648 - مارضي
0.3643 - ﻻن


# Test Accuracy Of The Projection

In [29]:
get_nn("باشا", src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "باشا":
0.4100 - رح
0.3553 - جحفلي
0.3519 - دماغك
0.3513 - وبكدا
0.3499 - وعشان


In [28]:
vector = torch.from_numpy(orignal_egy["باشا"])
translated_vector = mapping_egy(vector)
get_nn_oov("باشا", translated_vector.data, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "باشا":
0.3770 - وعشان
0.3749 - رح
0.3689 - جحفلي
0.3648 - مارضي
0.3643 - ﻻن
