In [2]:
from gensim.models import KeyedVectors
import numpy as np
import fasttext
import torch
from torch import nn
import io


# Train Transformation

In [3]:
orignal_egy = fasttext.load_model("C:/Users/Abdul/Desktop/Grad Project/code/Embeddings/embeddings-20220313T080929Z-001/embeddings/embedding_EGY.bin")
mapped_egy = KeyedVectors.load_word2vec_format("C:/Users/Abdul/Desktop/Grad Project/code/Embeddings/embeddings-20220313T080929Z-001/embeddings/vectors-EGY.txt")
assert orignal_egy.get_words() == mapped_egy.index_to_key
word_list = orignal_egy.get_words()



In [4]:
project_from = len(orignal_egy["تست"])
project_to = mapped_egy.vector_size

In [5]:
X = np.array([orignal_egy.get_word_vector(word) for word in word_list])
Y = np.array([mapped_egy.get_vector(word) for word in word_list])

In [6]:
class VectorDataSet(torch.utils.data.Dataset):

  def __init__(self, X, y):
    if not torch.is_tensor(X) and not torch.is_tensor(y):
      self.X = torch.from_numpy(X)
      self.y = torch.from_numpy(y)

  def __len__(self):
      return len(self.X)

  def __getitem__(self, i):
      return self.X[i], self.y[i]
      
class NeuralNet(nn.Module):
 
  def __init__(self,):
    super(NeuralNet, self).__init__()
    self.nn = nn.Linear(project_from, project_to)
   
  def forward(self, inputs):
    return self.nn(inputs)

dataset = VectorDataSet(X, Y)
trainloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0)
  
model = NeuralNet()

loss_function = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(2): 
  
  print(f'Starting epoch {epoch+1}')
  
  running_loss = 0.0
  for i, data in enumerate(trainloader, 0):
      inputs, actual = data

      # zero the parameter gradients
      optimizer.zero_grad()

      # forward + backward + optimize
      outputs = model(inputs)
      loss = loss_function(outputs, actual)
      loss.backward()
      optimizer.step()

      # print statistics
      running_loss += loss.item()
      if i % 2000 == 1999:    # print every 2000 mini-batches
          print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
          running_loss = 0.0

print('Training process has finished.')

Starting epoch 1
[1,  2000] loss: 0.298
[1,  4000] loss: 0.226
[1,  6000] loss: 0.173
[1,  8000] loss: 0.127
[1, 10000] loss: 0.090
[1, 12000] loss: 0.056
[1, 14000] loss: 0.026
[1, 16000] loss: 0.007
[1, 18000] loss: 0.003
[1, 20000] loss: 0.002
[1, 22000] loss: 0.002
[1, 24000] loss: 0.001
[1, 26000] loss: 0.001
[1, 28000] loss: 0.001
[1, 30000] loss: 0.001
[1, 32000] loss: 0.001
[1, 34000] loss: 0.001
Starting epoch 2
[2,  2000] loss: 0.001
[2,  4000] loss: 0.001
[2,  6000] loss: 0.001
[2,  8000] loss: 0.001
[2, 10000] loss: 0.001
[2, 12000] loss: 0.001
[2, 14000] loss: 0.001
[2, 16000] loss: 0.001
[2, 18000] loss: 0.001
[2, 20000] loss: 0.001
[2, 22000] loss: 0.001
[2, 24000] loss: 0.001
[2, 26000] loss: 0.001
[2, 28000] loss: 0.001
[2, 30000] loss: 0.001
[2, 32000] loss: 0.001
[2, 34000] loss: 0.001
Training process has finished.


In [7]:
weights = model.nn.weight
weights

Parameter containing:
tensor([[-0.1961,  0.0173,  0.0794,  ...,  0.0104, -0.0121, -0.0661],
        [-0.0055,  0.1659,  0.1194,  ..., -0.0704, -0.0691,  0.1322],
        [-0.1752,  0.0294, -0.1327,  ..., -0.1015, -0.0077,  0.1027],
        ...,
        [-0.0349, -0.0809,  0.0773,  ..., -0.1100,  0.0220, -0.0740],
        [ 0.0607, -0.0275, -0.0053,  ...,  0.0777,  0.1975,  0.0988],
        [ 0.0288, -0.1798,  0.0052,  ...,  0.1797, -0.1421, -0.0617]],
       requires_grad=True)

# Transform OOV Vector

In [8]:
vector = torch.from_numpy(orignal_egy["ابرا كادبرا"])
vector

tensor([-9.2040e-02,  1.4865e-01, -5.6032e-02,  3.7975e-02,  7.0395e-02,
         1.2474e-01, -4.7408e-02,  9.4362e-02,  3.5227e-01, -3.7987e-02,
        -1.9843e-01,  1.0184e-01, -4.8749e-02,  7.5740e-02,  2.6891e-01,
         6.1159e-02, -1.5976e-02, -1.6184e-02,  2.3527e-01, -5.5144e-02,
         1.9678e-01,  3.2300e-01,  6.2339e-02, -1.0238e-01, -1.2260e-01,
         5.7452e-03,  1.1809e-01, -4.1456e-02, -2.1094e-01,  3.4244e-01,
        -8.4845e-04, -2.8098e-02, -1.7438e-02, -7.1443e-02, -2.5656e-01,
        -5.7791e-02,  4.4908e-02, -2.4819e-01, -1.4585e-01,  5.2531e-02,
        -1.6236e-01,  2.5883e-01,  7.1127e-02,  2.4198e-01, -8.9583e-02,
        -1.9114e-02,  9.8471e-02, -1.0518e-02, -6.6584e-02, -2.2069e-01,
         1.1617e-02, -5.5783e-02, -3.3539e-01,  6.7654e-02,  8.4350e-03,
         9.7909e-02, -3.2134e-01, -2.9755e-01, -1.4918e-03, -9.0681e-02,
         1.8118e-01,  3.5747e-01,  5.1818e-02, -2.9018e-01,  1.7030e-02,
         2.9945e-01, -2.9328e-03, -1.7895e-02,  3.4

In [9]:
mapping_egy = torch.nn.Linear(project_from, project_to, bias=False)
#to_reload = torch.from_numpy(torch.load('best_mapping.pth'))
mapping_egy.weight.data.copy_(weights.type_as(mapping_egy.weight.data))
translated_vector = mapping_egy(vector)
translated_vector.data

tensor([-0.0328,  0.1184,  0.0196,  0.1928, -0.0652, -0.0742, -0.0329,  0.0404,
        -0.1578,  0.3530, -0.0938,  0.2301, -0.3129,  0.2493, -0.1146, -0.0506,
         0.1905, -0.0193,  0.1866,  0.1487,  0.0945,  0.1833, -0.0334, -0.1684,
        -0.3585, -0.1886,  0.1418,  0.0276,  0.2461,  0.2100,  0.1094,  0.0983,
        -0.3934, -0.0558,  0.2301, -0.0235, -0.1950,  0.1035, -0.1355,  0.1216,
        -0.0964,  0.0355,  0.1662,  0.1063, -0.0174,  0.1232,  0.2988,  0.1953,
        -0.0836,  0.1050, -0.0803, -0.0747, -0.0875, -0.1717,  0.0876,  0.1360,
        -0.1636, -0.0818,  0.1198,  0.2259, -0.2240, -0.0136,  0.1107,  0.0980,
        -0.0221, -0.1472,  0.2532, -0.1975, -0.1256, -0.2077,  0.2116,  0.1570,
        -0.0740,  0.0456,  0.0062,  0.1324, -0.1616,  0.0403,  0.3018, -0.0264,
         0.1925,  0.2399,  0.0640,  0.2653, -0.0307, -0.0484, -0.3235,  0.1310,
        -0.0336, -0.1031, -0.1970,  0.1981, -0.0824,  0.0857, -0.0777,  0.1374,
        -0.1464, -0.0430, -0.0428, -0.02

# Find Similarity

## Vocab Similarity

In [10]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id
    
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

def get_nn_oov(word, word_emb, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

In [11]:
src_path = 'C:/Users/Abdul/Desktop/Grad Project/code/Embeddings/embeddings-20220313T080929Z-001/embeddings/vectors-EGY.txt'
tgt_path = 'C:/Users/Abdul/Desktop/Grad Project/code/Embeddings/embeddings-20220313T080929Z-001/embeddings/vectors-GLF.txt'
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

In [12]:
# EGY -> EGY
get_nn("باشا", src_embeddings, src_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of "باشا":
1.0000 - باشا
0.7139 - يباشا
0.7020 - اباشا
0.6532 - الباشا
0.5911 - حاشا


In [13]:
# EGY -> GLF
get_nn("باشا", src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "باشا":
0.5828 - الميداني
0.5297 - يثني
0.5139 - تأليف
0.5138 - وحمل
0.5138 - أفتكر


In [14]:
# GLF -> EGY
get_nn("تسذا", tgt_embeddings, tgt_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of "تسذا":
0.6616 - يغضضن
0.6415 - تظن
0.6356 - وليضربن
0.6310 - بحزمة
0.6293 - فريقا


In [15]:
# GLF -> GLF
get_nn("تسذا", tgt_embeddings, tgt_id2word, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "تسذا":
1.0000 - تسذا
0.8349 - شذا
0.7860 - كيذا
0.7541 - وشذا
0.7523 - خن


## OOV Similarity

In [16]:
get_nn_oov("ابرا كادبرا", translated_vector.data, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "ابرا كادبرا":
0.6401 - هولاكو
0.6060 - رآي
0.6047 - أردوغان
0.6030 - إدمان
0.5985 - لفتت


# Test Accuracy Of The Projection

In [17]:
get_nn("باشا", src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "باشا":
0.5828 - الميداني
0.5297 - يثني
0.5139 - تأليف
0.5138 - وحمل
0.5138 - أفتكر


In [18]:
vector = torch.from_numpy(orignal_egy["باشا"])
translated_vector = mapping_egy(vector)
get_nn_oov("باشا", translated_vector.data, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "باشا":
0.5826 - الميداني
0.5294 - يثني
0.5141 - تأليف
0.5136 - أفتكر
0.5128 - ㅤ
