# Simple translation using semantic embeddings

Joulin et al. (2018) "[*Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion*](https://arxiv.org/abs/1804.07745)," arXiv

The version of the fasttext embedding being downloaded is *aligned*. As such, the word for 'cat' in the English embedding will be close to the word 'katt' (Swedish) and 'gato' (Spanish) in their respective embeddings. You choose the language to load by giving the loader a language code. After a fasttext embedding has been downloaded, it will be cached on the local file system to reduce reloading time.

In [None]:
import numpy as np
from urllib.request import urlopen
from tqdm import tqdm
import os.path
import time

class AlignedEmbedding(object):
  """This data loader will download an aligned fasttext embedding given a two letter language code.
  More information ca be found at https://fasttext.cc/docs/en/aligned-vectors.html ."""
  def __init__(self, lang='en', vocabulary_limit=-1):
    """Data loader"""
    self._dataurl = "https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.%s.align.vec" % lang
    cachefile = "cache_%s.npz" % lang
    if os.path.exists(cachefile):
      print("Loading cached data from %s..." % cachefile, end="")
      t = time.time()
      data = np.load(cachefile)
      self.idx2token_ = list(data['idx2token'])
      self.vectors_ = data['vectors']
      print("done (%isec)" % (time.time()-t), flush=True)
    else:
      with urlopen(self._dataurl) as remote_file:
        get_decoded_line = lambda file: file.readline().decode('utf-8')
        self.n_tokens, self.n_dim = [int(n) for n in get_decoded_line(remote_file).split()]
        if vocabulary_limit > 0:
          self.n_tokens = vocabulary_limit
        self.idx2token_ = list()
        self.vectors_ = np.zeros((self.n_tokens, self.n_dim), dtype=np.float32)
        for n in tqdm(range(self.vectors_.shape[0]), desc="Downloading and parsing vectors", unit="words"):
          textline = get_decoded_line(remote_file)
          linedata = textline.split(' ')
          self.idx2token_.append(linedata[0])
          self.vectors_[n, :] = np.asarray(linedata[1:], dtype=np.float32)
      np.savez(cachefile, vectors=self.vectors_, idx2token=self.idx2token_)
    self.token2idx_ = {token:i for i, token in enumerate(self.idx2token_)}

  def __len__(self):
    """Gives the number of tokens in the embedding."""
    return len(self.token2idx_)

  def __getitem__(self, tokens):
    """Returns the vector(s) for a token or a list of tokens."""
    assert isinstance(tokens, (str, list)), "tokens must be list or str."
    if type(tokens) is str:
      assert tokens in self, "Cound not find token '%s'" %s
      return self.vectors_[self.token2idx_[tokens]].reshape(1, -1)
    else:
      ret = np.zeros((len(tokens), self.vectors_.shape[1]))
      for i, token in enumerate(tokens):
        ret[i, :] = self[token]
      return ret

  def __contains__(self, token):
    """Allows a user to query if a tokens is in the embedding."""
    return token in self.token2idx_.keys()

In [None]:
embedding1 = AlignedEmbedding('en')
embedding2 = AlignedEmbedding('es')

!ls -l

Loading cached data from cache_en.npz...done (143sec)
Loading cached data from cache_es.npz...done (49sec)
total 7009692
-rw-r--r-- 1 root root 5320910082 Oct 18 12:22 cache_en.npz
-rw-r--r-- 1 root root 1856997146 Oct 18 12:24 cache_es.npz
drwxr-xr-x 1 root root       4096 Oct  8 13:45 sample_data


In [None]:
assert len(embedding1) > 10000
assert 'man' in embedding1
assert 'woman' in embedding1
assert 'kdjf343' not in embedding1
assert 'king' in embedding1

assert embedding1['man'].shape[0] == 1
assert embedding1['man'].shape[1] == 300
assert embedding1[['man', 'woman']].shape[0] == 2
assert embedding1[['man', 'woman']].shape[1] == 300
assert type(embedding1['man']) is np.ndarray
assert np.isclose(np.sum(embedding1['man']-embedding1['man']), 0)

## Word distance

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

word = 'blue'
source = embedding1
target = embedding2

print("Query word: %s" % word)
print("Euclidean\t\t\tCosine")
distances = euclidean_distances(source[word], 
                                target.vectors_).ravel()
similarities = cosine_similarity(source[word], 
                                 target.vectors_).ravel()
k = 20
for i, j in zip(np.argsort(distances)[:k], 
                np.argsort(similarities)[::-1][:k]):
  print("%.3f %s %.3f %s" % (distances[i], target.idx2token_[i].ljust(25), 
                             similarities[j], target.idx2token_[j]))

Query word: blue
Euclidean			Cosine
0.959 azul                      0.540 azul
0.964 blue                      0.535 blue
1.015 amarillo                  0.485 amarillo
1.035 rojo                      0.464 rojo
1.036 azules                    0.463 azules
1.054 azuli                     0.445 azuli
1.054 amarillo/naranja          0.445 amarillo/naranja
1.054 amarillo,                 0.444 amarillo,
1.059 azul/verde                0.439 azul/verde
1.065 azul/                     0.433 azul/
1.068 amarillo/verde            0.430 amarillo/verde
1.070 azul,                     0.428 azul,
1.071 amarillo/blanco           0.426 amarillo/blanco
1.075 verde                     0.422 verde
1.079 azule                     0.418 azule
1.080 amarillos                 0.417 amarillos
1.083 color                     0.414 color
1.084 azulamarillo              0.412 azulamarillo
1.085 amarillo—                 0.412 amarillo—
1.086 amarill                   0.410 amarill


## Word similarity

We should be able to meassure similarity as cosine similarity.

$cos(\overrightarrow{cat}, \overrightarrow{dog}) \geq cos(\overrightarrow{cat}, \overrightarrow{cow})$

In [None]:
assert cosine_similarity(embedding1['cat'], embedding1['dog']) > cosine_similarity(embedding1['cat'], embedding1['cow'])