# Setting Up

## Link notebook to drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Move to base folder

In [3]:
%cd /content/gdrive/My Drive/Colab Notebooks/Embeddings/Pretrained_Embeddings

/content/gdrive/My Drive/Colab Notebooks/Embeddings/Pretrained_Embeddings


## Install fast-text module

In [7]:
!pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 10.1MB/s eta 0:00:01[K     |█████████▌                      | 20kB 1.7MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 2.2MB/s eta 0:00:01[K     |███████████████████             | 40kB 2.5MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 2.2MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 2.0MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3023522 sha256=e07d1a3b9d7ec290ad724cd587844ac01f3637d229c0ce69df24bee64f67673d
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154

## Get the embeddings

In [4]:
# get embeddings from 
!mkdir FastText_Wiki
!curl -Lo FastText_Wiki/wiki.es.zip https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.es.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5193M  100 5193M    0     0  16.5M      0  0:05:14  0:05:14 --:--:-- 16.5M
gzip: FastText_Wiki/wiki.es.zip: unknown suffix -- ignored


In [6]:
!unzip FastText_Wiki/wiki.es.zip -d FastText_Wiki/

Archive:  FastText_Wiki/wiki.es.zip
  inflating: FastText_Wiki/wiki.es.vec  
  inflating: FastText_Wiki/wiki.es.bin  


# Load FT embeddings

In [8]:
import fasttext
ft = fasttext.load_model('FastText_Wiki/wiki.es.bin')



In [9]:
# we have embeddings for ~985K words. Words are sorted by frequency (i.e. most frequent words appear first in the list)
words = ft.get_words(on_unicode_error="ignore")
len(words)

985667

In [10]:
# We are gonna get rid of some 'rare' words and take the first 300K elements
import re
words = [w for w in words if not (re.search('\d', w))]
words = [w for w in words if not (re.search('[^aeiouAEIOUáéíóúÁÉÍÓÚ]{5,}', w))]
words = words[:300000]

In [11]:
words[:10]

['de', '</s>', ',', '.', 'la', 'en', 'el', 'y', '-', ')']

In [28]:
'brasil' in words

True

# Construct a model

In [12]:
# we'll use the 300K words selected above
model = {word: ft.get_word_vector(word) for word in words} 

# Use cosine similarity to compare vectors

In [13]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Find the 10 most similar words to 'rey'

In [14]:
v1 = ft.get_word_vector("rey")

In [15]:
# measure similarity using cosine_similarity()
similarity_measures = cosine_similarity([v1], list(model.values()))

In [16]:
# get the index of the 10 most similar words
sorted_indexes = np.argsort(similarity_measures[0])
index_most_similar = sorted_indexes[-10:]
index_most_similar = index_most_similar[::-1]

In [17]:
for i in index_most_similar:
    print(words[i])

rey
monarca
reina
rey»
reyes
reinar
corregente
trono
reinado
pretendiente


## Use the model to solve analogies

### rey -> hombre   :   ? -> mujer

In [18]:
v2 = ft.get_word_vector("rey") - ft.get_word_vector("hombre") + ft.get_word_vector("mujer")

In [19]:
# measure similarity using cosine_similarity()
similarity_measures = cosine_similarity([v2], list(model.values()))

In [20]:
# get the index of the 10 most similar words
sorted_indexes = np.argsort(similarity_measures[0])
index_most_similar = sorted_indexes[-5:]
index_most_similar = index_most_similar[::-1]

In [21]:
for i in index_most_similar:
    print(words[i])

rey
reina
consorte
desposa
monarca


### América -> México : ? -> Francia

In [None]:
v3 = ft.get_word_vector("América") - ft.get_word_vector("México") + ft.get_word_vector("Francia")

In [None]:
# measure similarity using cosine_similarity()
similarity_measures = cosine_similarity([v3], list(model.values()))

In [None]:
# get the index of the 10 most similar words
sorted_indexes = np.argsort(similarity_measures[0])
index_most_similar = sorted_indexes[-5:]
index_most_similar = index_most_similar[::-1]

In [None]:
for i in index_most_similar:
    print(words[i])

Francia
Céltica
Europa
América
Sud
