# Ejemplo con alguno de los principales métodos Embedding

### En particular con estos vectores pre-entrenados se puede ilustrar el ejemplo King - man + woman = Queen 

## Word2Vec - 1st method - Google

https://code.google.com/archive/p/word2vec/

El descargado de los archivos de los ejemplos que veremos puede llevarse algo de tiempo.
El archivo GoogleNews-vectors-negative300.bin.gz de word2vec es de aproximadamente 3.6 GB.

In [None]:
#Primero bajas el archivo de los embeddings Word2Vec:
#!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

In [None]:
#Y lo descompactas para obtener el binario:
#!gunzip GoogleNews-vectors-negative300.bin

In [3]:
# Para este caso usaremos la librería Gensim: https://radimrehurek.com/gensim/ 

#!pip install gensim

In [1]:
import gensim



In [2]:
# Ahora sí cargamos los vectores previamente entrenados por Google... puede tardar un poco en cargarlos...:

model = gensim.models.KeyedVectors.load_word2vec_format(
    '/workspace/micarpeta/misdatos/GoogleNews-vectors-negative300.bin', 
    binary=True
)

In [4]:
king = model['king']

In [9]:
print(king.shape)
print(king[0:5])

(300,)
[ 0.12597656  0.02978516  0.00860596  0.13964844 -0.02563477]


In [11]:
# Aquii vamos con el famoso ejemplo: 
#                   king - man + woman = queen

print(model.most_similar(positive=['king', 'woman'], negative=['man']))

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593831062317), ('monarchy', 0.5087411999702454)]


In [12]:
print(model.similarity('woman', 'man'))
print(model.similarity('king', 'woman'))

0.76640123
0.12847973


# GloVe - 2nd method - Standford

https://nlp.stanford.edu/projects/glove/

In [None]:
# Aquí usaremos la librería de nlp para los embeddings de Glove:
https://spacy.io/ 

In [38]:
#!pip install -U spacy

In [39]:
#!python3 -m spacy download en_core_web_lg

In [20]:
import spacy
import en_core_web_lg
from scipy.spatial import distance
import numpy as np

In [16]:
nlp = en_core_web_lg.load()

In [17]:
# Podemos también introducir todas las palabras involucradas en un enunciado:

doc = nlp("king queen woman man")

In [18]:
# similaridad entre king y queen:

doc[0].similarity(doc[1])

0.72526103

In [19]:
# similaridad entre king y woman:
doc[0].similarity(doc[2])

0.26556593

En ocasiones hay que multiplicar por escalares algunos vectores para obtener mejores resultados, sobre todo cuando se tienen vectores paralelos entre sí...

In [21]:
v =  doc[0].vector - (doc[3].vector*2) + (doc[2].vector*2)

print(v)

[ 0.712754   -0.20624     0.05630401  0.62922996 -0.33405    -0.32917994
  0.35588    -0.63176     0.03896999  1.9828997  -0.7717     -0.036082
 -0.4219134  -0.2288734  -0.39632398 -0.257362   -0.538078    1.04298
 -0.564305    0.35007    -0.12709999  0.89014995 -0.91889    -0.61249
 -0.70478     0.790441   -0.18181998 -0.03685001 -0.83111995  0.34461606
 -0.05571002  0.61666995  0.3444     -0.704682    0.99318     0.37484598
 -0.43377    -0.8651601  -0.54863     0.1090004   1.0698     -0.12551999
 -0.329508   -0.52566     0.40131795 -0.38313898 -0.45103902  0.5648801
  0.67260003 -0.55907404 -1.459715    0.35917002 -0.58002    -0.96379596
  0.19972001  0.5178399  -0.00740319 -0.502858   -0.927108    0.41625696
  0.02304001 -1.0703499   0.411865    0.43028998  0.50001407  0.7369099
  0.93891     0.50217795  0.756953    0.23161    -0.42280996 -0.19461
 -1.087724    0.12665     1.61252     0.04894301 -0.04629004  0.58300996
  0.448102   -0.86739296 -0.68719417 -0.407184   -0.88712007  0.

In [26]:
# En lugar de buscar en todo el conjunto de vectores pre-entrenados de GLove, lo buscamos en 
# nuestro enunciado que introducimos al inicio:

vectors = [token.vector for token in doc]
vectors = np.array(vectors)

In [27]:
print(vectors)   # estos son los vectors 300-dimensionales de nuestras 4 palabras: king, queen, woman man, respectivamente:

[[ 0.31542  -0.35068   0.42923  ...  0.11427   0.58221  -0.10559 ]
 [ 0.4095   -0.22693   0.25362  ... -0.34563   0.12043  -0.36494 ]
 [ 0.025567  0.27885  -0.16992  ... -0.018582 -0.10128  -0.34728 ]
 [-0.1731    0.20663   0.016543 ...  0.16665  -0.38341  -0.073803]]


In [28]:
# Ahora buscamos con cual de estos se encuentra más cercano el vector "v=king-man+woman": 

closest_index = distance.cdist(np.expand_dims(v, axis = 0), vectors, metric = 'cosine').argmin()
output_word = doc[closest_index].text

print(output_word)

queen


## FastText - 3rd method - Facebook

https://github.com/facebookresearch/fastText

In [22]:
#!pip install Cython --install-option="--no-cython-compile"

In [23]:
#!pip install fasttext

Usaremos el los vectores embedding previamente entrenamos para 157 idiomas:

https://fasttext.cc/docs/en/crawl-vectors.html

https://arxiv.org/abs/1802.06893

In [24]:
import fasttext
import fasttext.util

In [40]:
# ... se toma su tiempo... unos 5-10 minutes:

#fasttext.util.download_model('en', if_exists='ignore')  # English


In [28]:
#fasttext.util.download_model('es', if_exists='ignore')   # Espagnol

In [29]:
ft = fasttext.load_model('cc.en.300.bin')



In [30]:
ft.get_word_vector('king')

array([-2.63642855e-02, -4.38338369e-02, -5.22461310e-02,  2.49765869e-02,
        1.59946546e-01,  4.98980191e-03,  2.51637166e-03, -1.62712112e-02,
       -6.62135556e-02, -1.67888845e-03, -1.39499649e-01, -5.72493225e-02,
       -1.45975351e-01, -1.56568401e-02,  3.75731173e-03,  8.14326331e-02,
        9.02080238e-02, -6.22668210e-03, -1.21208653e-01,  8.42568502e-02,
        6.83858395e-02,  1.01658493e-01, -5.07243127e-02,  9.16049480e-02,
        5.08386921e-03,  6.28780201e-02,  5.67676872e-02,  1.91132650e-01,
        4.35085818e-02,  1.80901110e-01, -1.74744725e-02,  7.06654340e-02,
       -6.06337450e-02,  3.89074199e-02,  1.44602428e-03, -1.25214964e-01,
        8.63592885e-03, -7.98915625e-02, -1.00960366e-01,  4.66771051e-02,
        5.39167747e-02,  4.82006092e-03, -2.03307956e-01, -1.17739499e-01,
       -1.37199834e-01, -4.92817685e-02, -1.87217459e-01, -7.17959851e-02,
       -1.86646730e-02, -9.93231237e-02, -5.15213236e-02, -1.93316743e-01,
       -8.94939303e-02, -

In [31]:
ft.get_nearest_neighbors('king')

[(0.7550359964370728, 'kings'),
 (0.7068519592285156, 'queen'),
 (0.7060439586639404, 'king-'),
 (0.6811205148696899, 'king.'),
 (0.660710871219635, 'king.The'),
 (0.6591265797615051, 'King'),
 (0.6495252251625061, 'prince'),
 (0.6278106570243835, '-king'),
 (0.6183920502662659, 'monarch'),
 (0.6070184707641602, 'queen-mother')]

In [36]:
ft.get_nearest_neighbors('spider-man')

[(0.8240686655044556, 'spiderman'),
 (0.7895693778991699, 'Spider-man'),
 (0.7026702165603638, 'Spiderman'),
 (0.6991036534309387, 'Spidey'),
 (0.6901348829269409, 'Spider-Man'),
 (0.670944094657898, 'spidey'),
 (0.6565964221954346, 'x-men'),
 (0.6427714824676514, 'xmen'),
 (0.6398128271102905, 'webslinging'),
 (0.6341536045074463, 'webslinger')]

In [37]:
ft.get_nearest_neighbors('wonder-woman')   

[(0.45614171028137207, 'wonder-ful'),
 (0.3804067373275757, 'wonder-workers'),
 (0.37949901819229126, 'wonderdog'),
 (0.3745538890361786, 'wonderkind'),
 (0.3724863827228546, 'wonder-struck'),
 (0.36367306113243103, 'wonder-worker'),
 (0.3623429238796234, 'wondere'),
 (0.3536047041416168, 'super-woman'),
 (0.35171133279800415, 'argumentEAANECREEethicsevolutionfine-tuning'),
 (0.3515596389770508, 'wonder-working')]