#Word Embeddings, operaciones y jueguitos

In [None]:
#Importamos las librerias necesarias
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

In [1]:
# Creamos un modelo cargando el dataset text8
corpus = api.load('text8')

# Creamos el modelo Cbow usando word2vec
cbow_model = Word2Vec(corpus,
                      min_count=1,
                      vector_size=5,
                      window=4)

# Creamos el modelo skipgram usando wors2vec (notar que solo cambia un parametro)
skipgram_model = Word2Vec(corpus,
                          min_count=1,
                          vector_size=5,
                          window=4,
                          sg=True)



In [2]:
# Printeamos la descripcion del modelo
model_dict = gensim.downloader.info()['models']['word2vec-google-news-300']

for key in ['num_records', 'base_dataset', 'description']:
    print(f'{key: <12}: {model_dict[key]}')

# y lo descargamos
google_cbow = api.load('word2vec-google-news-300')

num_records : 3000000
base_dataset: Google News (about 100 billion words)
description : Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality' (https://code.google.com/archive/p/word2vec/).


In [None]:
# Devolvemos el embedding para una palabra

print('Word Embedding para "tree":\n')
print(f'CBOW:        {cbow_model.wv["tree"]}')
print(f'Skip-Gram:   {skipgram_model.wv["tree"]}')
print(f'Google CBOW: {google_cbow["tree"][:5]}\n\n')

In [None]:
# Calculo de similaridad

print('Similitud entre "tree" y "leaf":\n')
print(f'CBOW:        {cbow_model.wv.similarity("tree", "leaf")}')
print(f'Skip-Gram:   {skipgram_model.wv.similarity("tree", "leaf")}')
print(f'Google CBOW: {google_cbow.similarity("tree", "leaf")}\n\n')

In [None]:
# Devolvemos las 3 mas similares

print('Palabras mas similares a "tree":\n')
print(f'CBOW:        {cbow_model.wv.most_similar("tree", topn=3)}')
print(f'Skip-Gram:   {skipgram_model.wv.most_similar("tree", topn=3)}')
print(f'Google CBOW: {google_cbow.most_similar("tree", topn=3)}\n\n')


## Jueguitos jeje

In [None]:
# Encontrar palabra que no va encaja con las demas
words = ['tree', 'leaf', 'plant', 'bark', 'car']

cbow_result = cbow_model.wv.doesnt_match(words)
skipgram_result = skipgram_model.wv.doesnt_match(words)
google_result = google_cbow.doesnt_match(words)

print(f"Encontrar palabra desubicada: {words}:\n")
print(f'CBOW:        {cbow_result}')
print(f'Skip-Gram:   {skipgram_result}')
print(f'Google CBOW: {google_result}')

In [4]:
#Ejemplito clasico

# King -> Queen ejemplito
king = google_cbow['king']
man = google_cbow['man']
woman = google_cbow['woman']

king_result = google_cbow.most_similar(king-man+woman, topn=2)[1]
print(king_result)

('queen', 0.7300517559051514)
('berlin', 0.7331712245941162)


In [None]:
#Otro ejemplito

# Paris -> Berlin
paris = google_cbow['paris']
france = google_cbow['france']
berlin = google_cbow['berlin']

paris_result = google_cbow.most_similar(paris-france+berlin, topn=1)[0]
print(paris_result)

In [20]:
#Probemos otro que se les ocurra

car_truck_result = google_cbow.most_similar(google_cbow['son'] - google_cbow['man'] + google_cbow['woman'], topn=1)[0]
print(car_truck_result)

('daughter', 0.8964177966117859)
