# Exemplo de Word2Vec com modelo pré-treinado
* Exemplo adaptado de https://colab.research.google.com/drive/1zuq1I_FudtB2W4OSOWff8ODqfqK8d9-G



# Download do modelo pré-treinado (~1.5 gb)

In [None]:
import numpy as np

In [None]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-08-27 14:50:20--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.233.157
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.233.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘/root/input/GoogleNews-vectors-negative300.bin.gz’


2020-08-27 14:52:03 (15.3 MB/s) - ‘/root/input/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



# Instalando a biblioteca "gensim" para manipular word vectors

In [None]:
!pip install gensim
from gensim.models import KeyedVectors



# Carregando o modelo pré-treinado

In [None]:
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz' # from above
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
word2vec["smartphone"].shape

(300,)

In [None]:
word2vec["smartphone"]

array([ 1.37695312e-01, -3.14453125e-01, -4.72656250e-01,  7.66601562e-02,
       -3.63281250e-01,  5.98144531e-02,  1.36718750e-01,  9.42382812e-02,
        2.79296875e-01, -1.03027344e-01, -1.91406250e-01,  1.39648438e-01,
        2.16796875e-01, -9.61914062e-02,  2.23632812e-01,  5.46264648e-03,
        2.10937500e-01,  1.25976562e-01, -7.95898438e-02, -2.20703125e-01,
        8.64257812e-02, -1.16210938e-01,  1.26953125e-02, -3.45703125e-01,
        6.73828125e-02,  2.59765625e-01, -1.10351562e-01,  2.98828125e-01,
        2.38281250e-01,  1.78222656e-02, -2.11914062e-01,  1.91406250e-01,
        4.12597656e-02,  2.67578125e-01, -1.11816406e-01, -2.39257812e-01,
       -9.61914062e-02, -5.83496094e-02,  2.51953125e-01,  2.20703125e-01,
       -7.61718750e-02, -7.27539062e-02,  2.12402344e-02,  1.58203125e-01,
        1.83105469e-02, -3.56445312e-02, -8.23974609e-03, -3.83300781e-02,
        1.85546875e-01, -2.59765625e-01, -6.68945312e-02,  3.51562500e-02,
       -2.26562500e-01, -

# Vamos usar a similaridade de cosseno para calcular a proximidade entre palavras

Recall that $\text{cos}(a,b) = \frac{a \cdot b}{||a|| \cdot  ||b||}$

In [None]:
import numpy as np

def cos(x1, x2):
  return np.dot(x1, x2)/(np.linalg.norm(x1)*np.linalg.norm(x2))

# Testando similaridades...

In [None]:
cos(word2vec["smartphone"], word2vec["telephone"])

0.13366508

# Similaridade de Textos com Word Mover Distance

In [None]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]


sentence_obama = 'Obama speaks to the media in Illinois'
sentence_president = 'The president greets the press in Chicago'

sentence_obama = preprocess(sentence_obama)
sentence_president = preprocess(sentence_president)


distance = word2vec.wmdistance(sentence_obama, sentence_president)
print('distance = %.4f' % distance)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
distance = 3.3741


In [None]:
sentence_orange = preprocess('Oranges are my favorite fruit')
distance = word2vec.wmdistance(sentence_obama, sentence_orange)
print('distance = %.4f' % distance)


distance = 4.3802
