# Loading Word Embeddings
* Download and load word embeddings from Google's pretrained word2vec model
* Extract vector representation of individual terms
* Perform word similarities using term's vector representation 

In [None]:
import pandas as pd, nltk
from gensim.models import KeyedVectors, Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2021-01-30 12:46:42--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.78.254
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.78.254|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [None]:
model = KeyedVectors.load_word2vec_format('/root/input/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
model['computer'] 

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

* Above shown is a vector representation of word 'computer'

In [None]:
print('No of dimensions', len(model['computer']))

No of dimensions 300


In [None]:
# vocabulary size
len(model.vocab)

3000000

* Certain words/terms wouldn't be there in a particular word embedding, so we need to remove such terms from the tokens. 

In [None]:
'automobile' in model.vocab # checking if the word 'automobile' is in word embedding model using 'in' operator

True

### Similarity b/w two terms - cosine_similarity
* Ignore diagonal values of the result as they represent relationship with themselves
* consider only either (0,1)/(1,0) values

In [None]:
cosine_similarity([model['computer'], model['laptop']])

array([[0.9999996, 0.6640494],
       [0.6640494, 1.0000004]], dtype=float32)

.66 is close to .7 which represents a relationship b/w terms

In [None]:
cosine_similarity([model['computer'], model['automobile']])

array([[0.9999996 , 0.24495603],
       [0.24495603, 0.99999994]], dtype=float32)

.244 represents a very weak relationship b/w terms

# Identify off man out using - doesnt_match

In [None]:
model.doesnt_match(['cricket','ball','orange','stadium'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'orange'

# Identify Neighbors using - most_similar

In [None]:
model.most_similar('hyundai', topn=10) # finds top similar words across multiple docs

[('mercedes', 0.6871387958526611),
 ('volvo', 0.6870455741882324),
 ('camry', 0.6712770462036133),
 ('vw', 0.6688359379768372),
 ('nissan', 0.6683660745620728),
 ('toyota', 0.6605498790740967),
 ('honda', 0.6544189453125),
 ('subaru', 0.6441575884819031),
 ('bmw', 0.6436272859573364),
 ('saturn', 0.6278748512268066)]

In [None]:
# What is (king-man) + woman = ??
model.most_similar(positive=['king','woman'], negative=['man'], topn=1)

[('queen', 0.7118192911148071)]