In [0]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from IPython.display import display

## Glove -  Pretrained Word Embedding Dataset
- GloVe: https://nlp.stanford.edu/projects/glove/
- Direct link: http://nlp.stanford.edu/data/glove.6B.zip

In [2]:
# Download dataset
! wget http://nlp.stanford.edu/data/glove.6B.zip -O glove.zip
! unzip glove.zip -d ./glove
! rm -rf glove.zip
! ls glove

--2018-11-03 13:34:23--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2018-11-03 13:34:24--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.zip’


2018-11-03 13:34:45 (38.9 MB/s) - ‘glove.zip’ saved [862182613/862182613]

Archive:  glove.zip
  inflating: ./glove/glove.6B.50d.txt  
  inflating: ./glove/glove.6B.100d.txt  
  inflating: ./glove/glove.6B.200d.txt  
  inflating: ./glove/glove.6B.300d.txt  
glove.6B.100d.txt  glove.6B.200d.txt  glove.6B.300d.txt  glove.6B.50d.txt


In [3]:
# format:
# word vec[0] vec[1] vec[2] ...
! head -n 1 glove/glove.6B.50d.txt | cut -c-50

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445


## Load dataset - 50D

In [4]:
# load in pre-trained word vectors
word2vec = {}
embedding = []
idx2word = []

with open('./glove/glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)

# Word Embedding matrix
embedding = np.array(embedding)
display(embedding.shape)

(400000, 50)

In [5]:
display(embedding[0])
display(idx2word[0])

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

'the'

In [6]:
display(word2vec['the'])

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

## Find Analogies

In [0]:
from sklearn.metrics.pairwise import pairwise_distances

def find_analogies(w1, w2, w3, 
        embedding_matrix, word2vec, idx2word,
        metric='euclidean'):
    ''''''
    for w in (w1, w2, w3):
        if w not in word2vec:
            print("%s not in dictionary" % w)
            return
    V, D = embedding_matrix.shape
    
    king = word2vec[w1]
    man = word2vec[w2]
    woman = word2vec[w3]
    v0 = king - man + woman

    distances = pairwise_distances(
        v0.reshape(1, D),
        embedding_matrix,
        metric=metric).reshape(V)
    idxs = distances.argsort()[:4]

    for idx in idxs:
        word = idx2word[idx]
        if word not in (w1, w2, w3): 
            best_word = word
            break

    print(w1, "-", w2, "+", w3, '=', best_word)

In [10]:
find_analogies('king', 'man', 'woman',
    embedding, word2vec, idx2word,
    metric='cosine')

find_analogies('king', 'man', 'woman',
    embedding, word2vec, idx2word,
    metric='euclidean')

king - man + woman = queen
king - man + woman = queen


In [11]:
find_analogies('france', 'paris', 'london',
    embedding, word2vec, idx2word,
    metric='cosine')

find_analogies('france', 'paris', 'london',
    embedding, word2vec, idx2word,
    metric='euclidean')

france - paris + london = britain
france - paris + london = britain


In [13]:
find_analogies('france', 'paris', 'rome',
    embedding, word2vec, idx2word,
    metric='cosine')

find_analogies('france', 'paris', 'rome',
    embedding, word2vec, idx2word,
    metric='euclidean')

france - paris + rome = italy
france - paris + rome = italy


In [14]:
find_analogies('december', 'november', 'june',
    embedding, word2vec, idx2word,
    metric='cosine')

find_analogies('december', 'november', 'june',
    embedding, word2vec, idx2word,
    metric='euclidean')

december - november + june = july
december - november + june = july


## Find nearest neighbor

In [0]:
from sklearn.metrics.pairwise import pairwise_distances

def nearest_neighbors(word,
        embedding_matrix, word2vec, idx2word,
        metric='euclidean', n=5):
    ''''''
    if word not in word2vec:
        print("%s not in dictionary:" % word)
        return
    
    V, D = embedding_matrix.shape
    
    v = word2vec[word]
    distances = pairwise_distances(
        v.reshape(1, D),
        embedding_matrix,
        metric=metric).reshape(V)
    idxs = distances.argsort()[1:n+1]

    print("neighbors of '%s' - metric %s:" % (word, metric))
    for idx in idxs:
        print("\t%s" % idx2word[idx])

In [16]:
nearest_neighbors('king',
    embedding, word2vec, idx2word,
    metric='euclidean')
nearest_neighbors('king',
    embedding, word2vec, idx2word,
    metric='cosine')

neighbors of 'king' - metric euclidean:
	prince
	queen
	uncle
	ii
	grandson
neighbors of 'king' - metric cosine:
	prince
	queen
	ii
	emperor
	son


In [17]:
nearest_neighbors('einstein',
    embedding, word2vec, idx2word,
    metric='euclidean')
nearest_neighbors('einstein',
    embedding, word2vec, idx2word,
    metric='cosine')

neighbors of 'einstein' - metric euclidean:
	bohr
	relativity
	invented
	freud
	gottlieb
neighbors of 'einstein' - metric cosine:
	relativity
	bohr
	physics
	heisenberg
	freud


In [18]:
nearest_neighbors('february',
    embedding, word2vec, idx2word,
    metric='euclidean')
nearest_neighbors('february',
    embedding, word2vec, idx2word,
    metric='cosine')

neighbors of 'february' - metric euclidean:
	october
	january
	december
	august
	september
neighbors of 'february' - metric cosine:
	october
	december
	january
	august
	september


In [19]:
nearest_neighbors('rome',
    embedding, word2vec, idx2word,
    metric='euclidean')
nearest_neighbors('rome',
    embedding, word2vec, idx2word,
    metric='cosine')

neighbors of 'rome' - metric euclidean:
	naples
	venice
	turin
	florence
	lisbon
neighbors of 'rome' - metric cosine:
	naples
	venice
	italy
	turin
	pope
