In [1]:
import numpy as np

In [63]:
class Word2Vec:
    def __init__(self):
        self.embedding_vector = None
        self.token_to_index = {}
        self.index_to_token = {}
        self.token_to_vector = {}
    
    def set_embedding_vector(self):
        self.embedding_vector = np.stack(self.token_to_vector.values())
    
    def get_vecs_by_tokens(self, tokens):
        vecs = []
        for token in tokens:
            vecs.append(self.token_to_vector[token])
        return vecs

glove_6b50d = Word2Vec()

with open('../data/glove.6B.50d.txt', 'r') as f:
    for i, line in enumerate(f):
        value = line.split(' ')
        word = value[0]
        coef = np.array(value[1:], dtype=np.float32)
        glove_6b50d.token_to_vector[word] = coef
        glove_6b50d.token_to_index[word] = i
        glove_6b50d.index_to_token[i] = [word]
    glove_6b50d.set_embedding_vector()

In [64]:
glove_6b50d.token_to_index['investigation'], glove_6b50d.index_to_token[973]

(972, ['lives'])

## Finding Synonyms

In [67]:
def knn(W, x, k):
    # The added 1e-9 is for numerical stability
    cos = np.dot(W, x.reshape((-1,))) / (
        (np.sqrt(np.sum(W * W, axis=1) + 1e-9)) * np.sqrt(np.sum(x * x)))
    topk = np.argsort(cos)[-k:]
    return topk, [cos[i] for i in topk]

In [68]:
def get_similar_tokens(query_token, k, embed):
    topk, cos = knn(
        embed.embedding_vector,
        embed.token_to_vector[query_token],
        k+1
    )
    for i, c in zip(topk[1:], cos[1:]):  # Remove input words
        print('cosine sim=%.3f: %s' % (c, (embed.index_to_token[i])))

In [69]:
get_similar_tokens('amazon', 3, glove_6b50d)

cosine sim=0.653: ['amazon.com']
cosine sim=0.663: ['unbox']
cosine sim=1.000: ['amazon']


In [70]:
get_similar_tokens('baby', 3, glove_6b50d)

cosine sim=0.800: ['boy']
cosine sim=0.839: ['babies']
cosine sim=1.000: ['baby']


In [71]:
get_similar_tokens('beautiful', 3, glove_6b50d)

cosine sim=0.893: ['gorgeous']
cosine sim=0.921: ['lovely']
cosine sim=1.000: ['beautiful']


## Find Analogies

In [72]:
def get_analogy(token_a, token_b, token_c, embed):
    vecs = embed.get_vecs_by_tokens([token_a, token_b, token_c])
    x = vecs[1] - vecs[0] + vecs[2]
    topk, cos = knn(embed.embedding_vector, x, 1)
    return embed.index_to_token[topk[0]]  

In [73]:
get_analogy('man', 'woman', 'boy', glove_6b50d)

['girl']

In [74]:
get_analogy('china', 'beijing', 'japan', glove_6b50d)

['tokyo']

In [75]:
get_analogy('bad', 'worst', 'nice', glove_6b50d)

['place']

In [76]:
get_analogy('do', 'did', 'go', glove_6b50d)

['went']