In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import gensim
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pickle

In [2]:
### Pretrained word2vec
# Google’s pre-trained Word2Vec (1.5GB), word vectors for a vocabulary of 3 million words 
# and phrases that they trained on roughly 100 billion words from a Google News dataset
# https://code.google.com/archive/p/word2vec/
model = gensim.models.KeyedVectors.load_word2vec_format('./word2vec_pretrained/GoogleNews-vectors-negative300.bin', binary=True)  

In [3]:
# Check vocab size
words = [w for w in model.key_to_index]
print("Vocab size: ", len(words))

Vocab size:  3000000


In [15]:
def find_n_closest_for_analogy(filename, n):
    word_list_file = open(filename).readlines()
    analogies = [line.strip().split() for line in word_list_file if line.strip() != ""]
    
    closest_words_list = [None for i in range(len(analogies))]
    answer = [item[3] for item in analogies]
    
    for idx, analogy in enumerate(analogies):
        top3_similar_words = model.most_similar(positive=[analogy[1], analogy[2]], 
                                                negative=[analogy[0]], topn=n)
        closest_words_list[idx] = [w for (w, _) in top3_similar_words]
    
    return closest_words_list, answer 

## Analyze performance for the 4 lists

In [16]:
def evaluate_accuracy(closest_found, correct, n=3):
    top1 = np.array([c[0] for c in closest_found]) == np.array(correct)
    top3 = top1
    for i in range(1, n):
        top3 = np.logical_or(top3, np.array([c[i] for c in closest_found]) == np.array(correct))
    return np.sum(top1)/len(correct), np.sum(top3)/len(correct)

In [17]:
def predict_analogies_for_list(filename):
    print('Testing file: ', filename)
    closest_found, correct = find_n_closest_for_analogy(filename, 3)
    top1_acc, top3_acc = evaluate_accuracy(closest_found, correct)
    print("     Top1 accuracy rate for word list 1:", top1_acc * 100, "%")
    print("     Top3 accuracy rate for word list 1:", top3_acc * 100, "%")
    print()

In [18]:
for i in range(5):
    predict_analogies_for_list(('word_lists/list%d.txt' % (i+1)))

Testing file:  word_lists/list1.txt
     Top1 accuracy rate for word list 1: 85.0 %
     Top3 accuracy rate for word list 1: 90.0 %

Testing file:  word_lists/list2.txt
     Top1 accuracy rate for word list 1: 85.0 %
     Top3 accuracy rate for word list 1: 90.0 %

Testing file:  word_lists/list3.txt
     Top1 accuracy rate for word list 1: 35.0 %
     Top3 accuracy rate for word list 1: 45.0 %

Testing file:  word_lists/list4.txt
     Top1 accuracy rate for word list 1: 20.0 %
     Top3 accuracy rate for word list 1: 25.0 %

Testing file:  word_lists/list5.txt
     Top1 accuracy rate for word list 1: 58.108108108108105 %
     Top3 accuracy rate for word list 1: 68.91891891891892 %



In [24]:
# look up embeddings for words a, b, c
vec_a = model.__getitem__("Tokyo")
vec_b = model.__getitem__("Japan")
vec_c = model.__getitem__("Paris")

# estimated embedding for word d
est_vec_d = vec_c + (vec_b - vec_a)

# find top 3 words in vocab
top3_closest_words = get_neighbors(est_vec_d, words, 3)
print(top3_closest_words)

HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))


['France', 'Paris', 'Belgium']
