In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import gensim
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pickle

## Load the vocabulary

In [3]:
# Pretrained word2vec
# Google’s pre-trained Word2Vec (1.5GB), word vectors for a vocabulary of 3 million words 
# and phrases that they trained on roughly 100 billion words from a Google News dataset
# https://code.google.com/archive/p/word2vec/
model = gensim.models.KeyedVectors.load_word2vec_format('./word2vec_pretrained/GoogleNews-vectors-negative300.bin', binary=True) 

In [11]:
vocab = model.key_to_index.keys()
print("Vocab size: ", len(vocab))

Vocab size:  3000000


## Define functions for findinig closests words

In [118]:
def get_k_nearest(target_word_vec, k):
    nearest_indexes = model.distances(target_word_vec, other_words=vocab).argsort()[:k]
    return [model.index_to_key[i] for i in nearest_indexes]

In [119]:
def get_nearest_ten_for_analogy(filename):
    word_list_file = open(filename).readlines()
    analogies = [line.strip().split() for line in word_list_file if line.strip() != '']
    
    closest_words_list = [None for _ in range(len(analogies))]
    answer = [analogy[3] for analogy in analogies]
    
    for i, analogy in enumerate(tqdm(analogies)):
        vec_a = model.__getitem__(analogy[0])
        vec_b = model.__getitem__(analogy[1])
        vec_c = model.__getitem__(analogy[2])
        vec_d = vec_c + (vec_b - vec_a)
        closest_words_list[i] = get_k_nearest(vec_d, 10)
    
    return closest_words_list, answer, analogies 

In [14]:
def get_k_most_similar(filename, k):
    word_list_file = open(filename).readlines()
    analogies = [line.strip().split() for line in word_list_file if line.strip() != '']
    
    closest_words_list = [None for i in range(len(analogies))]
    answer = [analogy[3] for analogy in analogies]
    
    for i, analogy in enumerate(analogies):
        top3_similar_words = model.most_similar(positive=[analogy[1], analogy[2]], 
                                                negative=[analogy[0]], topn=k)
        closest_words_list[i] = [w for (w, _) in top3_similar_words]
    
    return closest_words_list, answer 

## Analyze and store nearest words for the five lists

In [None]:
for i in range(5):
    input_list = ('word_lists/list%d.txt' % (i+1))
    closest_found, correct, analogies = get_nearest_ten_for_analogy(input_list)
    list_results = np.array([closest_found, correct, analogies], dtype=object)
    with open(('list%d_nearest10.data' % (i+1)), 'wb') as data_save:
        pickle.dump(list_results, data_save) # store the data as binary data stream
    print('Top similar words for file ' + input_list + ' saved!')

## Evaluate analogies for the three different approaches for all five lists

In [15]:
def evaluate_accuracy(closest_found, correct, analogies, repeat_orig = True):
    if not repeat_orig:
        for i, closest in enumerate(closest_found):
            closest_found[i] = np.array([el for el in closest if el not in analogies[i][:3]]) 
        
    top1 = np.array([c[0] for c in closest_found]) == np.array(correct)
    top3 = top1
    for i in range(1, 3):
        top3 = np.logical_or(top3, np.array([c[i] for c in closest_found]) == np.array(correct))
        
    return np.sum(top1)/len(correct), np.sum(top3)/len(correct)

In [16]:
for i in range(5):
    list_filename = ('word_lists/list%d.txt' % (i+1))
    print('Testing file: ', list_filename)
    with open(('list%d_nearest10.data' % (i+1)), 'rb') as filehandle:
        closest_found, correct, analogies = np.array(pickle.load(filehandle))

        top1_acc, top3_acc = evaluate_accuracy(closest_found, correct, analogies)
        print('    Top 1 nearest accuracy: \t\t\t%.2f%%' % (top1_acc * 100))
        print('    Top 3 nearest accuracy: \t\t\t%.2f%%' % (top3_acc * 100))

        top1_acc, top3_acc = evaluate_accuracy(closest_found, correct, analogies, repeat_orig = False)
        print('    Top 1 nearest accuracy (w/o a,b,c): \t%.2f%%' % (top1_acc * 100))
        print('    Top 3 nearest accuracy (w/o a,b,c): \t%.2f%%' % (top3_acc * 100))

        closest_found, correct = get_k_most_similar(list_filename, 3)
        top1_acc, top3_acc = evaluate_accuracy(closest_found, correct, analogies)
        print('    Top 1 most similar accuracy: \t\t%.2f%%' % (top1_acc * 100))
        print('    Top 3 most similar accuracy: \t\t%.2f%%' % (top3_acc * 100))
        print()

Testing file:  word_lists/list1.txt
    Top 1 nearest accuracy: 			40.00%
    Top 3 nearest accuracy: 			85.00%
    Top 1 nearest accuracy (w/o a,b,c): 	85.00%
    Top 3 nearest accuracy (w/o a,b,c): 	90.00%
    Top 1 most similar accuracy: 		85.00%
    Top 3 most similar accuracy: 		90.00%

Testing file:  word_lists/list2.txt
    Top 1 nearest accuracy: 			35.00%
    Top 3 nearest accuracy: 			90.00%
    Top 1 nearest accuracy (w/o a,b,c): 	85.00%
    Top 3 nearest accuracy (w/o a,b,c): 	90.00%
    Top 1 most similar accuracy: 		85.00%
    Top 3 most similar accuracy: 		90.00%

Testing file:  word_lists/list3.txt
    Top 1 nearest accuracy: 			0.00%
    Top 3 nearest accuracy: 			40.00%
    Top 1 nearest accuracy (w/o a,b,c): 	30.00%
    Top 3 nearest accuracy (w/o a,b,c): 	50.00%
    Top 1 most similar accuracy: 		35.00%
    Top 3 most similar accuracy: 		45.00%

Testing file:  word_lists/list4.txt
    Top 1 nearest accuracy: 			0.00%
    Top 3 nearest accuracy: 			15.00%
    Top 1 n