In [12]:
import gensim
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pickle

In [2]:
### Pretrained word2vec
# Google’s pre-trained Word2Vec (1.5GB), word vectors for a vocabulary of 3 million words 
# and phrases that they trained on roughly 100 billion words from a Google News dataset
# https://code.google.com/archive/p/word2vec/
model = gensim.models.KeyedVectors.load_word2vec_format('./word2vec_pretrained/GoogleNews-vectors-negative300.bin', binary=True)  

In [3]:
# Check vocab size
words = list(model.vocab)
print("Vocab size: ", len(words))

Vocab size:  3000000


In [4]:
# Get cosine distance between two vectors
def get_cosine_dist(vec1, vec2):
    dist = np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
    return dist

In [5]:
# Sanity check on distance function
a = model.__getitem__("Yangtze_River")
b = model.__getitem__("Yangtze")
c = model.__getitem__("sushi")
print("Cosine distance (similar) :", get_cosine_dist(a, b))
print("Cosine distance (different) :", get_cosine_dist(a, c))

Cosine distance (similar) : 0.8322346
Cosine distance (different) : 0.14340287


In [6]:
# Get k closest words for an embedding
def get_neighbors(target_word_vec, vocab_words, k):
    '''
    Input:
    target_word_vec: word embedding
    vocab_words: list of words in vocabulary
    k: number of neighbors to get
    
    Output:
    k_nearest: k nearest words
    '''
    
    # calculate cosine distance between target word embedding and each word embedding of words in vocab
    # for each word in vocab, record cosine distance and that word 
    dist_item = [[None,None] for word in vocab_words]
    for i in tqdm(range(len(vocab_words))):
        item = vocab_words[i]
        item_word_vec =  model.__getitem__(item)
        item_dist = get_cosine_dist(item_word_vec, target_word_vec)
        dist_item[i] = [item_dist,item]

    # now sort dist_item so words with the most positive cosine distance will be at the front of the list
    sorted_dist_item = sorted(dist_item, reverse = True)

    # get labels of k nearest neighbours
    k_nearest = []
    for neighbour in sorted_dist_item[:k]:
        k_nearest.append(neighbour[1])

    return k_nearest

In [7]:
def find_closest3_for_analogy_list(filename):

    word_list_file = open(filename).readlines()
    analogies = [line.strip().split() for line in word_list_file if line.strip() != ""]
    
    closest_words_list = [None for i in range(len(analogies))]
    answer = [item[-1] for item in analogies]
    
    for idx, analogy in enumerate(analogies):
    
        # look up embeddings for words a, b, c
        vec_a = model.__getitem__(analogy[0])
        vec_b = model.__getitem__(analogy[1])
        vec_c = model.__getitem__(analogy[2])

        # estimated embedding for word d
        est_vec_d = vec_c + (vec_b - vec_a)

        # find top 3 words in vocab
        top3_closest_words = get_neighbors(est_vec_d, words, 3)
        closest_words_list[idx] = top3_closest_words
    
    return closest_words_list, answer, analogies 

## Analyze performance for the 4 lists

In [19]:
def evaluate_accuracy(closest_found, correct):
    total = len(correct)
    correct_cnt_top1 = 0
    correct_cnt_top3 = 0
    
    for i in range(len(correct)):
        answer_word = correct[i]
        top1 = closest_found[i][0]
        top3 = closest_found[i]
        if answer_word == top1:
            correct_cnt_top1 += 1
        if answer_word in top3:
            correct_cnt_top3 += 1
            
    top1_acc = correct_cnt_top1/total
    top3_acc = correct_cnt_top3/total
    
    return top1_acc, top3_acc

In [9]:
closest_found_list1, correct_list1, analogies_list1 = find_closest3_for_analogy_list("word_lists/list1.txt")

HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




In [15]:
list1_results = np.array([closest_found_list1, correct_list1, analogies_list1], dtype=object)
with open('list1_similar.data', 'wb') as data_save:
    # store the data as binary data stream
    pickle.dump(list1_results, data_save)

In [16]:
# Instead of taking a long time to go over the vocab, we load the pickle file we saved
with open('list1_similar.data', 'rb') as filehandle:
    # read the data as binary data stream
    closest_found_1, correct_1, analogies_1 = np.array(pickle.load(filehandle))

In [23]:
list1_top1_acc, list1_top3_acc = evaluate_accuracy(closest_found_1, correct_1)
print("Top1 accuracy rate for word list 1:", list1_top1_acc * 100, "%")
print("Top3 accuracy rate for word list 1:", list1_top3_acc * 100, "%")

Top1 accuracy rate for word list 1: 40.0 %
Top3 accuracy rate for word list 1: 85.0 %


In [10]:
closest_found_list2, correct_list2, analogies_list2 = find_closest3_for_analogy_list("word_lists/list2.txt")

HBox(children=(IntProgress(value=0, max=3000000), HTML(value='')))




KeyboardInterrupt: 

In [None]:
closest_found_list3, correct_list3, analogies_list4 = find_closest3_for_analogy_list("word_lists/list3.txt")

In [None]:
closest_found_list4, correct_list4, analogies_list4 = find_closest3_for_analogy_list("word_lists/list4.txt")