In [1]:
import gensim
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pickle



In [2]:
### Pretrained word2vec
# Googleâ€™s pre-trained Word2Vec (1.5GB), word vectors for a vocabulary of 3 million words 
# and phrases that they trained on roughly 100 billion words from a Google News dataset
# https://code.google.com/archive/p/word2vec/
model = gensim.models.KeyedVectors.load_word2vec_format('./word2vec_pretrained/GoogleNews-vectors-negative300.bin', binary=True) 

In [3]:
# Check vocab size
words = [w for w in model.key_to_index]
print("Vocab size: ", len(words))

Vocab size:  3000000


In [4]:
# Get cosine distance between two vectors
def get_cosine_dist(vec1, vec2):
    dist = np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
    return dist

In [5]:
# Sanity check on distance function
a = model.__getitem__("Yangtze_River")
b = model.__getitem__("Yangtze")
c = model.__getitem__("sushi")
print("Cosine distance (similar) :", get_cosine_dist(a, b))
print("Cosine distance (different) :", get_cosine_dist(a, c))

Cosine distance (similar) : 0.8322346
Cosine distance (different) : 0.14340287


In [6]:
# Get k closest words for an embedding
def get_neighbors(target_word_vec, vocab_words, k):
    '''
    Input:
    target_word_vec: word embedding
    vocab_words: list of words in vocabulary
    k: number of neighbors to get
    
    Output:
    k_nearest: k nearest words
    '''
    
    # calculate cosine distance between target word embedding and each word embedding of words in vocab
    # for each word in vocab, record cosine distance and that word 
    dist_item = [[None,None] for word in vocab_words]
    for i in range(len(vocab_words)):
        item = vocab_words[i]
        item_word_vec =  model.__getitem__(item)
        item_dist = get_cosine_dist(item_word_vec, target_word_vec)
        dist_item[i] = [item_dist,item]

    # now sort dist_item so words with the most positive cosine distance will be at the front of the list
    sorted_dist_item = sorted(dist_item, reverse = True)

    # get labels of k nearest neighbours
    k_nearest = []
    for neighbour in sorted_dist_item[:k]:
        k_nearest.append(neighbour[1])

    return k_nearest

In [7]:
def find_closest10_for_analogy_list(filename):

    word_list_file = open(filename).readlines()
    analogies = [line.strip().split() for line in word_list_file if line.strip() != ""]
    
    closest_words_list = [None for i in range(len(analogies))]
    answer = [item[-1] for item in analogies]
    
    for idx in tqdm(range(len(analogies))):
        analogy = analogies[idx]
        # look up embeddings for words a, b, c
        vec_a = model.__getitem__(analogy[0])
        vec_b = model.__getitem__(analogy[1])
        vec_c = model.__getitem__(analogy[2])

        # estimated embedding for word d
        est_vec_d = vec_c + (vec_b - vec_a)

        # find top 10 words in vocab
        top10_closest_words = get_neighbors(est_vec_d, words, 10)
        closest_words_list[idx] = top10_closest_words
    
    return closest_words_list, answer, analogies 

## Analyze performance for the 4 lists

In [9]:
for i in range(5):
    input_list = ('word_lists/list%d.txt' % (i+1))
    closest_found_list, correct_list, analogies_list = find_closest10_for_analogy_list(input_list)
    list_results = np.array([closest_found_list, correct_list, analogies_list], dtype=object)
    with open(('list%d_similar10.data' % (i+1)), 'wb') as data_save:
        # store the data as binary data stream
        pickle.dump(list_results, data_save)
    print("Top similar words for file " + input_list + " saved!")

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Top similar words for file word_lists/list1.txt saved!


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Top similar words for file word_lists/list2.txt saved!


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Top similar words for file word_lists/list3.txt saved!


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Top similar words for file word_lists/list4.txt saved!


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


Top similar words for file word_lists/list5.txt saved!


In [109]:
def evaluate_accuracy(closest_found, correct, analogies, repeat_orig = True):
#     # lowercase all
#     correct = [word.lower() for word in correct]
#     analogies = [[word.lower() for word in analogy] for analogy in analogies]
#     closest_found = [[word.lower() for word in top10] for top10 in closest_found]
    
    total = len(correct)
    correct_cnt_top1 = 0
    correct_cnt_top3 = 0
    closest_found_new = [None for i in range(len(closest_found))]
    if not repeat_orig:
        for i in range(len(closest_found)):
            closest_found_new[i] = np.array([el for el in closest_found[i] if el not in analogies[i][:3]]) 
    else:
        closest_found_new = closest_found
        
    for i in range(len(correct)):
        answer_word = correct[i]
        top1 = closest_found_new[i][0]
        top3 = closest_found_new[i][:3]
        if answer_word == top1:
            correct_cnt_top1 += 1
        if answer_word in top3:
            correct_cnt_top3 += 1
#         else:
#             print("Analogy :", analogies[i])
#             print(closest_found[i])
#             print(closest_found_new[i])
#             print(top1)
#             print(top3)
            
    top1_acc = correct_cnt_top1/total
    top3_acc = correct_cnt_top3/total
    
    return top1_acc, top3_acc

In [110]:
# Instead of taking a long time to go over the vocab, we load the pickle file we saved
for i in range(5):
    test_file = ('list%d_similar10.data' % (i+1))
    print("Testing file : ", ('word_lists/list%d.txt' % (i+1)))
    with open(test_file, 'rb') as filehandle:
        # read the data as binary data stream
        closest_found, correct, analogies = np.array(pickle.load(filehandle))
        print("=== Without removing similar words found that are in words a,b,c ===")
        list_top1_acc, list_top3_acc = evaluate_accuracy(closest_found, correct, analogies)
        print(("Top1 accuracy rate for word list %d:" % (i+1)) , list_top1_acc * 100, "%")
        print(("Top3 accuracy rate for word list %d:" % (i+1)) , list_top3_acc * 100, "%")
        print("=== After removing similar words found that are in words a,b,c ===")
        list_top1_acc, list_top3_acc = evaluate_accuracy(closest_found, correct, analogies, repeat_orig = False)
        print(("Top1 accuracy rate for word list %d:" % (i+1)) , list_top1_acc * 100, "%")
        print(("Top3 accuracy rate for word list %d:" % (i+1)) , list_top3_acc * 100, "%")
        print("=" * 60)

Testing file :  word_lists/list1.txt
=== Without removing similar words found that are in words a,b,c ===
Top1 accuracy rate for word list 1: 40.0 %
Top3 accuracy rate for word list 1: 85.0 %
=== After removing similar words found that are in words a,b,c ===
Top1 accuracy rate for word list 1: 85.0 %
Top3 accuracy rate for word list 1: 90.0 %
Testing file :  word_lists/list2.txt
=== Without removing similar words found that are in words a,b,c ===
Top1 accuracy rate for word list 2: 35.0 %
Top3 accuracy rate for word list 2: 90.0 %
=== After removing similar words found that are in words a,b,c ===
Top1 accuracy rate for word list 2: 85.0 %
Top3 accuracy rate for word list 2: 90.0 %
Testing file :  word_lists/list3.txt
=== Without removing similar words found that are in words a,b,c ===
Top1 accuracy rate for word list 3: 0.0 %
Top3 accuracy rate for word list 3: 40.0 %
=== After removing similar words found that are in words a,b,c ===
Top1 accuracy rate for word list 3: 30.0 %
Top3 accu

### Small example of another type of analogy:

In [None]:
# look up embeddings for words a, b, c
vec_a = model.__getitem__("Tokyo")
vec_b = model.__getitem__("Japan")
vec_c = model.__getitem__("Paris")

# estimated embedding for word d
est_vec_d = vec_c + (vec_b - vec_a)

# find top 3 words in vocab
top3_closest_words = get_neighbors(est_vec_d, words, 3)
print(top3_closest_words)