Author: David Simmons

Description: Computing word similarities and analogies from GloVe: https://nlp.stanford.edu/projects/glove/

In [1]:
import os
import numpy as np
from tqdm.notebook import tqdm

In [2]:
cwd = os.getcwd()

glove_path = cwd + '/glove.6B.300d.txt'
embedding_dim = 300
vocab_size = 400000

words = []
vectors = np.empty((vocab_size, embedding_dim), dtype=float)

with open(glove_path, 'r', encoding='UTF-8') as f:
    for i, line in enumerate(tqdm(f.readlines())):
        parsed_line = line.split()
        words += [parsed_line[0]]
        vectors[i,:] = np.reshape(np.array(parsed_line[1:], dtype=float), (1,embedding_dim))

  0%|          | 0/400000 [00:00<?, ?it/s]

In [3]:
norms = np.empty(vocab_size, dtype=float)

for i in tqdm(range(vocab_size)):
    norms[i] = np.linalg.norm(vectors[i])

  0%|          | 0/400000 [00:00<?, ?it/s]

In [4]:
def most_similar(word):
    index = words.index(word)
    word_vector = vectors[index, :]
    word_norm = norms[index]
    
    cosine_similarities = np.empty(vocab_size, dtype=float)

    for i in range(vocab_size):
        cosine_similarities[i] = word_vector.dot(vectors[i]) / (word_norm * norms[i])
    
    indices = np.argpartition(cosine_similarities, -2)[-2:]
    
    for i in indices:
        if words[i] != word:
            print(words[i], '{value:.3f}'.format(value=cosine_similarities[i]))

In [5]:
most_similar('dog')

dogs 0.789


In [6]:
most_similar('whale')

whales 0.781


In [7]:
most_similar('before')

after 0.826


In [8]:
most_similar('however')

although 0.907


In [9]:
most_similar('fabricate')

fabricating 0.565


In [10]:
def analogies(word_A, word_a, word_B):
    index_A = words.index(word_A)
    index_a = words.index(word_a)
    index_B = words.index(word_B)
    vector = -vectors[index_A, :] + vectors[index_a, :] + vectors[index_B, :]
    norm = np.linalg.norm(vector)
    
    cosine_similarities = np.empty(vocab_size, dtype=float)

    for i in range(vocab_size):
        cosine_similarities[i] = vector.dot(vectors[i]) / (norm * norms[i])
    
    indices = np.argpartition(cosine_similarities, -6)[-6:]
    
    top_analogies = []
    for i in indices:
        if words[i] not in [word_A, word_a, word_B]:
            top_analogies += [(words[i],cosine_similarities[i])]
    
    top_analogies = sorted(top_analogies, key = lambda x: x[1], reverse=True)
    
    print(word_A + ' : ' + word_a + ' :: ' + word_B + ' : ')
    print('---------------------------')
    print('Answer | Cosine Similarity')
    print('---------------------------')
    for i in range(3):
        print(top_analogies[i][0],'| {value:.3f}'.format( value=top_analogies[i][1]))


In [11]:
analogies('dog', 'puppy', 'cat')

dog : puppy :: cat : 
---------------------------
Answer | Cosine Similarity
---------------------------
puppies | 0.566
kitten | 0.529
kittens | 0.517


In [12]:
analogies('speak', 'speaker', 'sing')

speak : speaker :: sing : 
---------------------------
Answer | Cosine Similarity
---------------------------
sang | 0.477
chorus | 0.446
singing | 0.440


In [13]:
analogies('france', 'french', 'england')

france : french :: england : 
---------------------------
Answer | Cosine Similarity
---------------------------
english | 0.676
british | 0.519
cricket | 0.478


In [14]:
analogies('france', 'wine', 'england')

france : wine :: england : 
---------------------------
Answer | Cosine Similarity
---------------------------
wines | 0.513
tea | 0.477
beer | 0.429
