In [1]:
import os
os.chdir("../../glove/")

In [2]:
import numpy as np
def load_glove_vectors(glove_file):
    with open(glove_file, 'r', encoding="utf-8") as file:
        words = set()
        word_to_vec = {}
        for line in file:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec[curr_word] = np.array(line[1:], dtype=np.float64)
    return words, word_to_vec

In [12]:
words, word_to_vec = load_glove_vectors("glove.6B.100d.txt")

In [13]:
def find_cosine_similarity(u, v):
    cos_angle = 0.0
    dot = np.dot(u,v)
    norm_u = np.sqrt(np.sum(u**2))
    norm_v = np.sqrt(np.sum(v**2))
    cos_angle = dot/(norm_u)/norm_v
    return cos_angle

In [50]:
from heapq import heappush, heapify, heappop
class Node:
    def __init__(self, word, base):
        self.word = word
        self.distance = eucli_distance(word_to_vec[base], word_to_vec[word])
    def __lt__(self, other):
        return self.distance > other.distance
        
def eucli_distance(u,v):
    return np.linalg.norm(u-v)

def closest_words(base, num):
    closest = []
    for word in words:
        if word == base:
            continue
        if len(closest) < num:
            closest.append(Node(word, base))
            continue
        heapify(closest)
        heappop(closest)
        heappush(closest, Node(word,base))
    return [w.word for w in closest]

In [28]:
def find_analogy(a,b,c):
    A = word_to_vec[a]
    B = word_to_vec[b]
    C = word_to_vec[c]
    best_word = ""
    max_sim = float("-inf")
    for word in words:
        if word in [a,b,c]:
            continue
        curr_sim = find_cosine_similarity(A-B, C- word_to_vec[word])
        if curr_sim > max_sim:
            max_sim = curr_sim
            best_word = word
    return best_word

In [51]:
find_analogy("sun","day","moon")

'week'

In [52]:
king = closest_words("king",50)

In [53]:
queen = closest_words("queen",50)

In [54]:
len([x for x in king if x in queen])/50

0.22

In [55]:
eucli_distance(word_to_vec["king"],word_to_vec["queen"])

4.281252149113388

In [76]:
steve = closest_words("simon",50)

In [78]:
steve

['mayborn',
 'mann',
 'gould',
 'johnston',
 'murphy',
 'patterson',
 'steve',
 'wilson',
 'michael',
 'elliott',
 'gordon',
 'adam',
 'richard',
 'clarke',
 'andy',
 'arnold',
 'jonathan',
 'cooper',
 'murray',
 'henderson',
 'barker',
 'sullivan',
 'lloyd',
 'bennett',
 'jacobs',
 'matthews',
 'shaw',
 'martin',
 'ellis',
 'moore',
 'harris',
 'spencer',
 'evans',
 'baker',
 'lyons',
 'davies',
 'adams',
 'morrison',
 'barnes',
 'watson',
 'newman',
 'taylor',
 'neil',
 'clark',
 'andrew',
 'anderson',
 'stuart',
 'russell',
 'david',
 'slater']