### Import libraries and functions

In [92]:
import tensorflow as tf
import numpy as np
import requests
from os import listdir, getcwd, chdir, mkdir
from os.path import join
import zipfile
from scipy.spatial.distance import cosine
from scipy.linalg import norm

### Download GloVe word embeddings from Stanford website

In [5]:
URL = 'http://nlp.stanford.edu/data/glove.6B.zip'

if 'GloVe' not in listdir():
    mkdir('GloVe')
    
current_directory = getcwd()
chdir(join(current_directory, 'GloVe'))

r = requests.get(URL)
with open('glove.6B.zip', 'wb') as file:
    file.write(r.content)

chdir(current_directory)

### Unzip embeddings

In [7]:
current_directory = getcwd()
chdir(join(current_directory, 'GloVe'))

zip = zipfile.ZipFile('glove.6B.zip')
zip.extractall()

chdir(current_directory)

### Prepare a dictionary with embedding vectors

In [107]:
embedding_dict = {}

with open(join(getcwd(), 'GloVe', 'glove.6B.300d.txt'), encoding='UTF-8') as file:
    for line in file:
        token, vector = line.split(maxsplit=1)
        embedding_dict[token] = np.array(vector.replace('\n','').split(' ')).astype('float')

In [108]:
len(embedding_dict.keys())

400000

In [109]:
tokens = list(embedding_dict.keys())
embedding_matrix = np.array(list(embedding_dict.values()))

In [110]:
embedding_dict['aircraft'], embedding_dict['airplane']

(array([ 3.2027e-01,  2.6571e-01, -2.8039e-01, -6.2729e-01, -6.0355e-01,
         8.6039e-02,  3.5997e-01,  9.0403e-01, -2.9259e-01, -2.3684e+00,
         5.6919e-01, -3.8653e-02,  4.2209e-01, -9.0914e-01, -3.6788e-02,
         1.7256e-01,  6.3583e-01,  2.6582e-01, -2.8997e-01, -5.9928e-01,
         1.3667e-01,  4.4419e-01,  5.4853e-01,  3.0715e-01, -1.3531e-01,
         8.4109e-02, -3.6342e-01, -1.4986e-01, -3.7831e-01,  1.9623e-01,
         2.5002e-01, -2.9345e-01,  7.4447e-02, -1.2212e-01,  1.7075e-01,
        -4.7846e-01, -4.9278e-01, -1.9748e-01,  4.4991e-02,  6.0070e-01,
        -5.0407e-01,  4.5494e-02,  3.2189e-02,  4.7133e-01,  8.7665e-02,
         6.3117e-02,  5.4136e-01,  5.0370e-02,  4.5647e-01,  4.3566e-02,
         3.3416e-01, -6.2111e-01,  3.0040e-02,  4.5272e-01, -1.1423e+00,
        -4.2581e-01,  1.9306e-01,  1.8070e-01,  1.1289e-01,  1.4679e-02,
        -6.7776e-01, -3.7808e-01,  2.0781e-01, -6.8875e-01,  1.4939e-01,
         3.7239e-01, -4.5716e-01,  1.8325e-01,  3.8

In [111]:
cosine(embedding_dict['aviation'], embedding_dict['aerospace'])

0.44656689393858884

In [112]:
e = embedding_dict
masculine = (e['boy'] + e['man'] + e['prince'] + e['father'] + e['male']) / 5
feminine = (e['girl'] + e['woman'] + e['princess'] + e['mother'] + e['female']) / 5

# vec = e['actor'] - masculine + feminine
vec = e['paris'] - e['france'] + e['portugal']

distances = np.array([cosine(vec, embedding_matrix[line,:]) for line in range(embedding_matrix.shape[0])])
abs = np.abs(distances)
index = abs.argmin()

tokens[index]

'lisbon'

In [119]:
vec = e['paris'] - e['france'] + e['turkey']

distances = np.array([cosine(vec, embedding_matrix[line,:]) for line in range(embedding_matrix.shape[0])])
abs = np.abs(distances)
index = abs.argmin()

tokens[index]

'istanbul'

In [97]:
cosine(e['amazing'], e['horrendous'])

0.5716432756575993