### Import libraries and functions

In [92]:
import tensorflow as tf
import numpy as np
import requests
from os import listdir, getcwd, chdir, mkdir
from os.path import join
import zipfile
from scipy.spatial.distance import cosine
from scipy.linalg import norm

### Download GloVe word embeddings from Stanford website

In [5]:
URL = 'http://nlp.stanford.edu/data/glove.6B.zip'

if 'GloVe' not in listdir():
    mkdir('GloVe')
    
current_directory = getcwd()
chdir(join(current_directory, 'GloVe'))

r = requests.get(URL)
with open('glove.6B.zip', 'wb') as file:
    file.write(r.content)

chdir(current_directory)

### Unzip embeddings

In [7]:
current_directory = getcwd()
chdir(join(current_directory, 'GloVe'))

zip = zipfile.ZipFile('glove.6B.zip')
zip.extractall()

chdir(current_directory)

### Prepare a dictionary with embedding vectors

In [12]:
embedding_dict = {}

with open(join(getcwd(), 'GloVe', 'glove.6B.50d.txt'), encoding='UTF-8') as file:
    for line in file:
        token, vector = line.split(maxsplit=1)
        embedding_dict[token] = np.array(vector.replace('\n','').split(' ')).astype('float')

In [13]:
len(embedding_dict.keys())

400000

In [39]:
tokens = list(embedding_dict.keys())
embedding_matrix = np.array(list(embedding_dict.values()))

In [16]:
embedding_dict['aircraft'], embedding_dict['airplane']

(array([ 1.7714   , -0.75714  ,  1.0217   , -0.26717  , -0.36311  ,
         0.29269  , -0.79656  , -0.49746  ,  0.41422  , -1.0602   ,
         1.2215   ,  0.41672  , -0.40249  ,  0.70013  , -1.0695   ,
        -0.19489  , -1.0886   ,  1.2409   , -2.1505   , -1.1609   ,
         0.10969  ,  0.1729   , -0.82806  , -0.97654  , -0.14616  ,
        -1.2641   , -0.13635  , -0.041624 ,  1.0939   ,  0.7116   ,
         2.474    , -0.16225  , -0.26348  ,  0.15532  ,  1.1995   ,
         0.0076471,  0.76388  , -0.071138 , -1.3869   ,  0.88787  ,
         0.36175  , -0.33419  ,  1.6512   , -0.52295  , -0.30657  ,
         0.17399  , -0.55383  ,  0.46204  , -0.59634  ,  0.41802  ]),
 array([ 1.2977   , -0.29922  ,  0.66154  , -0.20133  , -0.02502  ,
         0.28644  , -1.0811   , -0.13045  ,  0.64917  , -0.33634  ,
         0.53352  ,  0.32792  , -0.43206  ,  1.4613   ,  0.022957 ,
        -0.26019  , -1.1061   ,  1.077    , -0.99877  , -1.3468   ,
         0.39016  ,  0.43799  , -1.0403   , -0

In [29]:
cosine(embedding_dict['aviation'], embedding_dict['aerospace'])

0.24487051091762535

In [88]:
e = embedding_dict
masculine = (e['boy'] + e['man'] + e['prince'] + e['father'] + e['male']) / 5
feminine = (e['girl'] + e['woman'] + e['princess'] + e['mother'] + e['female']) / 5

vec = e['actor'] - masculine + feminine

distances = np.array([cosine(vec, embedding_matrix[line,:]) for line in range(embedding_matrix.shape[0])])
abs = np.abs(distances)
index = abs.argmin()

tokens[index]

'actress'

array([-0.25953  , -0.15348  ,  0.15036  ,  0.27429  , -0.0071011,
        1.155    , -0.70017  ,  0.10858  , -0.43335  , -1.0701   ,
       -1.0207   , -0.78398  ,  0.63734  ,  0.44579  , -0.11649  ,
       -0.65589  , -0.61099  , -0.65956  ,  1.7138   ,  0.4675   ,
       -1.2396   ,  0.78073  ,  0.1146   ,  0.51444  ,  0.32173  ,
       -0.88167  ,  0.36809  ,  0.17611  , -1.1154   , -0.13632  ,
       -0.63477  , -0.36219  , -0.72977  , -0.43869  , -0.26495  ,
       -0.92195  , -1.1215   ,  0.11131  ,  0.41741  , -1.1479   ,
       -0.73792  ,  1.0466   ,  0.021259 , -0.85729  , -0.34326  ,
       -0.36334  , -0.3667   , -1.085    ,  0.33442  ,  1.0549   ])

In [68]:
cosine(e['man'], e['woman'])

0.1139662281504179

In [97]:
cosine(e['amazing'], e['horrendous'])

0.5716432756575993

-5.4593