In [1]:
import pickle
import numpy as np
from sklearn.neighbors import NearestNeighbors

## Read in Embeddings and Word Dictionary

In [2]:
mus = np.load('fifth-pass/mu.npy')
logsigmas = np.load('fifth-pass/sigma.npy')
word_dictionary = pickle.load(open('fifth-pass/word2id.pkl', 'rb'), encoding='latin1')

# Decode from bytes
word_dictionary = {key.decode('utf-8'): value for key, value in word_dictionary.items()}
reversed_word_dictionary = {value: key for key, value in word_dictionary.items()}

In [3]:
print('Shape of mus: {}'.format(mus.shape))
print('Shape of logsigmas: {}'.format(logsigmas.shape))
print('Vocabulary size: {}'.format(len(word_dictionary)))

Shape of mus: (71291, 4, 50)
Shape of logsigmas: (71291, 4, 1)
Vocabulary size: 71291


In [4]:
# Sanity check
assert not (np.isnan(mus).any() or np.isnan(logsigmas).any())

## Check similarity of words

In [5]:
def el_similarity(mu1, mu2, sigma1, sigma2):
    foo = np.log((sigma1 + sigma2) ** 50)
    bar = 50 * np.log(2*np.pi)
    baz = np.sum((mu1 - mu2) ** 2 / (sigma1 + sigma2))
    return -0.5 * (foo + bar + baz)

In [6]:
word = 'bank'
idx = word_dictionary[word]

In [7]:
np.linalg.norm(mus[idx], axis=1)

array([0.8555219 , 0.8770162 , 0.85493964, 0.80638415], dtype=float32)

In [8]:
[(i, j, np.linalg.norm(mus[idx][i] - mus[idx][j])) for j in range(4) for i in range(j)]

[(0, 1, 1.0508848),
 (0, 2, 0.80992067),
 (1, 2, 0.94959265),
 (0, 3, 1.1729044),
 (1, 3, 0.9836421),
 (2, 3, 1.0825567)]

In [9]:
mus_ = mus[idx]
sigmas_ = np.exp(logsigmas[idx]).flatten()

In [10]:
[(i, j, el_similarity(mus_[i], mus_[j], sigmas_[i], sigmas_[j])) for j in range(4) for i in range(j)]

[(0, 1, 5.13389699746493),
 (0, 2, 7.116908117614983),
 (1, 2, 6.18103301426391),
 (0, 3, 3.739083946438754),
 (1, 3, 5.894169712642487),
 (2, 3, 4.780821007637385)]

## kNN Analysis

In [11]:
knn0 = NearestNeighbors(n_neighbors=10).fit(mus[:, 0, :])
knn1 = NearestNeighbors(n_neighbors=10).fit(mus[:, 1, :])
knn2 = NearestNeighbors(n_neighbors=10).fit(mus[:, 2, :])
knn3 = NearestNeighbors(n_neighbors=10).fit(mus[:, 3, :])

knn = [knn0, knn1, knn2, knn3]

In [12]:
words = ['rock', 'bank', 'apple', 'star', 'cell', 'left']

for word in words:
    idx = word_dictionary[word]
    embedding = mus[idx]
    for i in range(4):
        distances, indices = knn[i].kneighbors(embedding[i].reshape(1, -1))
        lst = [reversed_word_dictionary[j] for j in indices.flatten()]
        print(lst[0] + '\t' + ' '.join(lst[1:]))

rock	sand and igneous pindus by shaped on sands waters
rock	vicious seed or groups rees metal ingres serving castings
rock	artists performers big vocals kid performances dance genre recordings
rock	sound album puppets music hall singing harrison davis jazz
bank	official monaco overseen tariffs abbreviated members each epa rt
bank	dismantlement continental peninsula gwangju wwi rebranded macau burma altos
bank	monetary finance telecommunications investment substantial purchases funds country investments
bank	cambodian landmines control s to banking jamaicans begun declares
apple	somewhat easily crossover credited qasr metered and remey older
apple	hfs unix custom computers hardware microcomputer rom xt platform
apple	anything fol your metasyntactic this almon above actual flash
apple	mac video microsoft ibm sgi windows macromedia feature license
star	lyr dm inside edgeworth conventions series pole spherical sideline
star	galaxy stars planets maser at galactic knock moon huygens
star	in 