In [1]:
import pickle
import numpy as np
from sklearn.neighbors import NearestNeighbors

## Read in Embeddings and Word Dictionary

In [2]:
mus = np.load('sixth-pass/mu.npy')
logsigmas = np.load('sixth-pass/sigma.npy')
word_dictionary = pickle.load(open('sixth-pass/word2id.pkl', 'rb'), encoding='latin1')

# Decode from bytes
word_dictionary = {key.decode('utf-8'): value for key, value in word_dictionary.items()}
reversed_word_dictionary = {value: key for key, value in word_dictionary.items()}

In [3]:
print('Shape of mus: {}'.format(mus.shape))
print('Shape of logsigmas: {}'.format(logsigmas.shape))
print('Vocabulary size: {}'.format(len(word_dictionary)))

Shape of mus: (71291, 4, 50)
Shape of logsigmas: (71291, 4, 1)
Vocabulary size: 71291


In [4]:
np.exp(logsigmas).mean()

0.049712293

In [5]:
# Sanity check
assert not (np.isnan(mus).any() or np.isnan(logsigmas).any())

## Check similarity of words

In [6]:
def el_similarity(mu1, mu2, sigma1, sigma2):
    foo = np.log((sigma1 + sigma2) ** 50)
    bar = 50 * np.log(2*np.pi)
    baz = np.sum((mu1 - mu2) ** 2 / (sigma1 + sigma2))
    return -0.5 * (foo + bar + baz)

In [7]:
word = 'bank'
idx = word_dictionary[word]

In [8]:
mus_ = mus[idx]
sigmas_ = np.exp(logsigmas[idx]).flatten()

In [9]:
[(i, j, el_similarity(mus_[i], mus_[j], sigmas_[i], sigmas_[j])) for j in range(4) for i in range(j)]

[(0, 1, 7.060481599650757),
 (0, 2, 11.232229939534555),
 (1, 2, 8.329426207521657),
 (0, 3, 4.416653179179988),
 (1, 3, 4.852052224665741),
 (2, 3, 5.542474565920934)]

In [None]:
N = 1000
mu_rand1 = np.random.uniform(-1, 1, N)
mu_rand2 = np.random.uniform(-1, 1, N)
sigma_rand1 = 0.05
sigma_rand2 = 0.05

x = [el_similarity(mu_rand1[i], mu_rand2[j], sigma_rand1, sigma_rand2) for i in range(N) for j in range(N)]

In [None]:
np.mean(x)

In [None]:
np.max(x)

In [None]:
np.min(x)

## kNN Analysis

In [77]:
knn = NearestNeighbors(n_neighbors=10).fit(mus.reshape(-1, 50, order='F'))  # It's important that order='F'!

'''
knn0 = NearestNeighbors(n_neighbors=10).fit(mus[:, 0, :])
knn1 = NearestNeighbors(n_neighbors=10).fit(mus[:, 1, :])
knn2 = NearestNeighbors(n_neighbors=10).fit(mus[:, 2, :])
knn3 = NearestNeighbors(n_neighbors=10).fit(mus[:, 3, :])
knn = [knn0, knn1, knn2, knn3]
''';

In [79]:
words = ['rock', 'bank', 'apple', 'star', 'cell', 'left']

for word in words:
    idx = word_dictionary[word]
    embedding = mus[idx]
    for i in range(4):
        distances, indices = knn.kneighbors(embedding[i].reshape(1, -1))
        lst = [reversed_word_dictionary[j % len(reversed_word_dictionary)] for j in indices.flatten()]
        print(lst[0] + '\t' + ' '.join(lst[1:]))

rock	oldies crimson indie dj rap mainstream hardcore disco videos
rock	extreme rocks oat rim is moose pulverized protected axillary
rock	deep rolling indie rock band from churning concert bands
rock	pop beatles blues concert clapton musical jazz funk music
bank	bank hay cairo egyptian thaler finance luxembourg banking fund
bank	jordan gulf disengagement embargo kuwaiti gaza palestinian between capital
bank	bank banking monetary account fund asset iban capital finance
bank	to pay employer supply aid cash remain been all
apple	tree pan stewed crab apples kipper sweet moxie manna
apple	apple desktop macintosh lotus marketed motorola microsoft intel product
apple	macintosh desktop microcomputer microsoft mac intel os pc desktop
apple	suit case overturned bail dm heflin judge nne legality
star	lyr fred jolly trailed willie catcher bo man closer
star	daily big airs crimson looking nightly from friday telecaster
star	series movie tv jones simpsons lucas trek futurama nausica
star	stars conste