In [1]:
import numpy as np
import mpu
from numpy.linalg import norm

## 1. Load the vocabulary 

In [2]:
vocab_data = mpu.io.read('vocab.pickle')
vocab_list = np.array(vocab_data['vocab_list'])
inverse_vocab_dict = vocab_data['inverse_vocab_dict']

## 2. Load and normalize the embedding

In [3]:
full_embedding = np.load('word2vec_embedding.npy')
online_embedding = np.load('word2vec_embedding_onlinefull.npy')
norm_full_embedding = full_embedding/norm(full_embedding, axis=-1)[:,np.newaxis]
norm_online_embedding = online_embedding/norm(online_embedding, axis=-1)[:,np.newaxis]

## 3. Calculate the cosine similarity

In [4]:
full_sim = norm_full_embedding @ norm_full_embedding.T
online_sim = norm_online_embedding @ norm_online_embedding.T

np.fill_diagonal(full_sim, 0)
np.fill_diagonal(online_sim, 0)

## 4. Compare the context words found by the full model and the online model

In [8]:
# Pick the target words that has strong linked context words
base_vocab_size = 3800
num_top_target_words = 3
num_top_context_words = 10

target_words = np.argsort(np.max(full_sim[base_vocab_size:], axis=-1))[::-1][:num_top_target_words] + base_vocab_size

# Find the top context words for each target word
context_words_full = np.argsort(full_sim[target_words], axis=-1)[:,::-1][:,:num_top_context_words]
context_words_online = np.argsort(online_sim[target_words], axis=-1)[:,::-1][:,:num_top_context_words]

for idx, target_word in enumerate(target_words):
    print('**********************************************************************************************')
    print('Full   model target to context: {} --> {}'.format(vocab_list[target_word], ','.join(vocab_list[context_words_full[idx]])))
    print('Online model target to context: {} --> {}'.format(vocab_list[target_word], ','.join(vocab_list[context_words_online[idx]])))    

**********************************************************************************************
Full   model target to context: tellus --> sigeia,disguised,thereon,replied,hic,simois,i,sceptres,[UNK],esteem
Online model target to context: tellus --> sigeia,disguised,degrees,tongueless,thus,trencher,speeches,remove,greece,ballad
**********************************************************************************************
Full   model target to context: sigeia --> tellus,hic,simois,disguised,steterat,replied,thereon,[UNK],musty,changes
Online model target to context: sigeia --> tellus,steterat,disguised,hic,leaden,wales,sovereignty,sailors,aedile,instant
**********************************************************************************************
Full   model target to context: stinking --> pour,pitch,down,beg,caps,weapons,their,seems,bound,tents
Online model target to context: stinking --> pour,weapons,threw,their,caps,hurl,your,shock,graces,throw
