In [32]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

check_dir = '135813_01082019_checkpoints'

In [33]:
# Load saved data
model = t.load(f'{check_dir}/checkpoint_23.pth', map_location='cpu')
dataset = t.load(f'{check_dir}/dataset.pth')

In [34]:
def get_proportions(doc_weights):
    """
    Softmax document weights to get proportions
    """
    return F.softmax(doc_weights, dim=1)

def get_doc_vectors(doc_weights, topic_embeds):
    """
    Multiply by proportions by topic embeddings to get document vectors
    """
    proportions = get_proportions(doc_weights)
    doc_vecs = t.matmul(topic_embeds, t.t(proportions))

    return t.t(doc_vecs)

In [35]:
topic_embeds = model["model_state_dict"]["topic_embeds"]
word_embeds = model["model_state_dict"]["word_embeds.weight"]
doc_weights = model["model_state_dict"]["doc_weights.weight"]
vocab = list(dataset['term_freq_dict'].keys())
doc_embeds = get_doc_vectors(doc_weights, topic_embeds)

print(doc_embeds.size())

torch.Size([11314, 128])


In [36]:
def wordvec2idx(word_vec):
    return np.where(word_embeds.numpy() == word_vec.numpy())[0][0]

def vec2word(word_vec):
    idx = wordvec2idx(word_vec)
    return vocab[idx]

In [37]:
def get_n_closest_word_vecs(topic_vec, n=20):
    dist = F.cosine_similarity(word_embeds, topic_vec.unsqueeze(dim=1).transpose(0, 1))
    index_sorted = dist.argsort()
    return index_sorted[:n]

In [38]:

for i, topic in enumerate(topic_embeds.transpose(0, 1)):
    # Get 10 closest word_embeds
    # Print word_embeds words
    top_10 = get_n_closest_word_vecs(topic)
    
    print(f'\nTOPIC: {i}')
    for word_vec in top_10:
        print(vocab[word_vec])


TOPIC: 0
liberated
xbdbefihaiplh
hizbullah
myxgyxcsebcuacxdzcsyucgyxcxsmscucgcgshgysgyucgtz
jackwertzen
mulch
gwttttttxgaokzokxuna
benefit
decapods
brant
michaelzimmers
mfteagalfm
roqkskvmtbtctfznrhjbhj
mafxr
atrocities
superpro
cecilfielder
badpixmap
hitujrfz
columnlayout

TOPIC: 1
associatedpress
trnc
moalmsfzxwklruukby
compk
srlisrggkwwrlk
woundnt
sov
amwltnpkluknrsu
xe
cahill
mckrbzrmzrchzdhormbvzrbzrlkvhzvkv
convince
cbdfxxjtgcbn
aldine
georgeferguson
stephencybermantoz
dontchaknow
congresmen
xewqz
stategy

TOPIC: 2
baffled
justaminute
outstanding
slipcover
motorcyclists
lakatos
bobmccormack
sbv
nknkrlg
godeveryday
whatever
rdcable
buflength
markpundurs
inconvienent
listach
xxewaxaxtxw
sbhflvggquu
mycolor
fermin

TOPIC: 3
uofw
lqlaaa
acoupleofdays
severalmillion
gptqqrbfbfbf
thecaliforniagoldenseals
eti
orbitalsciencescorporation
rboc
acquisition
mdfplpleqfeznlj
onlyto
creativelabs
mvbrcop
worldseries
merrily
foci
printfs
rugermarkiibullbarrel
physicalmodelling

TOPIC: 4
mcykkqhk

## 