In [1]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

check_dir = '115239_17082019_checkpoints'

In [2]:
# Load saved data
model = t.load(f'{check_dir}/checkpoint_60.pth', map_location='cpu')
dataset = t.load(f'saved_datasets/20_news_groups_dataset/metadata.pth')

In [3]:
print(model['epoch'])

60


In [4]:
def get_proportions(doc_weights):
    """
    Softmax document weights to get proportions
    """
    return F.softmax(doc_weights, dim=1).unsqueeze(dim=2)

def get_doc_vectors(doc_weights, topic_embeds):
    """
    Multiply by proportions by topic embeddings to get document vectors
    """
    proportions = get_proportions(doc_weights)
    doc_vecs = (proportions * topic_embeds.unsqueeze(0)).sum(dim=1)

    return doc_vecs

In [5]:
topic_embeds = model["model_state_dict"]["topic_embeds"]
word_embeds = model["model_state_dict"]["word_embeds.weight"]
doc_weights = model["model_state_dict"]["doc_weights.weight"]

vocab = list(dataset['term_freq_dict'].keys())
term_freq = list(dataset['term_freq_dict'].values())
doc_lens = dataset['doc_lengths']

doc_embeds = get_doc_vectors(doc_weights, topic_embeds)

print(doc_embeds.size())

torch.Size([17652, 300])


In [6]:
def wordvec2idx(word_vec):
    return np.where(word_embeds.numpy() == word_vec.numpy())[0][0]

def vec2word(word_vec):
    idx = wordvec2idx(word_vec)
    return vocab[idx]
  

In [7]:
def get_n_closest_word_vecs(topic_vec, n=10):
    dist = F.cosine_similarity(word_embeds, topic_vec.unsqueeze(dim=1).transpose(0, 1))
    index_sorted = dist.argsort()
    return index_sorted[:n]

In [8]:
for i, topic in enumerate(topic_embeds):
    # Get 10 closest word_embeds
    top_10 = get_n_closest_word_vecs(topic)
    print(f'TOPIC {i}: {" ".join([vocab[vec] for vec in top_10])}')
    

TOPIC 0: duct fatal equivalent TEMPEST obedience performance page Guard Anybody heater
TOPIC 1: underestimate Silicon parallel Land salvation assert influence performance panic config
TOPIC 2: Panama Anybody cellular equivalent session pertinent formatting ceremonial electric Analyst
TOPIC 3: duct ver Willis moral Old obo print assert prospect Baerga
TOPIC 4: Silicon ver law parallel gate ram sexual config invoke wheel
TOPIC 5: Baerga ver les genesis withhold allow russian Block turkish room
TOPIC 6: Bay regional Baerga sure remark Pens person pad program arc
TOPIC 7: remark secret inconvenient well sure invoke russian presumably Pens performance
TOPIC 8: remark lack heater russian professional Azerbaijan turkish formatting Davidian end
TOPIC 9: ver Watt Baerga fun regional allow Azerbaijan confused formatting datum
TOPIC 10: muffler Judge launcher equivalent LIFE hill sort een cheat rape
TOPIC 11: Jerome ver pad Mountain wit formatting Block tcp heater Johansson
TOPIC 12: heater progr

In [9]:
def _softmax(x):
    e_x = np.exp(x - np.max(x))
    out = e_x / e_x.sum()
    
    return out


def _softmax_2d(x):
    y = x - x.max(axis=1, keepdims=True)
    np.exp(y, out=y)
    y /= y.sum(axis=1, keepdims=True)
    return y


def prob_words(context, vocab, temperature=1.0):
    """ This calculates a softmax over the vocabulary as a function
    of the dot product of context and word.
    """
    dot = np.dot(vocab, context)
    prob = _softmax(dot / temperature)
    
    return prob


def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0,
                   doc_lengths=None, term_frequency=None, normalize=False):
    """ Collects a dictionary of word, document and topic distributions.
    https://github.com/cemoody/lda2vec/blob/b7f4642b750c6e792c07d177bd57ad36e65bb35c/lda2vec/topics.py
    Arguments
    ---------
    weights : float array
        This must be an array of unnormalized log-odds of document-to-topic
        weights. Shape should be [n_documents, n_topics]
    factors : float array
        Should be an array of topic vectors. These topic vectors live in the
        same space as word vectors and will be used to find the most similar
        words to each topic. Shape should be [n_topics, n_dim].
    word_vectors : float array
        This must be a matrix of word vectors. Should be of shape
        [n_words, n_dim]
    vocab : list of str
        These must be the strings for words corresponding to
        indices [0, n_words]
    temperature : float
        Used to calculate the log probability of a word. Higher
        temperatures make more rare words more likely.
    doc_lengths : int array
        An array indicating the number of words in the nth document.
        Must be of shape [n_documents]. Required by pyLDAvis.
    term_frequency : int array
        An array indicating the overall number of times each token appears
        in the corpus. Must be of shape [n_words]. Required by pyLDAvis.
    Returns
    -------
    data : dict
        This dictionary is readily consumed by pyLDAVis for topic
        visualization.
    """
    # Map each factor vector to a word
    topic_to_word = []
    msg = "Vocabulary size did not match size of word vectors"
    assert len(vocab) == word_vectors.shape[0], msg
    if normalize:
        word_vectors /= np.linalg.norm(word_vectors, axis=1)[:, None]
    # factors = factors / np.linalg.norm(factors, axis=1)[:, None]
    for factor_vector in factors:
        factor_to_word = prob_words(factor_vector, word_vectors,
                                    temperature=temperature)
        topic_to_word.append(np.ravel(factor_to_word))
    topic_to_word = np.array(topic_to_word)
    msg = "Not all rows in topic_to_word sum to 1"
    assert np.allclose(np.sum(topic_to_word, axis=1), 1), msg
    # Collect document-to-topic distributions, e.g. theta
    doc_to_topic = _softmax_2d(weights)
    msg = "Not all rows in doc_to_topic sum to 1"
    assert np.allclose(np.sum(doc_to_topic, axis=1), 1), msg
    data = {'topic_term_dists': topic_to_word,
            'doc_topic_dists': doc_to_topic,
            'doc_lengths': doc_lengths,
            'vocab': vocab,
            'term_frequency': term_frequency}
    return data

In [10]:
print(get_proportions(doc_weights).size())
print(t.transpose(topic_embeds, 0, 1).size())
print(word_embeds.size())
print(len(vocab))
print(len(doc_lens))
print(np.max(term_freq))

torch.Size([17652, 20, 1])
torch.Size([300, 20])
torch.Size([9006, 300])
9653
17577
16679


In [11]:
checkpoint_pyldavis = prepare_topics(
    doc_weights.numpy(),
    topic_embeds.numpy(),
    word_embeds.numpy(),
    np.array(vocab),
    doc_lengths=np.array(doc_lens),
    term_frequency=np.array(term_freq)
)

AssertionError: Vocabulary size did not match size of word vectors

In [None]:
import pyLDAvis
vis_data = pyLDAvis.prepare(**checkpoint_pyldavis)

In [None]:
pyLDAvis.display(vis_data)