In [275]:
import numpy as np
import pandas as pd
from gensim.corpora import Dictionary
from gensim.corpora.bleicorpus import BleiCorpus
import pyLDAvis
from gensim.matutils import corpus2dense
from gensim.models.ldamodel import LdaModel

In [None]:
corpus = BleiCorpus("../data/corpus.mm")
dictionary = Dictionary.load("../data/dictionary.dict")

In [263]:
with open("hdp/hdp_results/mode-topics.dat", "r") as f:
    topics = np.array([[int(word) for word in line.strip().split()] for line in f])
    
# d w z t, table with 4 columns, the last is not relevant, skip first row
with open("hdp/hdp_results/mode-word-assignments.dat", "r") as f:
    f.readline() #skip first line
    word_assig = [[int(elem) for elem in line.strip().split()] for line in f]

In [393]:
def get_ldavis_data(corpus, dictionary, topics, word_assig):
    """
    Inputs
        corpus: gensim.corpora.bleicorpus.BleiCorpus, bag of word representation of a collection of documents
        dictionary: gensim.corpora.dictionary.Dictionary
        topics: numpy.array, shape = (topics, vocabulary), frequency of each word in each topic.
        word_assig: list[list], [document_id, word_id, topic_id_assignment, irrelevant].
    Output
        ldavis_data: dict, input data for LDAvis.
    """
    # vocabulary
    vocabulary = list(dictionary.values())
    # get doc topics distributions
    topics_term_dists = (topics.T/topics.sum(axis=1)).T
    df_word_assig = pd.DataFrame(word_assig, columns=["d", "w", "z", "t"])
    doc_topic = df_word_assig[["d", "z"]].pivot_table(index="d", columns="z", aggfunc="size")
    doc_topic.fillna(0, inplace=True)
    doc_topic_dists = (doc_topic.values.T/doc_topic.sum(axis=1).values).T
    # get length of each doc and frequency of each word
    dense_corpus = corpus2dense(corpus, len(vocabulary))
    doc_lengths = dense_corpus.sum(axis=0)
    term_frequency = dense_corpus.sum(axis=1)
    # dictionary with the data for pyLDAvis
    ldavis_data = {"topic_term_dists": topics_term_dists, "doc_topic_dists": doc_topic_dists,
                   "doc_lengths": doc_lengths  , "vocab":vocabulary, "term_frequency": term_frequency}
    return ldavis_data

In [332]:
ldavis_data = get_ldavis_data(corpus, dictionary, topics, word_assig)
hdp_ldavis_data = pyLDAvis.prepare(mds="tsne", **ldavis_data)

In [333]:
pyLDAvis.display(hdp_ldavis_data)

## LDA

In [387]:
lda = LdaModel(corpus, id2word=dictionary, num_topics=12, callbacks=[l])

In [388]:
likelihood = lda.log_perplexity(corpus)*dense_corpus.sum()
likelihood

-4529626.448345808

In [407]:
doc_topic_dists = lda[corpus]

In [411]:
data = []
for doc_dist in doc_topic_dists:
    doc_dict = {}
    for topic in doc_dist:
        doc_dict[topic[0]] = topic[1]
    data.append(doc_dict)
df = pd.DataFrame(data)
df.fillna(0, inplace=True)

In [404]:
lda_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [287]:
pyLDAvis.display(lda_data)