In [8]:
import os
import json
import numpy as np
import pandas as pd
import pyLDAvis
from dotenv import load_dotenv
from gensim.corpora import Dictionary

In [7]:
#conda install -c conda-forge pyldavis=2.1.2

In [2]:
# load environment variables
load_dotenv()

True

# Load Model Output

In [3]:
corpus = os.getenv("CORPUS") 
model_path = os.getenv("MODEL_PATH")
dict_files = sorted([file for file in os.listdir(corpus) if ".dict" in file])
models_dir = sorted(os.listdir(model_path))

epochs = range(0, len(models_dir))
data = {}
for epoch in epochs:
     # load dictionary {word->id}
    dict_path = f'{corpus}/{dict_files[epoch]}'
    token2id = Dictionary.load(dict_path).token2id
    vocabulary = list(token2id.keys())
    
    # load topics distributions
    topics_path = f'{model_path}/{models_dir[epoch]}/mode-topics.dat'
    with open(topics_path, "r") as f:
        topics = np.array([[int(word) for word in line.strip().split()] for line in f])
    topics_dists = (topics.T/topics.sum(axis=1)).T
    
    # load word assignments and get mixture weigths
    word_assignments_path = f"{model_path}/{models_dir[epoch]}/mode-word-assignments.dat"
    word_assignments = []
    with open(word_assignments_path, "r") as f:
        lines = f.readlines()[1:]
    # (doc_id, word_id, topic_id, x)
    for line in lines:
        line = line.strip().split() 
        word_assignment = {"doc_id": int(line[0]) ,"word_id": int(line[1]), "topic_id": int(line[2])}
        word_assignments.append(word_assignment)
    
    # for each document get the number of words that each topic has
    word_assignments = pd.DataFrame(word_assignments, columns = ["doc_id", "word_id", "topic_id"])
    term_frequency = word_assignments.groupby("word_id").size().values
    word_assignments = word_assignments.pivot_table(index = "doc_id", columns = "topic_id", aggfunc = "count")
    word_assignments.fillna(0, inplace = True)
    
    # for each document get the number of words
    doc_length = word_assignments.sum(axis = 1) 
    
    # for each document get the topic proportions that draw the document (\pi_d)
    doc_topic_dists = word_assignments.divide(doc_length, axis = 0).values

    #save data in a dict
    data[epoch] = {"topic_term_dists": topics_dists, "doc_topic_dists": doc_topic_dists, 
                   "term_frequency": term_frequency, "doc_lengths": doc_length.values, "vocab": vocabulary}

# Visualizations

In [None]:
# get data ldavis_data and save
for epoch in epochs:
    hdp_ldavis_data = pyLDAvis.prepare(mds = "tsne", **data[epoch]).to_dict()
    hdp_ldavis_data["tinfo"]["logprob"] = [-1e16 if elem==float("-inf") else elem for elem in hdp_ldavis_data["tinfo"]["logprob"]]
    hdp_ldavis_data["tinfo"]["loglift"] = [-1e16 if elem==float("-inf") else elem for elem in hdp_ldavis_data["tinfo"]["loglift"]]
    with open(f"../../vis/data/{epoch+1}.json", "w") as f:
        json.dump(hdp_ldavis_data,f)

In [None]:
# ldavis_data for last slice
hdp_ldavis_data = pyLDAvis.prepare(mds="tsne", **data[slice])

In [None]:
# display ldavis
pyLDAvis.display(hdp_ldavis_data)