In [None]:
import numpy as np
import pandas as pd
from numpy.random import dirichlet, multinomial, normal, negative_binomial
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import logging
log = logging.getLogger("slda")
log.addHandler(logging.StreamHandler())

In [None]:
D = 250 # documents
W = 1000 # words
Wk = 100 # keywords
K = 10 #topics
alpha = 1./K
beta = 1./K
eta = 100/K
sigma2 = 0.01

Nd = [100 for doc in range(D)]

In [None]:
phi = dirichlet(np.repeat(alpha, W), size=K)
assert(phi.shape==(K,W))

In [None]:
fig, ax = plt.subplots()

plt.plot(phi[0])

ax.set_xlabel("W", fontsize=25)
ax.set_ylabel("$\\phi_K$", fontsize=25)

ax.tick_params(labelsize=20)



In [None]:
theta = dirichlet(np.repeat(beta, K), size=D)
assert(theta.shape==(D,K))

In [None]:
fig, ax = plt.subplots()

ax.plot(theta[0])

ax.set_xlabel("K", fontsize=25)
ax.set_ylabel("$\\theta_d$", fontsize=25)

ax.tick_params(labelsize=20)

plt.show()

In [None]:
z = np.array([[np.argmax(multinomial(1, theta[i])) for j in range(Nd[i])]
     for i in range(D)])

In [None]:
z_bar = np.average(z, 1)
Y = [normal(eta*z_bar_d, sigma2) for z_bar_d in z_bar]
assert(len(Y)==D)

In [None]:
W = np.array([[np.argmax(multinomial(1,phi[z[i,j]])) for j in range(Nd[i])]
     for i in range(D)])

In [None]:
W.shape

In [None]:
df = pd.DataFrame()

for doc in range(D):
    doc_dict, doc_ab = np.unique(W[doc], return_counts=True)
    df = df.join(pd.Series(index=doc_dict, data=doc_ab, name="doc_{}".format(doc)), how="outer")
    

df = df.fillna(0).astype(int)

In [None]:
#gamma = np.array([negative_binomial(y/(y-1), 1./y, size=Wk) for y in Y])
gamma = np.array([y for y in Y])

In [None]:
df_meta = pd.DataFrame()


for doc in range(D):
    doc_dict, doc_ab = np.unique(gamma[doc], return_counts=True)
    df_meta = df_meta.join(pd.Series(index=doc_dict, data=doc_ab, name="doc_{}".format(doc)), how="outer")
    

df_meta = df_meta.fillna(0).astype(int)

In [None]:
fig, ax = plt.subplots()

freq = df.sum(1).sort_values(ascending=False).values

ax.plot(freq/freq.sum())
ax.plot([1,len(freq)], [.1, .1/len(freq)])

ax.set_xlabel("rank", fontsize=25)
ax.set_ylabel("$f_i$", fontsize=25)

ax.tick_params(labelsize=20)

ax.set_xscale("log")
ax.set_yscale("log")

In [None]:
fig,ax=plt.subplots()

plt.scatter(Y,np.argmax(theta, axis=1))

ax.set_xlabel("$Y_d$", fontsize=25)
ax.set_ylabel("$\\theta_d$", fontsize=25)

ax.tick_params(labelsize=20)

# Models
## LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from scipy.special import kl_div

In [None]:
lda = LatentDirichletAllocation(n_components=K)

In [None]:
topic_dist = lda.fit_transform(df.transpose().values)

In [None]:
fig, ax = plt.subplots()

ax.hist([np.sum(kl_div(topic_dist[i], theta[i])) for i in range(D)], density=True)


ax.set_xlabel("KL_divergence", fontsize=25)
ax.set_ylabel("pdf", fontsize=25)

ax.tick_params(labelsize=20)
plt.show()

## sLDA

In [None]:
with open("train-data.txt", "w") as file:
    for doc in df.columns:
        total = (df[doc]>0).sum()
        file.write(str(total))
        for iw,(w,c) in enumerate(df[doc].items()):
            if c>0:
                file.write(f" {iw}:{c}")
        file.write("\n")

In [None]:
with open("train-labels.txt", "w") as file:
    for y in Y:
        file.write(str(int(round(y,0)))+"\n")

### Postprocess

In [None]:
import multiprocessing as mp

In [None]:
slda_topic_dist = pd.read_csv("sldasyntetic/final.gamma", sep=" ", header=None)
slda_topic_dist.columns = ["Topic %d"%(t+1) for t in range(slda_topic_dist.shape[1])]
slda_topic_dist = slda_topic_dist.divide(slda_topic_dist.sum(1),0)
print(slda_topic_dist.shape)
slda_topic_dist.head(2)

In [None]:
slda_word_dist = pd.DataFrame(index=df.index, columns = ["Topic %d"%(t+1) for t in range(10)]).fillna(0)
print(slda_word_dist.shape)
slda_word_dist.head(2)

In [None]:
def assign_word(line, sample):
    new_sample = pd.Series(name=sample, index=slda_word_dist.index, dtype=object)
    for token in line:
        idx, cnt = token.split(":")
        new_sample.at[new_sample.index[int(idx)]]=int(cnt)
    return new_sample
    
def assign_doc(sample):
    global df_word_dist_temp
    df_word_dist_temp = df_word_dist_temp.join(sample, how="outer")

In [None]:
pool = mp.Pool(6)
df_word_dist_temp = pd.DataFrame()
with open("sldasyntetic/word-assignments.dat") as file:
    lines = file.read().split("\n")      
    w = [pool.apply_async(assign_word, args=([line.split(" ")[1:], sample]), callback = assign_doc, error_callback=lambda err:log.error(err)) for line, sample in zip(lines, slda_word_dist.columns)]
    
    pool.close()

pool.join()
df_word_dist_temp=df_word_dist_temp.reindex(index=slda_word_dist.index, columns=slda_word_dist.columns)

In [None]:
for g, data in df_word_dist_temp.apply(lambda x: np.unique(x[~x.isna()],return_counts=True), 1).items():
    for t, c in zip(*data):
        slda_word_dist.at[g,slda_word_dist.columns[t]]=c

In [None]:
slda_word_dist = slda_word_dist.divide(slda_word_dist.sum(0),1).fillna(0)
slda_word_dist.head(2)

## hSBM

In [None]:
import sys
sys.path.append("/home/jovyan/work/phd/hSBM_Topicmodel/")
from sbmtm import sbmtm
import graph_tool.all as gt

In [None]:
import importlib, sbmtm
importlib.reload(sbmtm)
from sbmtm import sbmtm

In [None]:
hsbm = sbmtm()

In [None]:
hsbm.make_graph_from_BoW_df(df)

In [None]:
hsbm.g

In [None]:
hsbm.fit(n_init=3, B_min=8, B_max=15, parallel=True, verbose=False)

In [None]:
gt.draw_hierarchy(hsbm.state,
                 layout="bipartite",
                 hedge_pen_width=8, 
                 hvertex_size=25, 
                 vertex_kind=hsbm.g.vertex_properties["kind"])

In [None]:
import os
import cloudpickle as pickle

os.system("mkdir -p syntetic")
os.chdir("syntetic")
hsbm.save_data()
hsbm.save_graph()
os.chdir("..")

with open("sbmtm.pkl", "wb") as file:
    pickle.dump(hsbm, file)

## triSBM

In [None]:
import sys
sys.path.append("/home/jovyan/work/phd/trisbm/")
from trisbm import trisbm

In [None]:
model = trisbm()

In [None]:
df_meta.index = ["#{}".format(w) for w in df_meta.index]

In [None]:
model.make_graph(df.append(df_meta), lambda w: 2 if "#" in str(w) else 1)

In [None]:
model.fit(n_init=3, B_min=12, B_max=20, parallel=True, verbose=False)

In [None]:
gt.draw_hierarchy(model.state,
                 hedge_pen_width=8, 
                 hvertex_size=25, 
                 vertex_kind=model.g.vertex_properties["kind"])

In [None]:
import os
import cloudpickle as pickle

os.system("mkdir -p syntetic_key")
os.chdir("syntetic_key")
model.save_data()
model.save_graph()
os.chdir("..")

with open("trisbm.pkl", "wb") as file:
    pickle.dump(model, file)

# Benchmark

### Entropy

In [None]:
import sys
sys.path.append("../trisbm/")
sys.path.append("../hSBM_Topicmodel/")

In [None]:
import cloudpickle as pickle

with open("sbmtm.pkl", "rb") as file:
    hsbm = pickle.load(file)
    
with open("trisbm.pkl", "rb") as file:
    model = pickle.load(file)

In [None]:
with open("sldasyntetic/likelihood.dat") as file:
    slda_likelihood = float(file.readlines()[-1].split("\t")[0])

In [None]:
fig = go.Figure()

fig.add_traces([
    go.Bar(x=[0], y=[hsbm.get_mdl()/hsbm.g.num_edges()], name="hsbm"),
    go.Bar(x=[1], y=[model.get_mdl()/model.g.num_edges()], name="trisbm"),
    go.Bar(x=[2], y=[np.log(lda.perplexity(df.transpose().values))], name="lda"),
    go.Bar(x=[3], y=[np.log(-slda_likelihood)], name="slda")
])
    
fig.update_layout(
{
    "xaxis":{
        "tickfont":{
            "size":25
        },
        "tickmode": "array",
        "tickvals": [0,1,2,3],
        "ticktext": ["hsbm", "trisbm", "lda", "slda"],
    },
     "yaxis":{
        "title":"-Log(likelihood)",
        "titlefont_size":25,
         "tickfont":{
            "size":25
        }
    }
}
)
fig.show()
fig.write_image("Sigma_topics.pdf", engine="kaleido")

### Topic dist

In [None]:
hsbm_dist = pd.read_csv("syntetic/topsbm_level_0_topic-dist.csv", index_col=1).drop("i_doc",1)

In [None]:
trisbm_dist = pd.read_csv("syntetic_key/trisbm_level_0_topic-dist.csv", index_col=1).drop("i_doc",1)

In [None]:
import seaborn as sns

In [None]:
def padding(a,b, epsilon = 0):
    la = len(a)
    lb = len(b)
    if la>lb:
        return a+epsilon,np.concatenate((b,np.zeros(la-lb)))+epsilon
    elif la<lb:
        return np.concatenate((a,np.zeors(lb-la)))+epsilon,b+epsilon
    return a+epsilon,b+epsilon

In [None]:
real_hsbm_kl = [np.sum(kl_div(*padding(theta[idoc], hsbm_dist.values[idoc], 1e-10))) for idoc in range(D)]

real_tri_kl = [np.sum(kl_div(*padding(theta[idoc], trisbm_dist.values[idoc], 1e-10))) for idoc in range(D)]

real_lda_kl = [np.sum(kl_div(*padding(theta[idoc], topic_dist[idoc], 1e-10))) for idoc in range(D)]

real_slda_kl = [np.sum(kl_div(*padding(theta[idoc], slda_topic_dist.values[idoc], 1e-10))) for idoc in range(D)]

In [None]:
fig = go.Figure()

def get_scatter(data, name):
    hist = np.histogram(data, density=True)

    return go.Scatter(x=(hist[1][1:]+hist[1][:-1])/2, 
                      y=hist[0], 
                      mode="lines", 
                      line={"shape":"hvh", "width":8}, 
                      name=name)
    
fig.add_traces([
    get_scatter(data, name)
    
    for data, name in zip([real_hsbm_kl, real_tri_kl, real_lda_kl, real_slda_kl],
                         ["hsbm", "trisbm", "lda", "slda"])
])
    
fig.update_layout(
{
    "xaxis":{
        "title":"KL (P(topic|sample),P(planted latent variable|sample))",
        "titlefont_size":25
    },
     "yaxis":{
        "title":"documents",
        "titlefont_size":25
    },
    "barmode":'overlay'
}
)
fig.update_traces(opacity=0.5)
fig.show()
fig.write_image("KL_topics.pdf", engine="kaleido")

### Word-dist

In [None]:
hsbm_word_dist = pd.read_csv("syntetic/topsbm_level_0_word-dist.csv", index_col=0)

In [None]:
trisbm_word_dist = pd.read_csv("syntetic_key/trisbm_level_0_word-dist.csv", index_col=0)

In [None]:
lda_word_dist = lda.components_.T
lda_word_dist = lda_word_dist/np.sum(lda_word_dist, axis=0)

In [None]:
real_hsbm_kl_words = [[np.sum(kl_div(*padding(phi[ilatent], hsbm_word_dist.values.T[itopic], 1e-10))) for ilatent in range(K)]
 for itopic in range(hsbm_word_dist.shape[1])]

real_tri_kl_words = [[np.sum(kl_div(*padding(phi[ilatent], trisbm_word_dist.values.T[itopic], 1e-10))) for ilatent in range(K)]
 for itopic in range(trisbm_word_dist.shape[1])]

real_lda_kl_words = [[np.sum(kl_div(*padding(phi[ilatent], lda_word_dist.T[itopic], 1e-10))) for ilatent in range(K)]
 for itopic in range(lda_word_dist.shape[1])]

real_slda_kl_words = [[np.sum(kl_div(*padding(phi[ilatent], slda_word_dist.values.T[itopic], 1e-10))) for ilatent in range(K)]
 for itopic in range(slda_word_dist.shape[1])]

In [None]:
sns.heatmap(np.sort(real_lda_kl_words, axis=1))

In [None]:
sns.heatmap(np.sort(real_tri_kl_words, axis=1))

In [None]:
fig = go.Figure()

fig.add_traces([
    go.Histogram(x=np.ravel(real_hsbm_kl_words), name="hsbm"),
    go.Histogram(x=np.ravel(real_tri_kl_words), name="trisbm"),
    go.Histogram(x=np.ravel(real_lda_kl_words), name="lda"),
    go.Histogram(x=np.ravel(real_slda_kl_words), name="slda")
])
    
fig.update_layout(
{
    "xaxis":{
        "title":"KL (P(word|topic),P(word|planted variable))",
        "titlefont_size":25
    },
     "yaxis":{
        "title":"pait topic-latent variable",
        "titlefont_size":25
    },
    "barmode":'overlay'
}
)
fig.update_traces(opacity=0.3)
fig.show()
fig.write_image("KL_words.pdf", engine="kaleido")