# Graphical Models for Textual Data
This shows how graphical models can be used to infer relationships between textual data.

In [None]:
import pandas as pd
from regain.utils import flatten
import numpy as np

filename = "../regain/data/text/webkb-train-stemmed.txt"
train = pd.read_csv(filename, header=None, sep='\t', index_col=0).dropna()
train.columns = ['words']

In [None]:
words = np.unique(flatten([words.split(' ') for words in train.words.tolist() if isinstance(words, str)]))

ld = [dict(zip(*np.unique(row.words.split(' '), return_counts=True))) for row in train.itertuples()
      if isinstance(row.words, str)]

X = pd.DataFrame(ld, index=[row.Index for row in train.itertuples()
      if isinstance(row.words, str)]).fillna(0)

y = X.index

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [None]:
documents = train.words.fillna("")

In [None]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)

In [None]:
df_tf = pd.DataFrame(tf.todense(), index=train.index, columns=tf_vectorizer.get_feature_names())

In [None]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    topics = []
    for topic_idx, topic in enumerate(H):
        print ("Topic %d:" % (topic_idx))
        topics.append(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print (documents[doc_index])
    return topics
# # NMF is able to use tf-idf
# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
# tfidf = tfidf_vectorizer.fit_transform(documents)
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# # Run NMF
# nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# nmf_W = nmf_model.transform(tfidf)
# nmf_H = nmf_model.components_

# print("NMF Topics")
# display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)

In [None]:
n_topics = 50
n_top_words = 3
n_top_documents = 3

In [None]:
# # Run LDA
# lda_model = LatentDirichletAllocation(n_components=n_topics, max_iter=5, learning_method='online',
#                                       learning_offset=50.,random_state=0).fit(tf)
# lda_W = lda_model.transform(tf)
# lda_H = lda_model.components_

# print("LDA Topics")
# topics = display_topics(lda_H, lda_W, tf_vectorizer.get_feature_names(), documents, n_top_words, n_top_documents)

In [None]:
# df = pd.DataFrame(lda_W, index=train.index, columns=topics)
# X = lda_W
# y = documents.index

In [None]:
# X = pd.DataFrame([], columns=words)
# y = []
# for row in train.itertuples():
#     if isinstance(row.words, str):
#         series = pd.Series(dict(zip(*np.unique(row.words.split(' '), return_counts=True))), name=row.Index)
#         X = X.append(series)
#         y.append(row.Index)

# X = X.fillna(0)
# X.index = y

# y = np.asarray(y)

words_to_discard = []
for yy in np.unique(y):
    words_to_discard += list(X[words[X[y==yy].sum(axis=0) == 0]].columns)

words_to_keep = list(set(X.columns) - set(words_to_discard))

def logentropy_normalize(X):
    P = X / X.values.sum(axis=0, keepdims=True)
    E = 1 + (P * np.log(P)).fillna(0).values.sum(axis=0, keepdims=True) / np.log(X.shape[0] + 1)
    return E * np.log(1 + X)

X_new = logentropy_normalize(X)[words_to_keep]

In [None]:
from gensim.models import LogEntropyModel
from gensim.test.utils import common_texts
from gensim.corpora import Dictionary

corp = [w.split(' ') for w in train.words.tolist() if isinstance(w, str)]
text = corp #common_texts
dct = Dictionary(text)  # fit dictionary

num_terms = 50 # or words.size
dct.filter_extremes(keep_n=num_terms)

corpus = [dct.doc2bow(row) for row in text][:10]  #convert to BoW format
model = LogEntropyModel(corpus, normalize=True)  # fit model

In [None]:
import ctmmodel; reload(ctmmodel)
ctm_model = ctmmodel.CtmModel(corpus, id2word=dct, num_topics=15)

In [None]:
all_words = []
for c in corpus:
    doc_words = []
    for cc in c:
        doc_words.extend([dct[cc[0]]] * cc[1])
    all_words.append(' '.join(doc_words))

In [None]:
import pyctm; reload(pyctm)
from pyctm import variational_bayes, inferencer
reload(variational_bayes); reload(inferencer)

# parameter set 3
alpha_mu=0.
alpha_sigma=1
alpha_beta=0

ctm_inferencer = variational_bayes.VariationalBayes();
ctm_inferencer._initialize(all_words, list(dct.values()), number_of_topics=15,
                           alpha_mu=alpha_mu, alpha_sigma=alpha_sigma, alpha_beta=alpha_beta);

for iteration in range(50):
    ctm_inferencer.learning(-1)

In [None]:
logl, lamda, nu = ctm_inferencer.inference(all_words)

ll = utils.topic_beta(ctm_inferencer)

topic_words = pd.DataFrame(ll)

In [None]:
topic_str_repr = []
for row in topic_words.iterrows():
    print(pd.DataFrame(row[1].sort_values(ascending=False)[:3]).T)
    topic_str_repr.append(' '.join(row[1].sort_values(ascending=False)[:3].index))

In [None]:
word_dct_values = list(dct.values())

In [None]:
dff = pd.DataFrame(ctm_model.beta, columns=word_dct_values)
dff = dff[sorted(dff.columns)]

In [None]:
# from gensim import models
# model = models.LdaModel(corpus, id2word=dct, num_topics=num_terms)

In [None]:
import gensim
XX = gensim.matutils.corpus2dense(model[corpus], num_terms=num_terms).T
cols = list(dct.values())
df = pd.DataFrame(XX, columns=cols, index=y)
# df[words_to_keep].T.sort_index().T

# X = df[words_to_keep].values
X = df.values
y = df.index

In [None]:
from regain.covariance import kernel_time_graphical_lasso_
from regain.model_selection import stability_optimization
from sklearn.model_selection import StratifiedShuffleSplit

mdl = kernel_time_graphical_lasso_.KernelTimeGraphicalLasso(
    verbose=0, kernel=np.ones((np.unique(y).size, np.unique(y).size)), psi='l1',
    alpha=0.45, max_iter=1000).fit(X, y)

In [None]:
socv = stability_optimization.GraphicalModelStabilitySelection(
    mdl, param_grid=dict(alpha=np.logspace(2, -2)), cv=StratifiedShuffleSplit(100)
).fit(X, y)

In [None]:
n_times = np.unique(y).size
n_dim = X.shape[1]

In [None]:
idx = np.triu_indices(n_dim, 1)
dof = idx[0].size * n_times

In [None]:
from regain import utils
# utils.save_pickle(socv, "socv")

socv = utils.load_pickle("socv.pkl")

In [None]:
mdl = socv.best_estimator_

In [None]:
print("Nonzero percentage: %.4f" % (np.sum([np.count_nonzero(P[idx]) for P in mdl.precision_]) / dof))

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from kdge import plot_plotly
import plotly.offline as py
import plotly.graph_objs as go
from plotly import tools

py.init_notebook_mode()
import plotly.io as pio

In [None]:
p = mdl.precision_[0]

In [None]:
from regain.utils import retain_top_n

In [None]:
k = -1
trace = []
graphs = []
# for i, p in enumerate(ltgl.precision_ - ltgl.latent_):
for i, p in enumerate(mdl.precision_):

    A = np.abs(p - np.diag(np.diag(p)))
    A = retain_top_n(A, top_n)
    G = nx.from_numpy_matrix(A * 0.00001)
    graphs.append(G)
    trace.append(pl.plot_circular(G, df.columns, 1.4, cmap='Blues',
                                  #color_nodes=plt.rcParams['axes.prop_cycle'].by_key()['color'][:5]
                                 ))

fig = tools.make_subplots(
    rows=1, cols=len(mdl.precision_), horizontal_spacing=.1, print_grid=False)

for j, tr in enumerate(trace):
    tr['data'][k]['xaxis'] = 'x' + str(j+1)
    tr['data'][k]['yaxis'] = 'y' + str(j+1)

for j, tr in enumerate(trace):
    for i, x in enumerate(tr['data']):
        col = j + 1
        x['legendgroup'] = 'group'+ str(j+1)
        x['showlegend'] = False
        fig.append_trace(x,1,col)
        


for j, tr in enumerate(trace):
    fig.layout.annotations += tuple([pl._set_ref(
        x, 'x'+ str(j+1), 'y'+ str(j+1)) for x in tr['layout']['annotations']])

    fig['layout']['xaxis'+str(j+1)].update(showgrid=False, zeroline=False, showticklabels=False)
    fig['layout']['yaxis'+str(j+1)].update(showgrid=False, zeroline=False, showticklabels=False)

py.init_notebook_mode()

fig['layout'].update(height=900, width=4000,hovermode='closest',
                     paper_bgcolor='rgba(0,0,0,0)',
                        plot_bgcolor='rgba(0,0,0,0)')
# fig.layout.annotations += tuple([dict(
#     text="Python code: <a href='https://plot.ly/ipython-notebooks/network-graphs/'> https://plot.ly/ipython-notebooks/network-graphs/</a>",
#     showarrow=False, xref="paper", yref="paper", x=0.005, y=-0.2)])
# fig['layout'].update(scene=dict(aspectmode="data"))
py.iplot(fig)
# py.offline.iplot(fig, filename='figure_factory_subplot', image='svg')

In [None]:
pio.write_image(fig, "graphs.pdf")

In [None]:
import pyctm

from pyctm import variational_bayes, inferencer

# parameter set 3
alpha_mu=0.
alpha_sigma=.1
alpha_beta=-.1

ctm_inferencer = variational_bayes.VariationalBayes();
ctm_inferencer._initialize(train.words.tolist(), words, 20, alpha_mu, alpha_sigma, alpha_beta);

for iteration in range(50):
    ctm_inferencer.learning(-1)

logl, lamda, nu = ctm_inferencer.inference(train.words.tolist())

ll = utils.topic_beta(ctm_inferencer)

topic_words = pd.DataFrame(ll)

In [None]:
np.abs(lamda).sum()

In [None]:
from pyctm import utils

In [None]:
import numpy as np
from sklearn.covariance import GraphicalLassoCV

from regain import utils

In [None]:
utils.save_pickle(lamda, "lambda_ctm.pkl")
lamda = utils.load_pickle('lambda_ctm.pkl')

In [None]:
dff.values.shape

In [None]:
gl = GraphicalLassoCV().fit(lamda)

In [None]:
p = gl.precision_

A = np.abs(p - np.diag(np.diag(p)))
A = retain_top_n(A, 20)
G = nx.from_numpy_matrix(A * 3)
fig = pl.plot_circular(G, topic_str_repr, 2, cmap='Blues')

fig['layout'].update(height=800, width=800,hovermode='closest',
                     paper_bgcolor='rgba(0,0,0,0)',
                        plot_bgcolor='rgba(0,0,0,0)')
# fig.layout.annotations += tuple([dict(
#     text="Python code: <a href='https://plot.ly/ipython-notebooks/network-graphs/'> https://plot.ly/ipython-notebooks/network-graphs/</a>",
#     showarrow=False, xref="paper", yref="paper", x=0.005, y=-0.2)])
# fig['layout'].update(scene=dict(aspectmode="data"))
py.iplot(fig)
# py.offline.iplot(fig, filename='figure_factory_subplot', image='svg')