# TopSBM: Topic Modeling with Stochastic Block Models

In [None]:
%load_ext autoreload
%autoreload 2

import os
import pylab as plt
%matplotlib inline  

from sbmtm import sbmtm
import graph_tool.all as gt

from matplotlib import pyplot as plt

# Fitting the model

In [None]:
## we create an instance of the sbmtm-class
model = sbmtm()

## we have to create the word-document network from the corpus
#model.make_graph(texts,documents=titles)

In [None]:
## we can also skip the previous step by saving/loading a graph
#model.save_graph(filename = 'graph.xml.gz')
model.load_graph(filename = 'graph.xml.gz')

In [None]:
## fit the model
gt.seed_rng(32) ## seed for graph-tool's random number generator --> same results
model.fit()

# Plotting the result

The output shows the (hierarchical) community structure in the word-document network as inferred by the stochastic block model:

- document-nodes are on the left
- word-nodes are on the right
- different colors correspond to the different groups

The result is a grouping of nodes into groups on multiple levels in the hierarchy:

- on the uppermost level, each node belongs to the same group (square in the middle)
- on the next-lower level, we split the network into two groups: the word-nodes and the document-nodes (blue sqaures to the left and right, respectively). This is a trivial structure due to the bipartite character of the network.
- only next lower levels constitute a non-trivial structure: We now further divide nodes into smaller groups (document-nodes into document-groups on the left and word-nodes into word-groups on the right)

In [None]:
model.plot(nedges=1000)

In [None]:
model.plot("bipartitehSBM.pdf", nedges=1000)

# The basics

## Topics
For each word-group on a given level in the hierarchy, we retrieve the $n$ most common words in each group -- these are the topics!


In [None]:
model.topics(l=1,n=20)

## Topic-distribution in each document
Which topics contribute to each document?

In [None]:
## select a document (by its index)
i_doc = 0
print(model.documents[i_doc])
## get a list of tuples (topic-index, probability)
model.topicdist(i_doc,l=1)

# Extra: Clustering of documents - for free.
The stochastic block models clusters the documents into groups.
We do not need to run an additional clustering to obtain this grouping.


In [None]:
model.clusters(l=1,n=5)

Application -- Finding similar articles:

For a query-article, we return all articles from the same group

In [None]:
## select a document (index)
i_doc = 2
print(i_doc,model.documents[i_doc])
## find all articles from the same group
## print: (doc-index, doc-title)
model.clusters_query(i_doc,l=1,)

# More technical: Group membership
In the stochastic block model, word (-nodes) and document (-nodes) are clustered into different groups.

The group membership can be represented by the conditional probability $P(\text{group}\, |\, \text{node})$. Since words and documents belong to different groups (the word-document network is bipartite) we can show separately:

- P(bd | d), the probability of document $d$ to belong to document group $bd$
- P(bw | w), the probability of word $w$ to belong to word group $bw$.

In [None]:
p_td_d,p_tw_w = model.group_membership(l=7)
fig = plt.figure(figsize=(15,4))
plt.subplot(121)
plt.imshow(p_td_d,origin='lower',aspect='auto',interpolation='none')
plt.title(r'Document group membership $P(bd | d)$')
plt.xlabel('Document d (index)')
plt.ylabel('Document group, bd')
plt.colorbar()

plt.subplot(122)
plt.imshow(p_tw_w,origin='lower',aspect='auto',interpolation='none')
plt.title(r'Word group membership $P(bw | w)$')
plt.xlabel('Word w (index)')
plt.ylabel('Word group, bw')
plt.colorbar()
plt.show()

In [None]:
fig.savefig("group_membership.pdf")

# state analysis

In [None]:
state = model.state

In [None]:
level = state.get_levels()[0]

In [None]:
level.get_blocks()[4]

In [None]:
e=level.get_matrix()
plt.matshow(e.todense())

In [None]:
model.print_topics(l=6)

In [None]:
state.print_summary()

In [None]:
groups = model.groups[3]

In [None]:
np.sum(groups['p_tw_d'],axis=0)

In [None]:
groups['p_tw_d']

In [None]:
groups

In [None]:
groups = model.groups[6]

### topicdist

In [None]:
p_w_tw = groups['p_w_tw']
fig=plt.figure()
plt.imshow(p_w_tw,origin='lower',aspect='auto',interpolation='none')
plt.title(r'Word group membership $P(w | tw)$')
plt.xlabel('Topic, tw')
plt.ylabel('Word w (index)')
plt.colorbar()
fig.savefig("p_w_tw.png")

In [None]:
p_tw_d = groups['p_tw_d']
fig=plt.figure()
plt.imshow(p_tw_d,origin='lower',aspect='auto',interpolation='none')
plt.title(r'Word group membership $P(tw | d)$')
plt.xlabel('Document (index)')
plt.ylabel('Topic, tw')
plt.colorbar()
fig.savefig("p_tw_d.png")