In [1]:
%load_ext watermark
%watermark  -a Filippo_Valle -v -m -g -r -v -p pandas,numpy,graph_tool,cloudpickle,regex,topicpy,matplotlib,plotly,requests

Author: Filippo_Valle

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.22.0

pandas     : 1.2.3
numpy      : 1.19.0
graph_tool : 2.37 (commit afba9459, )
cloudpickle: 1.6.0
regex      : 2021.4.4
topicpy    : 0.2.1
matplotlib : 3.4.1
plotly     : 4.14.3
requests   : 2.25.1

Compiler    : GCC 9.3.0
OS          : Linux
Release     : 5.8.0-50-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit

Git hash: aec0cb27d23e921cf53771b154b07fadbbd6854a

Git repo: git@github.com:fvalle1/epj.git



In [2]:
import requests
import pandas as pd

In [3]:
titles = pd.read_table("../../hSBM_Topicmodel/titles.txt", sep=" ", header=None)[0].values
titles[:3]

array(['Nuclear_Overhauser_effect', 'Quantum_solvent',
       'Rovibrational_coupling'], dtype=object)

In [4]:
df_files = pd.read_csv("../../hSBM_Topicmodel/titles.txt", sep=" ", index_col=0, names=["Topic"])

In [None]:
url = "https://en.wikipedia.org/w/api.php"

def get_categories(title):
    params = {
        "action": "query",
        "format": "json",
        "prop": "categories",
        "titles": title
    }
    to_ret = []
    with requests.get(url=url, params=params) as req:
        if req.status_code==200:
            data = req.json()
            pages = data["query"]["pages"]
            for k, v in pages.items():
                if "categories" not in v.keys():
                    continue
                for cat in v['categories']:
                    to_ret.append(cat["title"].split(":")[1])
    return to_ret

In [None]:
df_meta=pd.DataFrame()

In [None]:
for title in titles:
    df_meta=df_meta.join(pd.Series(name=title, index=get_categories(title), data = 1), how="outer")

In [None]:
df_meta = df_meta.fillna(0).astype(int).drop_duplicates()

In [None]:
df_meta = df_meta.reindex(index=df_meta.index[(df_meta.sum(1)>1)].drop_duplicates())

In [None]:
with open("../../hSBM_Topicmodel/corpus.txt", "r") as file:
    texts = file.readlines()
texts = [h.split() for h in texts[1:]]

# Make hSBM graph

In [None]:
import sys
import os
import graph_tool.all as gt
sys.path.append("../../hSBM_Topicmodel/")

In [None]:
from sbmtm import sbmtm

In [None]:
sbmtm = sbmtm()
sbmtm.make_graph(texts, list(titles))

In [None]:
g = sbmtm.g

In [None]:
sbmtm.fit(n_init=5, verbose=False, B_min=6)

In [None]:
os.system("rm -r wikipedia")
os.system("mkdir -p wikipedia")
os.chdir("wikipedia")
sbmtm.save_data()
sbmtm.save_graph()
df_files.to_csv("files.dat")
os.chdir("..")

In [None]:
gt.draw_hierarchy(sbmtm.state, layout="bipartite", hedge_pen_width=8, hvertex_size=25)

# Add keywords

In [None]:
import graph_tool.all as gt

In [None]:
df_corpus = pd.DataFrame(data=gt.adjacency(g, weight=g.ep["count"]).toarray()[sbmtm.get_D():,:sbmtm.get_D()], index=sbmtm.words, columns=sbmtm.documents)

In [None]:
df = df_corpus.append(df_meta).fillna(0).astype(int)

### triSBM

In [None]:
sys.path.append("../../trisbm/")

In [None]:
%load_ext autoreload
%autoreload 2
from trisbm import trisbm

In [None]:
trisbm = trisbm()
trisbm.make_graph(df, lambda word_keyword: 2 if word_keyword in df_meta.index else 1)

In [None]:
trisbm.save_graph("wikipedia_keyword.xml.gz")

In [None]:
trisbm.fit(n_init=5, B_min=9, verbose=False)

In [None]:
import os

In [None]:
os.system("rm -r wikipedia_key")
os.system("mkdir -p wikipedia_key")
os.chdir("wikipedia_key")
trisbm.save_data()
trisbm.save_graph()
df_files.to_csv("files.dat")
os.chdir("..")

In [None]:
gt.draw_hierarchy(trisbm.state, 
                  #pos=gt.sfdp_layout(model.g),
                  hedge_pen_width=8, 
                  hvertex_size=25
                 )

## Benchmark

In [None]:
from topicpy.hsbmpy import get_scores, get_scores_shuffled, add_score_lines, normalise_score
import matplotlib.pyplot as plt
import pandas as pd
import os

In [None]:
labels = ["Topic"]
scores = get_scores("wikipedia_key", labels, algorithm="trisbm", verbose=False)
scores['trisbm'] = scores[labels[0]]
scores["hsbm"]=get_scores("wikipedia", labels, algorithm="topsbm", verbose=False)[labels[0]]
scores['shuffle'] = get_scores_shuffled("wikipedia_key", pd.read_csv("wikipedia/files.dat", sep=",", index_col=0), label=labels[0], algorithm='trisbm')
normalise_score(scores, base_algorithm="shuffle", operation=lambda x,y: x/y)

In [None]:
fig=plt.figure(figsize=(18,15))
ax = fig.subplots(1)
add_score_lines(ax,scores,labels=["hsbm","trisbm", "shuffle"], V="norm_V", alpha=1)
ax.set_xscale('log')
ax.set_ylim(0,max(map(lambda s: max(s["norm_V"]), scores.values()))*1.1)
ax.set_xlim(0,10)

plt.show()
fig.savefig("metric_scores.pdf")

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y = [sbmtm.get_mdl()/sbmtm.g.num_edges()], name="hSBM"),
    go.Bar(y = [trisbm.get_mdl()/trisbm.g.num_edges()], name="triSBM")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "title":"Wikipedia dataset",
    "xaxis":{
        "title": "Resolution",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "∑/E",
        "type":"log",
        #"range": [10e3,20e3],
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":35
    }
}

fig.update_layout(layout)
#fig.write_image("metric_entropies_bar.pdf")

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y=scores["hsbm"]["norm_V"], name="hSBM"),
    go.Bar(y=scores["trisbm"]["norm_V"], name="triSBM")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "xaxis":{
        "title": "Resolution",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "NMI/NMI*",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":35
    }
}

fig.update_layout(layout)
#fig.write_image("metric_scores_bar.pdf")

In [None]:
import cloudpickle as pickle

with open("sbmtm.pkl", "wb") as file:
    pickle.dump(sbmtm, file)
    
with open("trisbm.pkl", "wb") as file:
    pickle.dump(trisbm, file)