In [None]:
import pandas as pd
import json
import os
import regex as re
import numpy as np
import graph_tool.all as gt
import logging
log = logging.getLogger("aps")
log.addHandler(logging.StreamHandler())
log.setLevel(logging.DEBUG)
import sys
sys.path.append("../")
from nlp import singularize, process_phrase

In [None]:
os.listdir("aps-dataset-metadata-2020/")

In [None]:
def parse_article(filename):
    with open(filename, "r") as file:
        article = json.loads(file.read())
    doi = article["id"]
    title = article["title"]["value"]
    try:
        labels = [label["label"] for label in article["classificationSchemes"]["physh"]["disciplines"]]
    except:
        labels = []
    
    journal = article["journal"]["id"]
    authors = [author["surname"] for author in article["authors"]]
    
    try:
        pattern = "[0-9]{4,11}"
        nations = np.unique([re.search(pattern,affiliation["name"]).group() for affiliation in article["affiliations"] if re.search(pattern, affiliation["name"]) is not None])
    except:
        nations = []
    return doi, title, labels, journal, authors, nations
    
parse_article("aps-dataset-metadata-2020/PRX/8/PhysRevX.8.021023.json")
#parse_article("aps-dataset-metadata-2020/PRD/102/PhysRevD.102.014505.json")

In [None]:
#10.1103/PhysRevD.102.014505

In [None]:
def get_journal_dfs(journal):
    df = pd.DataFrame()
    df_meta = pd.DataFrame()
    #df_authors = pd.DataFrame()
    df_areas = pd.DataFrame()
    df_files = pd.DataFrame(columns=["journal"])
    issue = "aps-dataset-metadata-2020/{}/{}/".format(journal, "1" if "E" not in journal else "100")
    articles = os.listdir(issue)[:500]
    for article in articles:
        doi, title, labels, journal, authors, areas = parse_article(issue+article)
        title = process_phrase(title)
        words = title.split(" ")
        df_files=df_files.append(pd.Series(name=doi, index=["journal"], data=journal))
        df = df.join(pd.Series(*np.unique(words, return_counts=True)[::-1], name=doi), how="outer")
        df_meta = df_meta.join(pd.Series(index=labels, data=1, name=doi), how="outer")
        #df_authors = df_authors.join(pd.Series(index=authors, data=1, name=doi), how="outer")
        df_areas = df_areas.join(pd.Series(index=areas, data=1, name=doi), how="outer")
    return df, df_meta, df_files, df_areas

In [None]:
import multiprocessing as mp

In [None]:
pool = mp.Pool(12)

work = pool.map_async(get_journal_dfs, ["PRA", "PRB", "PRC", "PRD", "PRE"], error_callback=lambda err: log.debug(err))

pool.close()
pool.join()

In [None]:
df = pd.DataFrame()
df_meta = pd.DataFrame()
df_authors = pd.DataFrame()
df_files = pd.DataFrame(columns=["journal"])

for (df_j, df_meta_j, df_files_j, df_authors_j) in work.get():
    df = df.join(df_j, how="outer") # join new articles
    df_meta = df_meta.join(df_meta_j, how="outer") #join new articles
    df_files = df_files.append(df_files_j) #append new articles
    df_authors = df_authors.join(df_authors_j, how="outer") #append new authors

In [None]:
df = df.fillna(0).astype(int).drop_duplicates()

df = df.reindex(index = list(filter(lambda w:len(w)>0,df.index)))

'''
do reindex in two steps to avoid undefined behaviour
the sum is made on the new index
'''
O = df.apply(lambda x: (x>0).sum(), axis=1)
df = df.reindex(index = df.index[O>5]) #words that appear in at least # tweets 
df = df.reindex(columns = df.columns[df.sum(0) > 5]) #titles with at least # word with repetition

df_meta = df_meta.fillna(0).astype(int).drop_duplicates()

df_meta = df_meta.reindex(index=df_meta.index[df_meta.sum(1)>1]) # Keywords with at least # papers
df_meta = df_meta.reindex(columns=df.columns) # match df index

df_authors = df_authors.reindex(columns=df.columns)
df_authors = df_authors.drop_duplicates()
df_authors = df_authors.reindex(index=df_authors.index[~df_authors.index.duplicated(keep="first")])

df_authors = df_authors.reindex(index=df_authors.index[df_authors.sum(1)>1]) # Authors with at least # papers

df_files=df_files.reindex(index=df.columns)

In [None]:
df_files["journal"].unique()

In [None]:
df.sum(0).min()

In [None]:
print(df.shape)
print(df_meta.shape)
print(df_files.shape)
print(df_authors.shape)

In [None]:
df_meta.sum(1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots()
df.divide(df.sum(0),1).mean(1).sort_values(ascending=False).plot(ax=ax)

x = np.linspace(1,1e3)
ax.plot(x,1e-1*x**(-0.9))

ax.set_yscale("log")
ax.set_xscale("log")
fig.savefig("zipf.pdf")

# Make hSBM graph

In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../../hSBM_Topicmodel/")
from sbmtm import sbmtm

In [None]:
sbmtm = sbmtm()
sbmtm.make_graph_from_BoW_df(df)
sbmtm.save_graph("aps.xml.gz")

In [None]:
g = sbmtm.g
g

In [None]:
sbmtm.fit(n_init=5, verbose=False, B_min=10, parallel=True)

In [None]:
os.system("rm -rf aps")
os.system("mkdir -p aps")
os.chdir("aps/")
df_files.to_csv("files.dat")
os.system("mkdir -p topsbm")
os.chdir("topsbm/")
sbmtm.save_data()
sbmtm.save_graph()
os.chdir("../../")

In [None]:
gt.draw_hierarchy(sbmtm.state, layout="bipartite", hedge_pen_width=8, hvertex_size=25, vertex_color=sbmtm.g.vertex_properties["kind"])

In [None]:
with open("sbmtm.pkl", "wb") as file:
    pickle.dump(sbmtm, file)

## triSBM

In [None]:
%load_ext autoreload
%autoreload 2
sys.path.append("../../trisbm/")
from trisbm import trisbm

In [None]:
trisbm = trisbm()
trisbm.make_graph(df.append(df_meta), lambda word_keyword: 2 if word_keyword in df_meta.index else 1)

In [None]:
trisbm.save_graph("aps_keyword.xml.gz")

In [None]:
trisbm.fit(n_init=5, verbose=False, B_min=15)

In [None]:
import os

In [None]:
os.system("mkdir -p aps_key")
os.system("mkdir -p aps_key")
os.chdir("aps_key/")
df_files.to_csv("files.dat")
os.system("mkdir -p trisbm")
os.chdir("trisbm/")
trisbm.save_data()
trisbm.save_graph()
os.chdir("../../")

In [None]:
gt.draw_hierarchy(trisbm.state, 
                  #pos=gt.sfdp_layout(model.g),
                  hedge_pen_width=8, 
                  hvertex_size=25, 
                  vertex_color=trisbm.g.vertex_properties["kind"]
                 )

In [None]:
with open("trisbm.pkl", "wb") as file:
    pickle.dump(trisbm, file)

## nSBM

In [None]:
%load_ext autoreload
%autoreload 2
sys.path.append("../../trisbm/")
from trisbm import trisbm

In [None]:
nsbm = trisbm()
nsbm.make_graph_multiple_df(df, [df_meta, df_authors])

In [None]:
nsbm.save_graph("aps_authors.xml.gz")

In [None]:
nsbm.fit(n_init=5, verbose=False, B_min=20)

In [None]:
import os

In [None]:
os.system("mkdir -p aps_zip")
os.system("mkdir -p aps_zip")
os.chdir("aps_zip/")
df_files.to_csv("files.dat")
os.system("mkdir -p trisbm")
os.chdir("trisbm/")
nsbm.save_data()
nsbm.save_graph()
os.chdir("../../")

In [None]:
gt.draw_hierarchy(nsbm.state, 
                  subsample_edges=20000,
                  hedge_pen_width=8, 
                  hvertex_size=25, 
                  vertex_size=8,
                  vertex_color=nsbm.g.vertex_properties["kind"],
                  vertex_fill_color=nsbm.g.vertex_properties["kind"],
                  edge_color="red",
                  edge_pen_width=nsbm.g.edge_properties["count"]
                 )

In [None]:
with open("nsbm.pkl", "wb") as file:
    pickle.dump(nsbm, file)

## Benchmark

In [None]:
from topicpy.hsbmpy import get_scores, get_scores_shuffled, add_score_lines, normalise_score
import matplotlib.pyplot as plt
import pandas as pd
import os

In [None]:
labels = ["journal"]
scores = get_scores("aps_key", labels, algorithm="trisbm", verbose=False)
scores['trisbm'] = scores[labels[0]]
scores["hsbm"]=get_scores("aps", labels, algorithm="topsbm", verbose=False)[labels[0]]
scores["nsbm_state"]=get_scores("aps_auth", labels, algorithm="trisbm", verbose=False)[labels[0]]
scores["nsbm_zip"]=get_scores("aps_zip", labels, algorithm="trisbm", verbose=False)[labels[0]]
scores['shuffle'] = get_scores_shuffled("aps_key", pd.read_csv("aps/files.dat", sep=",", index_col=0), label=labels[0], algorithm='trisbm')
normalise_score(scores, base_algorithm="shuffle", operation=lambda x,y: x/y)

In [None]:
fig=plt.figure(figsize=(18,15))
ax = fig.subplots(1)
add_score_lines(ax,scores,labels=["hsbm", "trisbm", "nsbm_zip", "shuffle"], V="norm_V", alpha=1)
ax.set_xscale('log')
ax.set_ylim(0,max(map(lambda s: max(s["norm_V"]), scores.values()))*1.1)
ax.set_xlim(0,10)

plt.show()
fig.savefig("metric_scores.pdf")

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y = [sbmtm.get_mdl()/sbmtm.g.num_edges()], name="hSBM"),
    go.Bar(y = [trisbm.get_mdl()/trisbm.g.num_edges()], name="triSBM"),
    go.Bar(y = [nsbm.get_mdl()/nsbm.g.num_edges()], name="nSBM")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "title":"APS dataset",
    "xaxis":{
        "title": "Resolution",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "∑/E",
        "type":"log",
        #"range": [10e3,20e3],
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":35
    }
}

fig.update_layout(layout)
#fig.write_image("metric_entropies_bar.pdf")

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y=scores["hsbm"]["norm_V"], name="hSBM"),
    go.Bar(y=scores["trisbm"]["norm_V"], name="triSBM")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "title":"APS dataset",
    "xaxis":{
        "title": "Resolution",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "NMI/NMI*",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":35
    }
}

fig.update_layout(layout)
#fig.write_image("metric_scores_bar.pdf")

In [None]:
from topicpy.hsbmpy import clusteranalysis

In [None]:
clusteranalysis(os.getcwd()+"/aps/", ["journal"], algorithm="topsbm")
clusteranalysis(os.getcwd()+"/aps_key/", ["journal"], algorithm="trisbm")

# Read

In [None]:
import cloudpickle as pickle

with open("sbmtm.pkl", "rb") as file:
    sbmtm = pickle.load(file)
    
with open("trisbm.pkl", "rb") as file:
    trisbm = pickle.load(file)
    
with open("nsbm.pkl", "rb") as file:
    nsbm = pickle.load(file)