In [1]:
%load_ext watermark
%watermark  -a Filippo_Valle -v -m -g -r -v -p pandas,numpy,requests,xml,graph_tool,cloudpickle,regex,topicpy,matplotlib,plotly

Author: Filippo_Valle

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.22.0

pandas     : 1.2.3
numpy      : 1.19.0
requests   : 2.25.1
xml        : unknown
graph_tool : 2.37 (commit afba9459, )
cloudpickle: 1.6.0
regex      : 2021.4.4
topicpy    : 0.2.1
matplotlib : 3.4.1
plotly     : 4.14.3

Compiler    : GCC 9.3.0
OS          : Linux
Release     : 5.8.0-50-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit

Git hash: aec0cb27d23e921cf53771b154b07fadbbd6854a

Git repo: git@github.com:fvalle1/epj.git



In [None]:
import pandas as pd
import numpy as np
import os,sys
sys.path.append("../")
from nlp import process_phrase
import requests as req
import datetime as dt
import xml
import xml.etree.ElementTree as ET 
import logging
log = logging.getLogger("plos")
log.addHandler(logging.StreamHandler())
log.setLevel(logging.DEBUG)

In [None]:
def get_papers_from(date: dt.datetime):
    url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi"
    r = req.get(url, params={"from":date.strftime("%Y-%m-%d")})
    return ET.fromstring(r.text).findall("./records/record")

papers = get_papers_from(dt.datetime(2021, 1, 1, 8, 0, 0))

In [None]:
print(len(papers))
papers[0].attrib

In [None]:
def get_info(pmid:str)->xml.etree.ElementTree.Element:
    url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi"
    r = req.get(url, params={"id":pmid})
    if r.status_code == 200:
        return ET.fromstring(r.text).find("./records/record")
    else:
        return None
    
def get_paper(paper: xml.etree.ElementTree.Element)->xml.etree.ElementTree.Element:
    paper_name = paper.attrib["id"]
    paper = get_info(paper_name)
    if paper is None:
        return None
    link = [link.attrib["href"] for link in paper.findall("link") if link.attrib["format"]=="tgz"][0].replace("ftp://","http://")
    os.system("mkdir -p data")
    if paper_name in os.listdir("./data"):
        return paper
    os.chdir("./data")
    os.system("wget {}".format(link))
    os.system("gunzip {}.tar.gz".format(paper.attrib["id"]))
    os.system("tar -xf {}.tar".format(paper.attrib["id"]))
    os.system("rm -rf {}.tar".format(paper.attrib["id"]))
    os.chdir("../")
    #log.info(os.system("{}.* data/.".format(paper.attrib["id"])))
    return paper

def parse_paper(paper: xml.etree.ElementTree.Element)->list:
    paper_name = paper.attrib["id"]
    filename = list(filter(lambda file: ".nxml" in file, os.listdir(f"data/{paper_name}/")))[0]
    article = ET.parse(f"data/{paper_name}/{filename}").find(".")
    journal = [meta.text for meta in article.findall("front/journal-meta/journal-id") if meta.attrib['journal-id-type']=='nlm-ta']
    keywords = [keyword.text for keyword in article.find("./front/article-meta/kwd-group")]
    keywords = [process_phrase(keyword).replace(" ","_") for keyword in keywords]
    keywords = list(filter(lambda x: len(x)>0, keywords)) #remove ''
    keywords = ["#"+keyword for keyword in keywords]
    abstract = ET.tostring(article.find("./front/article-meta/abstract/")).decode("utf-8")
    return journal, keywords, process_phrase(abstract)

def get_and_parse(paper: xml.etree.ElementTree.Element)->list:
    try:
        return parse_paper(get_paper(paper))
    except:
        return None, None, None

In [None]:
paper = get_paper(papers[5])

In [None]:
paper.attrib

In [None]:
parse_paper(papers[5])

In [None]:
def get_paper_dfs(paper):
    df = pd.DataFrame()
    df_meta = pd.DataFrame()
    df_files = pd.DataFrame(columns=["journal"])
    
    journal, labels, text = get_and_parse(paper)
    doi = paper.attrib["id"]
    
    if text is None:
        return None
    
    words = text.split(" ")
    
    df_files=df_files.append(pd.Series(name=doi, index=["journal"], data=journal, dtype=str))
    df = df.join(pd.Series(*np.unique(words, return_counts=True)[::-1], name=doi), how="outer")
    df_meta = df_meta.join(pd.Series(index=labels, data=1, name=doi), how="outer")
    
    return df, df_meta, df_files

def append_callback(x):
    global df
    global df_meta
    global df_files
    if x is None:
        return None
    
    df_j, df_meta_j, df_files_j = x
    try:
        df = df.join(df_j, how="outer") # join new articles
        df_meta = df_meta.join(df_meta_j, how="outer") #join new articles
        df_files = df_files.append(df_files_j) 
    except:
        pass

In [None]:
import multiprocessing as mp

In [None]:
df = pd.DataFrame()
df_meta = pd.DataFrame()
df_files = pd.DataFrame(columns=["journal"])

pool = mp.Pool(12)

work = [pool.apply_async(get_paper_dfs, args=([paper]), callback=append_callback, error_callback=lambda err: log.debug(err)) for paper in papers]

pool.close()
pool.join()

In [None]:
common_journals = df_files.reset_index().groupby(["journal"]).count().sort_values("index")[-10:].index.values
df_files = df_files[df_files["journal"].isin(common_journals)] 

df = df.fillna(0).astype(int).drop_duplicates()

'''
do reindex in two steps to avoid undefined behaviour
the sum is made on the new index
'''

df = df.reindex(columns=df.columns[df.columns.isin(df_files.index.dropna())]) #be sure every columns has a file
df = df.reindex(index=list(filter(lambda x:len(x)>0,df.index))) # remove '' from words

O = df.apply(lambda x: (x>0).sum(), axis=1)
df = df.reindex(index = df.index[O>5]) #words that appear in at least # papers 
df = df.reindex(columns = df.columns[df.sum(0) > 5]) #docs that have at least # word with repetition 

df_meta = df_meta.fillna(0).astype(int).drop_duplicates()

df_meta = df_meta.reindex(columns=df.columns) # match df index
df_meta = df_meta.reindex(index=df_meta.index[df_meta.sum(1)>1]) # Keywords with at least # paper

In [None]:
common_journals

In [None]:
df.sum(0).min()

In [None]:
print(df.shape)
print(df_meta.shape)
print(df_files.shape)

In [None]:
df_meta.sum(1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots()
df.divide(df.sum(0),1).mean(1).sort_values(ascending=False).plot(ax=ax)

x = np.linspace(1,1e3)
ax.plot(x,1e-1*x**(-0.9))

ax.set_yscale("log")
ax.set_xscale("log")
fig.savefig("zipf.pdf")

# Make hSBM graph

In [2]:
import sys
sys.path.append("../../hSBM_Topicmodel/")

In [None]:
import graph_tool.all as gt
from sbmtm import sbmtm

In [None]:
sbmtm = sbmtm()
sbmtm.make_graph_from_BoW_df(df)
sbmtm.save_graph("plos.xml.gz")

In [None]:
g = sbmtm.g
g

In [None]:
sbmtm.fit(n_init=5, verbose=False, B_min=20, parallel=True)

In [None]:
sbmtm.groups[0]=sbmtm.get_groups(0)
sbmtm.groups[1]=sbmtm.get_groups(1)

In [None]:
os.system("rm -rf plos")
os.system("mkdir -p plos")
os.chdir("plos/")
df_files.to_csv("files.dat")
os.system("mkdir -p topsbm")
os.chdir("topsbm/")
sbmtm.save_data()
sbmtm.save_graph()
os.chdir("../../")

In [None]:
gt.draw_hierarchy(sbmtm.state, layout="bipartite", hedge_pen_width=8, hvertex_size=25)

## triSBM

In [3]:
sys.path.append("../../trisbm/")
from trisbm import trisbm

In [None]:
trisbm = trisbm()
trisbm.make_graph(df.append(df_meta), lambda word_keyword: 2 if word_keyword in df_meta.index else 1)

In [None]:
trisbm.save_graph("plos_keyword.xml.gz")

In [None]:
trisbm.fit(n_init=5, verbose=False, B_min=10)

In [None]:
import os

In [None]:
os.system("rm -rf plos_key")
os.system("mkdir -p plos_key")
os.chdir("plos_key/")
df_files.to_csv("files.dat")
os.system("mkdir -p trisbm")
os.chdir("trisbm/")
trisbm.save_data()
trisbm.save_graph()
os.chdir("../../")

In [None]:
gt.draw_hierarchy(trisbm.state, 
                  #pos=gt.sfdp_layout(model.g),
                  hedge_pen_width=8, 
                  hvertex_size=25
                 )

## Benchmark

In [None]:
from topicpy.hsbmpy import get_scores, get_scores_shuffled, add_score_lines, normalise_score
import matplotlib.pyplot as plt
import pandas as pd
import os

In [None]:
labels = ["journal"]
scores = get_scores("plos_key", labels, algorithm="trisbm", verbose=False)
scores['trisbm'] = scores[labels[0]]
scores["hsbm"]=get_scores("plos", labels, algorithm="topsbm", verbose=False)[labels[0]]
scores['shuffle'] = get_scores_shuffled("plos", pd.read_csv("plos/files.dat", sep=",", index_col=0), label=labels[0], algorithm='topsbm')
normalise_score(scores, base_algorithm="shuffle", operation=lambda x,y: x/y)

In [None]:
fig=plt.figure(figsize=(18,15))
ax = fig.subplots(1)
add_score_lines(ax,scores,labels=["hsbm","trisbm", "shuffle"], V="norm_V", alpha=1)
ax.set_xscale('log')
ax.set_ylim(0,max(map(lambda s: max(s["norm_V"]), scores.values()))*1.1)
ax.set_xlim(0,max(map(lambda s: max(s["xl"]), scores.values()))*1.1)

plt.show()
fig.savefig("metric_scores.pdf")

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y = [sbmtm.get_mdl()/sbmtm.g.num_edges()], name="hSBM"),
    go.Bar(y = [trisbm.get_mdl()/trisbm.g.num_edges()], name="triSBM")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "title":"Plos dataset",
    "xaxis":{
        "title": "Resolution",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "∑/E",
        "type":"log",
        #"range": [10e3,20e3],
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":35
    }
}

fig.update_layout(layout)
#fig.write_image("metric_entropies_bar.pdf")

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y=scores["hsbm"]["norm_V"], name="hSBM"),
    go.Bar(y=scores["trisbm"]["norm_V"], name="triSBM")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "title":"Plos dataset",
    "xaxis":{
        "title": "Resolution",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "NMI/NMI*",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":35
    }
}

fig.update_layout(layout)
#fig.write_image("metric_scores_bar.pdf")

In [None]:
from topicpy.hsbmpy import clusteranalysis

In [None]:
clusteranalysis(os.getcwd()+"/plos/", ["journal"], algorithm="topsbm")
clusteranalysis(os.getcwd()+"/plos_key/", ["journal"], algorithm="trisbm")

In [None]:
import cloudpickle as pickle

with open("sbmtm.pkl", "wb") as file:
    pickle.dump(sbmtm, file)
    
with open("trisbm.pkl", "wb") as file:
    pickle.dump(trisbm, file)