In [None]:
%load_ext watermark
%watermark  -a Filippo_Valle -v -m -g -r -v -p pandas,numpy,graph_tool,cloudpickle,regex,topicpy,matplotlib,plotly

In [None]:
import pandas as pd
import json
import os
import regex as re
import numpy as np
import graph_tool.all as gt
import logging
import cloudpickle as pickle
log = logging.getLogger("aps")
log.addHandler(logging.StreamHandler())
log.setLevel(logging.DEBUG)
import sys
sys.path.append("../")
from nlp import singularize, process_phrase

In [None]:
os.listdir("aps-dataset-metadata-2020/")

In [None]:
def parse_article(filename):
    with open(filename, "r") as file:
        article = json.loads(file.read())
    doi = article["id"]
    title = article["title"]["value"]
    try:
        labels = [label["label"] for label in article["classificationSchemes"]["physh"]["disciplines"]]
    except:
        labels = []
    
    journal = article["journal"]["id"]
    authors = [author["surname"] for author in article["authors"]]
    
    try:
        pattern = "[0-9]{4,11}" #ZIPCODE
        nations = np.unique([re.search(pattern,affiliation["name"]).group() for affiliation in article["affiliations"] if re.search(pattern, affiliation["name"]) is not None])
    except:
        nations = []
    return doi, title, labels, journal, authors, nations
    
parse_article("aps-dataset-metadata-2020/PRX/8/PhysRevX.8.021023.json")
#parse_article("aps-dataset-metadata-2020/PRD/102/PhysRevD.102.014505.json")

In [None]:
#10.1103/PhysRevD.102.014505

In [None]:
def get_journal_dfs(journal):
    issue = "aps-dataset-metadata-2020/{}/{}/".format(journal, "1" if "E" not in journal else "100")
    articles = os.listdir(issue)[:2500]
    labelled = []
    for article in articles:
        doi, title, labels, journal, authors, areas = parse_article(issue+article)
        title = process_phrase(title)
        words = title.split(" ")
        labelled.append(
        {
            "text": title,
            "label": journal,
            "metadata":{
                "labels": ",".join(labels),
                "authors": ",".join(authors),
                "doi": doi,
                "areas": ",".join(areas)
            }
        })
    return labelled

In [None]:
import multiprocessing as mp

In [None]:
pool = mp.Pool(12)

work = pool.map_async(get_journal_dfs, ["PRA", "PRB", "PRC", "PRD", "PRE"], error_callback=lambda err: log.debug(err))

pool.close()
pool.join()

In [None]:
data = np.concatenate(work.get())

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(data, test_size=500)

In [None]:
len(test)

In [None]:
import json
with open("train.jsonl", "w") as file:
    for article in train:
        file.write(json.dumps(article)+"\n")

# Make OpenAI file

In [None]:
import openai
import os
openai.api_key = os.environ["OPENAI_KEY"]

In [None]:
response = openai.File.create(file=open("train.jsonl"), purpose="classifications")
response

In [None]:
def get_prediction(text, file_id):
    try:
        response = openai.Classification.create(
            file=file_id,
            query=text["text"],
            search_model="ada", 
            model="curie", 
            max_examples=0
        )
        return text["metadata"]["doi"], response.get("label")
    except:
        return text["metadata"]["doi"], None
    
get_prediction(test[0], response.get("id"))

In [None]:
results = pd.DataFrame(columns=["doi", "class", "predicted"])
for article in test:
    res = get_prediction(article, response.get("id"))
    results = results.append(pd.Series(index=["doi", "class", "predicted"], data = [res[0], article["label"],res[1]]), ignore_index=True)

In [None]:
results.dropna(how="any", axis=0, inplace=True)
results.head()

In [None]:
from sklearn.metrics import v_measure_score

In [None]:
openai_score = v_measure_score(results["class"],results["predicted"])

## Benchmark

In [None]:
from topicpy.hsbmpy import get_scores, get_scores_shuffled, add_score_lines, normalise_score
import matplotlib.pyplot as plt
import pandas as pd
import os

In [None]:
labels = ["journal"]
scores = get_scores("aps_key", labels, algorithm="trisbm", verbose=False)
scores['trisbm'] = scores[labels[0]]
scores["hsbm"]=get_scores("aps", labels, algorithm="topsbm", verbose=False)[labels[0]]
scores["trisbm_zip"] = get_scores("aps_zip", labels, algorithm="trisbm", verbose=False)[labels[0]]
scores["trisbm_nauth"] = get_scores("aps_auth", labels, algorithm="trisbm", verbose=False)[labels[0]]
scores["nsbm_nauth"]=get_scores("aps_authors_count", labels, algorithm="trisbm", verbose=False)[labels[0]]
#scores["nsbm_zip"]=get_scores("aps_zip", labels, algorithm="trisbm", verbose=False)[labels[0]]
scores['shuffle'] = get_scores_shuffled("aps_key", pd.read_csv("aps/files.dat", sep=",", index_col=0), label=labels[0], algorithm='trisbm')
normalise_score(scores, base_algorithm="shuffle", operation=lambda x,y: x/y)

In [None]:
openai_x = len(results["predicted"].unique()) 
norm_fact = np.interp(openai_x,np.sort(scores["shuffle"]["xl"]),np.array(scores["shuffle"]["V"])[np.argsort(scores["shuffle"]["xl"])])
openai_score_norm = openai_score/norm_fact

In [None]:
fig=plt.figure(figsize=(18,15))
ax = fig.subplots(1)
ax.scatter(openai_x, openai_score_norm, s = 5000, c="red", marker=".", label="openai")

add_score_lines(ax,scores,labels=["hsbm", "trisbm", "trisbm_zip", "trisbm_nauth", "nsbm_nauth", "shuffle"], V="norm_V", alpha=1)
ax.set_xscale('log')
ax.set_ylim(0,max(map(lambda s: max(s["norm_V"]), scores.values()))*1.1)
ax.set_xlim(1,10)
ax.set_xscale("linear")

plt.show()
#fig.savefig("metric_scores.pdf")

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y=[max(scores["hsbm"]["V"])], name="hSBM"),
    go.Bar(y=[max(scores["trisbm"]["V"])], name="keywords"),
    #go.Bar(y=[max(scores["trisbm_zip"]["norm_V"])], name="zip codes"),
    #go.Bar(y=[max(scores["trisbm_nauth"]["norm_V"])], name="n authors"),
    #go.Bar(y=[max(scores["nsbm_nauth"]["V"])], name="keywords<br>+nauthors"),
    go.Bar(y=[openai_score], name="openAI")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "title":"APS dataset",
    "xaxis":{
        "title": "Setting",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "NMI",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":25
    }
}

fig.update_layout(layout)
fig.show()
fig.write_image("metric_scores_kinds_openai.pdf")