In [2]:
dbconnection = {
    "username": "neo4j",
    "password": "password",
    "url": "bolt://localhost:7687"
}
from neo4j import GraphDatabase
import json
from pathlib import Path
datapath = Path("../master-database-files/master-experimental/neo4j_citing_data_conversion/papers.json")
assert datapath.parent.exists()

In [22]:
with GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password")) as driver:
    with driver.session() as session:
        result = session.run("MATCH (n:Paper) RETURN n, id(n) AS id")
        jsonarrayy = []
        for record in result:
            citedIds = [res["id"] for res in session.run("MATCH (n)-[:CITES]->(m) WHERE id(n) = $id RETURN id(m) AS id", id=record["id"])]
            citingIds = [res["id"] for res in session.run("MATCH (n)<-[:CITES]-(m) WHERE id(n) = $id RETURN id(m) AS id", id=record["id"])]
            tags = [res["tag"] for res in session.run("MATCH (n)-[:TAGGED]->(m) WHERE id(n) = $id RETURN m.name AS tag", id=record["id"])]
            tags = (["liv"] if "HEPML" in tags else []) + (["ins"] if "inspire" in tags else [])
            jsonarrayy.append({
                "arxiv": record["n"]["arxiv"],
                "doi": record["n"]["doi"],
                "semscholar": record["n"]["semscholar"],
                "pdf": record["n"]["pdf"],
                "status": record["n"]["status"],
                "title": record["n"]["title"],
                "id": record["id"],
                "cited": citedIds,
                "citing": citingIds,
                "tags": tags
            })
        with datapath.open("w") as f:
            json.dump(jsonarrayy, f, indent=2)

In [18]:
def countCitations(semscholarId):
    citegraph = json.load(open(datapath))
    for paper in citegraph:
        if paper["semscholar"] == semscholarId:
            totalCited = 0
            for cited in paper["cited"]:
                if [p for p in citegraph if p["id"] == cited][0]["status"] == 2:
                    totalCited += 1
            totalCiting = 0
            for citing in paper["citing"]:
                if [p for p in citegraph if p["id"] == citing][0]["status"] == 2:
                    totalCiting += 1
            return (totalCited, totalCiting)

In [4]:
papers = json.load(open(datapath))
papersById = {paper["id"]: paper for paper in papers}
def isValid(paperID):
    return papersById[paperID]["status"] == 2 and len(papersById[paperID]["tags"]) > 0
def isInDataset(paperID):
    return len(papersById[paperID]["tags"]) > 0

In [7]:
def printPaperStatistics():
    citegraph = json.load(open(datapath))
    print("Valid papers:", len([p for p in citegraph if isValid(p["id"])]))
    print("Valid papers from the living review:", len([p for p in citegraph if isValid(p["id"]) and "liv" in p["tags"]]))
    print("Valid papers from the inspirehep search:", len([p for p in citegraph if isValid(p["id"]) and "ins" in p["tags"]]))
    print("Valid papers from the living review and inspirehep search:", len([p for p in citegraph if isValid(p["id"]) and "liv" in p["tags"] and "ins" in p["tags"]]))

    print("")
    print("Tagged papers:", len([p for p in citegraph if len(p["tags"]) > 0]))
    print("Tagged papers with errors:", len([p for p in citegraph if len(p["tags"]) > 0 and p["status"] != 2]))
    print("Papers from the living review:", len([p for p in citegraph if "liv" in p["tags"]]))
    print("Papers from the inspirehep search:", len([p for p in citegraph if "ins" in p["tags"]]))
    print("Overlapping papers:", len([p for p in citegraph if "liv" in p["tags"] and "ins" in p["tags"]]))
    
    print("")
    print("Papers with errors:", len([p for p in citegraph if p["status"] != 2]))
    print("Papers with error and more then zero citation or references:", len([p for p in citegraph if p["status"] != 2 and (len(p["cited"]) > 0 or len(p["citing"]) > 0)]))
    print("Papers without tags:", len([p for p in citegraph if len(p["tags"]) == 0]))
    print("Total papers:", len(citegraph))
printPaperStatistics()

Valid papers: 2409
Valid papers from the living review: 912
Valid papers from the inspirehep search: 2223
Valid papers from the living review and inspirehep search: 726

Tagged papers: 2462
Tagged papers with errors: 53
Papers from the living review: 924
Papers from the inspirehep search: 2266
Overlapping papers: 728

Papers with errors: 58
Papers with error and more then zero citation or references: 0
Papers without tags: 28
Total papers: 2490


In [22]:
def getListOfPapersByCitedCount():
    citegraph = json.load(open(datapath))
    return sorted([(len([c for c in p["cited"] if isValid(c)]), p["title"], p["semscholar"]) for p in citegraph if isValid(p["id"])], key=lambda x: x[0], reverse=True)

In [23]:
def printMostCitingPapers(nr):
    citedPapers = getListOfPapersByCitedCount()
    for i in range(nr):
        if i < len(citedPapers):
            print(f'''
{citedPapers[i][0]}: {{
    '{citedPapers[i][1]}'
    '{citedPapers[i][2]}'
}}''')

In [24]:
printMostCitingPapers(3)


385: {
    'A Living Review of Machine Learning for Particle Physics'
    '87420b5957e7c28b6170f5ef09b49cc83cf6a2a9'
}

145: {
    'Machine learning in the search for new fundamental physics'
    'f755718a3f6a08c3659357bc20897eb4e124cce1'
}

118: {
    'A guide for deploying Deep Learning in LHC searches: How to achieve optimality and account for uncertainty'
    'afea68ae8c74e035656c4ad424707f5342127bc4'
}


In [38]:
def getListOfPapersByCitingCount():
    citegraph = json.load(open(datapath))
    return sorted([(len([c for c in p["citing"] if isValid(c)]), p["title"], p["semscholar"]) for p in citegraph if isValid(p["id"])], key=lambda x: x[0], reverse=True)

In [33]:
def printMostCitedPapers(nr):
    citingPapers = getListOfPapersByCitingCount()
    for i in range(nr):
        if i < len(citingPapers):
            print(f'''
{citingPapers[i][0]}: {{
    '{citingPapers[i][1]}'
    '{citingPapers[i][2]}'
}}''')

In [39]:
printMostCitedPapers(3)


175: {
    'Jet-images — deep learning edition'
    '5460c1cde49a0d89575f9b4e59d51c06139f436b'
}

140: {
    'Jet substructure at the Large Hadron Collider: A review of recent advances in theory and machine learning'
    '702269ca81e00177943247c855764c764d93e193'
}

133: {
    'Deep learning in color: towards automated quark/gluon jet discrimination'
    '155260612d9de2b0f219a58720dbf2a78cab623c'
}


In [44]:
def countAverageCitationsOfValidPapers():
    citegraph = json.load(open(datapath))
    validPapers = [p for p in citegraph if isValid(p["id"])]
    totalCited = sum([len([c for c in p["cited"] if isValid(c)]) for p in validPapers])
    totalCiting = sum([len([c for c in p["citing"] if isValid(c)]) for p in validPapers])
    return totalCited / len(validPapers), totalCiting / len(validPapers)

In [45]:
def printAverageCitationsOfValidPapers():
    cited, citing = countAverageCitationsOfValidPapers()
    print(f"Average citations of valid papers: {cited}")
    print(f"Average references of valid papers: {citing}")
printAverageCitationsOfValidPapers()

Average citations of valid papers: 6.219178082191781
Average references of valid papers: 6.219178082191781


In [56]:
def GetAveragePathLengthOfConnectedPapers():
    citegraph = json.load(open(datapath))
    validPapers = [p for p in citegraph if isValid(p["id"])]
    totalPathLength = 0
    totalPathCount = 0
    i = 0
    for paper in validPapers:
        with Path("progress.txt").open("w") as f:
            f.write(f"{i}/{len(validPapers)}")
        pathlengthPerPaper = {p["id"]: -1 for p in validPapers}
        def updatePathLength(paperID, length):
            if pathlengthPerPaper[paperID] == -1 or pathlengthPerPaper[paperID] > length:
                pathlengthPerPaper[paperID] = length
                for citing in (papersById[paperID]["citing"] + papersById[paperID]["cited"]):
                    if citing in pathlengthPerPaper:
                        updatePathLength(citing, length + 1)
        updatePathLength(paper["id"], 0)
        for id, length in pathlengthPerPaper.items():
            if length != -1 and id != paper["id"]:
                totalPathLength += length
                totalPathCount += 1
        i += 1
    return totalPathLength / totalPathCount

In [58]:
print("Path length:", GetAveragePathLengthOfConnectedPapers())

Path length: 3.6433048553098697


In [19]:
countCitations("702269ca81e00177943247c855764c764d93e193")

Paper 702269ca81e00177943247c855764c764d93e193 has 57 citations and is cited by 140 papers


In [13]:
citegraph = json.load(open(datapath))
print(len(citegraph))

2490


In [27]:
def printListOfPapersWithoutTags(length):
    citegraph = json.load(open(datapath))
    i = 0
    for paper in citegraph:
        if i >= length:
            break
        if len(paper["tags"]) == 0:
            print(f'''
{paper["title"]}
{paper["semscholar"]}
{paper["id"]}
''')
            i += 1
printListOfPapersWithoutTags(1)


None
None
2431



In [None]:
print("test")

In [59]:
def checkForCitationErrors():
    citegraph = json.load(open(datapath))
    for paper in citegraph:
        if isValid(paper["id"]):
            for cited in paper["cited"]:
                if isValid(cited):
                    if paper["id"] not in papersById[cited]["citing"]:
                        return True
    return False
print("Has Error: ", checkForCitationErrors())

Has Error:  False


In [5]:
def countValidCitations():
    citegraph = json.load(open(datapath))
    count = 0
    for paper in citegraph:
        if isValid(paper["id"]):
            for cited in paper["cited"]:
                if isValid(cited):
                    count += 1
    return count
print("Valid citations: ", countValidCitations())

Valid citations:  14982
