In [143]:
%%bash

pip install pandas numpy gql



You should consider upgrading via the 'pip install --upgrade pip' command.


In [144]:
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
import pandas as pd

In [156]:
GRAPHQL_ENDPOINT = "http://localhost:8080/v1/graphql"

In [157]:
transport = RequestsHTTPTransport(
    url=GRAPHQL_ENDPOINT,
    use_json=True,
    headers={"Content-type": "application/json"},
    verify=False)

client = Client(
    retries=3,
    transport=transport,
    fetch_schema_from_transport=True)

# Query the database

Let's get some data. The following code queries the GraphQL endpoint and gets the 2-level citation graph for a specific paper. That is, it gets all papers that are at most 2 hops away. Assuming that each paper on average cites 30 other papers, the result would be 1 + 30 * 30 = ~900 records. Note that the database currently only contains papers labeled as "Computer Science", so cited papers from other fields will not show up.

In [145]:
# Paper title is matched via sql LIKE, so this can be a substring of the title
PAPER_TITLE = "%Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model%"

In [146]:
query = gql("""
fragment paper_fields on papers {
  id
  title
  year
  num_citations
}

query papers($title: String!) {
  papers(limit: 1, where: {title: {_like: $title}}, offset: 0) {
    ...paper_fields
    cites(args: {limit_: 100}) {
      ...paper_fields
      cites(args: {limit_: 100}) {
        ...paper_fields
      }
    }
  }
}
""")

In [147]:
papers = client.execute(query, variable_values={"title": PAPER_TITLE})
root = papers["papers"][0]

In [137]:
def process_recursively(paper, fn):
    """Helper function to process the recursive graph data structure. It runs a function for each element in the graph"""
    fn(paper)
    if not "cites" in paper:
        return
    for cited_paper in paper["cites"]:
        process_recursively(cited_paper, fn)

# Create DataFrame

Next, let's create a dataframe of all papers in the graph. Here, duplicates are eliminated.

In [153]:
# Create a DataFrame of all papers that appear in this subgraph
paper_map = {}
def add_paper(paper):
    fields = { k : paper[k] for k in paper if k != "cites" }
    paper_map[paper["id"]] = fields
process_recursively(root, add_paper)
papers_df = pd.DataFrame.from_records(list(paper_map.values()), index="id")

print(f"{len(papers_df)} unique papers")
papers_df.head()

837 unique papers


Unnamed: 0_level_0,title,year,num_citations
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c39fb7a46335c23f7529dd6f9f980462fd38653a,"Mastering Atari, Go, Chess and Shogi by Planni...",2019,25
0ab3f7ecbdc5a33565a234215604a6ca9d155a33,Rainbow: Combining Improvements in Deep Reinfo...,2018,397
049c6e5736313374c6e594c34b9be89a3a09dced,FeUdal Networks for Hierarchical Reinforcement...,2017,269
10a4992ece5baea79326a8878a6244eeacbc6af5,Deep Successor Reinforcement Learning,2016,77
2319a491378867c7049b3da055c5df60e1671158,Playing Atari with Deep Reinforcement Learning,2013,2793


## Most Popular Papers

The following shows papers with the overall most citations present in this subgraph. These are the popular papers that are somewhat relevant to the root paper.

In [154]:
papers_df.sort_values("num_citations", ascending=False).head(25)

Unnamed: 0_level_0,title,year,num_citations
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abd1c342495432171beb7ca8fd9551ef13cbd0ff,ImageNet Classification with Deep Convolutiona...,2012,40246
a6cb366736791bcccc5c8639de5a8f9636bf87e8,Adam: A Method for Stochastic Optimization,2015,33621
2c03df8b48bf3fa39054345bafabfeff15bfd11d,Deep Residual Learning for Image Recognition,2016,32655
eb42cf88027de515750f230b23b1a057dc782108,Very Deep Convolutional Networks for Large-Sca...,2014,27495
13d4c2f76a7c1a4d0a71204e1d5d263a3f5a7986,Random Forests,2004,24563
44d2abe2175df8153f465f6c39b68b76a0d40ab9,Long Short-Term Memory,1997,20985
97efafdb4a3942ab3efba53ded7413199f79c054,Reinforcement Learning: An Introduction,2005,18170
4f607f03272e4d62708f5b2441355f9e005cb452,Convex Optimization,2006,16074
2e62d1345b340d5fda3b092c460264b9543bc4b5,Genetic Algorithms in Search Optimization and ...,1989,16067
4b4279db68b16e20fbc56f9d41980a950191d30a,Adaptation in natural and artificial systems,1975,15985


## Most cited papers within the subgraph

The following shows papers that are cited most often **within this subgraph**. These papers tend to be more relevant to the root paper.

In [155]:
from collections import Counter

c = Counter()
process_recursively(root, lambda p: c.update([p["id"]]))
paper_counts_df = pd.DataFrame.from_records(c.most_common(), columns=["id", "subgraph_citation_count"], index="id")
paper_counts_df = pd.concat([papers_df, paper_counts_df], axis=1)
paper_counts_df.sort_values("subgraph_citation_count", ascending=False).head(20)

Unnamed: 0,title,year,num_citations,subgraph_citation_count
e0e9a94c4a6ba219e768b4e59f72c18f0a22e23d,Human-level control through deep reinforcement...,2015,6647,18
f82e4ff4f003581330338aaae71f60316e58dd26,The Arcade Learning Environment: An Evaluation...,2013,1003,15
97efafdb4a3942ab3efba53ded7413199f79c054,Reinforcement Learning: An Introduction,2005,18170,14
69e76e16740ed69f4dc55361a3d319ac2f1293dd,Asynchronous Methods for Deep Reinforcement Le...,2016,2365,14
e4257bc131c36504a04382290cbc27ca8bb27813,Action-Conditional Video Prediction using Deep...,2015,444,9
a6cb366736791bcccc5c8639de5a8f9636bf87e8,Adam: A Method for Stochastic Optimization,2015,33621,8
e635d81a617d1239232a9c9a11a196c53dab8240,Bandit Based Monte-Carlo Planning,2006,1593,8
6ce57ab17fcd507b856a79874063b59555c76b3a,Learning to Predict by the Methods of Temporal...,2005,2365,8
c6170fa90d3b2efede5a2e1660cb23e1c824f2ca,Prioritized Experience Replay,2015,864,8
0ab3f7ecbdc5a33565a234215604a6ca9d155a33,Rainbow: Combining Improvements in Deep Reinfo...,2018,397,8
