In [6]:
!pip install neo4j -q


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import json

# Read JSON data
data = []
with open('train.json') as f:
    for line in f:
        try:
            json_object = json.loads(line)
            data.append(json_object)
        except json.JSONDecodeError:
            print("Error decoding JSON on line:", line)


In [10]:
print(len(data))

564340


In [11]:
from neo4j import GraphDatabase
from tqdm import tqdm

# Neo4j Connection
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "Garv@neo4j10")

# Define a function to create nodes and edges in Neo4j
def create_citation_graph(driver, data):
    with driver.session() as session:
        # Create constraints to ensure uniqueness for 'paper' nodes
        session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (p:Paper) REQUIRE p.id IS UNIQUE")

        # Process each paper entry with progress bar
        for entry in tqdm(data, desc="Processing citations", unit="paper"):
            paper_id = entry["paper"]
            references = entry["reference"]

            # Create the citing paper node
            session.run("MERGE (p:Paper {id: $paper_id})", paper_id=paper_id)

            # Create citation relationships
            for ref_id in references:
                session.run("""
                    MERGE (ref:Paper {id: $ref_id})
                    MERGE (p:Paper {id: $paper_id})
                    MERGE (p)-[:CITES]->(ref)
                """, paper_id=paper_id, ref_id=ref_id)

In [12]:
# Connect to Neo4j and create the graph
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    create_citation_graph(driver, data)

print("Citation graph created successfully!")

Processing citations: 100%|██████████| 564340/564340 [1:58:25<00:00, 79.42paper/s]  


Citation graph created successfully!
