In [4]:
# Cell 1: Connect to Neo4j and Create Constraints
import os
from dotenv import load_dotenv
from py2neo import Graph # Using py2neo, which was installed
import time # To potentially add pauses if needed

print("Libraries imported.")
# Load .env file - good practice although we define connection vars directly here
load_dotenv(override=True)
# kg_git_dir = os.environ.get('KG_GIT') # Not needed for this script

# --- Neo4j Connection Details ---
# These should match your Neo4j Desktop setup
NEO4J_BOLT_URL = "bolt://localhost:7687" # Default Bolt port
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo4jdemo" # The password you set

graph = None # Initialize graph variable
# --- Connect to Neo4j ---
try:
    print(f"Connecting to Neo4j at {NEO4J_BOLT_URL} as user '{NEO4J_USER}'...")
    # Establish the connection using py2neo
    graph = Graph(NEO4J_BOLT_URL, auth=(NEO4J_USER, NEO4J_PASSWORD), name="spoke-genelab-v0.0.3") # Specify the database name
    # Run a simple query to confirm connection works
    graph.run("MATCH (n) RETURN count(n) LIMIT 1").data()
    print("Successfully connected to Neo4j database 'spoke-genelab-v0.0.3'!")
except Exception as e:
    print(f"\nERROR: Could not connect to Neo4j.")
    print("Please ensure the 'spoke-genelab-db' database instance is ACTIVE in Neo4j Desktop.")
    print(f"Error details: {e}")

# --- Define and Run Constraints (only if connection successful) ---
if graph:
    print("\nAttempting to create constraints...")
    # List of the 4 constraint Cypher queries
    constraint_queries = [
        "CREATE CONSTRAINT constraint_subject_id IF NOT EXISTS FOR (n:Subject) REQUIRE n.subjectID IS UNIQUE;",
        "CREATE CONSTRAINT constraint_anatomy_id IF NOT EXISTS FOR (n:Anatomy) REQUIRE n.identifier IS UNIQUE;",
        "CREATE CONSTRAINT constraint_assay_id IF NOT EXISTS FOR (n:Assay) REQUIRE n.identifier IS UNIQUE;",
        "CREATE CONSTRAINT constraint_measurement_id IF NOT EXISTS FOR (n:MeasurementValue) REQUIRE n.measurementID IS UNIQUE;"
    ]
    # Loop through and execute each query
    for i, query in enumerate(constraint_queries):
        try:
            print(f"Running constraint {i+1}: {query[:60]}...") # Show start of query
            graph.run(query) # Execute the Cypher query
            print(f"  Constraint {i+1} created or already exists.")
            time.sleep(0.5) # Add a tiny pause between commands
        except QueryError as qe:
            # Catch specific errors from Neo4j
            print(f"  Database reported issue with constraint {i+1}: {qe}")
        except Exception as e:
            # Catch other errors (like connection lost)
            print(f"  ERROR running constraint {i+1}: {e}")
    print("Constraint creation process finished.")
else:
    print("Skipping constraint creation due to connection failure.")

Libraries imported.
Connecting to Neo4j at bolt://localhost:7687 as user 'neo4j'...
Successfully connected to Neo4j database 'spoke-genelab-v0.0.3'!

Attempting to create constraints...
Running constraint 1: CREATE CONSTRAINT constraint_subject_id IF NOT EXISTS FOR (n...
  Constraint 1 created or already exists.
Running constraint 2: CREATE CONSTRAINT constraint_anatomy_id IF NOT EXISTS FOR (n...
  Constraint 2 created or already exists.
Running constraint 3: CREATE CONSTRAINT constraint_assay_id IF NOT EXISTS FOR (n:A...
  Constraint 3 created or already exists.
Running constraint 4: CREATE CONSTRAINT constraint_measurement_id IF NOT EXISTS FO...
  Constraint 4 created or already exists.
Constraint creation process finished.


In [5]:
# Cell 2: Load Node Data from CSV Files
print("Loading node data...")
# Ensure graph connection exists from Cell 1
if 'graph' in locals() and graph is not None:
    # Define the LOAD CSV commands for nodes as multi-line strings
    node_load_queries = [
        # Load Subjects
        """
        LOAD CSV WITH HEADERS FROM 'file:///output_csvs/Subject_nodes.csv' AS row
        MERGE (s:Subject {subjectID: row.`subjectID:ID`})
        ON CREATE SET s.sex = row.`sex:string`
        ON MATCH SET s.sex = row.`sex:string`;
        """,
        # Load Anatomy (Proxy Nodes)
        """
        LOAD CSV WITH HEADERS FROM 'file:///output_csvs/Anatomy_nodes.csv' AS row
        MERGE (a:Anatomy {identifier: row.`identifier:ID`});
        """,
        # Load Assays
        """
        LOAD CSV WITH HEADERS FROM 'file:///output_csvs/Assay_nodes.csv' AS row
        MERGE (a:Assay {identifier: row.`identifier:ID`})
        ON CREATE SET a.name = row.`name:string`, a.technology = row.`technology:string`, a.measurement = row.`measurement:string`, a.timepoint = row.`timepoint:string`, a.dataset = 'OSD-679', a.material_id_1 = row.`material_id_1:string`, a.material_name_1 = row.`material_name_1:string`
        ON MATCH SET a.name = row.`name:string`, a.technology = row.`technology:string`, a.measurement = row.`measurement:string`, a.timepoint = row.`timepoint:string`, a.dataset = 'OSD-679', a.material_id_1 = row.`material_id_1:string`, a.material_name_1 = row.`material_name_1:string`;
        """,
        # Load Measurement Values
        """
        LOAD CSV WITH HEADERS FROM 'file:///output_csvs/MeasurementValue_nodes.csv' AS row
        MERGE (m:MeasurementValue {measurementID: row.`measurementID:ID`})
        ON CREATE SET m.value = toFloat(row.`value:float`), m.unit = row.`unit:string`, m.type = row.`type:string`
        ON MATCH SET m.value = toFloat(row.`value:float`), m.unit = row.`unit:string`, m.type = row.`type:string`;
        """
    ]

    # Execute each node loading query using graph.run()
    for i, query in enumerate(node_load_queries):
        try:
            file_path = query.split("'")[1] # Extract file path for message
            print(f"\nRunning node load query {i+1} for {file_path}...")
            # Execute the query and get statistics
            results = graph.run(query).stats()
            print(f"  Finished query {i+1}. Stats: {results}")
            time.sleep(0.5) # Small pause between loads
        except Exception as e:
            print(f"  ERROR running node load query {i+1}: {e}")
            print("  Stopping node loading due to error.")
            break # Stop if one command fails

    print("\nNode loading process finished.")

else:
    print("Cannot load nodes, Neo4j graph connection ('graph' variable) not established in Cell 1.")

Loading node data...

Running node load query 1 for file:///output_csvs/Subject_nodes.csv...
  Finished query 1. Stats: {}

Running node load query 2 for file:///output_csvs/Anatomy_nodes.csv...
  Finished query 2. Stats: {}

Running node load query 3 for file:///output_csvs/Assay_nodes.csv...
  Finished query 3. Stats: {'properties_set': 4781}

Running node load query 4 for file:///output_csvs/MeasurementValue_nodes.csv...
  Finished query 4. Stats: {'properties_set': 11757}

Node loading process finished.


In [6]:
# Cell 3: Load Relationship Data from CSV Files
print("Loading relationship data...")
# Ensure graph connection exists from Cell 1
if 'graph' in locals() and graph is not None:
    # Define the LOAD CSV commands for relationships as multi-line strings
    relationship_load_queries = [
        # Link Assay to Subject
        """
        LOAD CSV WITH HEADERS FROM 'file:///output_csvs/Assay_PERFORMED_ON_Subject_rels.csv' AS row
        MATCH (start:Assay {identifier: row.`:START_ID`})      // Find the start Assay node using its unique ID
        MATCH (end:Subject {subjectID: row.`:END_ID`})      // Find the end Subject node using its unique ID
        MERGE (start)-[:PERFORMED_ON]->(end);               // Create the relationship if it doesn't exist
        """,
        # Link Assay to MeasurementValue
        """
        LOAD CSV WITH HEADERS FROM 'file:///output_csvs/Assay_HAS_OUTPUT_MeasurementValue_rels.csv' AS row
        MATCH (start:Assay {identifier: row.`:START_ID`})          // Find the start Assay node
        MATCH (end:MeasurementValue {measurementID: row.`:END_ID`}) // Find the end MeasurementValue node
        MERGE (start)-[:HAS_OUTPUT]->(end);                      // Create the relationship
        """,
        # Link MeasurementValue to Anatomy
        """
        LOAD CSV WITH HEADERS FROM 'file:///output_csvs/MeasurementValue_MEASURES_ANATOMY_Anatomy_rels.csv' AS row
        MATCH (start:MeasurementValue {measurementID: row.`:START_ID`}) // Find the start MeasurementValue node
        MATCH (end:Anatomy {identifier: row.`:END_ID`})             // Find the end Anatomy node (using UBERON ID)
        MERGE (start)-[:MEASURES_ANATOMY]->(end);                  // Create the relationship
        """
        # Add other relationship LOAD CSV commands here if you created more CSVs
        # e.g., for linking Subject/Assay to Study, or Anatomy hierarchy (PART_OF)
    ]

    # Execute each relationship loading query
    for i, query in enumerate(relationship_load_queries):
        try:
            file_path = query.split("'")[1] # Extract file path for message
            print(f"\nRunning relationship load query {i+1} for {file_path}...")
            # Execute the query and get statistics
            results = graph.run(query).stats()
            print(f"  Finished query {i+1}. Stats: {results}")
            time.sleep(0.5) # Small pause
        except Exception as e:
            print(f"  ERROR running relationship load query {i+1}: {e}")
            print("  Stopping relationship loading due to error.")
            break # Stop if one fails

    print("\nRelationship loading process finished.")
    print("\nImport of OCT data from OSD-679 should now be complete!")

else:
    print("Cannot load relationships, Neo4j graph connection ('graph' variable) not established in Cell 1.")

Loading relationship data...

Running relationship load query 1 for file:///output_csvs/Assay_PERFORMED_ON_Subject_rels.csv...
  Finished query 1. Stats: {}

Running relationship load query 2 for file:///output_csvs/Assay_HAS_OUTPUT_MeasurementValue_rels.csv...
  Finished query 2. Stats: {}

Running relationship load query 3 for file:///output_csvs/MeasurementValue_MEASURES_ANATOMY_Anatomy_rels.csv...
  Finished query 3. Stats: {}

Relationship loading process finished.

Import of OCT data from OSD-679 should now be complete!
