In [1]:
# -----------------------------------------------------------------------------
# 📘 Notebook: 03_neo4j_ingestion.ipynb
#
# Purpose:
#   Import cleaned per-run summaries into a Neo4j graph for interactive
#   exploration and quality auditing. Each run becomes a node linked to
#   its date and metric values.
#
# Steps:
#   1. Load cleaned dataset (run_summary_cleaned.parquet)
#   2. Connect securely to Neo4j (via .env)
#   3. Create constraints and indexes
#   4. Import runs + date relationships
#   5. Optionally attach metric nodes
#   6. Validate import with diagnostic Cypher queries
#
# Input : ../data/strava/processed/run_summary_cleaned.parquet
# Output: Populated Neo4j graph
# Next  : Stage 4 – Feature Engineering & Clustering
# -----------------------------------------------------------------------------


In [2]:
# 📘 03_neo4j_ingestion.ipynb
# Import cleaned running data into Neo4j for interactive exploration.

from neo4j import GraphDatabase
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
import os
from tqdm import tqdm

load_dotenv()
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASS = os.getenv("NEO4J_PASS")

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))


In [3]:
df = pd.read_parquet("../data/strava/processed/run_summary_cleaned.parquet")
print(f"Loaded {len(df):,} runs for graph import")

with driver.session() as s:
    s.run("CREATE CONSTRAINT IF NOT EXISTS FOR (r:Run) REQUIRE r.run_id IS UNIQUE")
    s.run("CREATE CONSTRAINT IF NOT EXISTS FOR (d:Date) REQUIRE d.date IS UNIQUE")
print("✅ Constraints ensured.")


Loaded 697 runs for graph import
✅ Constraints ensured.


In [4]:
def import_run(tx, row):
    tx.run("""
        MERGE (r:Run {run_id: $run_id})
        SET r.total_distance_km = $distance_km,
            r.avg_pace = $avg_pace,
            r.avg_speed = $avg_speed,
            r.avg_cadence = $avg_cadence,
            r.avg_hr = $avg_hr,
            r.elevation_gain = $elevation_gain,
            r.duration_min = $duration_min,
            r.missing_pct = $missing_pct
        MERGE (d:Date {date: date($date)})
        MERGE (r)-[:ON_DATE]->(d)
        """,
        run_id=row.run_id,
        distance_km=row.total_distance_km,
        avg_pace=row.avg_pace,
        avg_speed=row.avg_speed,
        avg_cadence=row.avg_cadence,
        avg_hr=row.avg_hr,
        elevation_gain=row.elevation_gain,
        duration_min=row.duration_min,
        missing_pct=row.missing_pct,
        date=str(row.date)
    )


In [5]:
#Cell 4 — Bulk import

In [6]:
with driver.session() as s:
    for _, row in tqdm(df.iterrows(), total=len(df)):
        s.execute_write(import_run, row)

print("✅ Imported all runs and date relations into Neo4j.")
driver.close()


100%|████████████████████████████████████████████████████████████████████████████████| 697/697 [00:09<00:00, 69.87it/s]

✅ Imported all runs and date relations into Neo4j.



