In [None]:
import re
import os
from neo4j import GraphDatabase
import fitz  # PyMuPDF for PDF reading

# Configuration: Neo4j connection details (use environment variables for security)
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASS = os.getenv("NEO4J_PASS", "password")  # In practice, store secrets securely

# Open Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))

# PDF file path (assumes the PDF is accessible locally)
pdf_path = "Clarifications Rules of Golf 2023.pdf"

# Patterns to identify rule sections and clarifications
rule_heading_pattern = re.compile(r"^(\d{1,2}(?:\.\d+)*[a-z]?(?:\(\d+\))?)\s")  # e.g. "16.1a(3) "
clarification_pattern = re.compile(r"^(\d{1,2}(?:\.\d+)*[a-z]?(?:\(\d+\))?/\d+)\s+–\s+(.*)")

# Data structures to hold nodes and relationships
nodes = {}              # node_id -> {"title": ..., "text": ..., "label": ...}
subrule_rels = []       # (child_id, parent_id) pairs for SUBRULE_OF
clarifies_rels = []     # (clar_id, rule_id) pairs for CLARIFIES
refers_rels = []        # (source_id, target_id) for REFERS_TO

# Helper function to add a node if not exists
def add_node(node_id, title, text, label):
    if node_id in nodes:
        # Append text if this node already exists (to combine segments, if any)
        nodes[node_id]["text"] += " " + text.strip()
    else:
        nodes[node_id] = {"title": title.strip() if title else "", 
                          "text": text.strip(), "label": label}

# Helper to record hierarchy relationship
def add_hierarchy(child_id):
    """Derive parent id from child id and record SUBRULE_OF relationship."""
    parent_id = None
    if "/" in child_id:
        # Clarification: parent is everything before the slash
        parent_id = "Rule " + child_id.split("/")[0]
    else:
        # Rule or subrule: remove the last part of the identifier to get parent
        base = child_id.replace("Rule ", "")
        if "." in base:
            parent_key = base.rsplit(".", 1)[0]        # drop last . section
        elif base and base[-1].isalpha():
            parent_key = base[:-1]                     # drop trailing letter (for e.g. 16.1a -> 16.1)
        else:
            parent_key = None
        if parent_key:
            parent_id = "Rule " + parent_key
    # Record relationship if valid parent found
    if parent_id and parent_id in nodes:
        # Only add if parent exists (within 16–25 subset)
        subrule_rels.append((child_id, parent_id))

# Extract text from the PDF
doc = fitz.open(pdf_path)
full_text = ""
for page in doc:
    full_text += page.get_text()

# Split the text into lines for easier parsing
lines = full_text.splitlines()

current_rule = None       # tracks current top-level Rule (e.g. "Rule 16")
current_section = None    # tracks current rule section (e.g. "Rule 16.1")
collecting = False        # whether we are within rules 16-25 section

for line in lines:
    line = line.rstrip()
    if not line:
        continue  # skip empty lines

    # Check for a top-level rule heading (e.g. "16 Relief from ...")
    # We identify "Rule X" by a line starting with the rule number at beginning.
    m_rule = rule_heading_pattern.match(line)
    if m_rule:
        # Extract rule or subrule number
        rule_num = m_rule.group(1)  # e.g. "16" or "16.1" or "17.1d(3)"
        # Determine if it's a top-level rule or a subsection
        if rule_num.isdigit():  
            # Top-level rule
            current_rule = f"Rule {rule_num}"
            current_section = current_rule
            collecting = 16 <= int(rule_num) <= 25  # only process if rule number 16-25
            if collecting:
                # Title is the rest of the line after the number
                title = line[len(rule_num):].strip(" .-")  # strip number and any punctuation
                add_node(current_rule, title, "", label="RuleSection")
                # (We will fill text content as we go; start with empty text for now)
        else:
            # Subsection of a rule
            if not collecting:
                continue  # skip if outside our target range
            # Form full node id with parent rule prefix
            sub_id = f"Rule {rule_num}"
            current_section = sub_id
            title = line[len(rule_num):].strip(" .-")
            add_node(sub_id, title, "", label="RuleSection")
            # Add hierarchy relationship (subsection -> parent)
            add_hierarchy(sub_id)
        continue

    # Check if line defines a clarification entry
    m_clar = clarification_pattern.match(line)
    if m_clar and collecting:
        clar_id_num = m_clar.group(1)      # e.g. "16.1/1"
        clar_title = m_clar.group(2)       # title of clarification
        clar_node_id = f"Rule {clar_id_num}"
        # Start a new clarification node
        add_node(clar_node_id, clar_title, "", label="Clarification")
        # Link clarification to the rule/subrule it clarifies
        parent_ref = clar_id_num.split("/")[0]  # e.g. "16.1"
        parent_node_id = f"Rule {parent_ref}"
        if parent_node_id in nodes:
            clarifies_rels.append((clar_node_id, parent_node_id))
        # Also treat this as current "section" for accumulating text
        current_section = clar_node_id
        continue

    # If we are within a relevant rule (16-25), accumulate text lines 
    if collecting and current_section:
        # Append this line to the text of the current section or clarification
        # (We add a space before the line to separate from previous content)
        nodes[current_section]["text"] += (" " + line)

# After initial parse, refine text content and detect cross-references
# Trim any leading/trailing whitespace in each node's text
for node_id, data in nodes.items():
    data["text"] = data["text"].strip()

    # Find cross-references to other rules in text
    # Pattern: "Rule <number>[.<section>[letter][(<digit>)]...]"
    ref_pattern = r"Rule\s+(\d+(?:\.\d+)*[a-z]?(?:\(\d+\))?)"
    for match in re.findall(ref_pattern, data["text"]):
        ref_rule_id = f"Rule {match}"
        # Only add reference if target node exists in our subset
        if ref_rule_id in nodes and ref_rule_id != node_id:
            refers_rels.append((node_id, ref_rule_id))

    # Determine hierarchy (SUBRULE_OF) for rule nodes after all nodes collected
    if "/" not in node_id:  # skip clarifications
        add_hierarchy(node_id)

# Remove potential duplicate relationships (set for uniqueness)
subrule_rels = list({(c,p) for (c,p) in subrule_rels})
clarifies_rels = list({(c,p) for (c,p) in clarifies_rels})
refers_rels = list({(s,t) for (s,t) in refers_rels})

# Use a Neo4j session to create nodes and relationships
with driver.session() as session:
    # Create constraints for uniqueness (optional)
    # session.run("CREATE CONSTRAINT IF NOT EXISTS ON (n:RuleSection) ASSERT n.id IS UNIQUE")
    # session.run("CREATE CONSTRAINT IF NOT EXISTS ON (c:Clarification) ASSERT c.id IS UNIQUE")

    # Create all nodes
    for node_id, data in nodes.items():
        # Choose label based on node type
        label = data["label"]
        cypher = (
            f"MERGE (n:{label} {{id: $id}}) "
            f"SET n.title = $title, n.text = $text"
        )
        session.run(cypher, id=node_id, title=data["title"], text=data["text"])

    # Create SUBRULE_OF relationships
    for child_id, parent_id in subrule_rels:
        cypher = (
            "MATCH (child {id:$c}), (parent {id:$p}) "
            "MERGE (child)-[:SUBRULE_OF]->(parent)"
        )
        session.run(cypher, c=child_id, p=parent_id)

    # Create CLARIFIES relationships
    for clar_id, rule_id in clarifies_rels:
        cypher = (
            "MATCH (clar {id:$c}), (rule {id:$r}) "
            "MERGE (clar)-[:CLARIFIES]->(rule)"
        )
        session.run(cypher, c=clar_id, r=rule_id)

    # Create REFERS_TO relationships
    for source_id, target_id in refers_rels:
        cypher = (
            "MATCH (src {id:$s}), (tgt {id:$t}) "
            "MERGE (src)-[:REFERS_TO]->(tgt)"
        )
        session.run(cypher, s=source_id, t=target_id)

print(f"Created {len(nodes)} nodes in Neo4j (rules 16-25 and clarifications).")
print(f"Created {len(subrule_rels)} SUBRULE_OF, {len(clarifies_rels)} CLARIFIES, and {len(refers_rels)} REFERS_TO relationships.")