# EUR-LEX Universal Parser

In [23]:
from bs4 import BeautifulSoup
from pathlib import Path
from typing import List, Dict

import re

### Opening and access the HTML data

In [20]:
# For opening the raw HTML file we declare the path here as a class.
html_file = Path("../data/regulations/32017R0745/EN/raw/raw.html")

# Then we read the content of the file into a variable.
raw_html = html_file.read_text(encoding="utf-8")

# We parse the raw HTML content using BeautifulSoup to create a navigable tree structure.
soup = BeautifulSoup(raw_html, "lxml")

Up to now we have just used the `Path` class from the library `Pathlib` to create a path object, which then we used to open the `raw_html` through the `read_text()` method. Then the `soup` variable was created which contains a navigable tree structure thanks to html tagged structure. Printing the `soup` variable will output the full html mess 

According to BeautifulSoup documentation, the method `find_all()` looks through a tag's descendants and retrieves all descendants that match all our filters. 

In [25]:
soup.find_all("h1")

[]

In [None]:
def parse_eu_ai_act_html(html_content: str, celex: str = "32024R1689") -> Dict:
    soup = BeautifulSoup(html_content, "lxml")

    provisions = []
    relations = []
    id_to_node = {}          # for building parent_id quickly

    # 1. Find main content
    main = soup.find("div", id="docHtml") or soup.find("div", class_="eli-container")

    # 2. Recursive walker (depth-first, respects DOM nesting)
    def walk(element, parent_id=None, current_path=None):
        if not element or not element.get("id"):
            return

        node_id = element["id"]

        # Determine kind from ID prefix (official ELI)
        if re.match(r"rct_\d+", node_id):
            kind = "recital"
            number = node_id.split("_")[1]
            full_id = f"{celex}_rec_{number}"
            citation = f"Recital ({number})"

        elif re.match(r"cpt_[IVX]+", node_id):
            kind = "chapter"
            number = node_id.split("_")[1]
            full_id = f"{celex}_cpt_{number}"
            citation = f"Chapter {number}"

        elif re.search(r"\.sct_\d+", node_id):
            kind = "section"
            # extract section number
            number = re.search(r"sct_(\d+)", node_id).group(1)
            full_id = f"{celex}_sct_{number}"
            citation = f"Section {number}"

        elif re.match(r"art_\d+", node_id):
            kind = "article"
            number = node_id.split("_")[1]
            full_id = f"{celex}_art_{number}"
            citation = f"Article {number}"

        elif re.match(r"anx_[IVX0-9]+", node_id):
            kind = "annex"
            number = node_id.split("_")[1]
            full_id = f"{celex}_anx_{number}"
            citation = f"Annex {number}"

        else:
            # fallback for paragraphs/points inside tables
            kind = "paragraph"   # will be refined later
            full_id = f"{celex}_{node_id}"
            citation = ""

        # Extract text (handle the classic EUR-Lex table pattern)
        text = ""
        table = element.find("table")
        if table:
            rows = table.find_all("tr")
            for row in rows:
                cells = row.find_all("td")
                if len(cells) >= 2:
                    number_cell = cells[0].get_text(strip=True)
                    text_cell = cells[1].get_text(separator=" ", strip=True)
                    text += f"{number_cell} {text_cell}\n"
        else:
            text = element.get_text(separator=" ", strip=True)

        # Build node
        node = {
            "id": node_id,
            "canonical_id": full_id,
            "celex": celex,
            "regulation_id": "EU AI Act",
            "lang": "EN",
            "kind": kind,
            "level": kind,                     # or derive from depth
            "item_number": number if 'number' in locals() else None,
            "title": element.find(["h1","h2","h3","strong"]).get_text(strip=True) if element.find(["h1","h2","h3","strong"]) else None,
            "text": text.strip(),
            "path": current_path or [],
            "parent_id": parent_id,
            "depth": len(current_path) if current_path else 0,
            # ... all your other fields (provenance, snippet, etc.)
            "is_requirement": False,   # LLM step 2
            "requirement_type": None,
            "obligations": [],
            "references": [],
            "roles": [],
            "provenance": { ... }      # html_start, raw_hash, etc.
        }

        provisions.append(node)
        id_to_node[full_id] = node

        # Recurse into children
        new_path = (current_path or []) + [full_id]
        for child in element.find_all("div", class_="eli-subdivision", recursive=False):
            walk(child, parent_id=full_id, current_path=new_path)

    # Start walking from enacting terms and annexes
    for top_level in main.find_all(["div"], id=re.compile(r"(rct_|cpt_|art_|anx_)")):
        walk(top_level)

    return {
        "graph_version": "0.2",
        "celex_id": celex,
        "regulation_id": "EU AI Act",
        "source_name": "EUR-Lex HTML",
        "generated_at": "...",
        "provisions": provisions,
        "relations": relations   # built from detected "Article XX", "Annex Y" etc.
    }