In [12]:
from __future__ import annotations

from pathlib import Path
import tarfile
import xml.etree.ElementTree as ET
from typing import Iterator

# Iterator Funktion
def iter_xml_roots(archives_dir: str | Path, pattern: str = "*.tar.gz"
                   ) -> Iterator[tuple[Path, str, ET.Element]]:
    """
    Iteriert über alle XML-Dateien in allen .tar.gz-Archiven in archives_dir.

    Yields:
        (archive_path, xml_member_name, xml_root_element)
    """
    archives_dir = Path(archives_dir)

    for archive_path in sorted(archives_dir.glob(pattern)):
        try:
            with tarfile.open(archive_path, mode="r:gz") as tar:
                # Iteriert streamend über Members (speichersparender als getmembers())
                for member in tar:
                    if not member.isfile():
                        continue
                    if not member.name.lower().endswith(".xml"):
                        continue

                    extracted = tar.extractfile(member)
                    if extracted is None:
                        continue

                    try:
                        with extracted as f:
                            tree = ET.parse(f)
                            yield archive_path, member.name, tree.getroot()
                    except ET.ParseError as e:
                        print(f"[WARN] XML ParseError in {archive_path}::{member.name}: {e}")

        except (tarfile.ReadError, OSError) as e:
            print(f"[WARN] Konnte Archiv nicht lesen: {archive_path} ({e})")

In [13]:
def extract_s_id_and_pts(root: ET.Element):
    """
    Returns a list of dicts:
      { "id": <s/@id>, "ar_pt": <ar/@pt or None>, "dp_pt": <dp/@pt or None> }
    """
    station_name = root.get("station", None)
    if not station_name:
        return
    rows = []
    for s in root.findall("s"):
        s_id = s.get("id")

        ar = s.find("ar")
        dp = s.find("dp")

        ar_pt = ar.get("pt") if ar is not None else None
        dp_pt = dp.get("pt") if dp is not None else None

        rows.append({"id": s_id, "ar_pt": ar_pt, "dp_pt": dp_pt})
    return station_name, rows

In [14]:
TIMETABLES_PATH = "../timetables"
TIMETABLE_CHANGES_PATH = "../timetable_changes"

In [17]:
i = 0

for path, data_name, xml_root in iter_xml_roots(TIMETABLES_PATH):
    result = extract_s_id_and_pts(xml_root)
    print(result)
    i += 1
    if i > 2:
        break

None
None
('Berlin Alexanderplatz', [{'id': '3645252224070148387-2509021119-12', 'ar_pt': '2509021220', 'dp_pt': '2509021221'}, {'id': '8750317990040285840-2509021014-20', 'ar_pt': '2509021205', 'dp_pt': '2509021206'}, {'id': '2128666462956236056-2509021214-4', 'ar_pt': '2509021241', 'dp_pt': '2509021242'}, {'id': '8129662548196100785-2509021155-8', 'ar_pt': '2509021253', 'dp_pt': '2509021254'}, {'id': '-8551086525322998602-2509021207-8', 'ar_pt': '2509021247', 'dp_pt': '2509021248'}, {'id': '-4129528892956638430-2509021144-4', 'ar_pt': '2509021210', 'dp_pt': '2509021211'}, {'id': '-412309776969860423-2509021201-12', 'ar_pt': '2509021252', 'dp_pt': '2509021253'}, {'id': '-7177845517273939006-2509021210-2', 'ar_pt': '2509021215', 'dp_pt': '2509021217'}, {'id': '-4989708721191209857-2509021126-12', 'ar_pt': '2509021233', 'dp_pt': '2509021234'}, {'id': '-8986680849348819865-2509021012-16', 'ar_pt': '2509021205', 'dp_pt': '2509021206'}, {'id': '-2975196180293385461-2509021134-8', 'ar_pt': 