In [1]:
from __future__ import annotations

from pathlib import Path
import tarfile
import xml.etree.ElementTree as ET
from typing import Iterator

# Iterator Funktion
def iter_xml_roots(archives_dir: str | Path, pattern: str = "*.tar.gz"
                   ) -> Iterator[tuple[Path, str, ET.Element]]:
    """
    Iteriert über alle XML-Dateien in allen .tar.gz-Archiven in archives_dir.

    Yields:
        (archive_path, xml_member_name, xml_root_element)
    """
    archives_dir = Path(archives_dir)

    for archive_path in sorted(archives_dir.glob(pattern)):
        try:
            with tarfile.open(archive_path, mode="r:gz") as tar:
                # Iteriert streamend über Members (speichersparender als getmembers())
                for member in tar:
                    if not member.isfile():
                        continue
                    if not member.name.lower().endswith(".xml"):
                        continue

                    extracted = tar.extractfile(member)
                    if extracted is None:
                        continue

                    try:
                        with extracted as f:
                            tree = ET.parse(f)
                            yield archive_path, member.name, tree.getroot()
                    except ET.ParseError as e:
                        print(f"[WARN] XML ParseError in {archive_path}::{member.name}: {e}")

        except (tarfile.ReadError, OSError) as e:
            print(f"[WARN] Konnte Archiv nicht lesen: {archive_path} ({e})")

In [2]:
import psycopg2
conn = psycopg2.connect(host="localhost", dbname="postgres", user="postgres", password="1234")

In [3]:
from pathlib import Path
from datetime import datetime
from zoneinfo import ZoneInfo
from pathlib import PurePosixPath

# Optional (schneller): psycopg2.extras.execute_values
try:
    from psycopg2.extras import execute_values
except Exception:
    execute_values = None


# ----------------------------
# 1) Schema-Änderung (einmalig)
# ----------------------------
def ensure_actual_columns(conn):
    with conn.cursor() as cur:
        cur.execute("""
            ALTER TABLE public.stops
                ADD COLUMN IF NOT EXISTS actual_arrival   timestamp with time zone,
                ADD COLUMN IF NOT EXISTS actual_departure timestamp with time zone;
        """)
    conn.commit()


# ----------------------------
# 2) Helpers: ct -> datetime, change_member -> timetable_member
# ----------------------------
BERLIN_TZ = ZoneInfo("Europe/Berlin")

def parse_db_ct(ct: str | None):
    """
    DB 'ct' Format: YYMMDDHHMM (10 Ziffern), z.B. 2509021817
    """
    if not ct:
        return None
    ct = ct.strip()
    if len(ct) != 10 or not ct.isdigit():
        return None
    dt = datetime.strptime(ct, "%y%m%d%H%M")
    return dt.replace(tzinfo=BERLIN_TZ)

def change_member_to_timetable_member(change_member_name: str) -> str | None:
    p = PurePosixPath(change_member_name)  # erzwingt "/" statt "\"
    parts = p.parts
    if len(parts) < 2:
        return None

    ts_dir = parts[0]  # "2510011345"
    if len(ts_dir) != 10 or not ts_dir.isdigit():
        return None

    hour_dir = ts_dir[:8] + "00"

    fname = parts[-1]
    if fname.endswith("_change.xml"):
        tt_fname = fname[:-len("_change.xml")] + "_timetable.xml"
    elif fname.endswith("change.xml"):
        tt_fname = fname[:-len("change.xml")] + "timetable.xml"
    else:
        return None

    return str(PurePosixPath(hour_dir) / tt_fname)


# ----------------------------
# 3) Batch-Update via TEMP staging table
# ----------------------------
def init_stage_table(conn):
    with conn.cursor() as cur:
        # KEIN "ON COMMIT DROP"!
        cur.execute("""
            CREATE TEMP TABLE IF NOT EXISTS _stops_change_stage (
                xml_member_name   text NOT NULL,
                stop_id           text NOT NULL,
                actual_arrival    timestamptz NULL,
                actual_departure  timestamptz NULL
            );
        """)
        # Index ist optional, kann bei Bedarf bleiben
        cur.execute("""
            CREATE INDEX IF NOT EXISTS _stops_change_stage_pk
            ON _stops_change_stage (xml_member_name, stop_id);
        """)
    conn.commit()


def apply_batch(conn, batch_rows):
    if not batch_rows:
        return 0

    # Safety: falls Notebook-Zellen neu ausgeführt wurden / neue Connection etc.
    init_stage_table(conn)

    with conn.cursor() as cur:
        cur.execute("TRUNCATE TABLE _stops_change_stage;")

        if execute_values is not None:
            execute_values(
                cur,
                """
                INSERT INTO _stops_change_stage (xml_member_name, stop_id, actual_arrival, actual_departure)
                VALUES %s
                """,
                batch_rows,
                page_size=10_000
            )
        else:
            cur.executemany(
                """
                INSERT INTO _stops_change_stage (xml_member_name, stop_id, actual_arrival, actual_departure)
                VALUES (%s, %s, %s, %s)
                """,
                batch_rows
            )

        cur.execute("""
            UPDATE public.stops s
            SET
                actual_arrival   = COALESCE(st.actual_arrival,   s.actual_arrival),
                actual_departure = COALESCE(st.actual_departure, s.actual_departure)
            FROM _stops_change_stage st
            WHERE s.xml_member_name = st.xml_member_name
              AND s.stop_id         = st.stop_id;
        """)
        updated = cur.rowcount

    conn.commit()
    return updated


# ----------------------------
# 4) Main: Changes einlesen & updaten
# ----------------------------
def process_change_archives(conn, archives_dir: str | Path, pattern: str = "*.tar.gz", batch_size: int = 50_000):
    ensure_actual_columns(conn)
    init_stage_table(conn)

    batch = []
    n_change_files = 0
    n_s_nodes = 0
    n_updates = 0
    n_skipped_no_match = 0

    for archive_path, xml_member_name, root in iter_xml_roots(archives_dir, pattern=pattern):
        # Nur Change-Files
        if not (xml_member_name.endswith("_change.xml") or xml_member_name.endswith("change.xml")):
            continue

        tt_member = change_member_to_timetable_member(xml_member_name)
        if tt_member is None:
            continue

        n_change_files += 1

        # Change-XML: <s id="..."><ar ct="...">...</ar><dp ct="...">...</dp></s>
        # (so wie in deinem Beispiel) :contentReference[oaicite:1]{index=1}
        for s in root.findall("./s"):
            stop_id = s.get("id")
            if not stop_id:
                continue

            ar = s.find("ar")
            dp = s.find("dp")

            ar_ct = ar.get("ct") if ar is not None else None
            dp_ct = dp.get("ct") if dp is not None else None

            actual_arrival = parse_db_ct(ar_ct)
            actual_departure = parse_db_ct(dp_ct)

            # Wenn beides fehlt, bringt's nichts
            if actual_arrival is None and actual_departure is None:
                continue

            batch.append((tt_member, stop_id, actual_arrival, actual_departure))
            n_s_nodes += 1

            if len(batch) >= batch_size:
                n_updates += apply_batch(conn, batch)
                batch.clear()

    # Rest flushen
    if batch:
        n_updates += apply_batch(conn, batch)
        batch.clear()

    print(f"Done.")
    print(f"  change files processed: {n_change_files}")
    print(f"  stop updates staged:    {n_s_nodes}")
    print(f"  rows updated (SQL):     {n_updates}")


# ----------------------------
# 5) AUSFÜHREN (Pfad anpassen)
# ----------------------------
# Beispiel:
changes_dir = Path("../timetable_changes")
process_change_archives(conn, changes_dir, pattern="*.tar.gz", batch_size=50_000)

Done.
  change files processed: 540668
  stop updates staged:    1294515
  rows updated (SQL):     686855


Wichtig: ``conn.close()``, ich hatte mal einen Deadlock Error, der ggf. dadurch entstanden ist, dass mehrere Connections aus verschiedenen Notebooks gleichzeitig vorhanden waren

In [4]:
conn.close()