In [None]:
from __future__ import annotations

from pathlib import Path
import tarfile
import xml.etree.ElementTree as ET
import sqlite3


def iter_planned_stops_from_archives(
    archives_dir: str | Path,
    pattern: str = "*.tar.gz",
    skip_change_files: bool = True,
):
    """
    Yields tuples:
      (source_label, stop_id, ar_pt, dp_pt)

    source_label ist z.B. "<archive_name>::<member_name>" (für Debug/Beispiele).
    """
    archives_dir = Path(archives_dir)

    for archive_path in sorted(archives_dir.glob(pattern)):
        try:
            with tarfile.open(archive_path, mode="r:*") as tf:
                for m in tf.getmembers():
                    if not m.isfile() or not m.name.endswith(".xml"):
                        continue
                    if skip_change_files and (m.name.endswith("_change.xml") or m.name.endswith("change.xml")):
                        continue

                    f = tf.extractfile(m)
                    if f is None:
                        continue

                    try:
                        root = ET.fromstring(f.read())
                    except ET.ParseError:
                        continue

                    source = f"{archive_path.name}::{m.name}"

                    for s in root.findall("./s"):
                        stop_id = s.get("id")
                        if not stop_id:
                            continue

                        ar = s.find("ar")
                        dp = s.find("dp")

                        ar_pt = (ar.get("pt") if ar is not None else None)
                        dp_pt = (dp.get("pt") if dp is not None else None)

                        # normalisieren (Whitespace raus, leere Strings -> None)
                        if ar_pt is not None:
                            ar_pt = ar_pt.strip() or None
                        if dp_pt is not None:
                            dp_pt = dp_pt.strip() or None

                        yield source, stop_id, ar_pt, dp_pt

        except (tarfile.TarError, OSError):
            continue


def check_stop_id_pt_conflicts(
    archives_dir: str | Path,
    pattern: str = "*.tar.gz",
    chunk_size: int = 50_000,
    max_examples: int = 30,
    sqlite_path: str | Path = "stopid_pt_check.sqlite",
    skip_change_files: bool = True,
):
    """
    Prüft, ob stop_id mehrfach vorkommt und ar_pt/dp_pt verschieden sind.

    Returns: summary dict
    """
    sqlite_path = str(sqlite_path)
    db = sqlite3.connect(sqlite_path)
    db.execute("PRAGMA journal_mode=WAL;")
    db.execute("PRAGMA synchronous=OFF;")
    db.execute("PRAGMA temp_store=MEMORY;")

    db.execute("""
        CREATE TABLE IF NOT EXISTS first_seen (
            stop_id    TEXT PRIMARY KEY,
            ar_pt      TEXT,
            dp_pt      TEXT,
            first_src  TEXT
        );
    """)
    db.execute("""
        CREATE TABLE IF NOT EXISTS conflict_ids (
            stop_id TEXT PRIMARY KEY,
            kind    TEXT NOT NULL   -- 'enrichment_only' oder 'true_conflict'
        );
    """)

    # TEMP batch table (wird pro chunk neu befüllt)
    db.execute("""
        CREATE TEMP TABLE IF NOT EXISTS batch (
            stop_id TEXT,
            ar_pt   TEXT,
            dp_pt   TEXT,
            src     TEXT
        );
    """)
    db.execute("CREATE INDEX IF NOT EXISTS batch_stop_id_idx ON batch(stop_id);")

    def is_null(x: str | None) -> bool:
        return x is None or x == ""

    def classify(f_ar, f_dp, n_ar, n_dp) -> str:
        # true_conflict, wenn irgendein Feld auf beiden Seiten non-null ist und unterschiedlich
        for a, b in ((f_ar, n_ar), (f_dp, n_dp)):
            if (not is_null(a)) and (not is_null(b)) and a != b:
                return "true_conflict"
        return "enrichment_only"

    total_seen = 0
    unique_first_seen_inserts = 0
    mismatch_rows = 0
    new_conflict_stopids = 0
    n_true_conflict_stopids = 0

    examples = []  # Liste von dicts, bis max_examples

    chunk = []

    def flush_chunk(rows):
        nonlocal unique_first_seen_inserts, mismatch_rows, new_conflict_stopids, n_true_conflict_stopids

        if not rows:
            return

        cur = db.cursor()
        cur.execute("DELETE FROM batch;")
        cur.executemany("INSERT INTO batch (stop_id, ar_pt, dp_pt, src) VALUES (?, ?, ?, ?);", rows)

        # 1) Neue stop_ids “merken”
        cur.execute("""
            INSERT OR IGNORE INTO first_seen (stop_id, ar_pt, dp_pt, first_src)
            SELECT stop_id, ar_pt, dp_pt, src FROM batch;
        """)
        unique_first_seen_inserts += cur.rowcount  # nur die wirklich neu eingefügten stop_ids

        # 2) Mismatches finden (gegen die FIRST_SEEN-Werte)
        #    Hinweis: first_seen enthält auch frisch inserierte rows -> die matchen nicht als mismatch,
        #    weil ar_pt/dp_pt identisch sind.
        cur.execute("""
            SELECT
                b.stop_id,
                f.ar_pt, f.dp_pt, f.first_src,
                b.ar_pt, b.dp_pt, b.src
            FROM batch b
            JOIN first_seen f ON f.stop_id = b.stop_id
            WHERE COALESCE(f.ar_pt, '') != COALESCE(b.ar_pt, '')
               OR COALESCE(f.dp_pt, '') != COALESCE(b.dp_pt, '');
        """)
        diffs = cur.fetchall()
        mismatch_rows += len(diffs)

        # 3) Konflikt-StopIDs deduped speichern + klassifizieren
        for stop_id, f_ar, f_dp, f_src, n_ar, n_dp, n_src in diffs:
            kind = classify(f_ar, f_dp, n_ar, n_dp)

            # conflict_ids upsert (wenn schon enrichment_only drin ist und jetzt true_conflict, upgraden)
            cur.execute("INSERT OR IGNORE INTO conflict_ids (stop_id, kind) VALUES (?, ?);", (stop_id, kind))
            if cur.rowcount == 1:
                new_conflict_stopids += 1

            if kind == "true_conflict":
                cur.execute("""
                    UPDATE conflict_ids
                    SET kind = 'true_conflict'
                    WHERE stop_id = ? AND kind != 'true_conflict';
                """, (stop_id,))

            # Beispiele sammeln (limitiert)
            if len(examples) < max_examples:
                examples.append({
                    "stop_id": stop_id,
                    "first": {"ar_pt": f_ar, "dp_pt": f_dp, "src": f_src},
                    "other": {"ar_pt": n_ar, "dp_pt": n_dp, "src": n_src},
                    "kind": kind,
                })

        db.commit()

        # Wie viele true_conflict StopIDs insgesamt?
        # (billig genug, das am Ende einmal zu zählen; hier optional)
        # -> wir zählen am Ende

    for src, stop_id, ar_pt, dp_pt in iter_planned_stops_from_archives(
        archives_dir, pattern=pattern, skip_change_files=skip_change_files
    ):
        total_seen += 1
        chunk.append((stop_id, ar_pt, dp_pt, src))
        if len(chunk) >= chunk_size:
            flush_chunk(chunk)
            chunk.clear()

    if chunk:
        flush_chunk(chunk)
        chunk.clear()

    cur = db.cursor()
    cur.execute("SELECT COUNT(*) FROM conflict_ids;")
    conflict_stopids = cur.fetchone()[0]
    cur.execute("SELECT COUNT(*) FROM conflict_ids WHERE kind='true_conflict';")
    true_conflict_stopids = cur.fetchone()[0]
    cur.close()
    db.close()

    return {
        "total_seen_rows_in_xml": total_seen,
        "unique_stop_ids_first_seen": unique_first_seen_inserts,
        "mismatch_rows_seen": mismatch_rows,  # Anzahl der Batch-Zeilen, die abwichen (kann > conflict_stopids sein)
        "conflicting_stop_ids": conflict_stopids,
        "true_conflict_stop_ids": true_conflict_stopids,
        "examples": examples,
        "sqlite_db_path": sqlite_path,
        "notes": {
            "kind=enrichment_only": "nur NULL<->Wert Unterschiede; kein Feld hat non-null vs non-null und verschieden",
            "kind=true_conflict": "mindestens ein Feld (ar_pt oder dp_pt) ist auf beiden Seiten non-null und unterschiedlich",
        }
    }


In [None]:
# --- Beispiel-Aufruf ---
result = check_stop_id_pt_conflicts("../timetables", pattern="*.tar.gz", chunk_size=50_000, max_examples=20)
print(result["conflicting_stop_ids"], result["true_conflict_stop_ids"])
for ex in result["examples"]:
     print(ex)

In [1]:
from __future__ import annotations

from pathlib import Path
import tarfile
import xml.etree.ElementTree as ET
import sqlite3


FIELDS = ["ar_pt", "dp_pt", "ar_ct", "dp_ct", "ar_clt", "dp_clt"]


def iter_stop_attrs_from_archives(
    archives_dir: str | Path,
    pattern: str = "*.tar.gz",
    skip_change_files: bool = True,
):
    """
    Yields:
      (source_label, stop_id, ar_pt, dp_pt, ar_ct, dp_ct, ar_clt, dp_clt)

    source_label ist "<archive_name>::<member_name>" für Debug/Beispiele.
    """
    archives_dir = Path(archives_dir)

    for archive_path in sorted(archives_dir.glob(pattern)):
        try:
            with tarfile.open(archive_path, mode="r:*") as tf:
                for m in tf.getmembers():
                    if not m.isfile() or not m.name.endswith(".xml"):
                        continue
                    if skip_change_files and (m.name.endswith("_change.xml") or m.name.endswith("change.xml")):
                        continue

                    f = tf.extractfile(m)
                    if f is None:
                        continue

                    try:
                        root = ET.fromstring(f.read())
                    except ET.ParseError:
                        continue

                    source = f"{archive_path.name}::{m.name}"

                    for s in root.findall("./s"):
                        stop_id = s.get("id")
                        if not stop_id:
                            continue

                        ar = s.find("ar")
                        dp = s.find("dp")

                        def g(node, attr):
                            if node is None:
                                return None
                            v = node.get(attr)
                            if v is None:
                                return None
                            v = v.strip()
                            return v or None

                        ar_pt  = g(ar, "pt")
                        dp_pt  = g(dp, "pt")
                        ar_ct  = g(ar, "ct")
                        dp_ct  = g(dp, "ct")
                        ar_clt = g(ar, "clt")
                        dp_clt = g(dp, "clt")

                        yield source, stop_id, ar_pt, dp_pt, ar_ct, dp_ct, ar_clt, dp_clt

        except (tarfile.TarError, OSError):
            continue


def check_stop_id_timeattr_conflicts(
    archives_dir: str | Path,
    pattern: str = "*.tar.gz",
    chunk_size: int = 50_000,
    max_examples: int = 30,
    sqlite_path: str | Path = "stopid_timeattr_check.sqlite",
    reset_sqlite: bool = True,
    skip_change_files: bool = True,
):
    """
    Prüft für stop_id Konflikte in:
      ar/@pt, dp/@pt, ar/@ct, dp/@ct, ar/@clt, dp/@clt

    Konflikt = beide Werte non-null und verschieden.
    Enrichment = nur NULL<->Wert (kein echter Widerspruch).

    Returns: summary dict inkl. Counts pro Feld + Beispiele.
    """
    sqlite_path = Path(sqlite_path)
    if reset_sqlite and sqlite_path.exists():
        sqlite_path.unlink()

    db = sqlite3.connect(str(sqlite_path))
    db.execute("PRAGMA journal_mode=WAL;")
    db.execute("PRAGMA synchronous=OFF;")
    db.execute("PRAGMA temp_store=MEMORY;")

    db.execute(f"""
        CREATE TABLE IF NOT EXISTS first_seen (
            stop_id    TEXT PRIMARY KEY,
            ar_pt      TEXT, dp_pt      TEXT,
            ar_ct      TEXT, dp_ct      TEXT,
            ar_clt     TEXT, dp_clt     TEXT,
            first_src  TEXT
        );
    """)

    # stop_id -> Flags pro Feld (conflict/enrichment)
    db.execute(f"""
        CREATE TABLE IF NOT EXISTS flags (
            stop_id TEXT PRIMARY KEY,
            ar_pt_conflict  INTEGER NOT NULL DEFAULT 0,
            dp_pt_conflict  INTEGER NOT NULL DEFAULT 0,
            ar_ct_conflict  INTEGER NOT NULL DEFAULT 0,
            dp_ct_conflict  INTEGER NOT NULL DEFAULT 0,
            ar_clt_conflict INTEGER NOT NULL DEFAULT 0,
            dp_clt_conflict INTEGER NOT NULL DEFAULT 0,

            ar_pt_enrich  INTEGER NOT NULL DEFAULT 0,
            dp_pt_enrich  INTEGER NOT NULL DEFAULT 0,
            ar_ct_enrich  INTEGER NOT NULL DEFAULT 0,
            dp_ct_enrich  INTEGER NOT NULL DEFAULT 0,
            ar_clt_enrich INTEGER NOT NULL DEFAULT 0,
            dp_clt_enrich INTEGER NOT NULL DEFAULT 0
        );
    """)

    db.execute("""
        CREATE TEMP TABLE IF NOT EXISTS batch (
            stop_id TEXT,
            ar_pt   TEXT, dp_pt   TEXT,
            ar_ct   TEXT, dp_ct   TEXT,
            ar_clt  TEXT, dp_clt  TEXT,
            src     TEXT
        );
    """)
    db.execute("CREATE INDEX IF NOT EXISTS batch_stop_id_idx ON batch(stop_id);")

    def is_null(x: str | None) -> bool:
        return x is None or x == ""

    def differs(a: str | None, b: str | None) -> bool:
        return (a or "") != (b or "")

    total_seen = 0
    mismatch_rows = 0
    examples: list[dict] = []

    chunk = []

    def flush_chunk(rows):
        nonlocal mismatch_rows

        if not rows:
            return

        cur = db.cursor()
        cur.execute("DELETE FROM batch;")
        cur.executemany(
            """
            INSERT INTO batch (stop_id, ar_pt, dp_pt, ar_ct, dp_ct, ar_clt, dp_clt, src)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?);
            """,
            rows,
        )

        # first_seen füllen (nur wenn stop_id neu)
        cur.execute("""
            INSERT OR IGNORE INTO first_seen
                (stop_id, ar_pt, dp_pt, ar_ct, dp_ct, ar_clt, dp_clt, first_src)
            SELECT stop_id, ar_pt, dp_pt, ar_ct, dp_ct, ar_clt, dp_clt, src
            FROM batch;
        """)

        # Diffs gegen first_seen
        cur.execute("""
            SELECT
                b.stop_id,
                f.ar_pt, f.dp_pt, f.ar_ct, f.dp_ct, f.ar_clt, f.dp_clt, f.first_src,
                b.ar_pt, b.dp_pt, b.ar_ct, b.dp_ct, b.ar_clt, b.dp_clt, b.src
            FROM batch b
            JOIN first_seen f ON f.stop_id = b.stop_id
            WHERE COALESCE(f.ar_pt,'')  != COALESCE(b.ar_pt,'')
               OR COALESCE(f.dp_pt,'')  != COALESCE(b.dp_pt,'')
               OR COALESCE(f.ar_ct,'')  != COALESCE(b.ar_ct,'')
               OR COALESCE(f.dp_ct,'')  != COALESCE(b.dp_ct,'')
               OR COALESCE(f.ar_clt,'') != COALESCE(b.ar_clt,'')
               OR COALESCE(f.dp_clt,'') != COALESCE(b.dp_clt,'');
        """)
        diffs = cur.fetchall()
        mismatch_rows += len(diffs)

        # flags updaten
        for row in diffs:
            (
                stop_id,
                f_ar_pt, f_dp_pt, f_ar_ct, f_dp_ct, f_ar_clt, f_dp_clt, f_src,
                n_ar_pt, n_dp_pt, n_ar_ct, n_dp_ct, n_ar_clt, n_dp_clt, n_src
            ) = row

            first_vals = {
                "ar_pt": f_ar_pt, "dp_pt": f_dp_pt,
                "ar_ct": f_ar_ct, "dp_ct": f_dp_ct,
                "ar_clt": f_ar_clt, "dp_clt": f_dp_clt,
            }
            new_vals = {
                "ar_pt": n_ar_pt, "dp_pt": n_dp_pt,
                "ar_ct": n_ar_ct, "dp_ct": n_dp_ct,
                "ar_clt": n_ar_clt, "dp_clt": n_dp_clt,
            }

            # ensure flags row exists
            cur.execute("INSERT OR IGNORE INTO flags (stop_id) VALUES (?);", (stop_id,))

            diff_summary = {}
            for fld in FIELDS:
                a = first_vals[fld]
                b = new_vals[fld]
                if not differs(a, b):
                    continue

                if (not is_null(a)) and (not is_null(b)) and a != b:
                    # echter Konflikt
                    cur.execute(f"UPDATE flags SET {fld}_conflict = 1 WHERE stop_id = ?;", (stop_id,))
                    diff_summary[fld] = {"kind": "conflict", "first": a, "other": b}
                else:
                    # nur NULL<->Wert
                    cur.execute(f"UPDATE flags SET {fld}_enrich = 1 WHERE stop_id = ?;", (stop_id,))
                    diff_summary[fld] = {"kind": "enrich", "first": a, "other": b}

            if diff_summary and len(examples) < max_examples:
                examples.append({
                    "stop_id": stop_id,
                    "first_src": f_src,
                    "other_src": n_src,
                    "diffs": diff_summary,
                })

        db.commit()
        cur.close()

    for src, stop_id, ar_pt, dp_pt, ar_ct, dp_ct, ar_clt, dp_clt in iter_stop_attrs_from_archives(
        archives_dir, pattern=pattern, skip_change_files=skip_change_files
    ):
        total_seen += 1
        chunk.append((stop_id, ar_pt, dp_pt, ar_ct, dp_ct, ar_clt, dp_clt, src))
        if len(chunk) >= chunk_size:
            flush_chunk(chunk)
            chunk.clear()

    if chunk:
        flush_chunk(chunk)
        chunk.clear()

    # Counts pro Feld
    cur = db.cursor()
    counts = {}
    for fld in FIELDS:
        cur.execute(f"SELECT SUM({fld}_conflict) FROM flags;")
        counts[f"{fld}_conflict_stop_ids"] = int(cur.fetchone()[0] or 0)
        cur.execute(f"SELECT SUM({fld}_enrich) FROM flags;")
        counts[f"{fld}_enrichment_stop_ids"] = int(cur.fetchone()[0] or 0)

    cur.execute("""
        SELECT COUNT(*) FROM flags
        WHERE ar_pt_conflict=1 OR dp_pt_conflict=1
           OR ar_ct_conflict=1 OR dp_ct_conflict=1
           OR ar_clt_conflict=1 OR dp_clt_conflict=1;
    """)
    any_conflict_stop_ids = int(cur.fetchone()[0] or 0)

    cur.execute("""
        SELECT COUNT(*) FROM flags
        WHERE ar_pt_enrich=1 OR dp_pt_enrich=1
           OR ar_ct_enrich=1 OR dp_ct_enrich=1
           OR ar_clt_enrich=1 OR dp_clt_enrich=1;
    """)
    any_enrichment_stop_ids = int(cur.fetchone()[0] or 0)

    cur.close()
    db.close()

    return {
        "total_seen_rows_in_xml": total_seen,
        "mismatch_rows_seen": mismatch_rows,
        "any_conflict_stop_ids": any_conflict_stop_ids,
        "any_enrichment_stop_ids": any_enrichment_stop_ids,
        "per_field": counts,
        "examples": examples,
        "sqlite_db_path": str(sqlite_path),
        "notes": {
            "conflict": "beide Werte non-null und verschieden (echter Widerspruch)",
            "enrichment": "nur NULL<->Wert (kein Widerspruch, nur 'mehr Info')",
            "skip_change_files": "Für ct/clt meist False setzen, sonst werden *_change.xml ignoriert.",
        }
    }

In [2]:
# --- Beispiel-Aufruf ---
result = check_stop_id_timeattr_conflicts(
    "../timetables",
    pattern="*.tar.gz",
    chunk_size=50_000,
    max_examples=20,
    skip_change_files=False,  # wichtig für ct/clt
)
print("any_conflict_stop_ids:", result["any_conflict_stop_ids"])
print("any_enrichment_stop_ids:", result["any_enrichment_stop_ids"])
print(result["per_field"])
for ex in result["examples"]:
    print(ex)

any_conflict_stop_ids: 0
any_enrichment_stop_ids: 0
{'ar_pt_conflict_stop_ids': 0, 'ar_pt_enrichment_stop_ids': 0, 'dp_pt_conflict_stop_ids': 0, 'dp_pt_enrichment_stop_ids': 0, 'ar_ct_conflict_stop_ids': 0, 'ar_ct_enrichment_stop_ids': 0, 'dp_ct_conflict_stop_ids': 0, 'dp_ct_enrichment_stop_ids': 0, 'ar_clt_conflict_stop_ids': 0, 'ar_clt_enrichment_stop_ids': 0, 'dp_clt_conflict_stop_ids': 0, 'dp_clt_enrichment_stop_ids': 0}


In [3]:
# --- Beispiel-Aufruf ---
result = check_stop_id_timeattr_conflicts(
    "../timetable_changes",
    pattern="*.tar.gz",
    chunk_size=50_000,
    max_examples=20,
    skip_change_files=False,  # wichtig für ct/clt
)
print("any_conflict_stop_ids:", result["any_conflict_stop_ids"])
print("any_enrichment_stop_ids:", result["any_enrichment_stop_ids"])
print(result["per_field"])
for ex in result["examples"]:
    print(ex)

any_conflict_stop_ids: 149974
any_enrichment_stop_ids: 161297
{'ar_pt_conflict_stop_ids': 0, 'ar_pt_enrichment_stop_ids': 5768, 'dp_pt_conflict_stop_ids': 0, 'dp_pt_enrichment_stop_ids': 4737, 'ar_ct_conflict_stop_ids': 131065, 'ar_ct_enrichment_stop_ids': 137838, 'dp_ct_conflict_stop_ids': 123387, 'dp_ct_enrichment_stop_ids': 137766, 'ar_clt_conflict_stop_ids': 13, 'ar_clt_enrichment_stop_ids': 21223, 'dp_clt_conflict_stop_ids': 14, 'dp_clt_enrichment_stop_ids': 21352}
{'stop_id': '4396219182070181868-2509021612-16', 'first_src': '250902_250909.tar.gz::2509021600/alexanderplatz_change.xml', 'other_src': '250902_250909.tar.gz::2509021615/alexanderplatz_change.xml', 'diffs': {'ar_ct': {'kind': 'conflict', 'first': '2509021807', 'other': '2509021806'}, 'dp_ct': {'kind': 'conflict', 'first': '2509021808', 'other': '2509021807'}}}
{'stop_id': '231917818137011807-2509021414-20', 'first_src': '250902_250909.tar.gz::2509021600/alexanderplatz_change.xml', 'other_src': '250902_250909.tar.gz::25