In [3]:
import json
import tarfile
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple

# Optional: schöner Output als Tabelle
try:
    import pandas as pd
except Exception:
    pd = None

# Safe-ish XML parsing (falls defusedxml installiert ist)
try:
    from defusedxml.ElementTree import fromstring  # type: ignore
except Exception:
    from xml.etree.ElementTree import fromstring  # type: ignore


# ========= HIER ANPASSEN =========
TAR_GZ_PATH = "../timetables/250902_250909.tar.gz"
SCHEMA_JSON_PATH = "../schema.json"
REPORT_UNKNOWN_ATTRIBUTES = True         # Attribute, die NICHT im Schema sind
REPORT_INVALID_ATTRIBUTE_VALUES = True   # Attribute, die im Schema sind, aber falsche Werte haben (enum/type)
MAX_ERRORS_PER_FILE = 500
# ================================


In [4]:
def _local_name(tag: str) -> str:
    return tag.split("}", 1)[1] if "}" in tag else tag

def _attr_map(elem) -> Dict[str, str]:
    return {_local_name(k): v for k, v in (elem.attrib or {}).items()}

def _children_by_tag(elem, name: str):
    return [c for c in list(elem) if _local_name(c.tag) == name]

def _resolve_ref(schema_root: Dict[str, Any], ref: str) -> Dict[str, Any]:
    if not ref.startswith("#/"):
        raise ValueError(f"Unsupported $ref: {ref}")
    node: Any = schema_root
    for p in ref[2:].split("/"):
        p = p.replace("~1", "/").replace("~0", "~")
        node = node[p]
    if not isinstance(node, dict):
        raise ValueError(f"$ref did not resolve to object: {ref}")
    return node

def _resolved(schema_root: Dict[str, Any], s: Any) -> Dict[str, Any]:
    if isinstance(s, dict) and "$ref" in s:
        return _resolve_ref(schema_root, s["$ref"])
    return s if isinstance(s, dict) else {}

def _infer_type(s: Dict[str, Any]) -> Optional[str]:
    t = s.get("type")
    if t == "object" and "enum" in s and "properties" not in s:
        return "string"
    return t

def _parse_bool(v: str) -> Optional[bool]:
    vv = v.strip().lower()
    if vv in {"true", "1", "yes", "y"}:
        return True
    if vv in {"false", "0", "no", "n"}:
        return False
    return None

@dataclass
class Violation:
    file: str
    element_path: str
    attribute: str
    value: str
    message: str

def _validate_primitive(value: str, s: Dict[str, Any]) -> Optional[str]:
    if "enum" in s:
        allowed = set(map(str, s["enum"]))
        if str(value) not in allowed:
            return f"value '{value}' not in enum {sorted(allowed)}"

    t = _infer_type(s)
    if t == "integer":
        try:
            int(str(value).strip())
        except Exception:
            return f"expected integer, got '{value}'"
    elif t == "boolean":
        if _parse_bool(str(value)) is None:
            return f"expected boolean, got '{value}'"

    return None


def validate_attributes_only(
    *,
    elem,
    schema_root: Dict[str, Any],
    schema_node: Dict[str, Any],
    file_name: str,
    path: str,
) -> List[Violation]:
    """
    Prüft NUR Attribute:
    - unbekannte Attribute (nicht im Schema als xml.attribute=True definiert)
    - bekannte Attribute mit ungültigem Wert (optional)
    Fehlende Attribute werden NICHT gemeldet.
    """
    violations: List[Violation] = []

    s = _resolved(schema_root, schema_node)
    props: Dict[str, Any] = s.get("properties", {}) or {}

    # allowed attributes = schema properties mit xml.attribute=True
    allowed_attr_schemas: Dict[str, Dict[str, Any]] = {}
    for name, ps in props.items():
        psr = _resolved(schema_root, ps)
        if bool((psr.get("xml") or {}).get("attribute", False)):
            allowed_attr_schemas[name] = psr

    attrs = _attr_map(elem)

    # 1) Unbekannte Attribute melden
    if REPORT_UNKNOWN_ATTRIBUTES:
        for a, v in attrs.items():
            if a not in allowed_attr_schemas:
                violations.append(Violation(
                    file=file_name,
                    element_path=path,
                    attribute=a,
                    value=str(v),
                    message="unknown attribute (not in schema for this element)"
                ))

    # 2) Bekannte Attribute auf Wert/Typ prüfen
    if REPORT_INVALID_ATTRIBUTE_VALUES:
        for a, schema_a in allowed_attr_schemas.items():
            if a in attrs:  # missing ist ok -> ignorieren
                err = _validate_primitive(attrs[a], schema_a)
                if err:
                    violations.append(Violation(
                        file=file_name,
                        element_path=path,
                        attribute=a,
                        value=str(attrs[a]),
                        message=err
                    ))

    # Rekursion in bekannte Kind-Elemente (damit wir auch dort Attribute prüfen)
    for name, ps in props.items():
        psr = _resolved(schema_root, ps)
        if bool((psr.get("xml") or {}).get("attribute", False)):
            continue  # Attribut, kein Child

        pt = _infer_type(psr)
        if pt == "array":
            items_schema = _resolved(schema_root, psr.get("items", {}))
            for i, ch in enumerate(_children_by_tag(elem, name)):
                violations.extend(validate_attributes_only(
                    elem=ch,
                    schema_root=schema_root,
                    schema_node=items_schema,
                    file_name=file_name,
                    path=f"{path}/{name}[{i}]",
                ))
        else:
            # object / primitive child: wir prüfen einfach Attribute am Child-Element selbst
            for i, ch in enumerate(_children_by_tag(elem, name)):
                violations.extend(validate_attributes_only(
                    elem=ch,
                    schema_root=schema_root,
                    schema_node=psr,
                    file_name=file_name,
                    path=f"{path}/{name}[{i}]",
                ))

    return violations


def validate_tar_gz_attributes_only(
    tar_path: str,
    schema_path: str,
    max_errors_per_file: int = 500,
) -> Tuple[List[Violation], Dict[str, Any]]:
    with open(schema_path, "r", encoding="utf-8") as f:
        schema_root = json.load(f)

    all_violations: List[Violation] = []
    summary: Dict[str, Any] = {"files_total": 0, "xml_files": 0, "files_with_wrong_attributes": 0}

    with tarfile.open(tar_path, "r:gz") as tf:
        members = [m for m in tf.getmembers() if m.isfile()]
        summary["files_total"] = len(members)

        for m in members:
            if not m.name.lower().endswith(".xml"):
                continue
            summary["xml_files"] += 1

            try:
                f = tf.extractfile(m)
                if f is None:
                    continue
                data = f.read()
                root = fromstring(data)
            except Exception:
                # Parsing-Fehler zählen wir nicht als "falsche Attribute" (kannst du ändern, wenn du willst)
                continue

            vios = validate_attributes_only(
                elem=root,
                schema_root=schema_root,
                schema_node=schema_root,
                file_name=m.name,
                path=f"/{_local_name(root.tag)}",
            )

            if vios:
                summary["files_with_wrong_attributes"] += 1
                all_violations.extend(vios[:max_errors_per_file])

    summary["violations_total"] = len(all_violations)
    return all_violations, summary


# ======= Ausführen =======
violations, summary = validate_tar_gz_attributes_only(
    tar_path=TAR_GZ_PATH,
    schema_path=SCHEMA_JSON_PATH,
    max_errors_per_file=MAX_ERRORS_PER_FILE,
)

print("=== Summary ===")
for k, v in summary.items():
    print(f"{k}: {v}")

if pd is not None:
    df = pd.DataFrame([v.__dict__ for v in violations])
    display(df)
else:
    print("\n=== Violations (erste 200) ===")
    for v in violations[:200]:
        print(f"{v.file}: {v.element_path} @{v.attribute}='{v.value}' -> {v.message}")

=== Summary ===
files_total: 20648
xml_files: 20648
files_with_wrong_attributes: 19054
violations_total: 315988


Unnamed: 0,file,element_path,attribute,value,message
0,2509021200/alexanderplatz_timetable.xml,/timetable/s[0]/tl[0],t,p,unknown attribute (not in schema for this elem...
1,2509021200/alexanderplatz_timetable.xml,/timetable/s[1]/tl[0],t,p,unknown attribute (not in schema for this elem...
2,2509021200/alexanderplatz_timetable.xml,/timetable/s[2]/tl[0],t,p,unknown attribute (not in schema for this elem...
3,2509021200/alexanderplatz_timetable.xml,/timetable/s[3]/tl[0],t,p,unknown attribute (not in schema for this elem...
4,2509021200/alexanderplatz_timetable.xml,/timetable/s[4]/tl[0],t,p,unknown attribute (not in schema for this elem...
...,...,...,...,...,...
315983,2509082300/yorckstra_e_timetable.xml,/timetable/s[7]/tl[0],t,p,unknown attribute (not in schema for this elem...
315984,2509082300/yorckstra_e_timetable.xml,/timetable/s[8]/tl[0],t,p,unknown attribute (not in schema for this elem...
315985,2509082300/yorckstra_e_timetable.xml,/timetable/s[9]/tl[0],t,p,unknown attribute (not in schema for this elem...
315986,2509082300/yorckstra_e_timetable.xml,/timetable/s[10]/tl[0],t,p,unknown attribute (not in schema for this elem...
