# Results archive inspection

This notebook automatically inspects every `.zip` file stored in the `results` directory.
It parses the filename of each archive to extract useful metadata, relies on the included
`manifest.json` file to map Redis database dumps to their logical meaning, and previews
all extracted files directly below. Large files are truncated to the first bytes so the
notebook stays responsive.

In [None]:
from pathlib import Path
import json
import zipfile
from IPython.display import display, Markdown

RESULTS_DIR = Path("results")
HOST_BASE = 9
DB_LABELS = {
    0: "DATA",
    1: "CAN",
    2: "R",
    3: "NR",
    4: "CAR",
    5: "AR",
    6: "GP",
    7: "BP",
    8: "PR",
    9: "AP",
    10: "LOGS",
}
MAX_FULL_BYTES = 200_000
MAX_PREVIEW_BYTES = 10_000

zip_paths = sorted(RESULTS_DIR.glob("*.zip"))

if not zip_paths:
    display(Markdown("> **No ZIP archives were found in the `results` directory.**"))
else:
    display(Markdown(f"> Found **{len(zip_paths)}** ZIP archives in `{RESULTS_DIR}`."))


In [None]:
def format_bytes(size):
    '''Return a human-readable representation of a file size.'''
    if size is None:
        return "—"
    units = ["B", "KB", "MB", "GB", "TB"]
    value = float(size)
    for unit in units:
        if value < 1024 or unit == units[-1]:
            if unit == "B":
                return f"{int(value)} {unit}"
            return f"{value:.2f} {unit}"
        value /= 1024
    return f"{value:.2f} B"


def parse_zip_metadata(zip_path):
    '''Extract dataset, class, completion flag, and host numbers from the archive name.'''
    dataset, class_name, completion_flag, host_fragment = zip_path.stem.rsplit("_", 3)
    try:
        host_offset = int(host_fragment)
        host_id = host_offset + HOST_BASE
    except ValueError:
        host_offset = None
        host_id = None
    flag_lower = completion_flag.lower()
    if flag_lower in {"true", "false"}:
        is_completed = flag_lower == "true"
    else:
        is_completed = None
    return {
        "zip_path": zip_path,
        "dataset": dataset,
        "class": class_name,
        "completion_raw": completion_flag,
        "is_completed": is_completed,
        "size_bytes": zip_path.stat().st_size,
        "host_offset": host_offset,
        "host_id": host_id,
    }


def detect_root_prefix(archive, zip_path):
    '''Guess the common directory prefix used inside the archive.'''
    stem_prefix = f"{zip_path.stem}/"
    has_stem = any(
        info.filename.startswith(stem_prefix)
        for info in archive.infolist()
        if not info.is_dir()
    )
    if has_stem:
        return stem_prefix
    return ""


def resolve_manifest(archive, zip_path):
    '''Return the manifest data together with the prefix used inside the archive.'''
    candidates = []
    stem_prefix = f"{zip_path.stem}/"
    candidates.append(stem_prefix)
    for info in archive.infolist():
        if info.is_dir():
            dirname = info.filename
            if dirname.startswith("__MACOSX/"):
                continue
            if not dirname.endswith("/"):
                dirname += "/"
            candidates.append(dirname)
    candidates.append("")
    seen = set()
    for prefix in candidates:
        if prefix in seen:
            continue
        seen.add(prefix)
        manifest_path = f"{prefix}manifest.json"
        try:
            with archive.open(manifest_path) as manifest_file:
                manifest = json.load(manifest_file)
        except KeyError:
            continue
        else:
            return prefix, manifest
    raise KeyError("manifest.json not found")


def display_db_overview(manifest, archive, prefix):
    '''Show a table describing the Redis databases listed in the manifest.'''
    header = "| Label | DB index | JSON file | Size |\n|-------|----------|-----------|------|\n"
    rows = []
    files_map = manifest.get("files", {})
    for db_index in manifest.get("databases", []):
        label = DB_LABELS.get(db_index, "Unknown")
        file_name = files_map.get(str(db_index))
        if file_name:
            archive_name = f"{prefix}{file_name}"
            try:
                size = archive.getinfo(archive_name).file_size
                size_desc = f"{format_bytes(size)} ({size:,} bytes)"
            except KeyError:
                size_desc = "⚠️ file missing in archive"
        else:
            file_name = "—"
            size_desc = "⚠️ file not listed"
        rows.append(f"| {label} | {db_index} | {file_name} | {size_desc} |")
    if rows:
        display(Markdown("### Redis databases\n" + header + "\n".join(rows)))
    else:
        display(Markdown("### Redis databases\n> Manifest does not describe any Redis database files."))


def preview_member(archive, info, prefix):
    '''Display a preview of a file stored inside the archive.'''
    member_name = info.filename
    relative_name = member_name[len(prefix):] if prefix and member_name.startswith(prefix) else member_name
    size_bytes = info.file_size
    size_text = f"{format_bytes(size_bytes)} ({size_bytes:,} bytes)"
    heading_lines = [f"### {relative_name}", f"* Size: {size_text}"]
    with archive.open(member_name) as handle:
        if size_bytes <= MAX_FULL_BYTES:
            payload = handle.read()
            text = payload.decode("utf-8", errors="replace")
            body_lines = heading_lines + ["", "````text", text, "````"]
        else:
            payload = handle.read(MAX_PREVIEW_BYTES)
            text = payload.decode("utf-8", errors="replace")
            remaining = max(size_bytes - MAX_PREVIEW_BYTES, 0)
            body_lines = (
                heading_lines
                + [f"* Previewed: first {MAX_PREVIEW_BYTES:,} bytes", "", "````text", text, "````", ""]
                + [
                    f"> ⚠️ Preview truncated. {remaining:,} additional bytes are not shown.",
                    "> Extract the file from the archive to inspect it entirely.",
                ]
            )
    display(Markdown("\n".join(body_lines)))


def display_zip_contents(zip_path):
    '''Render metadata, manifest information, and file previews for a single archive.'''
    meta = parse_zip_metadata(zip_path)
    size_text = f"{format_bytes(meta['size_bytes'])} ({meta['size_bytes']:,} bytes)"
    completion = (
        "True" if meta["is_completed"] is True else "False" if meta["is_completed"] is False else meta["completion_raw"]
    )
    host_offset = meta["host_offset"] if meta["host_offset"] is not None else "—"
    host_id = meta["host_id"] if meta["host_id"] is not None else "—"
    lines = [
        f"## Archive: `{zip_path.name}`",
        f"* Dataset: **{meta['dataset']}**",
        f"* Class: **{meta['class']}**",
        f"* Completed flag: **{completion}**",
        f"* Host offset: **{host_offset}** (host id starting from 9: **{host_id}**)",
        f"* Archive size: **{size_text}**",
    ]
    display(Markdown("\n".join(lines)))

    with zipfile.ZipFile(zip_path) as archive:
        try:
            prefix, manifest = resolve_manifest(archive, zip_path)
            manifest_error = None
        except KeyError as exc:
            manifest = None
            manifest_error = str(exc)
            prefix = detect_root_prefix(archive, zip_path)
        except json.JSONDecodeError as exc:
            manifest = None
            manifest_error = f"Could not parse manifest.json: {exc}"
            prefix = detect_root_prefix(archive, zip_path)

        if manifest:
            display(Markdown("### Manifest summary"))
            display(Markdown(f"* File prefix: `{manifest.get('file_prefix', '—')}`"))
            display_db_overview(manifest, archive, prefix)
        else:
            display(Markdown(f"> ⚠️ {manifest_error}"))

        members = sorted(
            (info for info in archive.infolist() if not info.is_dir()),
            key=lambda info: info.filename,
        )
        for info in members:
            preview_member(archive, info, prefix)


metadata = [parse_zip_metadata(path) for path in zip_paths]
if metadata:
    header = (
        "| Dataset | Class | Completed flag | Host offset | Host id | Archive size | Archive file |\n"
        "|---------|-------|----------------|-------------|---------|--------------|--------------|\n"
    )
    rows = []
    for item in metadata:
        completion = (
            "True" if item["is_completed"] is True else "False" if item["is_completed"] is False else item["completion_raw"]
        )
        host_offset = item["host_offset"] if item["host_offset"] is not None else "—"
        host_id = item["host_id"] if item["host_id"] is not None else "—"
        size_text = f"{format_bytes(item['size_bytes'])} ({item['size_bytes']:,} bytes)"
        rows.append(
            f"| {item['dataset']} | {item['class']} | {completion} | {host_offset} | {host_id} | {size_text} | {item['zip_path'].name} |"
        )
    display(Markdown("### Archive overview\n" + header + "\n".join(rows)))
else:
    display(Markdown("> **No archives to summarise.**"))


In [None]:
if zip_paths:
    for path in zip_paths:
        display_zip_contents(path)
else:
    display(Markdown("> No archives to inspect."))
