# Results archive inspection

This notebook automatically inspects every `.zip` file stored in the `results` directory.
It parses the filename of each archive to extract useful metadata, relies on the included
`manifest.json` file to map Redis database dumps to their logical meaning, and previews
all extracted files directly below. Large files are truncated to the first bytes so the
notebook stays responsive.

In [1]:
from pathlib import Path
from dataclasses import dataclass

import base64
import binascii
import json
import zipfile

RESULTS_DIR = Path("results")
HOST_BASE = 9
DB_LABELS = {
    0: "DATA",
    1: "CAN",
    2: "R",
    3: "NR",
    4: "CAR",
    5: "AR",
    6: "GP",
    7: "BP",
    8: "PR",
    9: "AP",
    10: "LOGS",
}
MAX_FULL_BYTES = 200_000
MAX_PREVIEW_BYTES = 10_000

zip_paths = sorted(RESULTS_DIR.glob("*.zip"))
zip_inventory = {
    "results_dir": str(RESULTS_DIR),
    "count": len(zip_paths),
    "found": bool(zip_paths),
    "paths": [str(path) for path in zip_paths],
}


In [2]:
def format_bytes(size):
    """Return a human-readable representation of a file size."""
    if size is None:
        return "-"
    units = ["B", "KB", "MB", "GB", "TB"]
    value = float(size)
    for unit in units:
        if value < 1024 or unit == units[-1]:
            if unit == "B":
                return f"{int(value)} {unit}"
            return f"{value:.2f} {unit}"
        value /= 1024
    return f"{value:.2f} B"


def parse_zip_metadata(zip_path):
    """Extract dataset, class, completion flag, and host numbers from the archive name."""
    dataset, class_name, completion_flag, host_fragment = zip_path.stem.rsplit("_", 3)
    try:
        host_offset = int(host_fragment)
        host_id = host_offset + HOST_BASE
    except ValueError:
        host_offset = None
        host_id = None
    flag_lower = completion_flag.lower()
    if flag_lower in {"true", "false"}:
        is_completed = flag_lower == "true"
    else:
        is_completed = None
    size_bytes = zip_path.stat().st_size
    return {
        "zip_path": str(zip_path),
        "zip_name": zip_path.name,
        "dataset": dataset,
        "class": class_name,
        "completion_raw": completion_flag,
        "is_completed": is_completed,
        "size_bytes": size_bytes,
        "size_text": format_bytes(size_bytes),
        "host_offset": host_offset,
        "host_id": host_id,
    }


def detect_root_prefix(archive, zip_path):
    """Guess the common directory prefix used inside the archive."""
    stem_prefix = f"{zip_path.stem}/"
    has_stem = any(
        info.filename.startswith(stem_prefix)
        for info in archive.infolist()
        if not info.is_dir()
    )
    if has_stem:
        return stem_prefix
    return ""


def resolve_manifest(archive, zip_path):
    """Return the manifest data together with the prefix used inside the archive."""
    candidates = []
    stem_prefix = f"{zip_path.stem}/"
    candidates.append(stem_prefix)
    for info in archive.infolist():
        if info.is_dir():
            dirname = info.filename
            if dirname.startswith("__MACOSX/"):
                continue
            if not dirname.endswith("/"):
                dirname += "/"
            candidates.append(dirname)
    candidates.append("")
    seen = set()
    for prefix in candidates:
        if prefix in seen:
            continue
        seen.add(prefix)
        manifest_path = f"{prefix}manifest.json"
        try:
            with archive.open(manifest_path) as manifest_file:
                manifest = json.load(manifest_file)
        except KeyError:
            continue
        else:
            return prefix, manifest
    raise KeyError("manifest.json not found")


class DumpDecodeError(RuntimeError):
    """Generic error raised while decoding a Redis DUMP payload."""


@dataclass
class DumpSections:
    payload: bytes
    version: int
    checksum: bytes


class _LengthEncoding:
    __slots__ = ("value", "encoding")

    def __init__(self, value=None, encoding=None):
        self.value = value
        self.encoding = encoding


RDB_ENCODING_INT8 = 0
RDB_ENCODING_INT16 = 1
RDB_ENCODING_INT32 = 2
RDB_ENCODING_LZF = 3


def split_dump_sections(raw: bytes) -> DumpSections:
    """Split payload, RDB version, and checksum from a Redis dump."""
    if len(raw) < 10:
        raise DumpDecodeError("DUMP payload is too short to contain metadata")
    checksum = raw[-8:]
    version_bytes = raw[-10:-8]
    version = int.from_bytes(version_bytes, "little", signed=False)
    payload = raw[:-10]
    return DumpSections(payload=payload, version=version, checksum=checksum)


def _read_length_info(buffer: bytes, offset: int):
    if offset >= len(buffer):
        raise DumpDecodeError("Offset out of range while reading length")
    first = buffer[offset]
    prefix = first >> 6
    if prefix == 0:
        length = first & 0x3F
        return _LengthEncoding(length), offset + 1
    if prefix == 1:
        if offset + 1 >= len(buffer):
            raise DumpDecodeError("Truncated 14-bit encoded length")
        second = buffer[offset + 1]
        length = ((first & 0x3F) << 8) | second
        return _LengthEncoding(length), offset + 2
    if prefix == 2:
        if offset + 4 >= len(buffer):
            raise DumpDecodeError("Truncated 32-bit encoded length")
        length = int.from_bytes(buffer[offset + 1 : offset + 5], "big", signed=False)
        return _LengthEncoding(length), offset + 5
    return _LengthEncoding(None, first & 0x3F), offset + 1


def lzf_decompress(data: bytes, expected_length: int) -> bytes:
    """Minimal implementation of the LZF decompression used by Redis."""
    output = bytearray()
    idx = 0
    data_len = len(data)
    while idx < data_len:
        ctrl = data[idx]
        idx += 1
        if ctrl < 32:
            literal_len = ctrl + 1
            if idx + literal_len > data_len:
                raise DumpDecodeError("Truncated literal LZF sequence")
            output.extend(data[idx : idx + literal_len])
            idx += literal_len
        else:
            length = ctrl >> 5
            ref_offset = len(output) - ((ctrl & 0x1F) << 8) - 1
            if length == 7:
                if idx >= data_len:
                    raise DumpDecodeError("Truncated LZF sequence while extending length")
                length += data[idx]
                idx += 1
            if idx >= data_len:
                raise DumpDecodeError("Truncated LZF sequence while resolving reference")
            ref_offset -= data[idx]
            idx += 1
            length += 2
            if ref_offset < 0:
                raise DumpDecodeError("Negative LZF reference")
            for _ in range(length):
                if ref_offset >= len(output):
                    raise DumpDecodeError("LZF reference out of range")
                output.append(output[ref_offset])
                ref_offset += 1
    if len(output) != expected_length:
        raise DumpDecodeError(
            f"Unexpected decompressed length: expected {expected_length}, got {len(output)}"
        )
    return bytes(output)


def _decode_special_encoding(buffer: bytes, offset: int, encoding: int):
    if encoding == RDB_ENCODING_INT8:
        if offset >= len(buffer):
            raise DumpDecodeError("Truncated 8-bit encoded integer")
        value = int.from_bytes(buffer[offset : offset + 1], "little", signed=True)
        return str(value).encode("ascii"), offset + 1
    if encoding == RDB_ENCODING_INT16:
        if offset + 2 > len(buffer):
            raise DumpDecodeError("Truncated 16-bit encoded integer")
        value = int.from_bytes(buffer[offset : offset + 2], "little", signed=True)
        return str(value).encode("ascii"), offset + 2
    if encoding == RDB_ENCODING_INT32:
        if offset + 4 > len(buffer):
            raise DumpDecodeError("Truncated 32-bit encoded integer")
        value = int.from_bytes(buffer[offset : offset + 4], "little", signed=True)
        return str(value).encode("ascii"), offset + 4
    if encoding == RDB_ENCODING_LZF:
        data_len_info, next_offset = _read_length_info(buffer, offset)
        compressed_len_info, compressed_offset = _read_length_info(buffer, next_offset)
        if data_len_info.value is None or compressed_len_info.value is None:
            raise DumpDecodeError("Invalid LZF length encoding")
        end = compressed_offset + compressed_len_info.value
        if end > len(buffer):
            raise DumpDecodeError("Truncated encoded string")
        compressed = buffer[compressed_offset:end]
        decompressed = lzf_decompress(compressed, data_len_info.value)
        return decompressed, end
    raise DumpDecodeError("Unknown string encoding")


def _read_encoded_string(buffer: bytes, offset: int):
    length_info, next_offset = _read_length_info(buffer, offset)
    if length_info.encoding is None:
        end = next_offset + length_info.value
        if end > len(buffer):
            raise DumpDecodeError("Truncated encoded string")
        return buffer[next_offset:end], end
    return _decode_special_encoding(buffer, next_offset, length_info.encoding)


def decode_string_from_dump(raw: bytes) -> bytes:
    sections = split_dump_sections(raw)
    payload = sections.payload
    if not payload:
        raise DumpDecodeError("Empty payload")
    object_type = payload[0]
    if object_type != 0:
        raise DumpDecodeError(f"Non-string object type: {object_type}")
    value, _ = _read_encoded_string(payload, 1)
    return value


def decode_bytes(value: str) -> bytes:
    if not isinstance(value, str):
        raise DumpDecodeError("Encoded value must be a string")
    try:
        return base64.b64decode(value.encode("ascii"))
    except (UnicodeEncodeError, binascii.Error) as exc:
        raise DumpDecodeError(f"Invalid base64 payload: {exc}") from exc


def decode_key(entry):
    return decode_bytes(entry["key"])


def text_preview(value: bytes, limit: int = 120) -> str:
    text = value.decode("utf-8", errors="replace")
    if len(text) > limit:
        return text[: limit - 1] + "."
    return text


def try_decode_value(entry):
    value_info = dict(entry.get("value") or {})
    data_b64 = value_info.get("data")
    if not data_b64:
        return "<no value>", value_info
    try:
        raw = decode_bytes(data_b64)
    except DumpDecodeError as exc:
        value_info["decode_error"] = str(exc)
        return "<invalid base64>", value_info
    details = {
        "dump_size": len(raw),
    }
    try:
        sections = split_dump_sections(raw)
        details["rdb_version"] = sections.version
        details["checksum"] = sections.checksum.hex()
    except DumpDecodeError as exc:
        details["dump_error"] = str(exc)
        return "<invalid dump>", details
    if entry.get("type") == "string":
        try:
            decoded = decode_string_from_dump(raw)
        except DumpDecodeError as exc:
            details["decode_error"] = str(exc)
            return "<string not decoded>", details
        details["decoded_bytes"] = decoded
        preview = text_preview(decoded)
        return preview, details
    return f"<{entry.get('type')} - {len(sections.payload)} bytes>", details


def shorten_text(text: str, limit: int = 600) -> str:
    sanitized = text.replace("````", "``` `")
    if len(sanitized) > limit:
        return sanitized[: limit - 1] + "."
    return sanitized


def summarise_backup_entries(entries, limit: int = 3):
    if not entries:
        return ["> No entries stored in this backup."]
    lines = []
    for index, entry in enumerate(entries[:limit], start=1):
        try:
            key_bytes = decode_key(entry)
            key_text = key_bytes.decode("utf-8", errors="replace") or "<empty key>"
        except (KeyError, DumpDecodeError) as exc:
            key_text = f"<unable to decode key: {exc}>"
        preview, details = try_decode_value(entry)
        entry_type = entry.get("type", "unknown")
        ttl = entry.get("pttl")
        ttl_text = f"{ttl}" if isinstance(ttl, int) else "persistent"
        lines.append(f"Entry {index}: key `{key_text}`")
        lines.append(f"Type: `{entry_type}`; TTL (ms): `{ttl_text}`")
        decoded_bytes = details.get("decoded_bytes")
        error = details.get("decode_error") or details.get("dump_error")
        if isinstance(decoded_bytes, (bytes, bytearray)):
            text_value = decoded_bytes.decode("utf-8", errors="replace")
            lines.append(shorten_text(text_value))
        else:
            lines.append(shorten_text(str(preview)))
        if error:
            lines.append(f"Warning: {error}")
    if len(entries) > limit:
        lines.append(f"Additional entries not shown: {len(entries) - limit}")
    return lines


def build_backup_preview(data):
    entries = data.get("entries") or []
    metadata = data.get("metadata") or {}
    return {
        "key_count": metadata.get("key_count", len(entries)),
        "created_at": metadata.get("created_at_utc"),
        "source": metadata.get("source") or {},
        "type_summary": metadata.get("type_summary") or {},
        "sample_entries": summarise_backup_entries(entries),
    }


def try_render_backup_preview(relative_name: str, payload: bytes):
    try:
        text = payload.decode("utf-8")
    except UnicodeDecodeError:
        return None
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        return None
    if not isinstance(data, dict):
        return None
    if "entries" not in data or "metadata" not in data:
        return None
    return build_backup_preview(data)


def get_relative_member_name(info, prefix):
    member_name = info.filename
    if prefix and member_name.startswith(prefix):
        return member_name[len(prefix):]
    return member_name


def is_logs_entry(relative_name):
    normalized = relative_name.replace('\\', '/').lstrip('./')
    return normalized == 'logs' or normalized.startswith('logs/')


In [3]:
READ_JSON_LIMIT_BYTES = 5_000_000

def collect_archive_data(zip_path):
    meta = parse_zip_metadata(zip_path)
    result = {
        'zip_name': zip_path.name,
        'zip_path': str(zip_path),
        'metadata': meta,
        'manifest': None,
        'db_overview': [],
        'members': [],
        'backups': {},
    }
    with zipfile.ZipFile(zip_path) as archive:
        try:
            prefix, manifest = resolve_manifest(archive, zip_path)
            result['manifest'] = manifest
        except Exception:
            prefix = detect_root_prefix(archive, zip_path)
            manifest = None
        if manifest:
            files_map = manifest.get('files', {})
            dbs = manifest.get('databases', [])
            for db_index in dbs:
                file_name = files_map.get(str(db_index))
                if not file_name:
                    continue
                archive_name = f"{prefix}{file_name}"
                try:
                    size = archive.getinfo(archive_name).file_size
                except KeyError:
                    size = None
                result['db_overview'].append({
                    'db_index': db_index,
                    'label': DB_LABELS.get(db_index, 'Unknown'),
                    'json_file': file_name,
                    'size_bytes': size,
                    'size_text': format_bytes(size) if size is not None else None,
                })
        members = sorted((info for info in archive.infolist() if not info.is_dir()), key=lambda info: info.filename)
        for info in members:
            relative = get_relative_member_name(info, prefix)
            if is_logs_entry(relative):
                continue
            size = info.file_size
            entry = {
                'relative_name': relative,
                'size_bytes': size,
                'size_text': format_bytes(size),
                'json_data': None,
                'json_truncated': False,
                'text_preview': None,
                'backup_preview': None,
            }
            read_entire = size <= MAX_FULL_BYTES or relative.endswith('.json')
            with archive.open(info.filename) as handle:
                payload = handle.read() if read_entire else handle.read(MAX_PREVIEW_BYTES)
            if relative.endswith('.json') and (size is None or size <= READ_JSON_LIMIT_BYTES):
                try:
                    text = payload.decode('utf-8')
                    data = json.loads(text)
                except Exception:
                    data = None
                if data is not None:
                    entry['json_data'] = data
                    preview = try_render_backup_preview(relative, payload)
                    if preview is not None:
                        entry['backup_preview'] = preview
                        result['backups'][relative] = data
                else:
                    entry['text_preview'] = payload.decode('utf-8', errors='replace')[:1000]
            else:
                entry['json_truncated'] = relative.endswith('.json') and (size is not None and size > READ_JSON_LIMIT_BYTES)
                try:
                    entry['text_preview'] = payload.decode('utf-8', errors='replace')[:1000]
                except Exception:
                    entry['text_preview'] = None
            result['members'].append(entry)
    return result

archives_metadata = [parse_zip_metadata(path) for path in zip_paths]
archives_data = [collect_archive_data(path) for path in zip_paths]
manifests_by_archive = {item['zip_name']: item['manifest'] for item in archives_data}
backups_by_archive = {item['zip_name']: item['backups'] for item in archives_data}
