# Redis DB10 Results Explorer

This notebook loads the Redis backup for database 10 stored in the results archives and decodes the serialized JSON payloads so they can be analysed as regular pandas tables.


## Usage

1. Update `BACKUP_SOURCE` below if you want to inspect a different archive or a raw JSON file.
2. Run the notebook top-to-bottom to load the backup and build a DataFrame with decoded entries.
3. Use the provided summaries or extend the notebook with your own analysis steps.


In [1]:
from __future__ import annotations

import base64
import binascii
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
from zipfile import ZipFile

import pandas as pd
from IPython.display import display


In [2]:
class DumpDecodeError(RuntimeError):
    'Raised when a Redis DUMP payload cannot be decoded.'


@dataclass
class DumpSections:
    payload: bytes
    version: int
    checksum: bytes


class _LengthEncoding:
    __slots__ = ('value', 'encoding')

    def __init__(self, value=None, encoding=None):
        self.value = value
        self.encoding = encoding


RDB_ENCODING_INT8 = 0
RDB_ENCODING_INT16 = 1
RDB_ENCODING_INT32 = 2
RDB_ENCODING_LZF = 3


def split_dump_sections(raw: bytes) -> DumpSections:
    if len(raw) < 10:
        raise DumpDecodeError('DUMP payload is too short to contain metadata')
    checksum = raw[-8:]
    version_bytes = raw[-10:-8]
    version = int.from_bytes(version_bytes, 'little', signed=False)
    payload = raw[:-10]
    return DumpSections(payload=payload, version=version, checksum=checksum)


def _read_length_info(buffer: bytes, offset: int):
    if offset >= len(buffer):
        raise DumpDecodeError('Offset out of range while reading length')
    first = buffer[offset]
    prefix = first >> 6
    if prefix == 0:
        length = first & 0x3F
        return _LengthEncoding(length), offset + 1
    if prefix == 1:
        if offset + 1 >= len(buffer):
            raise DumpDecodeError('Truncated 14-bit encoded length')
        second = buffer[offset + 1]
        length = ((first & 0x3F) << 8) | second
        return _LengthEncoding(length), offset + 2
    if prefix == 2:
        if offset + 4 >= len(buffer):
            raise DumpDecodeError('Truncated 32-bit encoded length')
        length = int.from_bytes(buffer[offset + 1 : offset + 5], 'big', signed=False)
        return _LengthEncoding(length), offset + 5
    return _LengthEncoding(None, first & 0x3F), offset + 1


def lzf_decompress(data: bytes, expected_length: int) -> bytes:
    output = bytearray()
    idx = 0
    data_len = len(data)
    while idx < data_len:
        ctrl = data[idx]
        idx += 1
        if ctrl < 32:
            literal_len = ctrl + 1
            if idx + literal_len > data_len:
                raise DumpDecodeError('Truncated literal LZF sequence')
            output.extend(data[idx : idx + literal_len])
            idx += literal_len
        else:
            length = ctrl >> 5
            ref_offset = len(output) - ((ctrl & 0x1F) << 8) - 1
            if length == 7:
                if idx >= data_len:
                    raise DumpDecodeError('Truncated LZF sequence while extending length')
                length += data[idx]
                idx += 1
            if idx >= data_len:
                raise DumpDecodeError('Truncated LZF sequence while resolving reference')
            ref_offset -= data[idx]
            idx += 1
            length += 2
            if ref_offset < 0:
                raise DumpDecodeError('Negative LZF reference')
            for _ in range(length):
                if ref_offset >= len(output):
                    raise DumpDecodeError('LZF reference out of range')
                output.append(output[ref_offset])
                ref_offset += 1
    if len(output) != expected_length:
        raise DumpDecodeError(
            f"Unexpected decompressed length: expected {expected_length}, got {len(output)}"
        )
    return bytes(output)


def _decode_special_encoding(buffer: bytes, offset: int, encoding: int):
    if encoding == RDB_ENCODING_INT8:
        if offset >= len(buffer):
            raise DumpDecodeError('Truncated 8-bit encoded integer')
        value = int.from_bytes(buffer[offset : offset + 1], 'little', signed=True)
        return str(value).encode('ascii'), offset + 1
    if encoding == RDB_ENCODING_INT16:
        if offset + 2 > len(buffer):
            raise DumpDecodeError('Truncated 16-bit encoded integer')
        value = int.from_bytes(buffer[offset : offset + 2], 'little', signed=True)
        return str(value).encode('ascii'), offset + 2
    if encoding == RDB_ENCODING_INT32:
        if offset + 4 > len(buffer):
            raise DumpDecodeError('Truncated 32-bit encoded integer')
        value = int.from_bytes(buffer[offset : offset + 4], 'little', signed=True)
        return str(value).encode('ascii'), offset + 4
    if encoding == RDB_ENCODING_LZF:
        compressed_len_info, next_offset = _read_length_info(buffer, offset)
        data_len_info, data_offset = _read_length_info(buffer, next_offset)
        if compressed_len_info.value is None or data_len_info.value is None:
            raise DumpDecodeError('Invalid LZF length encoding')
        end = data_offset + compressed_len_info.value
        if end > len(buffer):
            raise DumpDecodeError('Truncated encoded string')
        compressed = buffer[data_offset:end]
        decompressed = lzf_decompress(compressed, data_len_info.value)
        return decompressed, end
    raise DumpDecodeError('Unknown string encoding')


def _read_encoded_string(buffer: bytes, offset: int):
    length_info, next_offset = _read_length_info(buffer, offset)
    if length_info.encoding is None:
        end = next_offset + length_info.value
        if end > len(buffer):
            raise DumpDecodeError('Truncated encoded string')
        return buffer[next_offset:end], end
    return _decode_special_encoding(buffer, next_offset, length_info.encoding)


def decode_string_from_dump(raw: bytes) -> bytes:
    sections = split_dump_sections(raw)
    payload = sections.payload
    if not payload:
        raise DumpDecodeError('Empty payload')
    object_type = payload[0]
    if object_type != 0:
        raise DumpDecodeError(f'Non-string object type: {object_type}')
    value, _ = _read_encoded_string(payload, 1)
    return value


def decode_base64_bytes(value: str) -> bytes:
    if not isinstance(value, str):
        raise DumpDecodeError('Encoded value must be a base64 string')
    try:
        return base64.b64decode(value.encode('ascii'))
    except (UnicodeEncodeError, binascii.Error) as exc:
        raise DumpDecodeError(f'Invalid base64 payload: {exc}') from exc


def decode_entry(entry: dict) -> dict:
    key_bytes = decode_base64_bytes(entry['key'])
    key_text = key_bytes.decode('utf-8', errors='replace')
    value_info = entry.get('value') or {}
    data_b64 = value_info.get('data')
    if not data_b64:
        raise DumpDecodeError('Missing DUMP payload in backup entry')
    raw_value = decode_base64_bytes(data_b64)
    decoded_bytes = decode_string_from_dump(raw_value)
    text_value = decoded_bytes.decode('utf-8', errors='replace')
    try:
        json_value = json.loads(text_value)
    except json.JSONDecodeError:
        json_value = None
    return {
        'redis_key': key_text,
        'decoded_bytes': decoded_bytes,
        'text': text_value,
        'json': json_value,
    }


In [3]:
def load_backup_payload(source: Path, db_number: int = 10) -> dict:
    'Load a redis_backup_dbXX.json payload from a JSON file, directory, or ZIP archive.'
    source = source.expanduser()
    if source.is_dir():
        candidate = source / f'redis_backup_db{db_number}.json'
        if not candidate.exists():
            raise FileNotFoundError(f'Backup file not found under {source}')
        return json.loads(candidate.read_text(encoding='utf-8'))
    if not source.exists():
        raise FileNotFoundError(f'Backup source not found: {source}')
    suffix = source.suffix.lower()
    if suffix == '.json':
        return json.loads(source.read_text(encoding='utf-8'))
    if suffix == '.zip':
        target_name = f'redis_backup_db{db_number}.json'
        with ZipFile(source) as archive:
            matches = [name for name in archive.namelist() if name.endswith(target_name)]
            if not matches:
                raise FileNotFoundError(f'{target_name} not found inside {source.name}')
            if len(matches) > 1:
                print(f'Warning: multiple matches found, using {matches[0]}')
            data = archive.read(matches[0])
            return json.loads(data.decode('utf-8'))
    raise ValueError(f'Unsupported backup source: {source}')


In [4]:
results_dir = Path('results')
available_archives = sorted(results_dir.glob('*.zip'))
available_archives


[WindowsPath('results/ECG200_-1_false_0.zip'),
 WindowsPath('results/HandOutlines_0_false_0.zip'),
 WindowsPath('results/MelbournePedestrian_1_false_0.zip'),
 WindowsPath('results/MiddlePhalanxOutlineCorrect_0_false_0.zip'),
 WindowsPath('results/SonyAIBORobotSurface1_1_false_0.zip'),
 WindowsPath('results/Wafer_-1_false_0.zip'),
 WindowsPath('results/Wine_1_false_0.zip')]

In [5]:
BACKUP_SOURCE = Path('results') / 'SonyAIBORobotSurface1_1_false_0.zip'
DB_NUMBER = 10

backup_payload = load_backup_payload(BACKUP_SOURCE, db_number=DB_NUMBER)
entries = backup_payload.get('entries', [])

print(f'Loaded {len(entries)} entries from {BACKUP_SOURCE}')
metadata = backup_payload.get('metadata')
if metadata:
    print('Metadata:')
    print(json.dumps(metadata, indent=2))


Loaded 32738 entries from results\SonyAIBORobotSurface1_1_false_0.zip
Metadata:
{
  "created_at_utc": "2025-10-25T13:52:26.924139Z",
  "key_count": 32738,
  "scan_count": 1000,
  "source": {
    "db": 10,
    "host": "127.0.0.1",
    "port": 6379
  },
  "type_summary": {
    "string": 32738
  }
}


In [6]:
decoded_entries = []
failed_entries = []

for entry in entries:
    try:
        decoded_entries.append(decode_entry(entry))
    except DumpDecodeError as exc:
        failed_entries.append({'entry': entry, 'error': str(exc)})

print(f'Decoded {len(decoded_entries)} entries; {len(failed_entries)} failures.')
if failed_entries:
    print('First failure:')
    print(failed_entries[0]['error'])


Decoded 32738 entries; 0 failures.


In [7]:
if decoded_entries:
    sample = decoded_entries[0]
    print(f"Sample key: {sample['redis_key']}")
    print(json.dumps(sample['json'], indent=2))


Sample key: thething:worker_24003:14
{
  "worker_id": "thething:worker_24003",
  "iteration": 14,
  "timestamp_start": "2025-10-25T13:54:48.637414",
  "queue_size": 54,
  "car_queue_size": 1,
  "car_processing": {
    "candidate_bitmap": "111011011110111011011011101101111101101100110111111111111110110110111011011",
    "icf_size": 24,
    "result": "CONFIRMED_AR",
    "time_seconds": 0.10586810111999512,
    "raw_info": {
      "ar_iterations": 1,
      "ar_profile_dominated_by_AP": 1,
      "deleted_from_AR": 1,
      "deleted_from_AP": 2,
      "deleted_from_CAR": 9,
      "ar_extensions_total": 17,
      "ar_ext_AR_cache_checks": 3638,
      "ar_ext_R_cache_checks": 4040,
      "ar_ext_R_cache_hits": 1,
      "ar_ext_R_shares_sample": 1,
      "ar_extensions_filtered_by_R": 1,
      "ar_extensions_added": 16,
      "ar_extensions_added_to_CAR": 16
    },
    "extensions": {
      "total": 17,
      "added": 0,
      "filtered": 17,
      "filtered_by_ar": 0,
      "filtered_by_r_sha

In [8]:
records = []
for decoded in decoded_entries:
    payload = decoded['json']
    if payload is None:
        continue
    flat = pd.json_normalize(payload, sep='_')
    flat['redis_key'] = decoded['redis_key']
    records.append(flat)

if records:
    df = pd.concat(records, ignore_index=True)
    df = df.set_index('redis_key')
    df.head()
else:
    df = pd.DataFrame()
    df


KeyboardInterrupt: 

In [None]:
        if not df.empty:
            summary = {
                'worker_count': int(df['worker_id'].nunique()),
                'total_iterations': int(df['iteration'].max()),
                'total_runtime_hours': float(df['timings_total_seconds'].sum() / 3600.0),
            }
            print('Summary:')
            print(json.dumps(summary, indent=2))
            print('
CAR results:')
            print(df['car_processing_result'].value_counts())
            print('
CAN results:')
            print(df['can_processing_result'].value_counts())


In [None]:
if not df.empty:
    per_worker = (
        df.groupby('worker_id')
        .agg({
            'iteration': ['count', 'max'],
            'queue_size': ['mean', 'max'],
            'timings_total_seconds': 'sum',
        })
    )
    per_worker.columns = ['_'.join(map(str, col)).strip('_') for col in per_worker.columns]
    per_worker.head()
