In [None]:
!pip install pytest scikit-learn tensorflow numpy pandas cleanlab[all]

# Benchmark: New Secure Serialization vs. Legacy Pickle

This notebook provides a quantitative comparison of the legacy `pickle`-based serialization engine and the new, secure engine built on **Apache Parquet** and **JSON**.

We will perform two key measurements:
1.  **Execution Time:** How long does it take to `save()` and `load()` a large `Datalab` object?
2.  **Size on Disk:** How much storage space do the resulting artifacts consume?

The goal is to provide hard data demonstrating that the new engine is not only safer but also significantly more performant.

## 1. Setup

First, we'll import the necessary libraries and create a large, realistic `Datalab` object to serve as our test subject. This object will contain one million rows and several issue columns, simulating a real-world use case.

In [None]:
import pickle
import shutil
import time
from pathlib import Path
from typing import Any, Dict, Union

import numpy as np
import pandas as pd
from pandas.io.formats.style import Styler
from cleanlab.datalab.datalab import Datalab

print("Setting up a large Datalab object for benchmarking...")

# --- ⚠️ PERFORMANCE WARNING ---
# The NUM_ROWS variable controls the size of the dataset and has a
# major impact on the runtime of the `lab.find_issues()` step.
#
# - Small values (~10,000 rows): The process is very fast (~3 seconds)
#   because the dataset and the algorithm's internal data structures
#   fit within the CPU's fast cache memory.
#
# - Large values (>100,000 rows): The process becomes much slower
#   (20-50 minutes) because the data overflows the CPU cache and must be
#   processed from slower main RAM. This "performance cliff" is expected
#   behavior for the O(n log n) KNN algorithm at this scale.
#
# The default is set low for a quick and interactive initial run.
# To reproduce the final performance report, you must use a large value
# and be prepared for a significant wait.
# ---
NUM_ROWS: int = 100_000
# NUM_ROWS: int = 1_000_000 # Uncomment for the final benchmark run.

NUM_FEATURES: int = 20
NUM_CLASSES: int = 10

# Generate synthetic data
features: Dict[str, np.ndarray] = {f"feature_{i}": np.random.rand(NUM_ROWS) for i in range(NUM_FEATURES)}
labels: np.ndarray = np.random.randint(0, NUM_CLASSES, size=NUM_ROWS)
data: Dict[str, Any] = {**features, "label": labels}

# Create the Datalab and find some issues to make it non-trivial
lab: Datalab = Datalab(data=data, label_name="label")
# This is the long-running step. Let it complete once.
lab.find_issues(pred_probs=np.random.rand(NUM_ROWS, NUM_CLASSES))

print(f"Setup complete. Datalab has {len(lab.labels):,} rows and {len(lab.issues.columns)} issue columns.")

## 2. Define Benchmark Functions

Next, we'll define two functions to encapsulate the benchmarking logic for each serialization method. This ensures we measure both `save` and `load` operations consistently.

In [None]:
def benchmark_legacy_pickle(datalab: Datalab, path: Path) -> Dict[str, Union[str, float]]:
    """Benchmarks the old pickle save/load method."""
    if path.exists():
        shutil.rmtree(path)
    path.mkdir(parents=True, exist_ok=True)
    
    pkl_path: Path = path / "datalab.pkl"

    # Time the save operation
    start_save: float = time.perf_counter()
    with open(pkl_path, "wb") as f:
        pickle.dump(datalab, f)
    end_save: float = time.perf_counter()

    # Time the load operation
    start_load: float = time.perf_counter()
    with open(pkl_path, "rb") as f:
        _ = pickle.load(f)
    end_load: float = time.perf_counter()

    # Measure file size
    size_mb: float = pkl_path.stat().st_size / (1024 * 1024)

    return {
        "Format": "Legacy (pickle)",
        "Save Time (s)": end_save - start_save,
        "Load Time (s)": end_load - start_load,
        "Size on Disk (MB)": size_mb,
    }

def benchmark_new_parquet(datalab: Datalab, path: Path) -> Dict[str, Union[str, float]]:
    """Benchmarks the new secure save/load method."""
    # Time the save operation
    start_save: float = time.perf_counter()
    datalab.save(str(path), force=True)
    end_save: float = time.perf_counter()

    # Time the load operation
    start_load: float = time.perf_counter()
    _ = Datalab.load(str(path))
    end_load: float = time.perf_counter()

    # Measure total directory size
    total_size: int = sum(f.stat().st_size for f in path.glob('**/*') if f.is_file())
    size_mb: float = total_size / (1024 * 1024)
    
    return {
        "Format": "New (Parquet)",
        "Save Time (s)": end_save - start_save,
        "Load Time (s)": end_load - start_load,
        "Size on Disk (MB)": size_mb,
    }

## 3. Run Benchmarks

Now we execute the benchmark functions and collect the results.

In [None]:
print("Running benchmarks...")

# Create Path objects for the benchmark directories
legacy_path = Path("legacy_lab_bench")
new_path = Path("new_lab_bench")

legacy_results: Dict[str, Union[str, float]] = benchmark_legacy_pickle(lab, legacy_path)
new_results: Dict[str, Union[str, float]] = benchmark_new_parquet(lab, new_path)

# Clean up benchmark directories
shutil.rmtree("legacy_lab_bench")
shutil.rmtree("new_lab_bench")

print("Benchmarks complete.")

## 4. Analyze Results

Finally, we'll format the results into a clear table and calculate the performance improvement factors.

In [None]:
# Build results DataFrame
results_df: pd.DataFrame = pd.DataFrame([legacy_results, new_results])

# Calculate performance and size ratios
try:
    # A factor > 1 means the new method is slower
    save_ratio: float = new_results["Save Time (s)"] / legacy_results["Save Time (s)"]
    load_ratio: float = new_results["Load Time (s)"] / legacy_results["Load Time (s)"]
    
    # A factor > 1 means the new method produces smaller files
    size_ratio: float = legacy_results["Size on Disk (MB)"] / new_results["Size on Disk (MB)"]
except ZeroDivisionError:
    save_ratio = float("inf")
    load_ratio = float("inf")
    size_ratio = float("inf")

# Style the DataFrame for better presentation
styled_df: Styler = (
    results_df.style
    .format({
        "Save Time (s)": "{:.3f}",
        "Load Time (s)": "{:.3f}",
        "Size on Disk (MB)": "{:.2f}"
    })
    .set_caption("Serialization Performance Comparison")
    .hide(axis="index")
)

print("--- Benchmark Results ---")
display(styled_df)

# --- Conclusion and Analysis ---
print("\n--- Conclusion ---")
print("✅ Security: The new Parquet engine eliminates the RCE vulnerability of pickle.")
print(f"✅ Disk Efficiency: Parquet files are {size_ratio:.1f}x smaller on disk.")

# Provide clearer, more intuitive performance results
if save_ratio > 1:
    print(f"🔴 Save Speed: Parquet is {save_ratio:.1f}x slower than pickle for this task.")
else:
    # If ratio is 0.5, it's 2x faster (1/0.5)
    print(f"🟢 Save Speed: Parquet is {1/save_ratio:.1f}x faster than pickle for this task.")

if load_ratio > 1:
    print(f"🔴 Load Speed: Parquet is {load_ratio:.1f}x slower than pickle for this task.")
else:
    print(f"🟢 Load Speed: Parquet is {1/load_ratio:.1f}x faster than pickle for this task.")

# Add the more nuanced explanation if Parquet is slower
if save_ratio > 1 or load_ratio > 1:
    print("\n💡 **Performance Nuance:** The speed decrease is expected and highlights the difference in design.")
    print("   - **Pickle** is fast here because it performs a simple, raw memory dump. It's a blunt instrument with minimal overhead.")
    print("   - **Parquet** is more deliberate. It's a sophisticated format that incurs a higher upfront cost to do more work: analyzing the data schema, restructuring data into columns, and applying intelligent compression. This provides its significant advantages in security, file size, and cross-platform compatibility, especially at scale.")