In [1]:
# Demonstrating Daft DataFrame caching with HyperNodes cache system

import hashlib
import json
import os
from pathlib import Path

import daft

from hypernodes.cache import (
    DiskCache,
    compute_signature,
    hash_inputs,
)

# ---------- Cache setup ----------
CACHE_DIR = ".daft_udf_cache"
cache = DiskCache(CACHE_DIR)


def env_hash() -> str:
    """Include daft version + optional salt so upgrades/config bust the cache."""
    payload = {
        "daft": getattr(daft, "__version__", "unknown"),
        "salt": os.getenv("HN_CACHE_SALT", ""),
    }
    return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()


def materialize_or_load(signature: str, df_builder):
    """If signature in cache: read parquet; else build df, write parquet, record meta."""
    if cache.has(signature):
        print(f"✓ Cache HIT for signature {signature[:8]}... Loading from cache.")
        meta = cache.get(signature)
        if (
            isinstance(meta, dict)
            and meta.get("type") == "daft_df"
            and "blob_path" in meta
        ):
            parquet_path = str(Path(CACHE_DIR) / meta["blob_path"])
            return daft.read_parquet(parquet_path)
        # Fallback to legacy pickle path
        return cache.get(signature)

    # Miss → build and write
    print(f"✗ Cache MISS for signature {signature[:8]}... Building and caching.")
    df = df_builder()
    out_dir = Path(CACHE_DIR) / "blobs" / signature / "df"
    out_dir.mkdir(parents=True, exist_ok=True)
    df.write_parquet(str(out_dir))
    rel = out_dir.relative_to(Path(CACHE_DIR))
    cache.put(signature, {"type": "daft_df", "blob_path": str(rel)})
    return df


# ---------- Define computation ----------
def build_dataframe(data: dict):
    """Build a simple Daft DataFrame with transformations."""
    df = daft.from_pydict(data)
    # Simple transformations
    df = df.with_column("x_plus_one", daft.col("x") + 1)
    df = df.with_column("x_times_10", daft.col("x_plus_one") * 10)
    return df


# ---------- First run: should cache ----------
print("=" * 60)
print("RUN 1: Initial computation (should MISS cache)")
print("=" * 60)

inputs_dict = {"x": [1, 2, 3, 4], "word": ["foo", "bar", "baz", "qux"]}
inputs_h = hash_inputs(inputs_dict)

# Simple code hash for the transformation
code_h = hashlib.sha256(b"build_dataframe_v1").hexdigest()
deps_h = ""

sig = compute_signature(code_h, inputs_h, deps_h, env_hash())

df_out = materialize_or_load(sig, lambda: build_dataframe(inputs_dict))
df_out.show(10)

print("\n" + "=" * 60)
print("RUN 2: Same computation (should HIT cache)")
print("=" * 60)

# Second run with same inputs - should hit cache
df_out2 = materialize_or_load(sig, lambda: build_dataframe(inputs_dict))
df_out2.show(10)

print("\n✓ Demo complete!")


x Int64,word String,x_plus_one Int64,x_times_10 Int64
1,foo,2,20
2,bar,3,30
3,baz,4,40
4,qux,5,50



✓ Demo complete!


In [2]:
import shutil
import time
from typing import Any, Dict

from hypernodes.cache import DiskCache, compute_signature, hash_code, hash_inputs

# Initialize a conceptual disk cache for demonstration
CACHE_DIR = "./daft_cache_demo"
cache = DiskCache(CACHE_DIR)
print(f"Cache initialized at: {os.path.abspath(CACHE_DIR)}")
print(f"Cache size before run: {len(cache.meta_store)} entries\n")

# ----------------------------------------------------------------------
# 1. Function-based UDF
# ----------------------------------------------------------------------


def square_and_add(x: int, constant: int) -> int:
    """A simple UDF to square a number and add a constant."""
    # Simulate a heavy computation
    time.sleep(0.01)
    return x**2 + constant


# Conceptual Caching Wrapper for the UDF
def cached_udf_wrapper(
    func, inputs: Dict[str, Any], func_name: str = None, deps_hash: str = ""
):
    """
    Simulates the caching logic that would occur within the Daft execution engine.
    This is where the content-addressed signature is computed.
    """
    # 1. Compute the components of the signature
    try:
        code_h = hash_code(func)
    except (AttributeError, TypeError):
        # For objects without proper function attributes, use name
        code_h = hash_inputs({"func_name": func_name or str(type(func))})

    inputs_h = hash_inputs(inputs)
    env_h = "v1.0"  # Example environment hash (e.g., Daft version)

    # 2. Compute the final signature
    signature = compute_signature(
        code_hash=code_h, inputs_hash=inputs_h, deps_hash=deps_hash, env_hash=env_h
    )

    # 3. Check the cache
    display_name = func_name or (
        func.__name__ if hasattr(func, "__name__") else func.__class__.__name__
    )

    if cache.has(signature):
        print(f"CACHE HIT for {display_name} with signature: {signature[:8]}...")
        return cache.get(signature)

    print(
        f"CACHE MISS for {display_name} with signature: {signature[:8]}... -> EXECUTING"
    )

    # 4. Execute the function
    if callable(func):
        if hasattr(func, "__call__") and not hasattr(func, "__name__"):
            # Class instance with __call__
            result = func(**inputs)
        else:
            result = func(**inputs)
    else:
        raise ValueError(f"Function {display_name} is not callable")

    # 5. Store the result in the cache
    cache.put(signature, result)
    return result


# ----------------------------------------------------------------------
# 2. Class-based UDF
# ----------------------------------------------------------------------


class Multiplier:
    """A class-based UDF that multiplies by a factor."""

    def __init__(self, factor: int):
        self.factor = factor

    def __call__(self, x: int) -> int:
        # Simulate a heavy computation
        time.sleep(0.01)
        return x * self.factor


# ----------------------------------------------------------------------
# Daft Workflow Simulation
# ----------------------------------------------------------------------

# Create a Daft DataFrame
data = {"id": list(range(5)), "value": [1, 2, 3, 4, 5]}
df = daft.from_pydict(data)

# --- Run 1: Initial run (Expected: CACHE MISS) ---
print("\n--- RUN 1: Initial Execution (Expected: CACHE MISS) ---")

# Apply function-based UDF
df = df.with_column(
    "squared_plus_10",
    df["value"].apply(
        lambda x: cached_udf_wrapper(
            square_and_add, {"x": x, "constant": 10}, func_name="square_and_add"
        ),
        return_dtype=daft.DataType.int64(),
    ),
)

# Apply class-based UDF
multiplier_udf = Multiplier(factor=5)
df = df.with_column(
    "multiplied_by_5",
    df["value"].apply(
        lambda x: cached_udf_wrapper(
            multiplier_udf, {"x": x}, func_name="Multiplier(5)"
        ),
        return_dtype=daft.DataType.int64(),
    ),
)

# Trigger execution (e.g., collect)
print("\n--- Collecting results for Run 1 ---")
df.collect()

print(f"\nCache size after Run 1: {len(cache.meta_store)} entries")
print("--- Run 1 Complete ---\n")


# --- Run 2: Identical run (Expected: CACHE HIT) ---
print("\n--- RUN 2: Identical Execution (Expected: CACHE HIT) ---")

# Re-create the DataFrame and apply the *exact same* operations
df_run2 = daft.from_pydict(data)

# Apply function-based UDF (same inputs, same code)
df_run2 = df_run2.with_column(
    "squared_plus_10",
    df_run2["value"].apply(
        lambda x: cached_udf_wrapper(
            square_and_add, {"x": x, "constant": 10}, func_name="square_and_add"
        ),
        return_dtype=daft.DataType.int64(),
    ),
)

# Apply class-based UDF (same inputs, same class instance)
multiplier_udf_run2 = Multiplier(factor=5)
df_run2 = df_run2.with_column(
    "multiplied_by_5",
    df_run2["value"].apply(
        lambda x: cached_udf_wrapper(
            multiplier_udf_run2, {"x": x}, func_name="Multiplier(5)"
        ),
        return_dtype=daft.DataType.int64(),
    ),
)

# Trigger execution (e.g., collect)
print("\n--- Collecting results for Run 2 ---")
df_run2.collect()

print(f"\nCache size after Run 2: {len(cache.meta_store)} entries")
print("--- Run 2 Complete ---\n")


# --- Run 3: Modified run (Expected: CACHE MISS for modified UDF) ---
print("\n--- RUN 3: Modified Execution (Expected: CACHE MISS for modified UDF) ---")

# Apply function-based UDF (same code, but different constant input)
df_run3 = daft.from_pydict(data)
df_run3 = df_run3.with_column(
    "squared_plus_20",
    df_run3["value"].apply(
        lambda x: cached_udf_wrapper(
            square_and_add, {"x": x, "constant": 20}, func_name="square_and_add"
        ),
        return_dtype=daft.DataType.int64(),
    ),
)

# Apply class-based UDF (different factor input)
multiplier_udf_run3 = Multiplier(factor=10)
df_run3 = df_run3.with_column(
    "multiplied_by_10",
    df_run3["value"].apply(
        lambda x: cached_udf_wrapper(
            multiplier_udf_run3, {"x": x}, func_name="Multiplier(10)"
        ),
        return_dtype=daft.DataType.int64(),
    ),
)

# Trigger execution (e.g., collect)
print("\n--- Collecting results for Run 3 ---")
df_run3.collect()

print(f"\nCache size after Run 3: {len(cache.meta_store)} entries")
print("--- Run 3 Complete ---\n")

# Cleanup
if os.path.exists(CACHE_DIR):
    shutil.rmtree(CACHE_DIR)
    print(f"\nCleaned up cache directory: {CACHE_DIR}")


Cache initialized at: /Users/giladrubin/python_workspace/hypernodes/daft_cache_demo
Cache size before run: 0 entries


--- RUN 1: Initial Execution (Expected: CACHE MISS) ---

--- Collecting results for Run 1 ---
CACHE MISS for square_and_add with signature: 8e95a81e... -> EXECUTING
CACHE MISS for square_and_add with signature: 041e57ae... -> EXECUTING
CACHE MISS for square_and_add with signature: 2f0ca242... -> EXECUTING
CACHE MISS for square_and_add with signature: ab13e05d... -> EXECUTING
CACHE MISS for square_and_add with signature: 437cd9de... -> EXECUTING
CACHE MISS for Multiplier(5) with signature: d133b9de... -> EXECUTING
CACHE MISS for Multiplier(5) with signature: 5f1253fa... -> EXECUTING
CACHE MISS for Multiplier(5) with signature: 57b054fa... -> EXECUTING
CACHE MISS for Multiplier(5) with signature: 4ba936a5... -> EXECUTING
CACHE MISS for Multiplier(5) with signature: cb57de3e... -> EXECUTING

Cache size after Run 1: 10 entries
--- Run 1 Complete ---


--- RUN 2: Identical 