In [None]:
from __future__ import annotations

from dataclasses import dataclass
from typing import (
    Dict,
    List,
    Protocol,
    Sequence,
    Tuple,
    TypedDict,
    Optional,
)

import numpy as np
import numpy.typing as npt

from daft_func import Pipeline, func, NestedPipeline

# ---- Core vector type -------------------------------------------------------
Vector = npt.NDArray[np.float32]


# ---- Protocols -------------------------------------------------------------
class Encoder(Protocol):
    dim: int

    def encode(self, text: str) -> Vector: ...


class Indexer(Protocol):
    def index(self, encoded: Sequence["EncodedPassage"]) -> "BaseIndex": ...


class Reranker(Protocol):
    def rerank(
        self, query: "Query", hits: Sequence["RetrievedDoc"], top_k: int | None = None
    ) -> List["RetrievedDoc"]: ...


# ---- Data models ------------------------------------------------------------
@dataclass(frozen=True)
class Passage:
    pid: str
    text: str


@dataclass(frozen=True)
class EncodedPassage:
    pid: str
    text: str
    embedding: Vector


@dataclass(frozen=True)
class Query:
    text: str


@dataclass(frozen=True)
class RetrievedDoc:
    pid: str
    text: str
    embedding: Vector
    score: float


class SearchHit(TypedDict):
    pid: str
    score: float


class BaseIndex(Protocol):
    dim: int

    def add(self, items: Sequence[EncodedPassage]) -> None: ...
    def search(self, query_vec: Vector, top_k: int = 10) -> List[SearchHit]: ...
    def get(self, pid: str) -> EncodedPassage: ...

In [None]:
# ---- Implementations -------------------------------------------------------
class NumpyRandomEncoder:
    def __init__(self, dim: int = 4, rng: np.random.Generator | None = None) -> None:
        self.dim = dim
        self._rng = rng or np.random.default_rng()

    def encode(self, text: str) -> Vector:
        return self._rng.random(self.dim, dtype=np.float32)


class InMemoryIndex:
    def __init__(self, dim: int) -> None:
        self.dim = dim
        self._data: Dict[str, EncodedPassage] = {}

    def add(self, items: Sequence[EncodedPassage]) -> None:
        for it in items:
            self._data[it.pid] = it

    def search(self, query_vec: Vector, top_k: int = 10) -> List[SearchHit]:
        q = query_vec / (np.linalg.norm(query_vec) + 1e-12)
        hits: List[Tuple[str, float]] = []
        for pid, ep in self._data.items():
            v = ep.embedding / (np.linalg.norm(ep.embedding) + 1e-12)
            hits.append((pid, float(np.dot(q, v))))
        hits.sort(key=lambda x: x[1], reverse=True)
        return [{"pid": pid, "score": score} for pid, score in hits[:top_k]]

    def get(self, pid: str) -> EncodedPassage:
        return self._data[pid]


class SimpleIndexer:
    def __init__(self, dim: int) -> None:
        self.dim = dim

    def index(self, encoded: Sequence[EncodedPassage]) -> BaseIndex:
        idx = InMemoryIndex(self.dim)
        idx.add(encoded)
        return idx


class IdentityReranker:
    def rerank(
        self, query: Query, hits: Sequence[RetrievedDoc], top_k: int | None = None
    ) -> List[RetrievedDoc]:
        out = list(hits)
        if top_k is not None:
            out = out[:top_k]
        return out

In [None]:
# ---- Encode Pipeline ---------------------------------------------
@func(output="cleaned_text")
def clean_text(passage: Passage) -> str:
    return passage.text.strip().lower()


@func(output="embedding")
def encode_text(encoder: Encoder, cleaned_text: str, is_query: bool = False) -> Vector:
    return encoder.encode(cleaned_text, is_query)


@func(output="encoded_passage")
def pack_encoded(passage: Passage, embedding: Vector) -> EncodedPassage:
    return EncodedPassage(pid=passage.pid, text=passage.text, embedding=embedding)


single_encode = Pipeline(functions=[clean_text, encode_text, pack_encoded])
single_encode.visualize()

In [None]:
res = single_encode.run(
    inputs={"passage": Passage(pid="1", text="hello"), "encoder": Encoder()}
)  # should return a dict with keys as output names and corresponding results

In [None]:
# res == {
#     "cleaned_text": "hello",
#     "encoded_text": np.ndarray([...])
# }

In [None]:
single_encode.map(
    inputs={
        "passage": [Passage(pid="1", text="hello"), Passage(pid="2", text="world")],
        "encoder": Encoder(),
    },
    map_over="passage",
)  # this should return a dict with keys as output names and corresponding lists of results

In [None]:
# res == {
#     "cleaned_text": ["hello", "world"],
#     "encoded_text": [
#         np.ndarray([...]),  # embedding for "hello"
#         np.ndarray([...])   # embedding for "world"
#     ]
# }

In [None]:
## Index
encode_corpus = NestedPipeline(
    pipeline=single_encode,
    inputs={"corpus": "passage"},
    outputs={"encoded_passage": "encoded_corpus"},
    map_over="corpus",
)


@node(output="index")
def build_index(
    indexer: Indexer, encoded_corpus: Sequence[EncodedPassage]
) -> BaseIndex:
    return indexer.index(encoded_corpus)


# Take the mapped EncodedPassage list and build an index
encode_and_index = Pipeline(nodes=[encode_corpus, build_index])

In [None]:
# Toy data
corpus: List[Passage] = [
    Passage(pid="p1", text="Hello World"),
    Passage(pid="p2", text="The Quick Brown Fox"),
]

encoder = NumpyRandomEncoder(dim=4)
indexer = SimpleIndexer(dim=encoder.dim)
encode_and_index.visualize()

outputs = encode_and_index.run(
    inputs={
        "corpus": corpus,
        "encoder": encoder,
        "indexer": indexer,
    }
)
index: BaseIndex = outputs["index"]

In [None]:
# ---- Retrieval + Reranking --------------------------------------------------
encode_query = NestedPipeline(
    pipeline=single_encode,
    inputs={"query.text": "text"},
    outputs={"encoded_passage": "query_vec"},
)


@func(output="retrieved")
def retrieve(
    index: BaseIndex, query_vec: Vector, top_k: int = 10
) -> List[RetrievedDoc]:
    hits = index.search(query_vec, top_k=top_k)
    return [
        RetrievedDoc(
            pid=h["pid"],
            text=index.get(h["pid"]).text,
            embedding=index.get(h["pid"]).embedding,
            score=h["score"],
        )
        for h in hits
    ]


@func(output="reranked_hits")
def rerank_hits(
    reranker: Reranker,
    query: Query,
    retrieved: List[RetrievedDoc],
    final_top_k: int | None = None,
) -> List[RetrievedDoc]:
    return reranker.rerank(query, retrieved, top_k=final_top_k)


search_pipeline = Pipeline(nodes=[encode_query, retrieve, rerank_hits])

In [None]:
full_pipeline = Pipeline(nodes=[encode_and_index, search_pipeline])
full_pipeline.visualize()

In [None]:
corpus = [
    Passage(pid="p1", text="Hello World"),
    Passage(pid="p2", text="Quick Brown Fox"),
]
encoder = NumpyRandomEncoder(dim=4)
indexer = SimpleIndexer(dim=encoder.dim)

outputs = encode_and_index.run(
    inputs={"passage": corpus, "encoder": encoder, "indexer": indexer}
)
index: BaseIndex = outputs["index"]

reranker = IdentityReranker()

# Single query
search_out = search_pipeline.run(
    inputs={
        "query": Query(text="hello world"),
        "encoder": encoder,
        "index": index,
        "reranker": reranker,
        "top_k": 5,
        "final_top_k": 3,
    }
)

for doc in search_out["reranked_hits"]:
    print(doc.pid, doc.score, doc.text)

# ---- Multiple queries ---------------------------------------------------
queries = [Query(text="hello"), Query(text="quick fox"), Query(text="world")]
batch_out = search_pipeline.map(
    inputs={
        "query": queries,
        "encoder": encoder,
        "index": index,
        "reranker": reranker,
        "top_k": 5,
        "final_top_k": 3,
    },
    map_over="query",
)

for q, results in zip(queries, batch_out["reranked_hits"]):
    print(f"\nQuery: {q.text}")
    for r in results:
        print(f"  {r.pid} | {r.score:.3f} | {r.text}")

In [None]:
from __future__ import annotations

from dataclasses import dataclass
from typing import (
    Dict,
    Iterable,
    List,
    Mapping,
    MutableMapping,
    Protocol,
    Sequence,
    Tuple,
    TypedDict,
)

import numpy as np
import numpy.typing as npt

from daft_func import Pipeline, Subgraph, func, ProgressConfig

In [None]:
# ---- Core vector type -------------------------------------------------------
Vector = npt.NDArray[np.float32]

In [None]:
# ---- Protocols (clear, testable interfaces) --------------------------------
class Encoder(Protocol):
    """Any text encoder that returns a fixed-width embedding."""

    dim: int

    def encode(self, text: str) -> Vector:  # pragma: no cover - external
        ...


class Indexer(Protocol):
    """Component that builds/updates an index from encoded passages."""

    def index(
        self, encoded: Sequence["EncodedPassage"]
    ) -> "BaseIndex":  # pragma: no cover - external
        ...

In [None]:
# ---- Data models ------------------------------------------------------------
@dataclass(frozen=True)
class Passage:
    pid: str
    text: str


@dataclass(frozen=True)
class EncodedPassage:
    pid: str
    text: str
    embedding: Vector  # shape: (D,)


class SearchHit(TypedDict):
    pid: str
    score: float


class BaseIndex(Protocol):
    dim: int

    def add(
        self, items: Sequence[EncodedPassage]
    ) -> None:  # pragma: no cover - example
        ...

    def search(
        self, query_vec: Vector, top_k: int = 10
    ) -> List[SearchHit]:  # pragma: no cover - example
        ...

In [None]:
# ---- Simple reference implementations --------------------------------------
class NumpyRandomEncoder:
    """Toy encoder: DO NOT use in production. Demonstrates the interface.

    By default it is nondeterministic (np.random). For repeatability in tests,
    pass a seeded RNG.
    """

    def __init__(self, dim: int = 4, rng: np.random.Generator | None = None) -> None:
        self.dim = dim
        self._rng = rng or np.random.default_rng()

    def encode(self, text: str) -> Vector:
        # In real life: return model.encode(text).astype(np.float32)
        return self._rng.random(self.dim, dtype=np.float32)

In [None]:
class InMemoryIndex:
    """Keeps (pid, original, cleaned, embedding) in RAM.

    This illustrates a minimal index that preserves the required triplet
    (passage id, original text, embedding) plus the cleaned text.
    """

    def __init__(self, dim: int) -> None:
        self.dim = dim
        self._data: Dict[str, EncodedPassage] = {}

    def add(self, items: Sequence[EncodedPassage]) -> None:
        for it in items:
            if it.embedding.shape != (self.dim,):
                raise ValueError(
                    f"Embedding for {it.pid} has shape {it.embedding.shape}, expected ({self.dim},)"
                )
            self._data[it.pid] = it

    def search(self, query_vec: Vector, top_k: int = 10) -> List[SearchHit]:
        if query_vec.shape != (self.dim,):
            raise ValueError(f"query_vec shape {query_vec.shape} != ({self.dim},)")
        # Cosine similarity
        q = query_vec / (np.linalg.norm(query_vec) + 1e-12)
        hits: List[Tuple[str, float]] = []
        for pid, ep in self._data.items():
            v = ep.embedding / (np.linalg.norm(ep.embedding) + 1e-12)
            hits.append((pid, float(np.dot(q, v))))
        hits.sort(key=lambda x: x[1], reverse=True)
        return [{"pid": pid, "score": score} for pid, score in hits[:top_k]]

In [None]:
class SimpleIndexer:
    """Indexer that returns an InMemoryIndex."""

    def __init__(self, dim: int) -> None:
        self.dim = dim

    def index(self, encoded: Sequence[EncodedPassage]) -> BaseIndex:
        idx = InMemoryIndex(dim=self.dim)
        idx.add(encoded)
        return idx

## Encode

In [None]:
@node(output="cleaned_text")
def clean_text(passage: Passage) -> str:
    return passage.text.strip().lower()


@node(output="embedding")
def encode_text(encoder: Encoder, cleaned_text: str) -> Vector:
    return encoder.encode(cleaned_text)


@node(output="encoded_passage")
def pack_encoded(passage: Passage, embedding: Vector) -> EncodedPassage:
    return EncodedPassage(pid=passage.pid, text=passage.text, embedding=embedding)

In [None]:
# A pipeline that encodes a *single* (pid, text) into an EncodedPassage
single_encode = Pipeline(nodes=[clean_text, encode_text])
single_encode.visualize()

In [None]:
res = single_encode.run(
    inputs={"passage": Passage(pid="1", text="hello"), "encoder": Encoder()}
)  # should return a dict with keys as output names and corresponding results

In [None]:
# res == {
#     "cleaned_text": "hello",
#     "encoded_text": np.ndarray([...])
# }

In [None]:
single_encode.map(
    inputs={
        "passage": [Passage(pid="1", text="hello"), Passage(pid="2", text="world")],
        "encoder": Encoder(),
    },
    map_over="passage",
)  # this should return a dict with keys as output names and corresponding lists of results

In [None]:
# res == {
#     "cleaned_text": ["hello", "world"],
#     "encoded_text": [
#         np.ndarray([...]),  # embedding for "hello"
#         np.ndarray([...])   # embedding for "world"
#     ]
# }

## Index

In [None]:
encode_corpus = Subgraph(
    graph=single_encode,
    inputs={"corpus": "passage"},
    outputs={"encoded_passage": "encoded_corpus"},
    map_over="corpus",
)

In [None]:
@node(output="index")
def build_index(
    indexer: Indexer, encoded_corpus: Sequence[EncodedPassage]
) -> BaseIndex:
    return indexer.index(encoded_corpus)

In [None]:
# Take the mapped EncodedPassage list and build an index
encode_and_index = Pipeline(functions=[encode_corpus, build_index])

In [None]:
# Toy data
corpus: List[Passage] = [
    Passage(pid="p1", text="Hello World"),
    Passage(pid="p2", text="The Quick Brown Fox"),
]

In [None]:
encoder = NumpyRandomEncoder(dim=4)
indexer = SimpleIndexer(dim=encoder.dim)

In [None]:
encode_and_index.visualize()

In [None]:
outputs = encode_and_index.run(
    inputs={
        "corpus": corpus,
        "encoder": encoder,
        "indexer": indexer,
    }
)

In [None]:
index: BaseIndex = outputs["index"]

In [None]:
# ---- Retrieval + Reranking --------------------------------------------------

In [None]:
@func(output="retrieved")
def retrieve(
    index: BaseIndex, query_vec: Vector, top_k: int = 10
) -> List[RetrievedDoc]:
    return [
        RetrievedDoc(
            pid=h["pid"],
            text=index.get(h["pid"]).text,
            embedding=index.get(h["pid"]).embedding,
            score=h["score"],
        )
        for h in hits
    ]


@func(output="reranked_hits")
def rerank_hits(
    reranker: Reranker,
    query: Query,
    retrieved: List[RetrievedDoc],
    final_top_k: int | None = None,
) -> List[RetrievedDoc]:
    return reranker.rerank(query, retrieved, top_k=final_top_k)


search_pipeline = Pipeline(nodes=[encode_query, retrieve, rerank_hits])
search_pipeline.visualize()

In [None]:
# ---- Example usage ----------------------------------------------------------
corpus = [
    Passage(pid="p1", text="Hello World"),
    Passage(pid="p2", text="Quick Brown Fox"),
]
encoder = NumpyRandomEncoder(dim=4)
indexer = SimpleIndexer(dim=encoder.dim)

In [None]:
outputs = encode_and_index.run(
    inputs={"passage": corpus, "encoder": encoder, "indexer": indexer}
)
index: BaseIndex = outputs["index"]

In [None]:
reranker = IdentityReranker()

In [None]:
# Single query
search_out = search_pipeline.run(
    inputs={
        "query": Query(text="hello world"),
        "encoder": encoder,
        "index": index,
        "reranker": reranker,
        "top_k": 5,
        "final_top_k": 3,
    }
)

for doc in search_out["reranked_hits"]:
    print(doc.pid, doc.score, doc.text)

In [None]:
# ---- Multiple queries ---------------------------------------------------
queries = [Query(text="hello"), Query(text="quick fox"), Query(text="world")]
batch_out = search_pipeline.map(
    inputs={
        "query": queries,
        "encoder": encoder,
        "index": index,
        "reranker": reranker,
        "top_k": 5,
        "final_top_k": 3,
    },
    map_over="query",
)


for q, results in zip(queries, batch_out["reranked_hits"]):
    print(f" {r.pid} | {r.score:.3f} | {r.text}")

In [5]:
from typing import Dict, List

from daft_func import Pipeline, Runner, func
from examples.retrieval import (
    IdentityReranker,
    Query,
    RerankedHit,
    Reranker,
    RetrievalResult,
    Retriever,
    ToyRetriever,
)

In [6]:
from daft_func import ProgressConfig

# Custom configuration
progress_config = ProgressConfig(enabled=True)

In [7]:
import numpy as np


class Encoder:
    def __init__(self):
        self.dim = 4

    def encode(self, text: str) -> np.ndarray:
        return np.array(np.random.rand(self.dim))


In [None]:
import numpy as np


@func(output="cleaned_text")
def clean_text(text: str) -> str:
    return text.lower()


@func(output="encoded_text")
def encode_text(encoder: Encoder, cleaned_text: str) -> np.ndarray:
    return encoder.encode(cleaned_text)


encoding_pipeline = Pipeline(
    functions=[clean_text, encode_text],
)
encoding_pipeline.visualize()

In [None]:
res = encoding_pipeline.run(inputs={"text": "hello", "encoder": Encoder()})

⏸ cleaned_text:   0%|          | 0/1 [00:00<?, ?it/s]

⏸ encoded_text:   0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
res["encoded_text"]

array([0.13542552, 0.56111283, 0.64619301, 0.57526549])

In [None]:
# res == {
#     "cleaned_text": "hello",
#     "encoded_text": np.ndarray([...])  # shape (4,)
# }

# res["encoded_text"]  # → a single 1‑D embedding vector, e.g. array([0.12, 0.34, 0.56, 0.78])

In [None]:
# res = encoding_pipeline.map(inputs={"text": ["hello", "world"], "encoder": Encoder()}, map_axis="text") # this should work

In [None]:
# res == {
#     "cleaned_text": ["hello", "world"],
#     "encoded_text": [
#         np.ndarray([...]),  # embedding for "hello"
#         np.ndarray([...])   # embedding for "world"
#     ]
# }

# res["encoded_text"]  # → list of two vectors, each shape (4,)

In [None]:
class Indexer:
    def index(self, encoded_corpus: ?) -> str:
        return encoded_corpus

In [None]:
@func(output="index")
def index(indexer: Indexer, encoded_corpus: Dict[str, str], test: bool = True) -> str:
    return indexer.index(encoded_corpus)


indexing_pipeline = Pipeline(functions=[index])
indexing_pipeline.visualize()


In [None]:
indexing_pipeline.run(
    inputs={"corpus": [..., ...], "encoder": Encoder(), "indexer": Indexer()}
)

In [None]:
encode_passages = Subgraph(
    graph=encoding_pipeline,  # or a single @func
    inputs={"corpus": "text"},  # external → internal
    outputs={"encoded_text": "encoded_corpus"},  # internal → external
    map_over="corpus",
)

encoding_indexing_pipeline = Pipeline(functions=[encode_passages, index])
encoding_indexing_pipeline.visualize()  # should show the encode_passages step (with pipeline icon/color) and then index + save_index steps

In [None]:
encoding_indexing_pipeline.run(
    inputs={"corpus": [..., ...], "encoder": Encoder(), "indexer": Indexer()}
)

In [None]:
@func(output="hits")
def retrieve(
    retriever: Retriever, query: Query, top_k: int, index: str
) -> RetrievalResult:
    return retriever.retrieve(index, query, top_k=top_k)


@func(output="reranked_hits")
def rerank(
    reranker: Reranker, query: Query, hits: RetrievalResult, top_k: int
) -> List[RerankedHit]:
    return reranker.rerank(query, hits, top_k=top_k)

In [None]:
pipeline = Pipeline(functions=[retrieve, rerank])
pipeline.visualize()

In [None]:
pipeline.run(inputs={"query": [..., ...], "top_k": 2, "index": "..."})

In [None]:
# or
pipeline = Pipeline(functions=[indexing_pipeline, retrieve, rerank], map_axis="query")
pipeline.run(inputs={"query": [..., ...], "top_k": 2, "corpus": "..."})

questions/comments:
1. does it make sense to cache the corpus? to me it makes sense to avoid recomputation....... what is the cost of that? is it doable?
2. when .visualizing, only show by default zero or one (depending on how you count) level in. so within the indexing pipeline I should be seeing just one step for the encode_passages instead of seeing the internal pipeline. if level=2 then I'll see a box that says encode_passages and within it the pipeline. if level=2 and unwrap or flat=True then I'll just see the pipeline somehow in a way that maps the multi inputs to the specific inputs

In [None]:
@func(output="index_path", cache=True)
def index(retriever: Retriever, corpus: Dict[str, str], test: bool = True) -> str:
    index_path = retriever.index(corpus)
    return index_path


@func(output="hits", map_axis="query", key_attr="query_uuid", cache=True)
def retrieve(
    retriever: Retriever, query: Query, top_k: int, index_path: str
) -> RetrievalResult:
    return retriever.retrieve(index_path, query, top_k=top_k)


@func(output="reranked_hits", map_axis="query", key_attr="query_uuid", cache=True)
def rerank(
    reranker: Reranker, query: Query, hits: RetrievalResult, top_k: int
) -> List[RerankedHit]:
    return reranker.rerank(query, hits, top_k=top_k)


pipeline = Pipeline(functions=[index, retrieve, rerank])
pipeline.visualize()

In [4]:
corpus = {
    "d1": "a quick brown fox jumps",
    "d2": "brown dog sleeps",
    "d3": "five boxing wizards jump quickly",
}

single_inputs = {
    "retriever": ToyRetriever(),
    "corpus": corpus,
    "reranker": IdentityReranker(),
    "query": Query(query_uuid="q1", text="quick brown"),
    "top_k": 2,
}

In [None]:
from daft_func import CacheConfig

# Create runner with auto mode (chooses based on batch size)
runner = Runner(
    mode="local",
    batch_threshold=2,
    # cache_config=CacheConfig(enabled=True),  # , backend=DiskCache(cache_dir=".cache")),
)

In [None]:
from daft_func import Pipeline, ProgressConfig, Runner, func

# Custom configuration
progress_config = ProgressConfig(
    enabled=True,
    theme="dark",  # or "light", or None for auto
    # show_cache_indicators=True,
    # show_timing=True,
)
runner = Runner(progress_config=progress_config)

In [7]:
result = runner.run(inputs=single_inputs)  # should show misses

⏸ index_path   :   0%|          | 0/1 [00:00<?, ?it/s]

⏸ hits         :   0%|          | 0/1 [00:00<?, ?it/s]

⏸ reranked_hits:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [9]:
multi_inputs = {
    "corpus": corpus,
    "retriever": ToyRetriever(),
    "reranker": IdentityReranker(),
    "query": [
        Query(query_uuid="q1", text="quick brown"),
        Query(query_uuid="q2", text="wizards jump"),
        Query(query_uuid="q3", text="brown dog"),
    ],
    "top_k": 2,
}

In [10]:
result = runner.run(inputs=multi_inputs)

⏸ index_path   :   0%|          | 0/3 [00:00<?, ?it/s]

⏸ hits         :   0%|          | 0/3 [00:00<?, ?it/s]

⏸ reranked_hits:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
result = runner.run(inputs=single_inputs)  # should show misses

⏸ index_path   :   0%|          | 0/1 [00:00<?, ?it/s]

⏸ hits         :   0%|          | 0/1 [00:00<?, ?it/s]

⏸ reranked_hits:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
corpus = {
    "d1": "a quick brown fox jumps",
    "d2": "brown dog sleeps",
    "d3": "five boxing wizards jump quickly",
}

single_inputs = {
    "retriever": ToyRetriever(),
    "corpus": corpus,
    "reranker": IdentityReranker(),
    "query": Query(query_uuid="q1", text="quick brown"),
    "top_k": 2,
}

In [13]:
result = runner.run(inputs=single_inputs)  # should show hits

⏸ index_path   :   0%|          | 0/1 [00:00<?, ?it/s]

⏸ hits         :   0%|          | 0/1 [00:00<?, ?it/s]

⏸ reranked_hits:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
corpus = {
    "d1": "a quick brown fox jumps",
    "d2": "brown dog sleeps",
    "d3": "five boxing wizards jump quickly",
}

single_inputs = {
    "retriever": ToyRetriever(),
    "corpus": corpus,
    "reranker": IdentityReranker(),
    "query": Query(query_uuid="q1", text="quick brown"),
    "top_k": 2,
}

In [15]:
result = runner.run(
    inputs=single_inputs
)  # should show hits, but showing misses, probably because of the classes

⏸ index_path   :   0%|          | 0/1 [00:00<?, ?it/s]

⏸ hits         :   0%|          | 0/1 [00:00<?, ?it/s]

⏸ reranked_hits:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
multi_inputs = {
    "corpus": corpus,
    "retriever": ToyRetriever(),
    "reranker": IdentityReranker(),
    "query": [
        Query(query_uuid="q1", text="quick brown"),
        Query(query_uuid="q2", text="wizards jump"),
        Query(query_uuid="q3", text="brown dog"),
    ],
    "top_k": 2,
}

In [17]:
result = runner.run(inputs=multi_inputs)

⏸ index_path   :   0%|          | 0/3 [00:00<?, ?it/s]

⏸ hits         :   0%|          | 0/3 [00:00<?, ?it/s]

⏸ reranked_hits:   0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
result["reranked_hits"]

[[RerankedHit(query_uuid='q1', doc_id='d1', score=2.0),
  RerankedHit(query_uuid='q1', doc_id='d2', score=1.0)],
 [RerankedHit(query_uuid='q2', doc_id='d3', score=2.0),
  RerankedHit(query_uuid='q2', doc_id='d1', score=1.0)],
 [RerankedHit(query_uuid='q3', doc_id='d2', score=2.0),
  RerankedHit(query_uuid='q3', doc_id='d1', score=1.0)]]

In [19]:
for mode in ["local", "daft", "auto"]:
    runner = Runner(
        mode=mode,
        batch_threshold=2,
        cache_config=CacheConfig(enabled=True, cache_dir=".cache"),
    )
    result = runner.run(pipeline, inputs=multi_inputs)
    print(
        f"✅ {mode.upper():5s} mode: {len(result['reranked_hits'])} queries processed"
    )
print("\n" + "=" * 70)
print("🎉 Demo complete!")
print("=" * 70)

TypeError: CacheConfig.__init__() got an unexpected keyword argument 'cache_dir'

In [None]:
from time import sleep

from daft_func import func


@func(output="embeddings", cache=True, cache_key="model_v1")
def encode(text: str) -> list:
    sleep(6)
    return "s"


@func(output="result", cache=True)
def process(embeddings: list, threshold: float) -> dict:
    sleep(2)
    return "t"


pipeline = Pipeline(functions=[encode, process])
cache_config = CacheConfig(enabled=True, cache_dir=".cache")
runner = Runner(cache_config=cache_config)
# First run: executes both
result1 = runner.run(pipeline, inputs={"text": "hello", "threshold": 0.5})


[CACHE] embeddings: ✗ MISS (6.01s) | result: ✗ MISS (2.01s)


In [None]:
# Second run, change threshold: encode cached, process re-executes
result2 = runner.run(inputs={"text": "hello", "threshold": 0.8})

KeyboardInterrupt: 

In [None]:
from daft_func import Pipeline, func


# Define a simple pipeline
@func(output="doubled")
def double(x: int) -> int:
    """Double the input value."""
    return x * 2


@func(output="result")
def add_value(doubled: int, offset: int = 5) -> int:
    """Add an offset to the doubled value."""
    return doubled + offset


# Create pipeline with explicit functions
pipeline = Pipeline(functions=[double, add_value])

In [None]:
# Create and display visualization
pipeline.visualize()

In [None]:
from pydantic import BaseModel

from daft_func import Runner, func


# 1. Define your data models
class Query(BaseModel):
    id: str
    text: str


class Result(BaseModel):
    id: str
    score: float


@func(output="results", map_axis="query", key_attr="id")
def process(query: Query, threshold: float) -> Result:
    score = len(query.text) * threshold
    return Result(id=query.id, score=score)


# 3. Create pipeline and runner
pipeline = Pipeline(functions=[process])
runner = Runner()

In [None]:
outputs = runner.run(
    inputs={
        "query": [Query(id="q1", text="hello")],
        "threshold": 0.5,
    }
)

print(outputs["results"])
# [Result(id='q1', score=2.5), Result(id='q2', score=2.5)]

[Result(id='q1', score=2.5)]


In [None]:
outputs = runner.run(
    inputs={
        "query": [
            Query(id="q1", text="hello"),
            Query(id="q2", text="world"),
        ],
        "threshold": 0.5,
    }
)

print(outputs["results"])
# [Result(id='q1', score=2.5), Result(id='q2', score=2.5)]

[Result(id='q1', score=2.5), Result(id='q2', score=2.5)]
