From 1907ba8decec69777618996ac25fcf0cbaf40529 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 17 Apr 2025 19:54:48 +0200 Subject: [PATCH 1/7] add draft notebook --- README.md | 1 + index.toml | 13 + .../44_Creating_Custom_SuperComponents.ipynb | 660 ++++++++++++++++++ 3 files changed, 674 insertions(+) create mode 100644 tutorials/44_Creating_Custom_SuperComponents.ipynb diff --git a/README.md b/README.md index c2b8a514..9ecd17ee 100644 --- a/README.md +++ b/README.md @@ -32,3 +32,4 @@ To contribute to the tutorials, please check out our [Contributing Guidelines](. | [Query Classification with TransformersTextRouter](./tutorials/41_Query_Classification_with_TransformersTextRouter_and_TransformersZeroShotTextRouter.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack-tutorials/blob/main/tutorials/41_Query_Classification_with_TransformersTextRouter_and_TransformersZeroShotTextRouter.ipynb) | | [Retrieving a Context Window Around a Sentence](./tutorials/42_Sentence_Window_Retriever.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack-tutorials/blob/main/tutorials/42_Sentence_Window_Retriever.ipynb) | | | [Build a Tool-Calling Agent](./tutorials/43_Building_a_Tool_Calling_Agent.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack-tutorials/blob/main/tutorials/43_Building_a_Tool_Calling_Agent.ipynb) | | +| [Creating Custom SuperComponents](./tutorials/44_Creating_Custom_SuperComponents.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack-tutorials/blob/main/tutorials/44_Creating_Custom_SuperComponents.ipynb) | | diff --git a/index.toml b/index.toml index e7994ad3..fbcfbfd3 100644 --- a/index.toml +++ b/index.toml @@ -513,3 +513,16 @@ created_at = 2025-04-03 haystack_2 = true dependencies = ["docstring-parser", "trafilatura"] featured = true + +[[tutorial]] +title = "Creating Custom SuperComponents" +description = "Learn how to use the @super_component decorator to create custom SuperComponents with input and output mappings" +level = "intermediate" +weight = 8 +notebook = "44_Creating_Custom_SuperComponents.ipynb" +aliases = [] +completion_time = "20 min" +created_at = 2025-04-17 +haystack_2 = true +dependencies = ["sentence-transformers>=3.0.0", "datasets"] +featured = true diff --git a/tutorials/44_Creating_Custom_SuperComponents.ipynb b/tutorials/44_Creating_Custom_SuperComponents.ipynb new file mode 100644 index 00000000..c9220eb1 --- /dev/null +++ b/tutorials/44_Creating_Custom_SuperComponents.ipynb @@ -0,0 +1,660 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "2OvkPji9O-qX" + }, + "source": [ + "# Tutorial: Creating Custom SuperComponents\n", + "\n", + "- **Level**: Intermediate\n", + "- **Time to complete**: 20 minutes\n", + "- **Concepts and Components Used**: [`@super_component`](https://docs.haystack.deepset.ai/docs/super_component), [`Pipeline`](https://docs.haystack.deepset.ai/docs/pipeline), [`DocumentJoiner`](https://docs.haystack.deepset.ai/docs/documentjoiner), [`SentenceTransformersTextEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder), [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever), [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever), [`TransformersSimilarityRanker`](https://docs.haystack.deepset.ai/docs/transformerssimilarityranker)\n", + "- **Goal**: After completing this tutorial, you'll have learned how to create custom SuperComponents using the `@super_component` decorator to simplify complex pipelines and make them reusable as components." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LFqHcXYPO-qZ" + }, + "source": [ + "## Overview\n", + "\n", + "In this tutorial, you'll learn how to create custom SuperComponents using the `@super_component` decorator. SuperComponents are a powerful way to encapsulate complex pipelines into reusable components with simplified interfaces.\n", + "\n", + "We'll explore several examples:\n", + "\n", + "1. Creating a simple HybridRetriever SuperComponent\n", + "2. Extending our HybridRetriever with a ranker component \n", + "3. Creating a SuperComponent with custom input and output mappings\n", + "4. Creating a SuperComponent that exposes outputs from non-leaf components\n", + "\n", + "The `@super_component` decorator makes it easy to convert a class that defines a pipeline into a fully functional Haystack component that can be used in other pipelines or applications. All it requires is that the class has an attribute called `pipeline`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QXjVlbPiO-qZ" + }, + "source": [ + "## Preparing the Environment\n", + "\n", + "First, let's install Haystack and the dependencies we'll need:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UQbU8GUfO-qZ" + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "pip install haystack-ai datasets \"sentence-transformers>=3.0.0\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_lvfew16O-qa" + }, + "source": [ + "## Understanding the @super_component Decorator\n", + "\n", + "The `@super_component` decorator is a powerful tool that allows you to create custom components by wrapping a Pipeline. It handles all the complexity of mapping inputs and outputs between the component interface and the underlying pipeline.\n", + "\n", + "When you use the `@super_component` decorator, you need to define a class with:\n", + "\n", + "1. An `__init__` method that creates a Pipeline and assigns it to `self.pipeline`\n", + "2. Optionally, `input_mapping` and `output_mapping` attributes to customize how inputs and outputs are mapped\n", + "\n", + "The decorator then:\n", + "\n", + "1. Creates a new class that inherits from `SuperComponent`\n", + "2. Copies all methods and attributes from your original class\n", + "3. Adds initialization logic to properly set up the SuperComponent\n", + "\n", + "Let's see how this works with some practical examples." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yL8nuJdWO-qa" + }, + "source": [ + "## Example 1: Creating a HybridRetriever SuperComponent\n", + "\n", + "Let's start with a simple example: creating a HybridRetriever that combines BM25 and embedding-based retrieval. This SuperComponent will take a query and return relevant documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XvLVaFHTO-qb" + }, + "outputs": [], + "source": [ + "from haystack import Document, Pipeline, super_component\n", + "from haystack.components.joiners import DocumentJoiner\n", + "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", + "from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "\n", + "from datasets import load_dataset\n", + "\n", + "\n", + "@super_component\n", + "class HybridRetriever:\n", + " def __init__(self, document_store: InMemoryDocumentStore, embedder_model: str = \"BAAI/bge-small-en-v1.5\"):\n", + " # Create the components\n", + " embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n", + " bm25_retriever = InMemoryBM25Retriever(document_store)\n", + " text_embedder = SentenceTransformersTextEmbedder(embedder_model)\n", + " document_joiner = DocumentJoiner(join_mode=\"reciprocal_rank_fusion\")\n", + "\n", + " # Create the pipeline\n", + " self.pipeline = Pipeline()\n", + " self.pipeline.add_component(\"text_embedder\", text_embedder)\n", + " self.pipeline.add_component(\"embedding_retriever\", embedding_retriever)\n", + " self.pipeline.add_component(\"bm25_retriever\", bm25_retriever)\n", + " self.pipeline.add_component(\"document_joiner\", document_joiner)\n", + "\n", + " # Connect the components\n", + " self.pipeline.connect(\"text_embedder\", \"embedding_retriever\")\n", + " self.pipeline.connect(\"bm25_retriever\", \"document_joiner\")\n", + " self.pipeline.connect(\"embedding_retriever\", \"document_joiner\")\n", + "\n", + " # Define input and output mappings\n", + " self.input_mapping = {\n", + " \"query\": [\"text_embedder.text\", \"bm25_retriever.query\"],\n", + " }\n", + " self.output_mapping = {\n", + " \"document_joiner.documents\": \"documents\"\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RkaQjJJX0FAU" + }, + "source": [ + "Now, let's load a dataset and test our HybridRetriever:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aNzUi4iz0FAU" + }, + "outputs": [], + "source": [ + "# Load a dataset\n", + "dataset = load_dataset(\"HaystackBot/medrag-pubmed-chunk-with-embeddings\", split=\"train\")\n", + "docs = [Document(content=doc[\"contents\"], embedding=doc[\"embedding\"]) for doc in dataset]\n", + "document_store = InMemoryDocumentStore()\n", + "document_store.write_documents(docs)\n", + "\n", + "# Create and run the HybridRetriever\n", + "query = \"What treatments are available for chronic bronchitis?\"\n", + "retriever = HybridRetriever(document_store)\n", + "result = retriever.run(query=query)\n", + "\n", + "# Print the results\n", + "print(f\"Found {len(result['documents'])} documents\")\n", + "for i, doc in enumerate(result['documents'][:3]): # Show first 3 documents\n", + " print(f\"\\nDocument {i+1} (Score: {doc.score:.4f}):\")\n", + " print(doc.content[:200] + \"...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZnuXEecr0FAU" + }, + "source": [ + "### How the HybridRetriever Works\n", + "\n", + "Let's break down what's happening in our HybridRetriever SuperComponent:\n", + "\n", + "1. We define a class decorated with `@super_component`\n", + "2. In the `__init__` method, we:\n", + " - Create all the components we need (embedding retriever, BM25 retriever, etc.)\n", + " - Create a Pipeline and add all components to it\n", + " - Connect the components to define the flow of data\n", + " - Define input and output mappings to simplify the interface\n", + "3. The `@super_component` decorator handles all the complexity of making our class work as a component\n", + "\n", + "The input mapping `{\"query\": [\"text_embedder.text\", \"bm25_retriever.query\"]}` means that when we call `run(query=\"...\")`, the query is automatically sent to both the text embedder and the BM25 retriever.\n", + "\n", + "The output mapping `{\"document_joiner.documents\": \"documents\"}` means that the documents from the document joiner are returned as the \"documents\" output of our component." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HryYZP9ZO-qb" + }, + "source": [ + "## Example 2: Enhancing the HybridRetriever with a Ranker\n", + "\n", + "Now, let's enhance our HybridRetriever by adding a ranker component. This will re-rank the documents based on their semantic similarity to the query, potentially improving the quality of the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "INdC3WvLO-qb" + }, + "outputs": [], + "source": [ + "from haystack import Document, Pipeline, super_component\n", + "from haystack.components.joiners import DocumentJoiner\n", + "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", + "from haystack.components.rankers import TransformersSimilarityRanker\n", + "from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "\n", + "from datasets import load_dataset\n", + "\n", + "\n", + "@super_component\n", + "class HybridRetrieverWithRanker:\n", + " def __init__(\n", + " self, \n", + " document_store: InMemoryDocumentStore, \n", + " embedder_model: str = \"BAAI/bge-small-en-v1.5\",\n", + " ranker_model: str = \"BAAI/bge-reranker-base\"\n", + " ):\n", + " # Create the components\n", + " embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n", + " bm25_retriever = InMemoryBM25Retriever(document_store)\n", + " text_embedder = SentenceTransformersTextEmbedder(embedder_model)\n", + " document_joiner = DocumentJoiner()\n", + " ranker = TransformersSimilarityRanker(ranker_model)\n", + "\n", + " # Create the pipeline\n", + " self.pipeline = Pipeline()\n", + " self.pipeline.add_component(\"text_embedder\", text_embedder)\n", + " self.pipeline.add_component(\"embedding_retriever\", embedding_retriever)\n", + " self.pipeline.add_component(\"bm25_retriever\", bm25_retriever)\n", + " self.pipeline.add_component(\"document_joiner\", document_joiner)\n", + " self.pipeline.add_component(\"ranker\", ranker)\n", + "\n", + " # Connect the components\n", + " self.pipeline.connect(\"text_embedder\", \"embedding_retriever\")\n", + " self.pipeline.connect(\"bm25_retriever\", \"document_joiner\")\n", + " self.pipeline.connect(\"embedding_retriever\", \"document_joiner\")\n", + " self.pipeline.connect(\"document_joiner\", \"ranker\")\n", + "\n", + " # Define input and output mappings\n", + " self.input_mapping = {\n", + " \"query\": [\"text_embedder.text\", \"bm25_retriever.query\", \"ranker.query\"],\n", + " }\n", + " self.output_mapping = {\n", + " \"ranker.documents\": \"documents\"\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yxaN3KBo65pv" + }, + "outputs": [], + "source": [ + "# Create and run the HybridRetrieverWithRanker\n", + "retriever = HybridRetrieverWithRanker(document_store)\n", + "result = retriever.run(query=query)\n", + "\n", + "# Print the results\n", + "print(f\"Found {len(result['documents'])} documents\")\n", + "for i, doc in enumerate(result['documents'][:3]): # Show first 3 documents\n", + " print(f\"\\nDocument {i+1} (Score: {doc.score:.4f}):\")\n", + " print(doc.content[:200] + \"...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xtORWP3_0FAU" + }, + "source": [ + "### Comparing the Two Retrievers\n", + "\n", + "The main differences between the two retrievers are:\n", + "\n", + "1. **Added Ranker Component**: The second version includes a `TransformersSimilarityRanker` that re-ranks the documents based on their semantic similarity to the query.\n", + "2. **Updated Input Mapping**: We added `\"ranker.query\"` to the input mapping to ensure the query is also sent to the ranker.\n", + "3. **Updated Output Mapping**: We changed the output mapping to return the documents from the ranker instead of the document joiner.\n", + "\n", + "The ranker can significantly improve the quality of the results by re-ranking the documents based on their semantic similarity to the query, even if they were not ranked highly by the initial retrievers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Serialization and Deserialization of SuperComponents\n", + "\n", + "One of the key benefits of using the `@super_component` decorator is that it automatically adds serialization and deserialization capabilities to your component. This means you can easily save and load your SuperComponents using the standard Haystack serialization functions.\n", + "\n", + "Let's see how this works with our `DocumentPreprocessor` component:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from haystack import component_to_dict, component_from_dict\n", + "\n", + "# Create an instance of our SuperComponent\n", + "preprocessor = DocumentPreprocessor(document_store)\n", + "\n", + "# Serialize the component to a dictionary\n", + "serialized = component_to_dict(preprocessor, \"document_preprocessor\")\n", + "print(\"Serialized component:\")\n", + "print(serialized)\n", + "\n", + "# Deserialize the component from the dictionary\n", + "deserialized = component_from_dict(DocumentPreprocessor, serialized, \"document_preprocessor\")\n", + "print(\"\\nDeserialized component:\")\n", + "print(deserialized)\n", + "\n", + "# Verify that the deserialized component works\n", + "result = deserialized.run(documents=sample_docs, query=\"sample document\")\n", + "print(f\"\\nDeserialized component produced {len(result['embedded_documents'])} embedded documents\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The serialization and deserialization process works seamlessly with SuperComponents because the `@super_component` decorator automatically adds the necessary functionality. This is particularly useful when you want to:\n", + "\n", + "1. **Save and load pipelines**: You can save your entire pipeline, including SuperComponents, to a file and load it later.\n", + "2. **Deploy components**: You can deploy your SuperComponents to a server or cloud environment.\n", + "3. **Share components**: You can share your SuperComponents with others, who can then load and use them in their own pipelines.\n", + "\n", + "The serialization process captures all the initialization parameters of your SuperComponent, ensuring that when it's deserialized, it's recreated with the same configuration." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0_tne2jaFylV" + }, + "source": [ + "## Example 3: Creating a SuperComponent with Custom Input and Output Mappings\n", + "\n", + "Now, let's create a more complex SuperComponent that demonstrates the power of custom input and output mappings. We'll create a document preprocessing pipeline that can clean, split, and embed documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vfMNiQwjFjOt" + }, + "outputs": [], + "source": [ + "from haystack import Document, Pipeline, super_component\n", + "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n", + "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "\n", + "\n", + "@super_component\n", + "class DocumentPreprocessor:\n", + " def __init__(\n", + " self,\n", + " document_store: InMemoryDocumentStore,\n", + " embedder_model: str = \"BAAI/bge-small-en-v1.5\",\n", + " split_by: str = \"word\",\n", + " split_length: int = 200,\n", + " split_overlap: int = 20,\n", + " ):\n", + " # Create the components\n", + " cleaner = DocumentCleaner(\n", + " remove_empty_lines=True,\n", + " remove_extra_whitespaces=True,\n", + " remove_repeated_substrings=True,\n", + " )\n", + " splitter = DocumentSplitter(\n", + " split_by=split_by,\n", + " split_length=split_length,\n", + " split_overlap=split_overlap,\n", + " )\n", + " embedder = SentenceTransformersTextEmbedder(embedder_model)\n", + "\n", + " # Create the pipeline\n", + " self.pipeline = Pipeline()\n", + " self.pipeline.add_component(\"cleaner\", cleaner)\n", + " self.pipeline.add_component(\"splitter\", splitter)\n", + " self.pipeline.add_component(\"embedder\", embedder)\n", + "\n", + " # Connect the components\n", + " self.pipeline.connect(\"cleaner.documents\", \"splitter.documents\")\n", + " self.pipeline.connect(\"splitter.documents\", \"embedder.documents\")\n", + "\n", + " # Define custom input and output mappings\n", + " self.input_mapping = {\n", + " \"documents\": [\"cleaner.documents\"],\n", + " \"query\": [\"embedder.text\"], # For embedding queries\n", + " }\n", + " self.output_mapping = {\n", + " \"cleaner.documents\": \"cleaned_documents\",\n", + " \"splitter.documents\": \"split_documents\",\n", + " \"embedder.documents\": \"embedded_documents\",\n", + " \"embedder.embedding\": \"query_embedding\",\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "51Tu2p2C_ZxL" + }, + "outputs": [], + "source": [ + "# Create some sample documents\n", + "sample_docs = [\n", + " Document(content=\"This is a sample document with some extra whitespace. \\n\\n\\nIt has multiple lines.\"),\n", + " Document(content=\"This is another document. It will be processed by our pipeline.\")\n", + "]\n", + "\n", + "# Create and run the DocumentPreprocessor\n", + "preprocessor = DocumentPreprocessor(document_store)\n", + "result = preprocessor.run(documents=sample_docs, query=\"sample document\")\n", + "\n", + "# Print the results\n", + "print(f\"Cleaned documents: {len(result['cleaned_documents'])}\")\n", + "print(f\"Split documents: {len(result['split_documents'])}\")\n", + "print(f\"Embedded documents: {len(result['embedded_documents'])}\")\n", + "print(f\"Query embedding shape: {len(result['query_embedding'])}\")\n", + "\n", + "# Show a sample of the processed documents\n", + "print(\"\\nSample of cleaned document:\")\n", + "print(result['cleaned_documents'][0].content)\n", + "\n", + "print(\"\\nSample of split document:\")\n", + "print(result['split_documents'][0].content)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "czMjWwnxPA-3" + }, + "source": [ + "### Understanding Custom Input and Output Mappings\n", + "\n", + "In this example, we've created a SuperComponent with custom input and output mappings that expose more of the pipeline's functionality:\n", + "\n", + "1. **Multiple Inputs**: The component accepts both `documents` and `query` inputs, allowing it to process documents and embed queries.\n", + "2. **Multiple Outputs**: The component exposes outputs from multiple components in the pipeline:\n", + " - `cleaned_documents`: Documents after cleaning\n", + " - `split_documents`: Documents after splitting\n", + " - `embedded_documents`: Documents after embedding\n", + " - `query_embedding`: The embedding of the query\n", + "\n", + "This demonstrates how the `@super_component` decorator allows you to create components with rich interfaces that expose the functionality of multiple components in a pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9y4iJE_SrS4K" + }, + "source": [ + "## Example 4: Creating a SuperComponent with Outputs from Non-Leaf Components\n", + "\n", + "One of the powerful features of SuperComponents is the ability to expose outputs from any component in the pipeline, not just the leaf components. Let's create a SuperComponent that demonstrates this capability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HryYZP9ZO-qc" + }, + "outputs": [], + "source": [ + "from haystack import Document, Pipeline, super_component\n", + "from haystack.components.joiners import DocumentJoiner\n", + "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", + "from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever\n", + "from haystack.components.rankers import TransformersSimilarityRanker\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "\n", + "\n", + "@super_component\n", + "class AdvancedHybridRetriever:\n", + " def __init__(\n", + " self, \n", + " document_store: InMemoryDocumentStore, \n", + " embedder_model: str = \"BAAI/bge-small-en-v1.5\",\n", + " ranker_model: str = \"BAAI/bge-reranker-base\"\n", + " ):\n", + " # Create the components\n", + " embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n", + " bm25_retriever = InMemoryBM25Retriever(document_store)\n", + " text_embedder = SentenceTransformersTextEmbedder(embedder_model)\n", + " document_joiner = DocumentJoiner()\n", + " ranker = TransformersSimilarityRanker(ranker_model)\n", + "\n", + " # Create the pipeline\n", + " self.pipeline = Pipeline()\n", + " self.pipeline.add_component(\"text_embedder\", text_embedder)\n", + " self.pipeline.add_component(\"embedding_retriever\", embedding_retriever)\n", + " self.pipeline.add_component(\"bm25_retriever\", bm25_retriever)\n", + " self.pipeline.add_component(\"document_joiner\", document_joiner)\n", + " self.pipeline.add_component(\"ranker\", ranker)\n", + "\n", + " # Connect the components\n", + " self.pipeline.connect(\"text_embedder\", \"embedding_retriever\")\n", + " self.pipeline.connect(\"bm25_retriever\", \"document_joiner\")\n", + " self.pipeline.connect(\"embedding_retriever\", \"document_joiner\")\n", + " self.pipeline.connect(\"document_joiner\", \"ranker\")\n", + "\n", + " # Define input and output mappings\n", + " self.input_mapping = {\n", + " \"query\": [\"text_embedder.text\", \"bm25_retriever.query\", \"ranker.query\"],\n", + " }\n", + " \n", + " # Expose outputs from multiple components, including non-leaf components\n", + " self.output_mapping = {\n", + " \"bm25_retriever.documents\": \"bm25_documents\",\n", + " \"embedding_retriever.documents\": \"embedding_documents\",\n", + " \"document_joiner.documents\": \"joined_documents\",\n", + " \"ranker.documents\": \"ranked_documents\",\n", + " \"text_embedder.embedding\": \"query_embedding\"\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "INdC3WvLO-qc" + }, + "outputs": [], + "source": [ + "# Create and run the AdvancedHybridRetriever\n", + "retriever = AdvancedHybridRetriever(document_store)\n", + "result = retriever.run(query=query)\n", + "\n", + "# Print the results\n", + "print(f\"BM25 documents: {len(result['bm25_documents'])}\")\n", + "print(f\"Embedding documents: {len(result['embedding_documents'])}\")\n", + "print(f\"Joined documents: {len(result['joined_documents'])}\")\n", + "print(f\"Ranked documents: {len(result['ranked_documents'])}\")\n", + "print(f\"Query embedding shape: {len(result['query_embedding'])}\")\n", + "\n", + "# Compare the top document from each stage\n", + "print(\"\\nTop BM25 document:\")\n", + "print(result['bm25_documents'][0].content[:200] + \"...\")\n", + "print(f\"Score: {result['bm25_documents'][0].score:.4f}\")\n", + "\n", + "print(\"\\nTop embedding document:\")\n", + "print(result['embedding_documents'][0].content[:200] + \"...\")\n", + "print(f\"Score: {result['embedding_documents'][0].score:.4f}\")\n", + "\n", + "print(\"\\nTop ranked document:\")\n", + "print(result['ranked_documents'][0].content[:200] + \"...\")\n", + "print(f\"Score: {result['ranked_documents'][0].score:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wLIcnWl-66QA" + }, + "source": [ + "### Understanding Outputs from Non-Leaf Components\n", + "\n", + "In this example, we've created a SuperComponent that exposes outputs from multiple components in the pipeline, including non-leaf components:\n", + "\n", + "1. **BM25 Documents**: Documents retrieved by the BM25 retriever\n", + "2. **Embedding Documents**: Documents retrieved by the embedding retriever\n", + "3. **Joined Documents**: Documents after joining the results from both retrievers\n", + "4. **Ranked Documents**: Documents after re-ranking\n", + "5. **Query Embedding**: The embedding of the query\n", + "\n", + "This demonstrates how the `@super_component` decorator allows you to expose outputs from any component in the pipeline, not just the leaf components. This is particularly useful for debugging, analysis, or when you want to provide more detailed information to the user." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xtORWP3_0FAU" + }, + "source": [ + "## Ready-Made SuperComponents in Haystack\n", + "\n", + "Haystack provides several ready-made SuperComponents that you can use in your applications, for example\n", + "\n", + "1. **[MultiFileConverter](https://docs.haystack.deepset.ai/docs/multifileconverter)**: A SuperComponent that can convert various file types to documents.\n", + "2. **[DocumentPreprocessor](https://docs.haystack.deepset.ai/docs/documentpreprocessor)**: A SuperComponent that combines document cleaning and splitting.\n", + "\n", + "These SuperComponents provide a convenient way to use common pipelines without having to build them from scratch." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "czMjWwnxPA-3" + }, + "source": [ + "## Conclusion\n", + "\n", + "In this tutorial, you've learned how to create custom SuperComponents using the `@super_component` decorator. You've seen how to:\n", + "\n", + "1. Create a simple HybridRetriever SuperComponent\n", + "2. Enhance it with a ranker\n", + "3. Create a SuperComponent with custom input and output mappings\n", + "4. Create a SuperComponent that exposes outputs from non-leaf components\n", + "\n", + "SuperComponents are a powerful way to encapsulate complex pipelines into reusable components with simplified interfaces. They make it easy to create higher-level components that abstract away the details of the underlying pipeline." + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From efe4f2b31af8ec44a3b4a538e842a3ccc324f28c Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 17 Apr 2025 23:50:24 +0200 Subject: [PATCH 2/7] install dependencies, simplify, include outputs --- .../44_Creating_Custom_SuperComponents.ipynb | 294 ++++++++---------- 1 file changed, 132 insertions(+), 162 deletions(-) diff --git a/tutorials/44_Creating_Custom_SuperComponents.ipynb b/tutorials/44_Creating_Custom_SuperComponents.ipynb index c9220eb1..b0d018d4 100644 --- a/tutorials/44_Creating_Custom_SuperComponents.ipynb +++ b/tutorials/44_Creating_Custom_SuperComponents.ipynb @@ -55,7 +55,8 @@ "source": [ "%%bash\n", "\n", - "pip install haystack-ai datasets \"sentence-transformers>=3.0.0\"" + "pip install git+https://github.com/deepset-ai/haystack.git@main # pip install haystack-ai after the 2.13 release\n", + "pip install \"sentence-transformers>=3.0.0\" datasets transformers[torch,sentencepiece]" ] }, { @@ -88,7 +89,7 @@ "id": "yL8nuJdWO-qa" }, "source": [ - "## Example 1: Creating a HybridRetriever SuperComponent\n", + "## 1. Creating a HybridRetriever SuperComponent\n", "\n", "Let's start with a simple example: creating a HybridRetriever that combines BM25 and embedding-based retrieval. This SuperComponent will take a query and return relevant documents." ] @@ -99,7 +100,16 @@ "metadata": { "id": "XvLVaFHTO-qb" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/julian/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "from haystack import Document, Pipeline, super_component\n", "from haystack.components.joiners import DocumentJoiner\n", @@ -129,15 +139,7 @@ " # Connect the components\n", " self.pipeline.connect(\"text_embedder\", \"embedding_retriever\")\n", " self.pipeline.connect(\"bm25_retriever\", \"document_joiner\")\n", - " self.pipeline.connect(\"embedding_retriever\", \"document_joiner\")\n", - "\n", - " # Define input and output mappings\n", - " self.input_mapping = {\n", - " \"query\": [\"text_embedder.text\", \"bm25_retriever.query\"],\n", - " }\n", - " self.output_mapping = {\n", - " \"document_joiner.documents\": \"documents\"\n", - " }" + " self.pipeline.connect(\"embedding_retriever\", \"document_joiner\")" ] }, { @@ -155,7 +157,31 @@ "metadata": { "id": "aNzUi4iz0FAU" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 6.96it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 15 documents\n", + "\n", + "Document 1 (Score: 0.9841):\n", + "[Proceedings: Long-term therapy with antibiotics in chronic bronchitis]. Longterm therapy of chronic bacterial bronchitis assumes two forms: (a) therapy of acute exacerbations, and (b) continuous long...\n", + "\n", + "Document 2 (Score: 0.9541):\n", + "An investigation of renal function in chronic bronchitis. An investigation has been made into various parameters of renal function in patients with chronic bronchitis and in a group of hypoxic control...\n", + "\n", + "Document 3 (Score: 0.9186):\n", + "Haematologic adaptation in patients with chronic bronchitis and pulmonary insufficiency. The relationship between respiratory insufficiency, expressed by gas tensions in blood and bone marrow, and hae...\n" + ] + } + ], "source": [ "# Load a dataset\n", "dataset = load_dataset(\"HaystackBot/medrag-pubmed-chunk-with-embeddings\", split=\"train\")\n", @@ -166,7 +192,7 @@ "# Create and run the HybridRetriever\n", "query = \"What treatments are available for chronic bronchitis?\"\n", "retriever = HybridRetriever(document_store)\n", - "result = retriever.run(query=query)\n", + "result = retriever.run(text=query, query=query)\n", "\n", "# Print the results\n", "print(f\"Found {len(result['documents'])} documents\")\n", @@ -204,9 +230,9 @@ "id": "HryYZP9ZO-qb" }, "source": [ - "## Example 2: Enhancing the HybridRetriever with a Ranker\n", + "## 2. A HybridRetriever with Re-Ranking and Custom `input_mapping`\n", "\n", - "Now, let's enhance our HybridRetriever by adding a ranker component. This will re-rank the documents based on their semantic similarity to the query, potentially improving the quality of the results." + "Now, let's enhance our HybridRetriever by adding a ranker component. This will re-rank the documents based on their semantic similarity to the query, potentially improving the quality of the results. We also define a custom input_mapping." ] }, { @@ -256,12 +282,9 @@ " self.pipeline.connect(\"embedding_retriever\", \"document_joiner\")\n", " self.pipeline.connect(\"document_joiner\", \"ranker\")\n", "\n", - " # Define input and output mappings\n", + " # Define input mapping\n", " self.input_mapping = {\n", " \"query\": [\"text_embedder.text\", \"bm25_retriever.query\", \"ranker.query\"],\n", - " }\n", - " self.output_mapping = {\n", - " \"ranker.documents\": \"documents\"\n", " }" ] }, @@ -271,11 +294,35 @@ "metadata": { "id": "yxaN3KBo65pv" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 81.05it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 10 documents\n", + "\n", + "Document 1 (Score: 0.9995):\n", + "[Proceedings: Long-term therapy with antibiotics in chronic bronchitis]. Longterm therapy of chronic bacterial bronchitis assumes two forms: (a) therapy of acute exacerbations, and (b) continuous long...\n", + "\n", + "Document 2 (Score: 0.9568):\n", + "Pharmacologic therapy of asthma. Asthma is treated by avoiding the precipitants of symptoms, by a trial of hyposensitization (immunotherapy) if the precipitant cannot be avoided, and principally by ph...\n", + "\n", + "Document 3 (Score: 0.9432):\n", + "Choosing a drug regimen for obstructive pulmonary disease. 1. Agents to achieve bronchodilatation. Many patients with pulmonary disease may have significant airflow obstruction. Bronchodilatation is a...\n" + ] + } + ], "source": [ "# Create and run the HybridRetrieverWithRanker\n", "retriever = HybridRetrieverWithRanker(document_store)\n", - "result = retriever.run(query=query)\n", + "result = retriever.run(query=query) # instead of retriever.run(text=query, query=query) thanks to input_mapping\n", "\n", "# Print the results\n", "print(f\"Found {len(result['documents'])} documents\")\n", @@ -295,8 +342,7 @@ "The main differences between the two retrievers are:\n", "\n", "1. **Added Ranker Component**: The second version includes a `TransformersSimilarityRanker` that re-ranks the documents based on their semantic similarity to the query.\n", - "2. **Updated Input Mapping**: We added `\"ranker.query\"` to the input mapping to ensure the query is also sent to the ranker.\n", - "3. **Updated Output Mapping**: We changed the output mapping to return the documents from the ranker instead of the document joiner.\n", + "2. **Updated Input Mapping**: We added `\"text_embedder.text\"`, `\"bm25_retriever.query\"` and `\"ranker.query\"` to the input mapping to ensure the input query is sent to all three components while simplifying the `retriever.run` method.\n", "\n", "The ranker can significantly improve the quality of the results by re-ranking the documents based on their semantic similarity to the query, even if they were not ranked highly by the initial retrievers." ] @@ -305,7 +351,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Serialization and Deserialization of SuperComponents\n", + "## 3. Serialization and Deserialization of SuperComponents\n", "\n", "One of the key benefits of using the `@super_component` decorator is that it automatically adds serialization and deserialization capabilities to your component. This means you can easily save and load your SuperComponents using the standard Haystack serialization functions.\n", "\n", @@ -314,14 +360,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Serialized component:\n", + "{'type': 'haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor', 'init_parameters': {'remove_empty_lines': True, 'remove_extra_whitespaces': True, 'remove_repeated_substrings': False, 'keep_id': False, 'remove_substrings': None, 'remove_regex': None, 'unicode_normalization': None, 'ascii_only': False, 'split_by': 'word', 'split_length': 250, 'split_overlap': 0, 'split_threshold': 0, 'splitting_function': None, 'respect_sentence_boundary': False, 'language': 'en', 'use_split_rules': True, 'extend_abbreviations': True}}\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'split'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 13\u001b[39m\n\u001b[32m 10\u001b[39m \u001b[38;5;28mprint\u001b[39m(serialized)\n\u001b[32m 12\u001b[39m \u001b[38;5;66;03m# Deserialize the component from the dictionary\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m13\u001b[39m deserialized = \u001b[43mcomponent_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDocumentPreprocessor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mserialized\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdocument_preprocessor\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 14\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33mDeserialized component:\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 15\u001b[39m \u001b[38;5;28mprint\u001b[39m(deserialized)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/haystack/core/serialization.py:166\u001b[39m, in \u001b[36mcomponent_from_dict\u001b[39m\u001b[34m(cls, data, name, callbacks)\u001b[39m\n\u001b[32m 163\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m default_from_dict(\u001b[38;5;28mcls\u001b[39m, data)\n\u001b[32m 165\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m callbacks \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m callbacks.component_pre_init \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m166\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdo_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 168\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m _hook_component_init(component_pre_init_callback):\n\u001b[32m 169\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m do_from_dict()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/haystack/core/serialization.py:161\u001b[39m, in \u001b[36mcomponent_from_dict..do_from_dict\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdo_from_dict\u001b[39m():\n\u001b[32m 160\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mcls\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mfrom_dict\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m161\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfrom_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 163\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m default_from_dict(\u001b[38;5;28mcls\u001b[39m, data)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/haystack/components/preprocessors/document_preprocessor.py:190\u001b[39m, in \u001b[36mDocumentPreprocessor.from_dict\u001b[39m\u001b[34m(cls, data)\u001b[39m\n\u001b[32m 181\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 182\u001b[39m \u001b[33;03mDeserializes the SuperComponent from a dictionary.\u001b[39;00m\n\u001b[32m 183\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 187\u001b[39m \u001b[33;03m Deserialized SuperComponent.\u001b[39;00m\n\u001b[32m 188\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 189\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33msplitting_function\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m data[\u001b[33m\"\u001b[39m\u001b[33minit_parameters\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m--> \u001b[39m\u001b[32m190\u001b[39m data[\u001b[33m\"\u001b[39m\u001b[33minit_parameters\u001b[39m\u001b[33m\"\u001b[39m][\u001b[33m\"\u001b[39m\u001b[33msplitting_function\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mdeserialize_callable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 191\u001b[39m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43minit_parameters\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msplitting_function\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 192\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 194\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m default_from_dict(\u001b[38;5;28mcls\u001b[39m, data)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/haystack/utils/callable_serialization.py:53\u001b[39m, in \u001b[36mdeserialize_callable\u001b[39m\u001b[34m(callable_handle)\u001b[39m\n\u001b[32m 45\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdeserialize_callable\u001b[39m(callable_handle: \u001b[38;5;28mstr\u001b[39m) -> Callable:\n\u001b[32m 46\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 47\u001b[39m \u001b[33;03m Deserializes a callable given its full import path as a string.\u001b[39;00m\n\u001b[32m 48\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 51\u001b[39m \u001b[33;03m :raises DeserializationError: If the callable cannot be found\u001b[39;00m\n\u001b[32m 52\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m53\u001b[39m parts = \u001b[43mcallable_handle\u001b[49m\u001b[43m.\u001b[49m\u001b[43msplit\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 55\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(parts), \u001b[32m0\u001b[39m, -\u001b[32m1\u001b[39m):\n\u001b[32m 56\u001b[39m module_name = \u001b[33m\"\u001b[39m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m.join(parts[:i])\n", + "\u001b[31mAttributeError\u001b[39m: 'NoneType' object has no attribute 'split'" + ] + } + ], "source": [ - "from haystack import component_to_dict, component_from_dict\n", + "from haystack.core.serialization import component_to_dict, component_from_dict\n", + "from haystack.components.preprocessors import DocumentPreprocessor\n", "\n", "# Create an instance of our SuperComponent\n", - "preprocessor = DocumentPreprocessor(document_store)\n", + "preprocessor = DocumentPreprocessor()\n", "\n", "# Serialize the component to a dictionary\n", "serialized = component_to_dict(preprocessor, \"document_preprocessor\")\n", @@ -334,8 +405,9 @@ "print(deserialized)\n", "\n", "# Verify that the deserialized component works\n", - "result = deserialized.run(documents=sample_docs, query=\"sample document\")\n", - "print(f\"\\nDeserialized component produced {len(result['embedded_documents'])} embedded documents\")" + "doc = Document(content=\"I love pizza!\")\n", + "result = deserialized.run(documents=[doc])\n", + "print(f\"\\nDeserialized component produced {len(result['documents'])} documents\")" ] }, { @@ -351,138 +423,15 @@ "The serialization process captures all the initialization parameters of your SuperComponent, ensuring that when it's deserialized, it's recreated with the same configuration." ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "0_tne2jaFylV" - }, - "source": [ - "## Example 3: Creating a SuperComponent with Custom Input and Output Mappings\n", - "\n", - "Now, let's create a more complex SuperComponent that demonstrates the power of custom input and output mappings. We'll create a document preprocessing pipeline that can clean, split, and embed documents." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vfMNiQwjFjOt" - }, - "outputs": [], - "source": [ - "from haystack import Document, Pipeline, super_component\n", - "from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter\n", - "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", - "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", - "\n", - "\n", - "@super_component\n", - "class DocumentPreprocessor:\n", - " def __init__(\n", - " self,\n", - " document_store: InMemoryDocumentStore,\n", - " embedder_model: str = \"BAAI/bge-small-en-v1.5\",\n", - " split_by: str = \"word\",\n", - " split_length: int = 200,\n", - " split_overlap: int = 20,\n", - " ):\n", - " # Create the components\n", - " cleaner = DocumentCleaner(\n", - " remove_empty_lines=True,\n", - " remove_extra_whitespaces=True,\n", - " remove_repeated_substrings=True,\n", - " )\n", - " splitter = DocumentSplitter(\n", - " split_by=split_by,\n", - " split_length=split_length,\n", - " split_overlap=split_overlap,\n", - " )\n", - " embedder = SentenceTransformersTextEmbedder(embedder_model)\n", - "\n", - " # Create the pipeline\n", - " self.pipeline = Pipeline()\n", - " self.pipeline.add_component(\"cleaner\", cleaner)\n", - " self.pipeline.add_component(\"splitter\", splitter)\n", - " self.pipeline.add_component(\"embedder\", embedder)\n", - "\n", - " # Connect the components\n", - " self.pipeline.connect(\"cleaner.documents\", \"splitter.documents\")\n", - " self.pipeline.connect(\"splitter.documents\", \"embedder.documents\")\n", - "\n", - " # Define custom input and output mappings\n", - " self.input_mapping = {\n", - " \"documents\": [\"cleaner.documents\"],\n", - " \"query\": [\"embedder.text\"], # For embedding queries\n", - " }\n", - " self.output_mapping = {\n", - " \"cleaner.documents\": \"cleaned_documents\",\n", - " \"splitter.documents\": \"split_documents\",\n", - " \"embedder.documents\": \"embedded_documents\",\n", - " \"embedder.embedding\": \"query_embedding\",\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "51Tu2p2C_ZxL" - }, - "outputs": [], - "source": [ - "# Create some sample documents\n", - "sample_docs = [\n", - " Document(content=\"This is a sample document with some extra whitespace. \\n\\n\\nIt has multiple lines.\"),\n", - " Document(content=\"This is another document. It will be processed by our pipeline.\")\n", - "]\n", - "\n", - "# Create and run the DocumentPreprocessor\n", - "preprocessor = DocumentPreprocessor(document_store)\n", - "result = preprocessor.run(documents=sample_docs, query=\"sample document\")\n", - "\n", - "# Print the results\n", - "print(f\"Cleaned documents: {len(result['cleaned_documents'])}\")\n", - "print(f\"Split documents: {len(result['split_documents'])}\")\n", - "print(f\"Embedded documents: {len(result['embedded_documents'])}\")\n", - "print(f\"Query embedding shape: {len(result['query_embedding'])}\")\n", - "\n", - "# Show a sample of the processed documents\n", - "print(\"\\nSample of cleaned document:\")\n", - "print(result['cleaned_documents'][0].content)\n", - "\n", - "print(\"\\nSample of split document:\")\n", - "print(result['split_documents'][0].content)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "czMjWwnxPA-3" - }, - "source": [ - "### Understanding Custom Input and Output Mappings\n", - "\n", - "In this example, we've created a SuperComponent with custom input and output mappings that expose more of the pipeline's functionality:\n", - "\n", - "1. **Multiple Inputs**: The component accepts both `documents` and `query` inputs, allowing it to process documents and embed queries.\n", - "2. **Multiple Outputs**: The component exposes outputs from multiple components in the pipeline:\n", - " - `cleaned_documents`: Documents after cleaning\n", - " - `split_documents`: Documents after splitting\n", - " - `embedded_documents`: Documents after embedding\n", - " - `query_embedding`: The embedding of the query\n", - "\n", - "This demonstrates how the `@super_component` decorator allows you to create components with rich interfaces that expose the functionality of multiple components in a pipeline." - ] - }, { "cell_type": "markdown", "metadata": { "id": "9y4iJE_SrS4K" }, "source": [ - "## Example 4: Creating a SuperComponent with Outputs from Non-Leaf Components\n", + "## 4. Creating a SuperComponent with Outputs from Non-Leaf Components\n", "\n", - "One of the powerful features of SuperComponents is the ability to expose outputs from any component in the pipeline, not just the leaf components. Let's create a SuperComponent that demonstrates this capability." + "One of the powerful features of SuperComponents is the ability to expose outputs from any component in the pipeline, not just the leaf components. With leaf components, we here refer to components that do not send any outputs to other components in a pipeline. Let's create a SuperComponent that demonstrates this capability." ] }, { @@ -491,7 +440,17 @@ "metadata": { "id": "HryYZP9ZO-qc" }, - "outputs": [], + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mnotebook controller is DISPOSED. \n", + "\u001b[1;31mView Jupyter log for further details." + ] + } + ], "source": [ "from haystack import Document, Pipeline, super_component\n", "from haystack.components.joiners import DocumentJoiner\n", @@ -551,7 +510,17 @@ "metadata": { "id": "INdC3WvLO-qc" }, - "outputs": [], + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mnotebook controller is DISPOSED. \n", + "\u001b[1;31mView Jupyter log for further details." + ] + } + ], "source": [ "# Create and run the AdvancedHybridRetriever\n", "retriever = AdvancedHybridRetriever(document_store)\n", @@ -624,8 +593,8 @@ "In this tutorial, you've learned how to create custom SuperComponents using the `@super_component` decorator. You've seen how to:\n", "\n", "1. Create a simple HybridRetriever SuperComponent\n", - "2. Enhance it with a ranker\n", - "3. Create a SuperComponent with custom input and output mappings\n", + "2. Enhance it with a ranker and custom input mapping\n", + "3. Serialize and deserialize the component with out-of-the-box functionalities\n", "4. Create a SuperComponent that exposes outputs from non-leaf components\n", "\n", "SuperComponents are a powerful way to encapsulate complex pipelines into reusable components with simplified interfaces. They make it easy to create higher-level components that abstract away the details of the underlying pipeline." @@ -639,7 +608,8 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", + "language": "python", "name": "python3" }, "language_info": { @@ -652,7 +622,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.12.8" } }, "nbformat": 4, From 840f0fced9cc17c06eed6c43b07e1640f9a7eda8 Mon Sep 17 00:00:00 2001 From: bilgeyucel Date: Fri, 18 Apr 2025 14:06:20 +0300 Subject: [PATCH 3/7] Update the notebook * remove the `haystack_2` field from index.toml --- index.toml | 1 - .../44_Creating_Custom_SuperComponents.ipynb | 3178 ++++++++++++++++- 2 files changed, 3071 insertions(+), 108 deletions(-) diff --git a/index.toml b/index.toml index 7f369f26..4599234f 100644 --- a/index.toml +++ b/index.toml @@ -216,6 +216,5 @@ notebook = "44_Creating_Custom_SuperComponents.ipynb" aliases = [] completion_time = "20 min" created_at = 2025-04-17 -haystack_2 = true dependencies = ["sentence-transformers>=3.0.0", "datasets"] featured = true diff --git a/tutorials/44_Creating_Custom_SuperComponents.ipynb b/tutorials/44_Creating_Custom_SuperComponents.ipynb index b0d018d4..9214f201 100644 --- a/tutorials/44_Creating_Custom_SuperComponents.ipynb +++ b/tutorials/44_Creating_Custom_SuperComponents.ipynb @@ -10,7 +10,7 @@ "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 20 minutes\n", - "- **Concepts and Components Used**: [`@super_component`](https://docs.haystack.deepset.ai/docs/super_component), [`Pipeline`](https://docs.haystack.deepset.ai/docs/pipeline), [`DocumentJoiner`](https://docs.haystack.deepset.ai/docs/documentjoiner), [`SentenceTransformersTextEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder), [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever), [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever), [`TransformersSimilarityRanker`](https://docs.haystack.deepset.ai/docs/transformerssimilarityranker)\n", + "- **Concepts and Components Used**: [`@super_component`](https://docs.haystack.deepset.ai/docs/supercomponents), [`Pipeline`](https://docs.haystack.deepset.ai/docs/pipeline), [`DocumentJoiner`](https://docs.haystack.deepset.ai/docs/documentjoiner), [`SentenceTransformersTextEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder), [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever), [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever), [`TransformersSimilarityRanker`](https://docs.haystack.deepset.ai/docs/transformerssimilarityranker)\n", "- **Goal**: After completing this tutorial, you'll have learned how to create custom SuperComponents using the `@super_component` decorator to simplify complex pipelines and make them reusable as components." ] }, @@ -27,11 +27,11 @@ "We'll explore several examples:\n", "\n", "1. Creating a simple HybridRetriever SuperComponent\n", - "2. Extending our HybridRetriever with a ranker component \n", + "2. Extending our HybridRetriever with a ranker component\n", "3. Creating a SuperComponent with custom input and output mappings\n", "4. Creating a SuperComponent that exposes outputs from non-leaf components\n", "\n", - "The `@super_component` decorator makes it easy to convert a class that defines a pipeline into a fully functional Haystack component that can be used in other pipelines or applications. All it requires is that the class has an attribute called `pipeline`." + "The `@super_component` decorator makes it easy to convert a class that defines a pipeline into a fully functional Haystack component that can be used in other pipelines or applications without losing pipeline functionalities like content tracing and debugging. All it requires is that the class has an attribute called `pipeline`." ] }, { @@ -96,20 +96,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "XvLVaFHTO-qb" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/julian/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "from haystack import Document, Pipeline, super_component\n", "from haystack.components.joiners import DocumentJoiner\n", @@ -157,17 +148,43 @@ "metadata": { "id": "aNzUi4iz0FAU" }, + "outputs": [], + "source": [ + "# Load a dataset\n", + "dataset = load_dataset(\"HaystackBot/medrag-pubmed-chunk-with-embeddings\", split=\"train\")\n", + "docs = [Document(content=doc[\"contents\"], embedding=doc[\"embedding\"]) for doc in dataset]\n", + "document_store = InMemoryDocumentStore()\n", + "document_store.write_documents(docs)\n", + "\n", + "# Create and run the HybridRetriever\n", + "query = \"What treatments are available for chronic bronchitis?\"\n", + "retriever = HybridRetriever(document_store)\n", + "result = retriever.run(\n", + " text=query, query=query\n", + ") # `query` variable will match with `text` and `query` inputs of components in the pipeline." + ] + }, + { + "cell_type": "code", + "source": [ + "# Print the results\n", + "print(f\"Found {len(result['documents'])} documents\")\n", + "for i, doc in enumerate(result[\"documents\"][:3]): # Show first 3 documents\n", + " print(f\"\\nDocument {i+1} (Score: {doc.score:.4f}):\")\n", + " print(doc.content[:200] + \"...\")" + ], + "metadata": { + "id": "ZoTmeqV_j-OI", + "outputId": "7fa55a12-563c-4341-bbaa-3c578cc5e976", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 13, "outputs": [ { - "name": "stderr", "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 6.96it/s]\n" - ] - }, - { "name": "stdout", - "output_type": "stream", "text": [ "Found 15 documents\n", "\n", @@ -181,24 +198,6 @@ "Haematologic adaptation in patients with chronic bronchitis and pulmonary insufficiency. The relationship between respiratory insufficiency, expressed by gas tensions in blood and bone marrow, and hae...\n" ] } - ], - "source": [ - "# Load a dataset\n", - "dataset = load_dataset(\"HaystackBot/medrag-pubmed-chunk-with-embeddings\", split=\"train\")\n", - "docs = [Document(content=doc[\"contents\"], embedding=doc[\"embedding\"]) for doc in dataset]\n", - "document_store = InMemoryDocumentStore()\n", - "document_store.write_documents(docs)\n", - "\n", - "# Create and run the HybridRetriever\n", - "query = \"What treatments are available for chronic bronchitis?\"\n", - "retriever = HybridRetriever(document_store)\n", - "result = retriever.run(text=query, query=query)\n", - "\n", - "# Print the results\n", - "print(f\"Found {len(result['documents'])} documents\")\n", - "for i, doc in enumerate(result['documents'][:3]): # Show first 3 documents\n", - " print(f\"\\nDocument {i+1} (Score: {doc.score:.4f}):\")\n", - " print(doc.content[:200] + \"...\")" ] }, { @@ -216,12 +215,12 @@ " - Create all the components we need (embedding retriever, BM25 retriever, etc.)\n", " - Create a Pipeline and add all components to it\n", " - Connect the components to define the flow of data\n", - " - Define input and output mappings to simplify the interface\n", "3. The `@super_component` decorator handles all the complexity of making our class work as a component\n", "\n", - "The input mapping `{\"query\": [\"text_embedder.text\", \"bm25_retriever.query\"]}` means that when we call `run(query=\"...\")`, the query is automatically sent to both the text embedder and the BM25 retriever.\n", "\n", - "The output mapping `{\"document_joiner.documents\": \"documents\"}` means that the documents from the document joiner are returned as the \"documents\" output of our component." + "If we define an input mapping like `{\"query\": [\"text_embedder.text\", \"bm25_retriever.query\"]}`, we can call `retriever.run(query=query)`, and the query will automatically be routed to both the text embedder's `text` input and the BM25 retriever's `query` input.\n", + "\n", + "You can also specify how the outputs should be exposed through `output_mapping`. For example, output mapping `{\"document_joiner.documents\": \"documents\"}` means that the documents produced by the `document_joiner` will be returned under the name `documents` when you call `retriever.run(...)`." ] }, { @@ -230,14 +229,14 @@ "id": "HryYZP9ZO-qb" }, "source": [ - "## 2. A HybridRetriever with Re-Ranking and Custom `input_mapping`\n", + "## 2. A HybridRetriever with Re-Ranking and Custom 'input_mapping'\n", "\n", "Now, let's enhance our HybridRetriever by adding a ranker component. This will re-rank the documents based on their semantic similarity to the query, potentially improving the quality of the results. We also define a custom input_mapping." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "INdC3WvLO-qb" }, @@ -256,10 +255,10 @@ "@super_component\n", "class HybridRetrieverWithRanker:\n", " def __init__(\n", - " self, \n", - " document_store: InMemoryDocumentStore, \n", + " self,\n", + " document_store: InMemoryDocumentStore,\n", " embedder_model: str = \"BAAI/bge-small-en-v1.5\",\n", - " ranker_model: str = \"BAAI/bge-reranker-base\"\n", + " ranker_model: str = \"BAAI/bge-reranker-base\",\n", " ):\n", " # Create the components\n", " embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n", @@ -283,28 +282,201 @@ " self.pipeline.connect(\"document_joiner\", \"ranker\")\n", "\n", " # Define input mapping\n", - " self.input_mapping = {\n", - " \"query\": [\"text_embedder.text\", \"bm25_retriever.query\", \"ranker.query\"],\n", - " }" + " self.input_mapping = {\"query\": [\"text_embedder.text\", \"bm25_retriever.query\", \"ranker.query\"]}" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { - "id": "yxaN3KBo65pv" + "id": "yxaN3KBo65pv", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 440, + "referenced_widgets": [ + "b4246cbc7b0b4f7784293fbd8337befe", + "97e4db1d981344ac80ce2e59745f50ff", + "49c3d271587a46a08edfc1a059887131", + "df28d21a35204da58c8642cfe1687a4d", + "4c933b0299f9478e9c39ba079302035f", + "6873d6a802f44e86ad11c44a47b43452", + "4f5e209fdc084e6fabbc480d673b14cf", + "c1c3b0e627944396ab365a6c7b91cc54", + "6b44fcace7a84fa9a863bb782668789c", + "fc33fd0c55554ea39216330ea4f7c745", + "547c5f3ca88747f8b9744da7456335b9", + "f41d46b53c454c6d886acb02ad054cda", + "3f9b55a507414540808db89d2af7302c", + "3155d69cb8b54b1899d447179d1eff9b", + "980f33d3987a4a21983d8e36f45f27f3", + "0974fcf1f2cf43c79d0c9e925f95872e", + "e863cb07b1134c9291cee19eeb2ee8f1", + "a2951397273141b1b5c05a3dd89dea7c", + "d0d28d8937b548fbafd2c72d33e4e4ab", + "4c78d723207346cdba9a2be2650b0bbc", + "e6b0473eccbf42adbc79375c48bab702", + "0aea00b2a8d443719572911525df5727", + "165ba79a9c1a465b829b16643b485e54", + "e02626edcde54a918ba58711fb78f9eb", + "a8bf397d535545cca2916a3103189591", + "362d04b496e04dd896555cdc562c0387", + "40282da198fe4350b5a806cf76492d13", + "206acb3589a049f38bab5e674a97cf75", + "5645ab5f93b9493a915f34dfbf444617", + "70a6ccd0ba6d491a9fc5284eedd0d736", + "c56370cd22b04d7d8d6b04abb22eb8d6", + "9054763a91c84642bd9d2c2db3b6f25a", + "e5bf035129f543feb9c5fa7a978fc5e8", + "c511266135c740e483df076e2812c139", + "9ebf405398e04047bcf7fdf0c4ec1beb", + "5ddb907fa72940d4b6b04f37d8c1d03c", + "66cbc136d6fa4c2ab97706cf51f691b4", + "41803f1ed4474a4991011e486ba09c3b", + "dafb0a71240c499f8481ee8ccc551289", + "269b9dc104ac47f3a6861f868f2e3419", + "046d644b846740039469eb061d47218b", + "58f6e0621d1749b0bdb3855a9af27e47", + "23cae104afce477f8514919db4e3da9b", + "da8b68a02ff642139022e5bc09d39cda", + "858d7e87d43f4f6c9c5ce83488b160b8", + "5bd56c5f80784646b74ac3fde1c9c7b9", + "290dbd2bef85499a967738a3ed87453b", + "5e17e82b3b154a8aa731833992b852e5", + "95b52f609cac4ecb916a31d1f8c13c8c", + "1027aa20fdcd4311a2409047ccf29520", + "b6fb44afe94e48f18eb41d982cd7dfdc", + "aaab7ce16fc44608815cdfea8606cb09", + "5e21b9d4e394464bb8402a586074cef7", + "569b3110771b4e2e83498ea770b97abe", + "a6e7b602094e41e5a3db0031b2ede035", + "573b84602f75411f801c7a69509754d2", + "fec4f36eb14f4ef8b2d3932327eb1311", + "e7c9affa0c6f4fa189c70188bc5fc0e6", + "0dba51b2e7cc4927bb9c94f820abbdc8", + "16ebad6926f34338a088a6cdfb8ced86", + "c040063684024574a3cb7e7db7a10297", + "0023bce9a9434741a59910e734902d83", + "a5951c7a49d24ee0904e90a1193f683f", + "f4e7897f36d94a50b0d546cdd032d1ea", + "3d678d1ee9f14c3380208c49a5108221", + "ba108e9e3b0248be9d4413bd7d1e6f2f", + "59c201c7fd8b4e04b23246db1bc7061a", + "ec9d2a77052b4979ad999c8e843e4c55", + "9a791ad8efc340c2876a5335eb32e4f5", + "300576212d2a4109a77f732e6ebe8d8f", + "256d83e568f240f5b1ea5b611b599375", + "5b24575d05a64ec98c0a514042006aac", + "6d55e29f7df248dda6da276bb55a9f1a", + "50dd4ef6a384418694dce3209a1a35cd", + "3bc892d8b29f4e51a7091b5a80886802", + "f6aac71ee87f4f72ab8a854e587116c6", + "936bf0c898e648cfa7a2b8d5f69c44a0" + ] + }, + "outputId": "21486dba-7914-4349-a579-2993ec212d86" }, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 81.05it/s]\n" - ] + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/799 [00:00 \u001b[39m\u001b[32m13\u001b[39m deserialized = \u001b[43mcomponent_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDocumentPreprocessor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mserialized\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdocument_preprocessor\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 14\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33mDeserialized component:\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 15\u001b[39m \u001b[38;5;28mprint\u001b[39m(deserialized)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/haystack/core/serialization.py:166\u001b[39m, in \u001b[36mcomponent_from_dict\u001b[39m\u001b[34m(cls, data, name, callbacks)\u001b[39m\n\u001b[32m 163\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m default_from_dict(\u001b[38;5;28mcls\u001b[39m, data)\n\u001b[32m 165\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m callbacks \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m callbacks.component_pre_init \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m166\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdo_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 168\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m _hook_component_init(component_pre_init_callback):\n\u001b[32m 169\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m do_from_dict()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/haystack/core/serialization.py:161\u001b[39m, in \u001b[36mcomponent_from_dict..do_from_dict\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdo_from_dict\u001b[39m():\n\u001b[32m 160\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mcls\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mfrom_dict\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m161\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfrom_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 163\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m default_from_dict(\u001b[38;5;28mcls\u001b[39m, data)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/haystack/components/preprocessors/document_preprocessor.py:190\u001b[39m, in \u001b[36mDocumentPreprocessor.from_dict\u001b[39m\u001b[34m(cls, data)\u001b[39m\n\u001b[32m 181\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 182\u001b[39m \u001b[33;03mDeserializes the SuperComponent from a dictionary.\u001b[39;00m\n\u001b[32m 183\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 187\u001b[39m \u001b[33;03m Deserialized SuperComponent.\u001b[39;00m\n\u001b[32m 188\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 189\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33msplitting_function\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m data[\u001b[33m\"\u001b[39m\u001b[33minit_parameters\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m--> \u001b[39m\u001b[32m190\u001b[39m data[\u001b[33m\"\u001b[39m\u001b[33minit_parameters\u001b[39m\u001b[33m\"\u001b[39m][\u001b[33m\"\u001b[39m\u001b[33msplitting_function\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mdeserialize_callable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 191\u001b[39m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43minit_parameters\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msplitting_function\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 192\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 194\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m default_from_dict(\u001b[38;5;28mcls\u001b[39m, data)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/deepset/dev/haystack-tutorials/.venv/lib/python3.12/site-packages/haystack/utils/callable_serialization.py:53\u001b[39m, in \u001b[36mdeserialize_callable\u001b[39m\u001b[34m(callable_handle)\u001b[39m\n\u001b[32m 45\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdeserialize_callable\u001b[39m(callable_handle: \u001b[38;5;28mstr\u001b[39m) -> Callable:\n\u001b[32m 46\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 47\u001b[39m \u001b[33;03m Deserializes a callable given its full import path as a string.\u001b[39;00m\n\u001b[32m 48\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 51\u001b[39m \u001b[33;03m :raises DeserializationError: If the callable cannot be found\u001b[39;00m\n\u001b[32m 52\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m53\u001b[39m parts = \u001b[43mcallable_handle\u001b[49m\u001b[43m.\u001b[49m\u001b[43msplit\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 55\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(parts), \u001b[32m0\u001b[39m, -\u001b[32m1\u001b[39m):\n\u001b[32m 56\u001b[39m module_name = \u001b[33m\"\u001b[39m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m.join(parts[:i])\n", - "\u001b[31mAttributeError\u001b[39m: 'NoneType' object has no attribute 'split'" + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;31m# Deserialize the component from the dictionary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mdeserialized\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcomponent_from_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDocumentPreprocessor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mserialized\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"document_preprocessor\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\nDeserialized component:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdeserialized\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/haystack/core/serialization.py\u001b[0m in \u001b[0;36mcomponent_from_dict\u001b[0;34m(cls, data, name, callbacks)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallbacks\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomponent_pre_init\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mdo_from_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0m_hook_component_init\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcomponent_pre_init_callback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/haystack/core/serialization.py\u001b[0m in \u001b[0;36mdo_from_dict\u001b[0;34m()\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdo_from_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"from_dict\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 161\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdefault_from_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/haystack/components/preprocessors/document_preprocessor.py\u001b[0m in \u001b[0;36mfrom_dict\u001b[0;34m(cls, data)\u001b[0m\n\u001b[1;32m 188\u001b[0m \"\"\"\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"splitting_function\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"init_parameters\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 190\u001b[0;31m data[\"init_parameters\"][\"splitting_function\"] = deserialize_callable(\n\u001b[0m\u001b[1;32m 191\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"init_parameters\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"splitting_function\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 192\u001b[0m )\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/haystack/utils/callable_serialization.py\u001b[0m in \u001b[0;36mdeserialize_callable\u001b[0;34m(callable_handle)\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mraises\u001b[0m \u001b[0mDeserializationError\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIf\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcallable\u001b[0m \u001b[0mcannot\u001b[0m \u001b[0mbe\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \"\"\"\n\u001b[0;32m---> 53\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcallable_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'split'" ] } ], @@ -412,7 +593,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "C6zbojiWXgEe" + }, "source": [ "The serialization and deserialization process works seamlessly with SuperComponents because the `@super_component` decorator automatically adds the necessary functionality. This is particularly useful when you want to:\n", "\n", @@ -436,21 +619,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "id": "HryYZP9ZO-qc" }, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mnotebook controller is DISPOSED. \n", - "\u001b[1;31mView Jupyter log for further details." - ] - } - ], + "outputs": [], "source": [ "from haystack import Document, Pipeline, super_component\n", "from haystack.components.joiners import DocumentJoiner\n", @@ -463,10 +636,10 @@ "@super_component\n", "class AdvancedHybridRetriever:\n", " def __init__(\n", - " self, \n", - " document_store: InMemoryDocumentStore, \n", + " self,\n", + " document_store: InMemoryDocumentStore,\n", " embedder_model: str = \"BAAI/bge-small-en-v1.5\",\n", - " ranker_model: str = \"BAAI/bge-reranker-base\"\n", + " ranker_model: str = \"BAAI/bge-reranker-base\",\n", " ):\n", " # Create the components\n", " embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n", @@ -490,34 +663,78 @@ " self.pipeline.connect(\"document_joiner\", \"ranker\")\n", "\n", " # Define input and output mappings\n", - " self.input_mapping = {\n", - " \"query\": [\"text_embedder.text\", \"bm25_retriever.query\", \"ranker.query\"],\n", - " }\n", - " \n", + " self.input_mapping = {\"query\": [\"text_embedder.text\", \"bm25_retriever.query\", \"ranker.query\"]}\n", + "\n", " # Expose outputs from multiple components, including non-leaf components\n", " self.output_mapping = {\n", " \"bm25_retriever.documents\": \"bm25_documents\",\n", " \"embedding_retriever.documents\": \"embedding_documents\",\n", " \"document_joiner.documents\": \"joined_documents\",\n", " \"ranker.documents\": \"ranked_documents\",\n", - " \"text_embedder.embedding\": \"query_embedding\"\n", + " \"text_embedder.embedding\": \"query_embedding\",\n", " }" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { - "id": "INdC3WvLO-qc" + "id": "INdC3WvLO-qc", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 373, + "referenced_widgets": [ + "1c63ba6f43574419b66fe173a763b04a", + "19f722a52a8e4e709bb99e6904187ced", + "d85c427101844bca92b072ddcad78faf", + "df0c5bd77043417fa206f28633170b75", + "efa58b66150e4f7bbce5ed3dd63f13a7", + "624236744c8043e8bed74fd8f6dceacc", + "6f9e161785a84bfe8b502f7cfca3bc62", + "ffac4adb3c85410992373480b9902b09", + "6072bee9c392439e95850c73df892e0c", + "7e3cfba7404a45a9920f2e9f07f1bf6e", + "90f7536c69e84b25a48386a7bffbfe42" + ] + }, + "outputId": "a9d0a257-6969-49bd-f141-7d3d4fe24813" }, "outputs": [ { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mnotebook controller is DISPOSED. \n", - "\u001b[1;31mView Jupyter log for further details." + "output_type": "display_data", + "data": { + "text/plain": [ + "Batches: 0%| | 0/1 [00:00 Date: Tue, 22 Apr 2025 11:04:51 +0300 Subject: [PATCH 4/7] Add accelerate dependency for tests --- index.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.toml b/index.toml index 4599234f..d5be85fd 100644 --- a/index.toml +++ b/index.toml @@ -216,5 +216,5 @@ notebook = "44_Creating_Custom_SuperComponents.ipynb" aliases = [] completion_time = "20 min" created_at = 2025-04-17 -dependencies = ["sentence-transformers>=3.0.0", "datasets"] +dependencies = ["sentence-transformers>=3.0.0", "datasets", accelerate] featured = true From 79e9f97cdce8bc73c50fdbc510777480ba211e06 Mon Sep 17 00:00:00 2001 From: bilgeyucel Date: Tue, 22 Apr 2025 11:06:37 +0300 Subject: [PATCH 5/7] add quote signs --- index.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.toml b/index.toml index d5be85fd..c57a4b4f 100644 --- a/index.toml +++ b/index.toml @@ -216,5 +216,5 @@ notebook = "44_Creating_Custom_SuperComponents.ipynb" aliases = [] completion_time = "20 min" created_at = 2025-04-17 -dependencies = ["sentence-transformers>=3.0.0", "datasets", accelerate] +dependencies = ["sentence-transformers>=3.0.0", "datasets", "accelerate"] featured = true From dbeae401c342650f7290b89af716533395030cfb Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Tue, 22 Apr 2025 11:05:44 +0200 Subject: [PATCH 6/7] update pip install to latest release --- tutorials/44_Creating_Custom_SuperComponents.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tutorials/44_Creating_Custom_SuperComponents.ipynb b/tutorials/44_Creating_Custom_SuperComponents.ipynb index 9214f201..8ea12f35 100644 --- a/tutorials/44_Creating_Custom_SuperComponents.ipynb +++ b/tutorials/44_Creating_Custom_SuperComponents.ipynb @@ -55,8 +55,8 @@ "source": [ "%%bash\n", "\n", - "pip install git+https://github.com/deepset-ai/haystack.git@main # pip install haystack-ai after the 2.13 release\n", - "pip install \"sentence-transformers>=3.0.0\" datasets transformers[torch,sentencepiece]" + "pip install haystack-ai\n", + "pip install \"sentence-transformers>=3.0.0\" datasets transformers[torch,sentencepiece]" ] }, { @@ -3591,4 +3591,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From e27e48366487194fb1596ed49e8d20a102f3291c Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Tue, 22 Apr 2025 11:06:29 +0200 Subject: [PATCH 7/7] Update created_at in index.toml --- index.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.toml b/index.toml index c57a4b4f..822e47c7 100644 --- a/index.toml +++ b/index.toml @@ -215,6 +215,6 @@ weight = 8 notebook = "44_Creating_Custom_SuperComponents.ipynb" aliases = [] completion_time = "20 min" -created_at = 2025-04-17 +created_at = 2025-04-22 dependencies = ["sentence-transformers>=3.0.0", "datasets", "accelerate"] featured = true