In [1]:
%load_ext autoreload
%autoreload 2

Import dependencies

In [2]:
from haystack import Document, Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

Initialize components

In [3]:
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
text_embedder = SentenceTransformersTextEmbedder()  # type: ignore
retriever = InMemoryEmbeddingRetriever(document_store=document_store)  # type: ignore

Create the pipeline

In [4]:
query_pipeline = Pipeline()

Add components

In [5]:
query_pipeline.add_component("text_embedder", text_embedder)
query_pipeline.add_component("retriever", retriever)

Connect components

In [6]:
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

# Because the InMemoryEmbeddingRetriever only has one input, this is also correct:
# pipeline.connect("text_embedder.embedding", "retriever")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7e4951964990>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])

Run the pipeline

In [7]:
results = query_pipeline.run({
    "text_embedder": {
        "text": "How is the weather today?"
    }
})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No Documents found with embeddings. Returning empty list. To generate embeddings, use a DocumentEmbedder.


In [23]:
results

{'retriever': {'documents': []}}

Pipeline input

In [24]:
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter

In [25]:
document_store = InMemoryDocumentStore()
fetcher = LinkContentFetcher()  # type: ignore
converter = HTMLToDocument()  # type: ignore
writer = DocumentWriter(document_store = document_store)  # type: ignore

In [26]:
pipeline = Pipeline()
pipeline.add_component(instance=fetcher, name="fetcher")
pipeline.add_component(instance=converter, name="converter")
pipeline.add_component(instance=writer, name="writer")

In [27]:
pipeline.connect("fetcher.streams", "converter.sources")
pipeline.connect("converter.documents", "writer.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f3e88b44a50>
🚅 Components
  - fetcher: LinkContentFetcher
  - converter: HTMLToDocument
  - writer: DocumentWriter
🛤️ Connections
  - fetcher.streams -> converter.sources (List[ByteStream])
  - converter.documents -> writer.documents (List[Document])

In [28]:
# Requesting a list of required inputs
pipeline.inputs()

{'fetcher': {'urls': {'type': typing.List[str], 'is_mandatory': True}},
 'converter': {'meta': {'type': typing.Union[typing.Dict[str, typing.Any], typing.List[typing.Dict[str, typing.Any]], NoneType],
   'is_mandatory': False,
   'default_value': None},
  'extraction_kwargs': {'type': typing.Optional[typing.Dict[str, typing.Any]],
   'is_mandatory': False,
   'default_value': None}},
 'writer': {'policy': {'type': typing.Optional[haystack.document_stores.types.policy.DuplicatePolicy],
   'is_mandatory': False,
   'default_value': None}}}

In [29]:
pipeline.run(data={
    "fetcher": {
        "urls": ["https://docs.vngcloud.vn/vng-cloud-document"]
        }
    })

{'writer': {'documents_written': 1}}

In [30]:
document_store.save_to_disk("vngcloud_docs")