In [1]:
from haystack import Pipeline
from haystack.document_stores.types import DuplicatePolicy
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder

from datasets import load_dataset
from haystack import Document

In [2]:
document_store = QdrantDocumentStore(
    url="http://localhost:6333",
    recreate_index=True,
    return_embedding=True,
    wait_result_from_api=True,
)

In [3]:
dataset = load_dataset("PaulAdversarial/all_news_finance_sm_1h2023", split="train")
documents = [Document(content=doc["title"]) for doc in dataset]

document_embedder = SentenceTransformersDocumentEmbedder()  
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(documents)

document_store.write_documents(documents_with_embeddings.get("documents"),policy=DuplicatePolicy.OVERWRITE)



Batches:   0%|          | 0/159 [00:00<?, ?it/s]

5100it [00:05, 1010.47it/s]                                                     


5062

In [4]:
retriever = QdrantEmbeddingRetriever(document_store=document_store)

In [5]:
template = """
Given only the following information, answer the question.
Ignore your own knowledge.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}


Question: {{ query }}?
"""

In [6]:
pipe = Pipeline()

pipe.add_component("text_embedder", SentenceTransformersTextEmbedder())
pipe.add_component("retriever", retriever)
pipe.add_component("prompt_builder", PromptBuilder(template=template))
pipe.add_component("llm", OllamaGenerator(model="llama3.1"))

pipe.connect("text_embedder.embedding", "retriever.query_embedding")
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f55e568b1f0>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: QdrantEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: OllamaGenerator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [8]:
query = "Give me all the news you have about CNBC"

response = pipe.run({"prompt_builder": {"query": query}, "text_embedder": {"text": query}})

print(response["llm"]["replies"])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['Based on the provided context, here is the news I have about CNBC:\n\n* CNBC Daily Open mentions that Credit Suisse has spread the banking crisis to Europe. This suggests that CNBC is reporting on financial and business news.\n* There is no specific article or update mentioned as being from CNBC itself, but rather a mention of their format ("CNBC Daily Open") in relation to another piece of information.\n\nIt appears that CNBC is not the primary source for any specific news story within this context. However, they are referenced as part of the general flow of business and financial news.']
