In [1]:
from haystack.telemetry import tutorial_running

tutorial_running(42)

In [3]:
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by='sentence')

text = ("Paul fell asleep to dream of an Arrakeen cavern, silent people all around  him moving in the dim light "
        "of glowglobes. It was solemn there and like a cathedral as he listened to a faint sound—the "
        "drip-drip-drip of water. Even while he remained in the dream, Paul knew he would remember it upon "
        "awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel "
        "himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or "
        "companions his own age,  perhaps did not deserve sadness in farewell. Dr Yueh, his teacher, had "
        "hinted  that the faufreluches class system was not rigidly guarded on Arrakis. The planet sheltered "
        "people who lived at the desert edge without caid or bashar to command them: will-o’-the-sand people "
        "called Fremen, marked down on no  census of the Imperial Regate.")

doc = Document(content=text)
docs = splitter.run([doc])
docs

{'documents': [Document(id=8db98f03cd338b83a8b28172ebf03c77fb481c17053e792193033003daeffc8a, content: 'Paul fell asleep to dream of an Arrakeen cavern, silent people all around  him moving in the dim lig...', meta: {'source_id': 'b56504f244b7b650096b14d678bc82f3d7fe240bb135361c6a23a14c4b809596', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
  Document(id=db478194d25cedfb818f04c4bd22a5d4c058f6cddcced2461445d82f7feaf932, content: ' It was solemn there and like a cathedral as he listened to a faint sound—the drip-drip-drip of wate...', meta: {'source_id': 'b56504f244b7b650096b14d678bc82f3d7fe240bb135361c6a23a14c4b809596', 'page_number': 1, 'split_id': 1, 'split_idx_start': 117}),
  Document(id=5d093b6ec1a4bdc7e75f033ae0b570e237053213a09b42a56ad815b4d118943d, content: ' Even while he remained in the dream, Paul knew he would remember it upon awakening.', meta: {'source_id': 'b56504f244b7b650096b14d678bc82f3d7fe240bb135361c6a23a14c4b809596', 'page_number': 1, 'split_id': 2, 'spli

In [4]:
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

doc_store = InMemoryDocumentStore()
doc_store.write_documents(docs['documents'], policy=DuplicatePolicy.OVERWRITE)

9

In [12]:
docs['documents'][4].content

' The dream faded.'

In [14]:
from haystack.components.retrievers import SentenceWindowRetriever

retriever = SentenceWindowRetriever(document_store=doc_store, window_size=2)
result = retriever.run(retrieved_documents=[docs['documents'][4]])
result

{'context_windows': [' Even while he remained in the dream, Paul knew he would remember it upon awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or companions his own age,  perhaps did not deserve sadness in farewell.'],
 'context_documents': [[Document(id=5d093b6ec1a4bdc7e75f033ae0b570e237053213a09b42a56ad815b4d118943d, content: ' Even while he remained in the dream, Paul knew he would remember it upon awakening.', meta: {'source_id': 'b56504f244b7b650096b14d678bc82f3d7fe240bb135361c6a23a14c4b809596', 'page_number': 1, 'split_id': 2, 'split_idx_start': 219}),
   Document(id=4ed71ff61df531053cc7d5f80e8a0bd1e702f3a396f3f3983ceeffe89878a684, content: ' He always remembered the dreams that were predictions.', meta: {'source_id': 'b56504f244b7b650096b14d678bc82f3d7fe240bb135361c6a23a14c4b809596', 'page_number': 1, 'split_id': 3, 'split_idx_star

In [15]:
from typing import List
import csv
from haystack import Document
from pathlib import Path
import requests

def read_documents(file: str) -> List[Document]:
    with open(file, "r") as file:
        reader = csv.reader(file, delimiter="\t")
        next(reader, None)  # skip the headers
        documents = []
        for row in reader:
            category = row[0].strip()
            title = row[2].strip()
            text = row[3].strip()
            documents.append(Document(content=text, meta={"category": category, "title": title}))

    return documents

doc = requests.get('https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv')

datafolder = Path('data')
datafolder.mkdir(exist_ok=True)
with open(datafolder/'bbc-news-data.csv', 'wb') as f:
    for chunk in doc.iter_content(512):
        f.write(chunk)

docs = read_documents("data/bbc-news-data.csv")
len(docs)

2225

In [17]:
from haystack import Document, Pipeline
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

document_store = InMemoryDocumentStore()

indexing_pipeline = Pipeline()
indexing_pipeline.add_component('splitter', DocumentSplitter(split_length=1, split_overlap=0, split_by='sentence'))
indexing_pipeline.add_component('writer', DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))

indexing_pipeline.connect('splitter', 'writer')
indexing_pipeline.run({'documents': docs})

{'writer': {'documents_written': 44186}}

In [18]:
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.retrievers import SentenceWindowRetriever

sentence_window_pipeline = Pipeline()
sentence_window_pipeline.add_component('bm25_retriever', InMemoryBM25Retriever(document_store=document_store))
sentence_window_pipeline.add_component('sentence_window_retriever', SentenceWindowRetriever(document_store, window_size=12))

sentence_window_pipeline.connect('bm25_retriever.documents', 'sentence_window_retriever.retrieved_documents')

<haystack.core.pipeline.pipeline.Pipeline object at 0x000001FE3BB86AB0>
🚅 Components
  - bm25_retriever: InMemoryBM25Retriever
  - sentence_window_retriever: SentenceWindowRetriever
🛤️ Connections
  - bm25_retriever.documents -> sentence_window_retriever.retrieved_documents (List[Document])

In [19]:
result = sentence_window_pipeline.run(data={'bm25_retriever': {'query': "phishing attacks", "top_k": 1}}, include_outputs_from={'bm25_retriever'})


In [20]:
result['bm25_retriever']['documents']

[Document(id=57766497f35c7ebef5c49e754b8df41a8df3d5df3e46bc595807d7420d7cef8e, content: ' The Anti-Phishing Working group reported that the number of phishing attacks against new targets wa...', meta: {'category': 'tech', 'title': 'Cyber crime booms in 2004', 'source_id': '5c81f8cbd6c9c07819bf60e484489fe0af9e6626591ec77066701cb856fb3b33', 'page_number': 1, 'split_id': 12, 'split_idx_start': 1520}, score: 17.74585935028894)]

In [22]:
result['sentence_window_retriever']['context_windows']

['The last 12 months have seen a dramatic growth in almost every security threat that plague Windows PCs.  The count of known viruses broke the 100,000 barrier and the number of new viruses grew by more than 50%. Similarly phishing attempts, in which conmen try to trick people into handing over confidential data, are recording growth rates of more than 30% and attacks are becoming increasingly sophisticated. Also on the increase are the number of networks of remotely controlled computers, called bot nets, used by malicious hackers and conmen to carry out many different cyber crimes.  One of the biggest changes of 2004 was the waning influence of the boy hackers keen to make a name by writing a fast-spreading virus, said Kevin Hogan, senior manager in Symantec\'s security response group. Although teenage virus writers will still play around with malicious code, said Mr Hogan, 2004 saw a significant rise in criminal use of malicious programs. The financial incentives were driving crimina

In [24]:
result['sentence_window_retriever']['context_documents']

[[Document(id=c13d6940687df1ea885d686d5b7e5fe703a98d46a3d6fabd50eef074866e4b40, content: 'The last 12 months have seen a dramatic growth in almost every security threat that plague Windows P...', meta: {'category': 'tech', 'title': 'Cyber crime booms in 2004', 'source_id': '5c81f8cbd6c9c07819bf60e484489fe0af9e6626591ec77066701cb856fb3b33', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}),
  Document(id=8c0f586edfdf4967e15f731f736cda18d6f0bd4e332737754deb7bc118e624a3, content: '  The count of known viruses broke the 100,000 barrier and the number of new viruses grew by more th...', meta: {'category': 'tech', 'title': 'Cyber crime booms in 2004', 'source_id': '5c81f8cbd6c9c07819bf60e484489fe0af9e6626591ec77066701cb856fb3b33', 'page_number': 1, 'split_id': 1, 'split_idx_start': 103}),
  Document(id=a985729ae1cec63d984026a47e05c4fa8dd8ca81ea27bff77b387436c6d93634, content: ' Similarly phishing attempts, in which conmen try to trick people into handing over confidential dat...', meta