# Haystack

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
data_path = "../data" 
files = ["yellow_pages.pdf"]

In [3]:
df = pd.read_json(os.path.join(data_path, "dev_questions.json"))
df.head()

Unnamed: 0,reference,query,response
0,The report of YELLOW PAGES LIMITED (Petra Diam...,What was the accounts receivable of 'Petra Dia...,
1,The report is from YELLOW PAGES LIMITED (Petra...,What was the total assets of 'TransUnion' in t...,
2,The total revenues of YELLOW PAGES LIMITED (Pe...,What was the total revenue for Petra Diamonds ...,
3,The total revenues of YELLOW PAGES LIMITED (Pe...,By how much did the total revenue for Petra Di...,
4,The basic income per share in Q2 of 2021 of YE...,What was the total revenue for Petra Diamonds ...,


In [4]:
# From https://haystack.deepset.ai/integrations/arize-phoenix
from openinference.instrumentation.haystack import HaystackInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
    OTLPSpanExporter,
)
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

endpoint = "http://localhost:6006/v1/traces" # The URL to your Phoenix instance
tracer_provider = trace_sdk.TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))

HaystackInstrumentor().instrument(tracer_provider=tracer_provider)

In [7]:
# Load PDFs
from haystack.components.converters.pypdf import PyPDFToDocument
from datetime import datetime

converter = PyPDFToDocument()
results = converter.run(sources=[os.path.join(data_path, "eval", file) for file in files], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
type(documents[0])

In [14]:
# Split document
from haystack import Document
from haystack_experimental.components.splitters import HierarchicalDocumentSplitter

doc = Document(content="This is a simple test document")
splitter = HierarchicalDocumentSplitter(block_sizes={3, 2}, split_overlap=0, split_by="word")
documents_split = splitter.run(documents)

In [17]:
type(documents_split)

dict

In [18]:
# Create Document Store
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack import Document

document_store = ChromaDocumentStore()
document_store.write_documents(documents)
# document_store.write_documents([
#     Document(content="This is the first document."),
#     Document(content="This is the second document.")
# ])
print(document_store.count_documents())

Document 6ef58163625e88e0168d9c607793055ac16c619f28347caa1a8bef9878fbaad1 contains `meta` values of unsupported types for the keys: __parent_id, __children_ids. These items will be discarded. Supported types are: str, int, float, bool.
Add of existing embedding ID: 6ef58163625e88e0168d9c607793055ac16c619f28347caa1a8bef9878fbaad1
Insert of existing embedding ID: 6ef58163625e88e0168d9c607793055ac16c619f28347caa1a8bef9878fbaad1


1


In [20]:
from haystack import Document, Pipeline
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever

prompt_template = """
Given these documents, answer the question.
Documents:
{% for doc in documents %}
    {{ doc.content }}
{% endfor %}
Question: {{question}}
Answer:
"""
retriever = ChromaQueryTextRetriever(document_store)
# retriever = InMemoryBM25Retriever(document_store=document_store)
prompt_builder = PromptBuilder(template=prompt_template)
llm = OpenAIGenerator()

rag_pipeline = Pipeline()
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", llm)
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")

question = "Who lives in Paris?"
results = rag_pipeline.run(
    {
        "retriever": {"query": question},
        "prompt_builder": {"question": question},
    }
)

Number of requested results 10 is greater than number of elements in index 1, updating n_results = 1


In [22]:
results

{'llm': {'replies': ['The provided documents do not contain any information about individuals living in Paris or any specific context related to Paris. Therefore, I cannot provide an answer to the question regarding who lives in Paris based on the given content.'],
  'meta': [{'model': 'gpt-4o-mini-2024-07-18',
    'index': 0,
    'finish_reason': 'stop',
    'usage': {'completion_tokens': 43,
     'prompt_tokens': 59488,
     'total_tokens': 59531,
     'completion_tokens_details': CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0),
     'prompt_tokens_details': PromptTokensDetails(audio_tokens=0, cached_tokens=0)}}]}}

## Evaluate results

In [None]:
import nest_asyncio

from phoenix.evals import HallucinationEvaluator, OpenAIModel, QAEvaluator, run_evals

nest_asyncio.apply()  # This is needed for concurrency in notebook environments

# Set your OpenAI API key
eval_model = OpenAIModel(model="gpt-4o-mini-2024-07-18")

# Define your evaluators
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_evaluator = QAEvaluator(eval_model)

# We have to make some minor changes to our dataframe to use the column names expected by our evaluators
# for `hallucination_evaluator` the input df needs to have columns 'output', 'input', 'context'
# for `qa_evaluator` the input df needs to have columns 'output', 'input', 'reference'
df["context"] = df["reference"]
df.rename(columns={"query": "input", "response": "output"}, inplace=True)
assert all(column in df.columns for column in ["output", "input", "context", "reference"])

# Run the evaluators, each evaluator will return a dataframe with evaluation results
# We upload the evaluation results to Phoenix in the next step
hallucination_eval_df, qa_eval_df = run_evals(
    dataframe=df, evaluators=[hallucination_evaluator, qa_evaluator], provide_explanation=True
)