In [None]:
import dotenv
import os
dotenv.load_dotenv() # For OpenRouter API Key

Constants

In [None]:
QUESTIONS_PATH = "" # Path to the MaScQA eval.json file
MODEL_NAME = ""
VECTOR_STORE_URL = ""
EMBEDDING_MODEL = ""
EMBEDDING_DIMENSIONS = 0
COLLECTION_NAME = ""
MAX_TOKENS = 1024
TEMPERATURE = 0.0
TOP_K = 60

Questions

In [None]:
from src.import_questions import import_questions
questions = import_questions(QUESTIONS_PATH)

DocumentStore

In [None]:
from haystack.dataclasses.document import Document
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore

document_store = QdrantDocumentStore(
		url=VECTOR_STORE_URL,
    index=COLLECTION_NAME,
    embedding_dim=EMBEDDING_DIMENSIONS
)

Text-Embedder

In [None]:
from haystack.components.embedders import SentenceTransformersTextEmbedder

text_embedder = SentenceTransformersTextEmbedder(model=EMBEDDING_MODEL, progress_bar=False)

Retriever

In [None]:
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever

retriever = QdrantEmbeddingRetriever(document_store=document_store, top_k=TOP_K)

Prompt

In [None]:
from haystack.components.builders import PromptBuilder

template = """
Solve the following question. Write the correct answer inside a list at the end. Use the given context to answer the question if it's helpful.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

prompt_builder = PromptBuilder(template=template)

LLM

In [None]:
from haystack.components.generators import OpenAIGenerator
from haystack.utils import Secret

generator = OpenAIGenerator(model=MODEL_NAME,
                            api_key=Secret.from_env_var("OPENROUTER_API_KEY"),
                            api_base_url="https://openrouter.ai/api/v1",
                            generation_kwargs={
                              "max_tokens": MAX_TOKENS,
                              "temperature": TEMPERATURE,
                            })

Pipeline

In [None]:
from haystack import Pipeline

basic_rag_pipeline = Pipeline()
# Add components to your pipeline
basic_rag_pipeline.add_component("text_embedder", text_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm", generator)

# Now, connect the components to each other
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
basic_rag_pipeline.connect("prompt_builder", "llm")

In [None]:
import pandas as pd
# Take all dataframes in questions and concatenate them into one, using question keys as a new column topic
df = pd.concat(questions.values(), keys=questions.keys(), names=["topic"]).reset_index()

In [None]:
# Add columns for the results and analysis
df["result"] = ""
df["filtered_result"] = ""
df["correct_result"] = ""
df["overlap"] = 0
df["error_type"] = ""

In [None]:
total_rows = len(df[(df["result"] == "") | (df["result"] == "ERROR") | (df["result"].str.contains("<!DOCTYPE html>", na=False))])

In [None]:
from tqdm import tqdm

# Get the total number of rows in the DataFrame for the progress bar
total_rows = len(df[(df["result"] == "") | (df["result"] == "ERROR") | (df["result"].str.contains("<!DOCTYPE html>", na=False))])

# Create a progress bar
with tqdm(total=total_rows, desc="Processing rows", dynamic_ncols=True) as pbar:
    # Iterate over each row
    for index, row in df.iterrows():
        if row["result"] != "" and row["result"] != "ERROR" and not "<!DOCTYPE html>" in row["result"]:
            continue

        # Get the question and the context
        question = row["questions"]
        # Run the pipeline
        try:
            result = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})["llm"]["replies"][0]
        except Exception as e:
            print(str(e))
            result = "ERROR"
        # Add to df
        df.at[index, "result"] = result

        # Update the progress bar
        pbar.update()

In [None]:
from datetime import datetime
import json

date = datetime.now().strftime("%Y%m%d%H%M")
FILE_EMBEDDING_NAME = EMBEDDING_MODEL.replace("/", "-")
MODEL_NAME_NO_SLASH = MODEL_NAME.replace("/", "-")
filename = f"results_{MODEL_NAME_NO_SLASH}_{FILE_EMBEDDING_NAME}_{COLLECTION_NAME}_top{TOP_K}_{date}"

# Save the results to a new file
df.to_csv(f"./{filename}.csv", index=False)

# Save metadata
metadata = {
    "model": MODEL_NAME,
    "embedding": EMBEDDING_MODEL,
    "collection": COLLECTION_NAME,
    "date": date,
    "num_rows": len(df),
    "top_k": TOP_K
}

with open(f'{filename}.json', 'w') as f:
  json.dump(metadata, f)