In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [24]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

path = "data/"
loader = DirectoryLoader(path, glob="*.txt", loader_cls=TextLoader, recursive=True)
historical_docs = loader.load()
len(historical_docs)

66

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 550, chunk_overlap = 50)
split_documents = text_splitter.split_documents(historical_docs)
len(split_documents)

388

In [26]:
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Qdrant.from_documents(
    split_documents,
    embeddings,
    location=":memory:",
    collection_name="AmatolDocs"
)

In [27]:
naive_retriever = vectorstore.as_retriever(search_kwargs={"k" : 15})

In [28]:
from langchain_core.prompts import ChatPromptTemplate

RAG_TEMPLATE = """\
You are a helpful and kind assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

In [29]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI(model="gpt-4.1-nano")

In [30]:
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

naive_retrieval_chain = (
    {"context": itemgetter("question") | naive_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [31]:
naive_retrieval_chain.invoke({"question" : "What did Colonel Hawkins say about the town and plant of Amatol, NJ?"})["response"].content

'Colonel Hawkins said that in order to get people to stay at the Amatol plant, which was located 25 miles from Atlantic City, they had to build attractive accommodations because it was very hard to get people to go out there, especially given the knowledge of the danger involved. He explained that the size of Amatol—6,000 acres—was intended as a “substantial safety zone.”'

## Best-Matching 25 (BM25) Retriever

In [33]:
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(split_documents)

In [34]:
bm25_retrieval_chain = (
    {"context": itemgetter("question") | bm25_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

Let's look at the responses!

In [35]:
bm25_retrieval_chain.invoke({"question" : "What did Colonel Hawkins say about the town and plant of Amatol, NJ?"})["response"].content

'Colonel Hawkins stated that Camp Dix was not selected because the commanding officer there did not want it nearby. He also explained that the town of Amatol was designed to be attractive and served as a safety zone, covering 6,000 acres, to ensure safety for workers and residents. Additionally, he mentioned that Amatol was conveniently located between Atlantic City and Philadelphia, and that its size was intended to provide a substantial safety zone.'

## Contextual Compression (Using Reranking)

In [68]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

compressor = CohereRerank(model="rerank-v3.5")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=naive_retriever, top_k=12
)

In [69]:
contextual_compression_retrieval_chain = (
    {"context": itemgetter("question") | compression_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [70]:
contextual_compression_retrieval_chain.invoke({"question" : "What did Colonel Hawkins say about the town and plant of Amatol, NJ?"})["response"].content

'Colonel Hawkins said that at the loading plant at Amatol, 25 miles from Atlantic City, it was very difficult to get people to go out there because of the danger associated with the site. They had to build attractive accommodations to encourage workers to stay, especially since the knowledge of the danger made it hard to recruit and retain staff. He also mentioned that after the plant in Morgan, N.J., blew up on October 4th, they faced all sorts of trouble in getting people to work at Amatol from that time on.'

## Multi-Query Retriever

In [39]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=naive_retriever, llm=chat_model
)

In [40]:
multi_query_retrieval_chain = (
    {"context": itemgetter("question") | multi_query_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [42]:
multi_query_retrieval_chain.invoke({"question" : "What did Colonel Hawkins say about the town and plant of Amatol, NJ?"})["response"].content

'Colonel Hawkins said that the town of Amatol was built as a safety measure, covering about 350 acres, and was intended to accommodate approximately 20,000 people. He explained that they constructed the town with houses and amenities such as movie theaters, sewerage, waterworks, and lights. The purpose of building such a town was to support the shell-loading plant and its workers, providing a planned community close to the plant. However, he also mentioned that the town of Amatol was short-lived; after World War I ended, it became a vacant town with most buildings dismantled and little left today.'

In [43]:
from langchain_core.callbacks import CallbackManagerForRetrieverRun

# You must pass a dummy run manager
run_manager = CallbackManagerForRetrieverRun.get_noop_manager()

query = "What did Colonel Hawkins say about the town and plant of Amatol, NJ?"
reformulated_queries = multi_query_retriever.generate_queries(query, run_manager)

print("🔁 Reformulated Queries:")
for q in reformulated_queries:
    print("-", q)

🔁 Reformulated Queries:
- 1. What are Colonel Hawkins' remarks regarding the town of Amatol, NJ, and its associated plant?  
- 2. Can you provide information on Colonel Hawkins' comments about Amatol, NJ, and its industrial facilities?  
- 3. What insights did Colonel Hawkins share concerning the town of Amatol and the Amatol plant in New Jersey?


#Semantic Retriever

In [44]:
from langchain_experimental.text_splitter import SemanticChunker

semantic_chunker = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="percentile"
)

semantic_documents = semantic_chunker.split_documents(historical_docs)

semantic_vectorstore = Qdrant.from_documents(
    semantic_documents,
    embeddings,
    location=":memory:",
    collection_name="AmatolDocsSemantic"
)

semantic_retriever = semantic_vectorstore.as_retriever(search_kwargs={"k" : 10})

semantic_retrieval_chain = (
    {"context": itemgetter("question") | semantic_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

semantic_retrieval_chain.invoke({"question" : "What did Colonel Hawkins say about the town and plant of Amatol, NJ?"})["response"].content

'Colonel Hawkins said that the plant at Hammonton, which is called Amatol, was situated between Atlantic City and Philadelphia to be conveniently located, and that the size of Amatol—6,000 acres—was intended as a "substantial safety zone." Additionally, he mentioned that "We had to build rather attractive accommodations at Amatol. We had to make it just as attractive as we could in order to get people to stay there; particularly in view of the knowledge of most people of the danger."'

## Ensemble Retriever

In [45]:
from langchain.retrievers import EnsembleRetriever

retriever_list = [bm25_retriever, naive_retriever, semantic_retriever, multi_query_retriever]
equal_weighting = [1/len(retriever_list)] * len(retriever_list)

ensemble_retriever = EnsembleRetriever(
    retrievers=retriever_list, weights=equal_weighting
)

In [46]:
ensemble_retrieval_chain = (
    {"context": itemgetter("question") | ensemble_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [47]:
ensemble_retrieval_chain.invoke({"question" : "What did Colonel Hawkins say about the town and plant of Amatol, NJ?"})["response"].content

'Colonel Hawkins said that he had to build attractive accommodations at Amatol to get people to stay there, especially considering that most people were aware of the dangers associated with the plant. He mentioned that the size of Amatol—6,000 acres—was intended as a “substantial safety zone” to protect the workers and the surrounding area.'

# Ensemble + Cohere

In [48]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

compressor = CohereRerank(model="rerank-v3.5")

compressed_ensemble_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=ensemble_retriever,
    top_k=15
)

ensemble_compressed_chain = (
    {"context": itemgetter("question") | compressed_ensemble_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)


In [49]:
ensemble_compressed_chain.invoke({"question" : "What did Colonel Hawkins say about the town and plant of Amatol, NJ?"})["response"].content

"Colonel Hawkins said that the town and plant of Amatol, NJ, were constructed rapidly, within nine months, to serve as a shell-loading munitions plant and workers' village during World War I. He described the town as a planned community that was made attractive in order to encourage workers to stay, despite the awareness of the dangers involved. Hawkins also mentioned that the size of Amatol, covering 6,000 acres, was intended as a substantial safety zone. Additionally, he noted that there were difficulties in attracting workers to the site due to safety concerns, and efforts had to be made to make accommodations appealing enough to draw and retain personnel."

# RAGAS

In [50]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Session 11 - Retrieval Pipelines"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangSmith API Key: ")

In [51]:
import random
import pandas as pd
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import (
    SingleHopSpecificQuerySynthesizer,
    MultiHopAbstractQuerySynthesizer,
    MultiHopSpecificQuerySynthesizer,
)
from langchain.callbacks import get_openai_callback

# Reproducibility
random.seed(42)

# LLM + embedding setup
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-nano", temperature=0.7))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

# Query types
query_distribution = [
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 0.5),
    (MultiHopAbstractQuerySynthesizer(llm=generator_llm), 0.25),
    (MultiHopSpecificQuerySynthesizer(llm=generator_llm), 0.25),
]

# Generator init
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)

# Generate
print("Generating synthetic dataset...")
with get_openai_callback() as cb:
    dataset = generator.generate_with_langchain_docs(
        historical_docs,
        testset_size=10,
        query_distribution=query_distribution,
    )
    print(f"💰 Tokens used: {cb.total_tokens} | Cost: ${cb.total_cost:.4f}")

  from .autonotebook import tqdm as notebook_tqdm


Generating synthetic dataset...


Applying HeadlineSplitter:   0%|          | 0/66 [00:00<?, ?it/s]           unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to ap

💰 Tokens used: 167482 | Cost: $0.0222


In [52]:
import pandas as pd

df = dataset.to_pandas()
pd.set_option('display.max_colwidth', 300)
display(df)

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Who was JAMES BLAINE WALKER and what was his role in relation to rapid wartime industrial mobilization?,"[INTRODUCTION BY JAMES BLAINE WALKER Secretary Public Service Commission, 1st Dist. New York; Author of ""Fifty Years of Rapid Transit,"" etc. FOR MANY YEARS travelers on the Pennsylvania Railroad, between Philadelphia and Atlantic City, have viewed from the car windows apparently interminable str...","JAMES BLAINE WALKER was the Secretary of the Public Service Commission for the 1st District of New York and the author of ""Fifty Years of Rapid Transit.""",single_hop_specifc_query_synthesizer
1,Who was Mr. Hammel and what role did he play in the context of wartime industrial development during World War I?,"[In the following pages Mr. Hammel, the Company's Chief Engineer, and a Member of the American Society of Civil Engineers, has creditably presented an all-too-brief history of this great undertaking, which includes an interesting description of the plant and the town, their construction, operati...","Mr. Hammel was the Company's Chief Engineer and a Member of the American Society of Civil Engineers. He presented a history of a significant wartime industrial undertaking, including details about the plant, the town, their construction, operation, capacity, and the organization that made them p...",single_hop_specifc_query_synthesizer
2,Y. M. C. A. what is it?,"[with drying apparatus, as well as a diet kitchen, infirmary and shower. It would, indeed, be difficult to design a building better suited to its purposes, comfortable in its construction and within its walls all the accessories for the proper housing of its occupants. This building, affording l...","The Y. M. C. A. is a projected group of buildings, some completed, others moving towards completion, that includes a swimming pool, gymnasium with stage, bowling alley, billiard rooms and refreshment and lounging rooms.",single_hop_specifc_query_synthesizer
3,What does the history say about American labor and their living conditions?,"[Closing Reflection The history of this community has again demonstrated that American labor wants not merely to exist, but to live; that upon living conditions is dependent enthusiasm for work; that to house workmen in buildings that are not both attractive and comfortable is a fundamental error.]","The history of this community has demonstrated that American labor wants not merely to exist, but to live; that upon living conditions is dependent enthusiasm for work; that to house workmen in buildings that are not both attractive and comfortable is a fundamental error.",single_hop_specifc_query_synthesizer
4,"So like when US declared war on Germany in 1917, they had to make lots of munitions and stuff, right, and that was a big deal for Germany too, right?","[SCOPE OF THE WORK In April, 1917, when the United States declared war on Germany, and began the work of enlarging the Army and Navy and providing for the needed supply of food, clothing and munitions, its greatest task was to provide for the manufacture of munitions. The needed men were here; i...","The context describes that in April 1917, when the United States declared war on Germany, its greatest task was to provide for the manufacture of munitions, including building new factories and transforming peacetime works into gun and shell producers. The focus was on enlarging the Army and Nav...",single_hop_specifc_query_synthesizer
5,How did the company town and community development of Amatol relate to its town planning and the safety measures implemented during its construction?,"[<1-hop>\n\nAmatol: A former World War I munitions factory, located in Mullica Township, NJ The Book Shell Loading at Amatol, N.J. (150 scanned pages) is a rare, oversize book that documented the building of a bomb making facility at the tail end of World War I. Amatol was an extensive and attra...","Amatol was developed as an extensive 'company town' to support the ordnance facility, with careful town planning influenced by early 20th-century principles of planned communities and English Garden Cities. The town was designed to be attractive and stimulating for workers, incorporating element...",multi_hop_abstract_query_synthesizer
6,How did the wartime industrial production at amatol relate to the us involvement in WW1 and how did the town planning support the war effort?,"[<1-hop>\n\nAmatol: A former World War I munitions factory, located in Mullica Township, NJ The Book Shell Loading at Amatol, N.J. (150 scanned pages) is a rare, oversize book that documented the building of a bomb making facility at the tail end of World War I. Amatol was an extensive and attra...","The industrial production at Amatol was directly connected to the US involvement in WW1, as the plant was constructed rapidly starting in March 1918 to support the war effort by loading shells and ordnance for American troops in France. The town built to support the plant was a planned community...",multi_hop_abstract_query_synthesizer
7,How did the co-ordination of labor and machinery for efficient production contribute to the construction and operation of the shell loading plant during WW1?,"[<1-hop>\n\nINTRODUCTION BY JAMES BLAINE WALKER Secretary Public Service Commission, 1st Dist. New York; Author of ""Fifty Years of Rapid Transit,"" etc. FOR MANY YEARS travelers on the Pennsylvania Railroad, between Philadelphia and Atlantic City, have viewed from the car windows apparently inter...","The co-ordination of labor and machinery was essential in the rapid construction and operation of the shell loading plant in Amatol, New Jersey. The organization combined expert personnel, such as engineers and administrative leaders, with innovative methods like the power belt conveyor system, ...",multi_hop_abstract_query_synthesizer
8,Wht is the relashunship between New York City and the amatoal shell plant in New York?,"[<1-hop>\n\nINTRODUCTION BY JAMES BLAINE WALKER Secretary Public Service Commission, 1st Dist. New York; Author of ""Fifty Years of Rapid Transit,"" etc. FOR MANY YEARS travelers on the Pennsylvania Railroad, between Philadelphia and Atlantic City, have viewed from the car windows apparently inter...","The context does not mention a shell plant in New York City, but it describes how a large tract of land in New Jersey near Hammonton was transformed into a shell-loading plant during wartime, with significant involvement from New York-based companies and organizations. The connection is that the...",multi_hop_specific_query_synthesizer
9,"How did the development and subsequent disappearance of the Amatol munitions factory in New Jersey reflect the United States' wartime efforts during World War I, and what lasting impacts did this have on the local community and historical documentation?","[<1-hop>\n\nSCOPE OF THE WORK In April, 1917, when the United States declared war on Germany, and began the work of enlarging the Army and Navy and providing for the needed supply of food, clothing and munitions, its greatest task was to provide for the manufacture of munitions. The needed men w...","The Amatol munitions factory in Mullica Township, New Jersey, was built as part of the United States' extensive wartime effort following the declaration of war on Germany in April 1917, which required the rapid expansion of munitions manufacturing. Construction began in March 1918, and the facto...",multi_hop_specific_query_synthesizer


In [53]:
# Save the test dataset to avoid regenerating it
dataset.to_pandas().to_csv("loan_test_dataset.csv", index=False)
print("Test dataset saved to loan_test_dataset.csv")


Test dataset saved to loan_test_dataset.csv


In [56]:
import os
import time
import copy
import pandas as pd
import json
import numpy as np
from ragas.evaluation import EvaluationDataset
from ragas import evaluate, RunConfig
from ragas.metrics import (
    LLMContextRecall,
    Faithfulness,
    FactualCorrectness,
    ResponseRelevancy,
    ContextEntityRecall,
    NoiseSensitivity,
)
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
from langchain.callbacks import get_openai_callback

def evaluation_result_to_dict(result):
    """
    Convert EvaluationResult.scores (list of per-example dicts) to average metric values.
    """
    scores_list = result.scores
    if not scores_list or not isinstance(scores_list, list):
        return {}

    aggregated = {}
    for key in scores_list[0].keys():
        values = [s[key] for s in scores_list if s[key] is not None and not (isinstance(s[key], float) and np.isnan(s[key]))]
        if values:
            aggregated[key] = round(float(np.mean(values)), 4)

    return aggregated


def evaluate_retriever(name: str, pipeline, dataset):
    print(f"📊 Running evaluation for: {name}")

    test_dataset = copy.deepcopy(dataset)

    latencies = []
    total_tokens = 0
    total_cost = 0
    prompt_number = 1
    prompt_logs = []

    overall_start_time = time.time()

    for test_row in test_dataset:
        question = test_row.eval_sample.user_input
        start_time = time.time()
        with get_openai_callback() as cb:
            response = pipeline.invoke({"question": question})
        
        latency = time.time() - start_time
        latencies.append(latency)
        total_tokens += cb.total_tokens
        total_cost += cb.total_cost

        test_row.eval_sample.response = str(response["response"])
        test_row.eval_sample.retrieved_contexts = [doc.page_content for doc in response["context"]]

        token_usage = {
            "completion_tokens": response["response"].response_metadata['token_usage']["completion_tokens"],
            "prompt_tokens": response["response"].response_metadata['token_usage']["prompt_tokens"],
            "total_tokens": response["response"].response_metadata['token_usage']["total_tokens"]
        }

        model_name = response["response"].response_metadata['model_name']

        prompt_logs.append({
            "prompt_number": prompt_number,
            "question": question,
            "response": response["response"].content,
            "token_usage": token_usage,
            "model_name": model_name,
            "retrieved_contexts": [doc.page_content for doc in response["context"]],
            "retrieved_metadata": [doc.metadata for doc in response["context"]],
            "latency": round(latency, 2),
            "tokens": cb.total_tokens,
            "cost": round(cb.total_cost, 6),
            "timestamp": time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))
        })

        print(f"Prompt {prompt_number} | {response["response"].content} |⏱️ {latency:.2f}s | Tokens {cb.total_tokens} | Cost {cb.total_cost:.6f}")
        prompt_number += 1

    total_duration = time.time() - overall_start_time
    avg_latency = sum(latencies) / len(latencies) if latencies else 0
    avg_tokens_per_query = total_tokens / len(test_dataset)

    print(f"\n📊 Summary for {name}:")
    print(f"🔢 Total tokens: {total_tokens}")
    print(f"⏱️ Avg latency: {avg_latency:.2f}s")
    print(f"🔢 Avg tokens per query: {avg_tokens_per_query:.2f}")
    print(f"⏱️ Total duration: {total_duration:.2f}s")
    print(f"💰 Total cost: {total_cost:.6f}")

    evaluator_model = "gpt-4.1-nano"
    evaluation_dataset = EvaluationDataset.from_pandas(test_dataset.to_pandas())
    evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=evaluator_model, temperature=0))

    with get_openai_callback() as cb:
        result = evaluate(
            experiment_name=name,
            dataset=evaluation_dataset,
            metrics=[
                LLMContextRecall(),
                Faithfulness(),
                # FactualCorrectness(),
                ResponseRelevancy(),
                ContextEntityRecall(),
                # NoiseSensitivity(),
            ],
            llm=evaluator_llm,
            run_config=RunConfig(timeout=300)
        )

    evaluation_cost = cb.total_cost
    evaluation_tokens = cb.total_tokens

    print(f"Evaluation LLM model: {evaluator_model}")
    print('Evaluation results:', result)
    print('Evaluation tokens:', evaluation_tokens)
    print(f'Evaluation cost: {evaluation_cost:.6f}')

    evaluation_result_dict = evaluation_result_to_dict(result)

    output_dir = "historical_docs_retriever_logs"
    os.makedirs(output_dir, exist_ok=True)
    log_filename = f'{name.lower().replace(" ", "_")}_logs.json'
    output_path = os.path.join(output_dir, log_filename)

    full_output = {
        "name": name,
        "summary": {
            "total_runtime": round(total_duration, 2),
            "avg_latency": round(avg_latency, 2),
            "total_tokens": int(total_tokens),
            "total_queries": len(test_dataset),
            "total_cost": round(total_cost, 6),
            "avg_tokens_per_query": round(avg_tokens_per_query, 2),
        },
        "latencies": [round(lat, 2) for lat in latencies],
        "evaluation_result": evaluation_result_dict,
        "evaluation_cost": {
            "eval_model": evaluator_model,
            "eval_tokens": evaluation_tokens,
            "eval_cost": round(evaluation_cost, 6),
        },
        "prompt_logs_path": output_path,
        "prompt_logs": prompt_logs,
    }

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(full_output, f, indent=2, ensure_ascii=False)

    print(f"✅ Full result saved to: {output_path}")

    del full_output['prompt_logs']
    return full_output

In [57]:
from ragas.testset import Testset

subset = Testset(list(dataset)[:3])
evaluate_retriever("naive_retriever_test", naive_retrieval_chain, subset)


📊 Running evaluation for: naive_retriever_test
Prompt 1 | James Blaine Walker was the Secretary of the Public Service Commission for the 1st District of New York and an author, notably of "Fifty Years of Rapid Transit." In relation to rapid wartime industrial mobilization, he appears to have played a role as an example of an organized, official figure involved in infrastructure and public services during the period of increased industrial activity, such as in the context of war efforts. However, based on the provided context, there is no specific information indicating that he directly led or organized the wartime industrial mobilization efforts. |⏱️ 2.96s | Tokens 2692 | Cost 0.000303
Prompt 2 | Mr. Hammel was the company's Chief Engineer and a member of the American Society of Civil Engineers. In the context of wartime industrial development during World War I, he played a significant role in documenting and presenting the history of the construction and operation of the shell-loadin

Evaluating: 100%|██████████| 12/12 [00:11<00:00,  1.06it/s]


Evaluation LLM model: gpt-4.1-nano
Evaluation results: {'context_recall': 1.0000, 'faithfulness': 0.9333, 'answer_relevancy': 0.6237, 'context_entity_recall': 0.4583}
Evaluation tokens: 35048
Evaluation cost: 0.004515
✅ Full result saved to: historical_docs_retriever_logs/naive_retriever_test_logs.json


{'name': 'naive_retriever_test',
 'summary': {'total_runtime': 6.93,
  'avg_latency': 2.31,
  'total_tokens': 7810,
  'total_queries': 3,
  'total_cost': 0.000872,
  'avg_tokens_per_query': 2603.33},
 'latencies': [2.96, 2.03, 1.94],
 'evaluation_result': {'context_recall': 1.0,
  'faithfulness': 0.9333,
  'answer_relevancy': 0.6237,
  'context_entity_recall': 0.4583},
 'evaluation_cost': {'eval_model': 'gpt-4.1-nano',
  'eval_tokens': 35048,
  'eval_cost': 0.004515},
 'prompt_logs_path': 'historical_docs_retriever_logs/naive_retriever_test_logs.json'}

In [166]:
# def run_all_retriever_evaluations(pipelines: dict, dataset):
#     """
#     Runs evaluation across multiple retriever pipelines.

#     Args:
#         pipelines (dict): Dictionary where key is a pipeline name and value is the retriever chain.
#         dataset: Ragas synthetic test set.

#     Returns:
#         dict: A dictionary of results per pipeline.
#     """
#     results = {}
#     for name, pipeline in pipelines.items():
#         print(f"\n🚀 Starting evaluation for: {name}")
#         results[name] = evaluate_retriever(name, pipeline, dataset)
#     return results


In [58]:
naive_evaluation = evaluate_retriever("Naive", naive_retrieval_chain, dataset)

📊 Running evaluation for: Naive
Prompt 1 | James Blaine Walker was a public service official who served as the Secretary of the Public Service Commission for the 1st District of New York. He is also noted as the author of "Fifty Years of Rapid Transit" and other works. 

In relation to rapid wartime industrial mobilization, based on the provided context, his role appears to be primarily in a leadership or administrative capacity within the public service sector, overseeing or facilitating industrial efforts during wartime. However, the specific details of his direct involvement in wartime industrial mobilization are not explicitly mentioned in the provided documents.

If you are seeking detailed information about his exact role in wartime mobilization efforts, I don't have enough information from the given context to provide a precise answer. |⏱️ 4.28s | Tokens 2729 | Cost 0.000135
Prompt 2 | Mr. Hammel was the company's Chief Engineer and a member of the American Society of Civil Engi

Evaluating: 100%|██████████| 44/44 [00:35<00:00,  1.24it/s]


Evaluation LLM model: gpt-4.1-nano
Evaluation results: {'context_recall': 0.9848, 'faithfulness': 0.9109, 'answer_relevancy': 0.8404, 'context_entity_recall': 0.3707}
Evaluation tokens: 142369
Evaluation cost: 0.019528
✅ Full result saved to: historical_docs_retriever_logs/naive_logs.json


In [59]:
bm25_evaluation = evaluate_retriever("BM25", bm25_retrieval_chain, dataset)

📊 Running evaluation for: BM25
Prompt 1 | James Blaine Walker was the Secretary of the Public Service Commission for the 1st District of New York and an author known for works such as "Fifty Years of Rapid Transit." Based on the provided context, there is no direct information linking him specifically to the role of rapid wartime industrial mobilization or detailing his contributions in that area. The excerpts focus more on industrial organization and the rapid construction of military-related facilities, such as shell loading plants and associated towns, rather than on Walker's personal involvement in mobilization efforts.

Therefore, I do not have enough information to state what his role was in relation to rapid wartime industrial mobilization. |⏱️ 1.45s | Tokens 585 | Cost 0.000098
Prompt 2 | Based on the provided context, there is no information about a person named Mr. Hammel or his role in wartime industrial development during World War I. Therefore, I do not know who Mr. Hammel

Evaluating: 100%|██████████| 44/44 [00:28<00:00,  1.52it/s]


Evaluation LLM model: gpt-4.1-nano
Evaluation results: {'context_recall': 0.9394, 'faithfulness': 0.9318, 'answer_relevancy': 0.7560, 'context_entity_recall': 0.2320}
Evaluation tokens: 99736
Evaluation cost: 0.014843
✅ Full result saved to: historical_docs_retriever_logs/bm25_logs.json


In [60]:
multi_query_evaluation = evaluate_retriever("Multi-Query", multi_query_retrieval_chain, dataset)

📊 Running evaluation for: Multi-Query
Prompt 1 | James Blaine Walker was the Secretary of the Public Service Commission for the 1st District of New York. He is also noted as the author of "Fifty Years of Rapid Transit." However, based on the context provided, there is no specific information indicating his direct role in relation to rapid wartime industrial mobilization. |⏱️ 3.83s | Tokens 3815 | Cost 0.000427
Prompt 2 | Mr. Hammel was the Company's Chief Engineer and a member of the American Society of Civil Engineers. During World War I, he played a significant role in presenting a history of a major wartime industrial undertaking, specifically the construction and operation of an all-encompassing munitions plant near Hammonton, New Jersey, known as Amatol. His contributions included providing a creditable overview of the plant, the town, their rapid construction, capacity, and the organization that made the project possible. His leadership and engineering expertise were instrumental

Evaluating:  98%|█████████▊| 43/44 [00:23<00:01,  1.99s/it]Exception raised in Job[39]: TimeoutError()
Evaluating: 100%|██████████| 44/44 [05:07<00:00,  6.99s/it]


Evaluation LLM model: gpt-4.1-nano
Evaluation results: {'context_recall': 1.0000, 'faithfulness': 0.9343, 'answer_relevancy': 0.8293, 'context_entity_recall': 0.3242}
Evaluation tokens: 160050
Evaluation cost: 0.021490
✅ Full result saved to: historical_docs_retriever_logs/multi-query_logs.json


In [61]:
contextual_compression_evaluation = evaluate_retriever("Contextual Compression", contextual_compression_retrieval_chain, dataset)

📊 Running evaluation for: Contextual Compression
Prompt 1 | James Blaine Walker was the Secretary of the Public Service Commission for the 1st District of New York. He was also the author of "Fifty Years of Rapid Transit" and played a role related to public service and transportation. 

Based on the provided context, there is no direct information indicating that he was involved specifically in the rapid wartime industrial mobilization effort during World War I. The context discusses the general needs of mobilization, such as building factories and stimulating production, but does not explicitly connect James Blaine Walker to these activities.

Therefore, I do not know his specific role in relation to rapid wartime industrial mobilization. |⏱️ 3.37s | Tokens 632 | Cost 0.000102
Prompt 2 | Mr. Victor F. Hammel was the Chief Engineer of the company involved in the construction and operation of the shell-loading plant and the town of Amatol, New Jersey, during World War I. His role includ

Evaluating: 100%|██████████| 44/44 [00:37<00:00,  1.17it/s]


Evaluation LLM model: gpt-4.1-nano
Evaluation results: {'context_recall': 0.9697, 'faithfulness': 0.8788, 'answer_relevancy': 0.7561, 'context_entity_recall': 0.3677}
Evaluation tokens: 95553
Evaluation cost: 0.014207
✅ Full result saved to: historical_docs_retriever_logs/contextual_compression_logs.json


In [62]:
semantic_evaluation = evaluate_retriever("Semantic", semantic_retrieval_chain, dataset)

📊 Running evaluation for: Semantic
Prompt 1 | JAMES BLAINE WALKER was not mentioned in the provided context, and there is no information indicating his identity or specific role related to rapid wartime industrial mobilization. Therefore, I do not know who he was or what his role might have been in that context. |⏱️ 1.35s | Tokens 4021 | Cost 0.000419
Prompt 2 | Mr. Hammel was the Chief Engineer of the Atlantic Loading Company during World War I. In the context of wartime industrial development, he played a significant role in the construction and organization of one of the largest shell-loading munitions plants in the United States, located at Amatol, New Jersey. He is credited with providing a detailed history of this major industrial undertaking, which included describing the plant, the town, and the efficient organization that made rapid and successful production possible during the war effort. His contributions exemplify the critical role of engineering leadership in rapidly devel

Evaluating:  95%|█████████▌| 42/44 [00:33<00:05,  2.71s/it]Exception raised in Job[15]: TimeoutError()
Evaluating:  98%|█████████▊| 43/44 [05:00<01:20, 80.41s/it]Exception raised in Job[35]: TimeoutError()
Evaluating: 100%|██████████| 44/44 [05:08<00:00,  7.01s/it]


Evaluation LLM model: gpt-4.1-nano
Evaluation results: {'context_recall': 0.9091, 'faithfulness': 0.9545, 'answer_relevancy': 0.8457, 'context_entity_recall': 0.2980}
Evaluation tokens: 212764
Evaluation cost: 0.026711
✅ Full result saved to: historical_docs_retriever_logs/semantic_logs.json


In [63]:
ensemble_evaluation = evaluate_retriever("Ensemble", ensemble_retrieval_chain, dataset)

📊 Running evaluation for: Ensemble
Prompt 1 | James Blaine Walker was the Secretary of the Public Service Commission for the 1st District of New York and an author known for works like "Fifty Years of Rapid Transit." In relation to rapid wartime industrial mobilization, his role was primarily as an official overseeing public services, which likely included coordinating and facilitating efficient industrial and infrastructural efforts necessary for rapid production and mobilization during wartime. |⏱️ 7.02s | Tokens 8338 | Cost 0.000883
Prompt 2 | Mr. Hammel was the Chief Engineer of the Atlantic Loading Company during World War I. In the context of wartime industrial development, he played a key role in overseeing the construction and operation of the large-scale shell-loading plant and the associated workers' town at Amatol, New Jersey. He provided a credited and competent presentation of the plant's history, including its construction, organization, capacity, and technical details. H

Evaluating:  95%|█████████▌| 42/44 [00:53<00:10,  5.48s/it]Exception raised in Job[15]: TimeoutError()
Evaluating:  98%|█████████▊| 43/44 [05:00<01:16, 76.74s/it]Exception raised in Job[31]: TimeoutError()
Evaluating: 100%|██████████| 44/44 [05:09<00:00,  7.04s/it]


Evaluation LLM model: gpt-4.1-nano
Evaluation results: {'context_recall': 1.0000, 'faithfulness': 0.9740, 'answer_relevancy': 0.9333, 'context_entity_recall': 0.3213}
Evaluation tokens: 291997
Evaluation cost: 0.035488
✅ Full result saved to: historical_docs_retriever_logs/ensemble_logs.json


In [64]:
ensemble_compressed_evaluation = evaluate_retriever("EnsembleCompressed", ensemble_compressed_chain, dataset)

📊 Running evaluation for: EnsembleCompressed
Prompt 1 | James Blaine Walker was the Secretary of the Public Service Commission for the 1st District of New York. He was also an author, known for writing "Fifty Years of Rapid Transit," among other works. 

Based on the provided context, there is no specific information indicating that James Blaine Walker played a direct role in rapid wartime industrial mobilization. The context discusses the industrial efforts during wartime, such as the construction of shell loading plants and the rapid development of factories, but it does not link these activities to Walker himself. 

Therefore, I do not know if James Blaine Walker had any direct involvement in the wartime industrial mobilization efforts. |⏱️ 6.50s | Tokens 1025 | Cost 0.000169
Prompt 2 | Mr. Hammel was the Chief Engineer of the company responsible for the construction and operation of the shell loading plant at Amatol, New Jersey, during World War I. He was a member of the American S

Evaluating: 100%|██████████| 44/44 [00:37<00:00,  1.18it/s]


Evaluation LLM model: gpt-4.1-nano
Evaluation results: {'context_recall': 0.9848, 'faithfulness': 0.9221, 'answer_relevancy': 0.7592, 'context_entity_recall': 0.3523}
Evaluation tokens: 129435
Evaluation cost: 0.018441
✅ Full result saved to: historical_docs_retriever_logs/ensemblecompressed_logs.json


In [65]:
all_results = {
    "Naive": naive_evaluation,
    "BM25": bm25_evaluation,
    "Multi-Query": multi_query_evaluation,
    "Contextual Compression": contextual_compression_evaluation,
    "Semantic": semantic_evaluation,
    "Ensemble": ensemble_evaluation,
    "EnsembleCompressed": ensemble_compressed_evaluation
}


In [66]:
import pandas as pd
from IPython.display import display

def display_summary_table(results: dict):
    """
    Displays a summary table comparing retriever pipelines with averaged RAGAS metrics.
    
    Args:
        results (dict): A dictionary of evaluation results from evaluate_retriever().
    """
    summary_data = []

    for _, result in results.items():
        eval_scores = result["evaluation_result"]

        row = {
            "Pipeline": result["name"],
            **result["summary"],
            "ContextRecall": eval_scores.get("context_recall"),
            "Faithfulness": eval_scores.get("faithfulness"),
            # "FactualCorrectness": eval_scores.get("factual_correctness(mode=f1)"),
            "AnswerRelevancy": eval_scores.get("answer_relevancy"),
            "ContextEntityRecall": eval_scores.get("context_entity_recall"),
            # "NoiseSensitivity": eval_scores.get("noise_sensitivity(mode=relevant)"),
        }

        summary_data.append(row)

    df = pd.DataFrame(summary_data)
    display(df.sort_values(by="ContextRecall", ascending=False).reset_index(drop=True))


In [71]:
contextual_compression_evaluation = evaluate_retriever("Contextual Compression Rerun", contextual_compression_retrieval_chain, dataset)

📊 Running evaluation for: Contextual Compression Rerun
Prompt 1 | James Blaine Walker was the Secretary of the Public Service Commission for the 1st District of New York and the author of "Fifty Years of Rapid Transit." While the provided context does not explicitly detail his direct role in wartime industrial mobilization during World War I, it mentions his introduction to a related subject. The context highlights the challenges faced by the United States in rapidly increasing the production of munitions, food, clothing, and other war supplies after entering the war in 1917. 

Given his position and expertise, it is likely that James Blaine Walker contributed to or was involved in efforts related to managing or coordinating public service and transportation infrastructure, which are critical components of mobilization during wartime. However, based solely on the provided information, I cannot specify his exact role in relation to rapid wartime industrial mobilization.

If more detaile

Evaluating: 100%|██████████| 44/44 [00:30<00:00,  1.47it/s]


Evaluation LLM model: gpt-4.1-nano
Evaluation results: {'context_recall': 1.0000, 'faithfulness': 0.8874, 'answer_relevancy': 0.8484, 'context_entity_recall': 0.4032}
Evaluation tokens: 95342
Evaluation cost: 0.014162
✅ Full result saved to: historical_docs_retriever_logs/contextual_compression_rerun_logs.json


In [72]:
display_summary_table(all_results)


Unnamed: 0,Pipeline,total_runtime,avg_latency,total_tokens,total_queries,total_cost,avg_tokens_per_query,ContextRecall,Faithfulness,AnswerRelevancy,ContextEntityRecall
0,Multi-Query,49.15,4.47,44605,11,0.005378,4055.0,1.0,0.9343,0.8293,0.3242
1,Ensemble,78.74,7.16,97422,11,0.01078,8856.55,1.0,0.974,0.9333,0.3213
2,Naive,38.57,3.51,29161,11,0.003235,2651.0,0.9848,0.9109,0.8404,0.3707
3,EnsembleCompressed,68.46,6.22,20061,11,0.002938,1823.73,0.9848,0.9221,0.7592,0.3523
4,Contextual Compression,27.01,2.45,8046,11,0.001321,731.45,0.9697,0.8788,0.7561,0.3677
5,BM25,23.96,2.18,8044,11,0.001321,731.27,0.9394,0.9318,0.756,0.232
6,Semantic,30.0,2.73,53330,11,0.006005,4848.18,0.9091,0.9545,0.8457,0.298


| Pipeline               | Total Runtime | Avg Latency | Total Tokens | Total Queries | Total Cost | Avg Tokens/Query | ContextRecall                         | Faithfulness                         | AnswerRelevancy                      | ContextEntityRecall                  |
|------------------------|---------------|-------------|---------------|----------------|-------------|--------------------|----------------------------------------|--------------------------------------|--------------------------------------|--------------------------------------|
| Naive                  | 38.57         | 3.51        | 29161         | 11             | 0.003235    | 2651.00            | 0.9848                                 | 0.9109                               | 0.8404                               | <span style="background-color:#c8e6c9">0.3707</span> |
| BM25                   | 23.96         | 2.18        | <span style="background-color:#c8e6c9">8044</span>          | 11             | <span style="background-color:#c8e6c9">0.001321</span>    | <span style="background-color:#c8e6c9">731.27</span>            | 0.9394                                 | 0.9318                               | 0.7560                               | ❌ <span style="background-color:#ffcdd2">0.2320</span> |
| Multi-Query            | <span style="background-color:#c8e6c9">49.15</span>         | <span style="background-color:#c8e6c9">4.47</span>        | 44605         | 11             | 0.005378    | 4055.00            | <span style="background-color:#c8e6c9">1.0000</span>              | 0.9343                               | 0.8293                               | 0.3242                               |
| Contextual Compression | <span style="background-color:#c8e6c9">27.01</span>         | <span style="background-color:#c8e6c9">2.45</span>        | 8046          | 11             | <span style="background-color:#c8e6c9">0.001321</span>    | <span style="background-color:#c8e6c9">731.45</span>            | 0.9697                                 | 0.8788                               | 0.7561                               | 0.3677                               |
| Semantic               | 30.00         | 2.73        | 53330         | 11             | 0.006005    | 4848.18            | ❌ <span style="background-color:#ffcdd2">0.9091</span>              | 0.9545                               | 0.8457                               | 0.2980                               |
| Ensemble               | ❌ <span style="background-color:#ffcdd2">78.74</span>         | ❌ <span style="background-color:#ffcdd2">7.16</span>        | ❌ <span style="background-color:#ffcdd2">97422</span>         | 11             | ❌ <span style="background-color:#ffcdd2">0.010780</span>    | ❌ <span style="background-color:#ffcdd2">8856.55</span>           | <span style="background-color:#c8e6c9">1.0000</span>              | <span style="background-color:#c8e6c9">0.9740</span>       | <span style="background-color:#c8e6c9">0.9333</span>       | 0.3213                               |
| EnsembleCompressed     | 68.46         | 6.22        | 20061         | 11             | 0.002938    | 1823.73            | 0.9848                                 | 0.9221                               | 0.7592                               | 0.3523                               |


| Pipeline               | Total Runtime | Avg Latency | Total Tokens | Total Cost | Avg Tokens/Query | ContextRecall | Faithfulness | AnswerRelevancy | ContextEntityRecall |
|------------------------|---------------|-------------|---------------|-------------|--------------------|----------------|---------------|------------------|-----------------------|
| Naive                  | 🟡 38.57       | 🟡 3.51      | 🟡 29161       | 🟡 0.003235  | 🟡 2651.00          | 🟡 0.9848      | 🟡 0.9109     | 🟢 0.8404         | 🟢 0.3707              |
| BM25                   | 🟢 23.96       | 🟢 2.18      | 🟢 8044        | 🟢 0.001321  | 🟢 731.27           | 🔴 0.9394      | 🟡 0.9318     | 🔴 0.7560         | 🔴 0.2320              |
| Multi-Query            | 🟡 49.15       | 🟡 4.47      | 🟡 44605       | 🟡 0.005378  | 🟡 4055.00          | 🟢 1.0000      | 🟡 0.9343     | 🟢 0.8293         | 🟡 0.3242              |
| Contextual Compression | 🟢 27.01       | 🟢 2.45      | 🟢 8046        | 🟢 0.001321  | 🟢 731.45           | 🟡 0.9697      | 🔴 0.8788     | 🔴 0.7561         | 🟢 0.3677              |
| Semantic               | 🟡 30.00       | 🟡 2.73      | 🔴 53330       | 🔴 0.006005  | 🔴 4848.18          | 🔴 0.9091      | 🟢 0.9545     | 🟢 0.8457         | 🔴 0.2980              |
| Ensemble               | 🔴 78.74       | 🔴 7.16      | 🔴 97422       | 🔴 0.010780  | 🔴 8856.55          | 🟢 1.0000      | 🟢 0.9740     | 🟢 0.9333         | 🟡 0.3213              |
| EnsembleCompressed     | 🔴 68.46       | 🔴 6.22      | 🟡 20061       | 🟡 0.002938  | 🟡 1823.73          | 🟡 0.9848      | 🟡 0.9221     | 🔴 0.7592         | 🟢 0.3523              |
