In [4]:
import os
import getpass
from uuid import uuid4

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")
os.environ["LANGCHAIN_PROJECT"] = f"Sprint Scribe - {uuid4().hex[0:8]}"
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [5]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from datetime import datetime, timedelta

loader = CSVLoader(
    file_path=f"../data/TF-Task.csv",
    metadata_columns=[
        "Issue key",
        "Parent key",
        "Parent summary",
    ],
)

jira_tickets = loader.load()
jira_tickets[0]

Document(metadata={'source': '../data/TF-Task.csv', 'row': 0, 'Issue key': 'TDF-156', 'Parent key': 'TDF-1', 'Parent summary': 'Initiation:Discovery'}, page_content="Summary: Tailor the customer questionnaire\nDescription: Using information obtained from the GTM and CSA teams, as well as from the client kick-off meeting, tailor the questionnaire that will be used in the client workshops.\n\nThe goals are:\n\n# Address relevant questions with answers that we already have\n# Reduce the questionnaire to the relevant questions that we don’t already have answers for\n\nThe outcome will be a focused questionnaire that can be used in the discovery workshops when we can confirm the information we already heard, and ask only the relevant remaining questions.  This will let the client know that we’ve been listening to them since the beginning, that the different teams in Infostrux are talking to one another and that we respect the client’s time by not repeating the same set of questions.\n\n(TIP

# Sintatic data generation

### TODO
1. Use the EPIC information to create cluster

In [78]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [79]:
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph()
kg

KnowledgeGraph(nodes: 0, relationships: 0)

In [80]:
from ragas.testset.graph import Node, NodeType

### NOTICE: We're using a subset of the data for this example - this is to keep costs/time down.
for ticket in jira_tickets:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={
                "page_content": ticket.page_content,
                "document_metadata": ticket.metadata,
            },
        )
    )
kg

KnowledgeGraph(nodes: 140, relationships: 0)

In [81]:
from ragas.testset.transforms import default_transforms, apply_transforms

transformer_llm = generator_llm
embedding_model = generator_embeddings

default_transforms = default_transforms(
    documents=jira_tickets, llm=transformer_llm, embedding_model=embedding_model
)
apply_transforms(kg, default_transforms)
kg

Applying SummaryExtractor:   0%|          | 0/95 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/140 [00:00<?, ?it/s]

Node 919433d5-5083-4024-9bcd-ed912ad93b1b does not have a summary. Skipping filtering.
Node 607f1b7f-d15c-491b-b479-24c74d579b2e does not have a summary. Skipping filtering.
Node 0d2b149d-7f9f-4d3c-8448-673f54d584e4 does not have a summary. Skipping filtering.
Node 9e162d88-6d28-4286-b7cb-74b5f0984906 does not have a summary. Skipping filtering.
Node 187629f5-9152-4f41-8277-c3fe99402495 does not have a summary. Skipping filtering.
Node ebaeaed9-cf09-4795-84a8-8cc84ca40498 does not have a summary. Skipping filtering.
Node e1974d52-60f5-4f47-a3a7-8ebfaeafa10f does not have a summary. Skipping filtering.
Node 532ca24c-bd37-4a81-a685-b8c106dabddd does not have a summary. Skipping filtering.
Node c41b1dff-64b8-48f5-9ebc-85c0a86d6af1 does not have a summary. Skipping filtering.
Node 68126742-58e8-4a8f-b36e-c1a38af24c08 does not have a summary. Skipping filtering.
Node 0b9149a9-a888-428f-88f4-4fd407288384 does not have a summary. Skipping filtering.
Node 190bbef0-d233-46e4-b6ea-52c19a5a47d8 d

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/375 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

KnowledgeGraph(nodes: 140, relationships: 482)

In [82]:
kg.save("loan_data_kg.json")
loan_data_kg = KnowledgeGraph.load("loan_data_kg.json")
loan_data_kg

KnowledgeGraph(nodes: 140, relationships: 482)

In [84]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(
    llm=generator_llm, embedding_model=embedding_model, knowledge_graph=loan_data_kg
)

In [88]:
from ragas.testset.synthesizers import (
    default_query_distribution,
    SingleHopSpecificQuerySynthesizer,
    MultiHopAbstractQuerySynthesizer,
    MultiHopSpecificQuerySynthesizer,
)

query_distribution = [
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0)
]

In [None]:
testset = generator.generate(testset_size=120, query_distribution=query_distribution)
testset.to_pandas()

Generating Scenarios:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,How should the CSA team contribute to tailorin...,[Summary: Tailor the customer questionnaire\nD...,The CSA team member should be invited to help ...,single_hop_specifc_query_synthesizer
1,Wher can I find the atlassian documments relat...,[Summary: Infrastructure Workshop\nDescription...,Example questions which can be used in some ca...,single_hop_specifc_query_synthesizer
2,where i find Infostrux folder?,[Summary: Data Workshop\nDescription: As a dev...,"""Customers/<customer>/Infostrux - <customer> S...",single_hop_specifc_query_synthesizer
3,how i use terraform-snowflake-rbac-infra for m...,[Summary: CI/CD Tunnels (Optional)\nDescriptio...,You can look at the example from the RBAC pipe...,single_hop_specifc_query_synthesizer
4,How is RBAC integrated into the CI/CD pipeline...,[Summary: CI/CD Connectors (Optional)\nDescrip...,RBAC is integrated into the CI/CD pipeline for...,single_hop_specifc_query_synthesizer
5,How can Terraform be used to set up Fivetran S...,[Summary: Fivetran:Terraform SSH Tunnels (Opti...,Terraform can be used to implement Fivetran SS...,single_hop_specifc_query_synthesizer
6,Whaat is a Terrafom providr and howw can it be...,"[Summary: Fivetran:Terraform Destinations, Con...",A Terraform provider for Fivetran allows us to...,single_hop_specifc_query_synthesizer
7,What are the requirements and considerations f...,[Summary: Communicate Requirements\nDescriptio...,"For SQL Server implementations, either CDC (Ch...",single_hop_specifc_query_synthesizer
8,Howw do I set up a Fivetran destinashun in Sno...,[Summary: Fivetran Destination\nDescription: A...,"To set up a Fivetran destination in Snowflake,...",single_hop_specifc_query_synthesizer
9,How we use Airbyte if not do Fivetran setup?,[Summary: Fivetran Account\nDescription: As a ...,"Depending on sources, Fivetran setup can be sk...",single_hop_specifc_query_synthesizer


In [15]:
from langsmith import Client

client = Client()

In [None]:
dataset_name = "Sprint Scribe v1.0"

langsmith_dataset = client.create_dataset(
    dataset_name=dataset_name, description="Sprint Scribe v1.0"
)

In [91]:
for data_row in testset.to_pandas().iterrows():
    client.create_example(
        inputs={"question": data_row[1]["user_input"]},
        outputs={"answer": data_row[1]["reference"]},
        metadata={"context": data_row[1]["reference_contexts"]},
        dataset_id=langsmith_dataset.id,
    )

# Evaluating retrival

In [16]:
from langchain_community.chat_loaders.langsmith import LangSmithDatasetChatLoader

# Load the dataset from Langchain by name
dataset = client.read_dataset(dataset_name="Sprint Scribe v1.0")

In [31]:
# Load the dataset examples from LangSmith
examples = list(client.list_examples(dataset_id=dataset.id))
import pandas as pd

# Transform the list of Example objects to a pandas DataFrame
examples_df = pd.DataFrame(
    [
        {
            "user_input": ex.inputs["question"],
            "reference": ex.outputs["answer"],
            "reference_contexts": ex.metadata["context"],
        }
        for ex in examples
    ]
)
examples_df.head()

Unnamed: 0,user_input,reference,reference_contexts
0,What responsibilities does a pod manager have ...,A pod manager is responsible for cleaning up t...,[Summary: send a reminder message to tech lead...
1,What steps should be taken to ensure that the ...,According to the documented acceptance criteri...,[Summary: T minus 4 weeks\nDescription: *Accep...
2,"As a Data Quality Project Manager, how should ...","During the final three weeks of the project, t...",[Summary: T minus 3 weeks\nDescription: *Accep...
3,wat hapens at T minus 2 week?,"At T minus 2 week, status is reported to all s...",[Summary: T minus 2 week\nDescription: *Accept...
4,What is Infostrux's role in project wrap-up pr...,Infostrux is responsible for internal communic...,[Summary: T minus 1 week\nDescription: *Accept...


# Evaluation

In [18]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))

In [32]:
import copy
naive_retrieval_dataset = copy.deepcopy(examples_df)
bm25_retrieval_dataset = copy.deepcopy(examples_df)
multi_query_retrieval_dataset = copy.deepcopy(examples_df)

In [None]:
from langchain_core.prompts import ChatPromptTemplate

RAG_TEMPLATE = """\
You are a helpful and kind assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

# Naive Retrieval

In [None]:
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Qdrant.from_documents(
    jira_tickets,
    embeddings,
    location=":memory:",
    collection_name="LoanComplaints",
)
naive_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

naive_retrieval_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {
        "context": itemgetter("question") | naive_retriever,
        "question": itemgetter("question"),
    }
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [33]:
naive_retrieval_dataset["response"] = ""
naive_retrieval_dataset["retrieved_contexts"] = [
    [] for _ in range(len(naive_retrieval_dataset))
]

for k, v in naive_retrieval_dataset.iterrows():
    response = naive_retrieval_chain.invoke({"question": v.user_input})
    naive_retrieval_dataset.at[k, "response"] = response["response"].content
    naive_retrieval_dataset.at[k, "retrieved_contexts"] = [
        context.page_content for context in response["context"]
    ]

In [35]:
from ragas.metrics import (
    LLMContextRecall,
    Faithfulness,
    FactualCorrectness,
    ResponseRelevancy,
    ContextEntityRecall,
    NoiseSensitivity,
)
from ragas import evaluate, RunConfig
from ragas import EvaluationDataset

custom_run_config = RunConfig(timeout=360)

naive_retrieval_result = evaluate(
    dataset=EvaluationDataset.from_pandas(naive_retrieval_dataset),
    metrics=[
        LLMContextRecall(),
        Faithfulness(),
        FactualCorrectness(),
        ResponseRelevancy(),
        ContextEntityRecall(),
        NoiseSensitivity(),
    ],
    llm=evaluator_llm,
    run_config=RunConfig(timeout=360),
)
naive_retrieval_result

Evaluating:   0%|          | 0/108 [00:00<?, ?it/s]

Exception raised in Job[50]: AttributeError('StringIO' object has no attribute 'statements')
Exception raised in Job[53]: AttributeError('StringIO' object has no attribute 'statements')
Exception raised in Job[29]: AttributeError('StringIO' object has no attribute 'statements')
Exception raised in Job[65]: AttributeError('StringIO' object has no attribute 'statements')
Exception raised in Job[17]: TimeoutError()


{'context_recall': 0.8889, 'faithfulness': 0.8564, 'factual_correctness': 0.4935, 'answer_relevancy': 0.9127, 'context_entity_recall': 0.5356, 'noise_sensitivity_relevant': 0.2317}

# MB25 Retriever

In [36]:
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(
    jira_tickets,
)

In [37]:
bm25_retrieval_chain = (
    {
        "context": itemgetter("question") | bm25_retriever,
        "question": itemgetter("question"),
    }
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [38]:
bm25_retrieval_dataset["response"] = ""
bm25_retrieval_dataset["retrieved_contexts"] = [
    [] for _ in range(len(bm25_retrieval_dataset))
]

for k, v in bm25_retrieval_dataset.iterrows():
    response = bm25_retrieval_chain.invoke({"question": v.user_input})
    bm25_retrieval_dataset.at[k, "response"] = response["response"].content
    bm25_retrieval_dataset.at[k, "retrieved_contexts"] = [
        context.page_content for context in response["context"]
    ]

In [39]:
bm25_retrieval_result = evaluate(
    dataset=EvaluationDataset.from_pandas(bm25_retrieval_dataset),
    metrics=[
        LLMContextRecall(),
        Faithfulness(),
        FactualCorrectness(),
        ResponseRelevancy(),
        ContextEntityRecall(),
        NoiseSensitivity(),
    ],
    llm=evaluator_llm,
    run_config=RunConfig(timeout=360),
)
bm25_retrieval_result

Evaluating:   0%|          | 0/108 [00:00<?, ?it/s]

{'context_recall': 0.6296, 'faithfulness': 0.6716, 'factual_correctness': 0.4572, 'answer_relevancy': 0.7520, 'context_entity_recall': 0.4301, 'noise_sensitivity_relevant': 0.1710}

# Multi-query retriever

In [40]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=naive_retriever, llm=chat_model
)

In [41]:
multi_query_retrieval_chain = (
    {
        "context": itemgetter("question") | multi_query_retriever,
        "question": itemgetter("question"),
    }
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [42]:
multi_query_retrieval_dataset["response"] = ""
multi_query_retrieval_dataset["retrieved_contexts"] = [
    [] for _ in range(len(multi_query_retrieval_dataset))
]

for k, v in multi_query_retrieval_dataset.iterrows():
    response = multi_query_retrieval_chain.invoke({"question": v.user_input})
    multi_query_retrieval_dataset.at[k, "response"] = response["response"].content
    multi_query_retrieval_dataset.at[k, "retrieved_contexts"] = [
        context.page_content for context in response["context"]
    ]

In [43]:
multi_query_retrieval_result = evaluate(
    dataset=EvaluationDataset.from_pandas(multi_query_retrieval_dataset),
    metrics=[
        LLMContextRecall(),
        Faithfulness(),
        FactualCorrectness(),
        ResponseRelevancy(),
        ContextEntityRecall(),
        NoiseSensitivity(),
    ],
    llm=evaluator_llm,
    run_config=RunConfig(timeout=360),
)
multi_query_retrieval_result

Evaluating:   0%|          | 0/108 [00:00<?, ?it/s]

Exception raised in Job[11]: TimeoutError()
Exception raised in Job[17]: TimeoutError()
Exception raised in Job[41]: TimeoutError()


{'context_recall': 1.0000, 'faithfulness': 0.8607, 'factual_correctness': 0.4522, 'answer_relevancy': 0.9662, 'context_entity_recall': 0.4989, 'noise_sensitivity_relevant': 0.2659}

# Ensemble retriever

In [44]:
from langchain.retrievers import EnsembleRetriever
ensemble_retrieval_dataset = copy.deepcopy(examples_df)

retriever_list = [
    bm25_retriever,
    naive_retriever,
    multi_query_retriever,
]
equal_weighting = [1 / len(retriever_list)] * len(retriever_list)

ensemble_retriever = EnsembleRetriever(
    retrievers=retriever_list, weights=equal_weighting
)

In [45]:
ensemble_retrieval_chain = (
    {
        "context": itemgetter("question") | ensemble_retriever,
        "question": itemgetter("question"),
    }
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [46]:
ensemble_retrieval_dataset["response"] = ""
ensemble_retrieval_dataset["retrieved_contexts"] = [
    [] for _ in range(len(ensemble_retrieval_dataset))
]

for k, v in ensemble_retrieval_dataset.iterrows():
    response = ensemble_retrieval_chain.invoke({"question": v.user_input})
    ensemble_retrieval_dataset.at[k, "response"] = response["response"].content
    ensemble_retrieval_dataset.at[k, "retrieved_contexts"] = [
        context.page_content for context in response["context"]
    ]

In [47]:
ensemble_retrieval_result = evaluate(
    dataset=EvaluationDataset.from_pandas(ensemble_retrieval_dataset),
    metrics=[
        LLMContextRecall(),
        Faithfulness(),
        FactualCorrectness(),
        ResponseRelevancy(),
        ContextEntityRecall(),
        NoiseSensitivity(),
    ],
    llm=evaluator_llm,
    run_config=RunConfig(timeout=360),
)
ensemble_retrieval_result

Evaluating:   0%|          | 0/108 [00:00<?, ?it/s]

Exception raised in Job[29]: AttributeError('StringIO' object has no attribute 'statements')
Exception raised in Job[34]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
Exception raised in Job[11]: TimeoutError()
Exception raised in Job[17]: TimeoutError()
Exception raised in Job[41]: TimeoutError()
Exception raised in Job[89]: TimeoutError()


{'context_recall': 0.9815, 'faithfulness': 0.7227, 'factual_correctness': 0.4306, 'answer_relevancy': 0.9099, 'context_entity_recall': 0.5417, 'noise_sensitivity_relevant': 0.2547}

# Result comparison

In [55]:
import pandas as pd

# Helper function to convert EvaluationResult to dict if needed
def eval_result_to_dict(result):
    # If already a dict, return as is
    if isinstance(result, dict):
        return result
    # Try to use .dict() or .to_dict() if available
    if hasattr(result, "dict"):
        return result.dict()
    if hasattr(result, "to_dict"):
        return result.to_dict()
    # Otherwise, try to use __dict__ (may include extra keys)
    if hasattr(result, "__dict__"):
        return dict(result.__dict__)
    # If all else fails, raise error
    raise TypeError(f"Cannot convert {type(result)} to dict")

# Collect results into a list of tuples: (method, scores)
results = [
    ("Naive", eval_result_to_dict(naive_retrieval_result)),
    ("BM25", eval_result_to_dict(bm25_retrieval_result)),
    ("Multi-Query", eval_result_to_dict(multi_query_retrieval_result)),
    ("Ensemble", eval_result_to_dict(ensemble_retrieval_result)),
]

# Extract the metrics from the 'scores' field of each result
# Each 'scores' is typically a list of dicts (one per example); we want the mean for each metric
import numpy as np

metrics = [
    "context_recall",
    "faithfulness",
    "factual_correctness",
    "answer_relevancy",
    "context_entity_recall"
]

table = {}
for method, result in results:
    # result['scores'] is a list of dicts, one per example
    scores = result.get("scores", [])
    # Defensive: skip if scores is empty
    if not scores:
        table[method] = {metric: None for metric in metrics}
        continue
    # Each score is a dict with the metrics
    # Compute mean for each metric
    metric_means = {}
    for metric in metrics:
        values = [score.get(metric) for score in scores if metric in score]
        # Only compute mean if there are values
        if values:
            metric_means[metric] = np.mean(values)
        else:
            metric_means[metric] = None
    table[method] = metric_means

# Create DataFrame: rows=methods, columns=metrics
metrics_df = pd.DataFrame.from_dict(table, orient="index")
metrics_df.index.name = "Method"
print(metrics_df.round(4))

             context_recall  faithfulness  factual_correctness  \
Method                                                           
Naive                0.8889        0.8564                  NaN   
BM25                 0.6296        0.6716               0.4572   
Multi-Query          1.0000        0.8607               0.4522   
Ensemble             0.9815        0.7227               0.4306   

             answer_relevancy  context_entity_recall  
Method                                                
Naive                  0.9127                 0.5356  
BM25                   0.7520                 0.4301  
Multi-Query            0.9662                 0.4989  
Ensemble               0.9099                    NaN  
