# Advanced Retrieval with LangChain

# 🤝 Breakout Room Part #1

In [1]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")

In [2]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [3]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from datetime import datetime, timedelta

loader = CSVLoader(
    file_path=f"./data/Projects_with_Domains.csv",
    metadata_columns=[
      "Project Title",
      "Project Domain",
      "Secondary Domain",
      "Description",
      "Judge Comments",
      "Score",
      "Project Name",
      "Judge Score"
    ]
)

synthetic_usecase_data = loader.load()

for doc in synthetic_usecase_data:
    doc.page_content = doc.metadata["Description"]

In [4]:
synthetic_usecase_data[0]

Document(metadata={'source': './data/Projects_with_Domains.csv', 'row': 0, 'Project Title': 'InsightAI 1', 'Project Domain': 'Security', 'Secondary Domain': 'Finance / FinTech', 'Description': 'A low-latency inference system for multimodal agents in autonomous systems.', 'Judge Comments': 'Technically ambitious and well-executed.', 'Score': '85', 'Project Name': 'Project Aurora', 'Judge Score': '9.5'}, page_content='A low-latency inference system for multimodal agents in autonomous systems.')

In [5]:
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Qdrant.from_documents(
    synthetic_usecase_data,
    embeddings,
    location=":memory:",
    collection_name="Synthetic_Usecases"
)

## Task 4: Naive RAG Chain

In [6]:
naive_retriever = vectorstore.as_retriever(search_kwargs={"k" : 10})

In [7]:
from langchain_core.prompts import ChatPromptTemplate

RAG_TEMPLATE = """\
You are a helpful and kind assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

In [8]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI(model="gpt-4.1-nano")

In [9]:
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

naive_retrieval_chain = (
    {"context": itemgetter("question") | naive_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [10]:
naive_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'The most common project domain in the provided data is "Healthcare / MedTech," which appears three times among the listed projects.'

In [11]:
naive_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are use cases related to security. Specifically, one project titled "Pathfinder 24" focuses on an AI-powered platform optimizing logistics routes for sustainability, which also has a secondary domain of Security. Additionally, there is a project called "WealthifyAI" (though its primary focus is on privacy in healthcare applications using federated learning) that could have security implications.'

In [12]:
naive_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges generally had positive comments about the fintech projects. For example, one project was described as having "impressive real-world impact," and another was praised as a "clever solution with measurable environmental benefit." Overall, the comments highlight that the fintech projects were considered innovative, well-executed, and impactful by the judges.'

## Task 5: Best-Matching 25 (BM25) Retriever


In [13]:
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(synthetic_usecase_data)

In [14]:
bm25_retrieval_chain = (
    {"context": itemgetter("question") | bm25_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [15]:
bm25_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the project domains mentioned include Productivity Assistants, E‑commerce / Marketplaces, Healthcare / MedTech, and Finance / FinTech. Since there are multiple projects in the Finance / FinTech domain (examples include "PulseAI 50" and "DocuCheck 47"), it appears to be the most common project domain among the examples.'

In [16]:
bm25_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there was a use case related to security. The project titled "SecureNest 49" is in the domain of E‑commerce / Marketplaces and Legal / Compliance, and it involves a document summarization and retrieval system for enterprise knowledge bases, which can relate to security and compliance concerns.'

In [17]:
bm25_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges had positive comments about the fintech projects. Specifically, for the project "SynthMind," they noted that it was "Conceptually strong but results need more benchmarking," and gave it a high score of 9.6.'

## Task 6: Contextual Compression (Using Reranking)

In [18]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

compressor = CohereRerank(model="rerank-v3.5")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=naive_retriever
)

In [19]:
contextual_compression_retrieval_chain = (
    {"context": itemgetter("question") | compression_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [20]:
contextual_compression_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain appears to be "Security," as it is listed as the project domain for one of the projects. However, with only three examples, it\'s not definitive. If these are representative of the larger dataset, "Security" seems to be prominent, though more data would be needed for a conclusive answer.'

In [21]:
contextual_compression_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Based on the provided context, there are no specific use cases explicitly related to security. The use cases mentioned focus on federated learning to improve privacy in healthcare applications, but there is no direct mention of security-related use cases.'

In [22]:
contextual_compression_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges had positive comments about the fintech projects. For example, in the case of the project "Pathfinder 27" in the Finance / FinTech domain, the judges highlighted "excellent code quality and use of open-source libraries."'

## Task 7: Multi-Query Retriever

In [23]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=naive_retriever, llm=chat_model
) 

In [24]:
multi_query_retrieval_chain = (
    {"context": itemgetter("question") | multi_query_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [25]:
multi_query_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'The most common project domain in the provided data appears to be "Healthcare / MedTech," which is listed multiple times across different projects.'

In [26]:
multi_query_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are use cases related to security. Specifically, one project titled "Project Aurora" focuses on a low-latency inference system for multimodal agents in autonomous systems, which is categorized under the Security domain.'

In [27]:
multi_query_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges had positive remarks about the fintech projects. For example, they described the project "SkyForge" as a "clever solution with measurable environmental benefit," and "LatticeFlow" was noted for its "excellent code quality and use of open-source libraries." Overall, the judges appreciated the technical maturity, validation, and innovative aspects of these fintech projects.'

## Task 8: Parent Document Retriever

In [28]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient, models

parent_docs = synthetic_usecase_data
child_splitter = RecursiveCharacterTextSplitter(chunk_size=750)

In [29]:
from langchain_qdrant import QdrantVectorStore

client = QdrantClient(location=":memory:")

client.create_collection(
    collection_name="full_documents",
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
)

parent_document_vectorstore = QdrantVectorStore(
    collection_name="full_documents", embedding=OpenAIEmbeddings(model="text-embedding-3-small"), client=client
)

In [30]:
store = InMemoryStore()

parent_document_retriever = ParentDocumentRetriever(
    vectorstore = parent_document_vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [31]:
parent_document_retriever.add_documents(parent_docs, ids=None)

In [32]:
parent_document_retrieval_chain = (
    {"context": itemgetter("question") | parent_document_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [33]:
parent_document_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain appears to be "Healthcare / MedTech," as it is mentioned multiple times among the examples.'

In [34]:
parent_document_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Based on the provided context, there do not appear to be any use cases explicitly related to security. The projects mentioned mainly focus on federated learning to improve privacy in healthcare applications, but no specific security use cases are noted.'

In [35]:
parent_document_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges had positive comments about the projects. For example, they described the project "LatticeFlow" as a "promising idea with robust experimental validation," and "PixelSense" as having a "comprehensive and technically mature approach." Additionally, "SynthMind" was recognized for being "solid work with impressive real-world impact," and "GreenPulse" was noted as "technically ambitious and well-executed." Overall, the judges praised the projects for their potential, technical quality, and real-world relevance.'

## Task 9: Ensemble Retriever

In [36]:
from langchain.retrievers import EnsembleRetriever

retriever_list = [bm25_retriever, naive_retriever, parent_document_retriever, compression_retriever, multi_query_retriever]
equal_weighting = [1/len(retriever_list)] * len(retriever_list)

ensemble_retriever = EnsembleRetriever(
    retrievers=retriever_list, weights=equal_weighting
)

In [37]:
ensemble_retrieval_chain = (
    {"context": itemgetter("question") | ensemble_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [38]:
ensemble_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain appears to be "Legal / Compliance," as it is listed multiple times among the sample projects.'

In [39]:
ensemble_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are use cases related to security. One example is "Project Aurora," which involves a low-latency inference system for multimodal agents in autonomous systems.'

In [40]:
ensemble_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges generally had positive comments about the fintech projects. For example, they described the Pathfinder project as having "Excellent code quality and use of open-source libraries," and the DocuCheck project as being a "Conceptually strong but results need more benchmarking." The scores also reflect favorable evaluations, with scores like 81 and 87 and judge scores of 9.8 and 9.6, indicating high regard for these projects.'

## Task 10: Semantic Chunking

In [41]:
from langchain_experimental.text_splitter import SemanticChunker

semantic_chunker = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="percentile"
)

In [42]:
semantic_documents = semantic_chunker.split_documents(synthetic_usecase_data[:20])

In [43]:
semantic_vectorstore = Qdrant.from_documents(
    semantic_documents,
    embeddings,
    location=":memory:",
    collection_name="Synthetic_Usecase_Data_Semantic_Chunks"
)

In [44]:
semantic_retriever = semantic_vectorstore.as_retriever(search_kwargs={"k" : 10})

In [45]:
semantic_retrieval_chain = (
    {"context": itemgetter("question") | semantic_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [46]:
semantic_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'The most common project domain in the provided data is "Legal / Compliance," which appears twice. Other domains like "Customer Support / Helpdesk," "Developer Tools / DevEx," "Writing & Content," and "Finance / FinTech" also appear multiple times, but not more frequently than "Legal / Compliance" in this dataset.'

In [47]:
semantic_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are usecases related to security. Specifically, there are projects such as "SynthMind," which involves a medical imaging solution, and "BioForge," which is a medical imaging solution with a focus on enhancing early diagnosis through vision transformers. Additionally, the project "Neural Canvas" addresses a low-latency inference system for multimodal agents in autonomous systems, which can be relevant to security applications.'

In [48]:
semantic_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges had positive comments about the fintech projects. For example, they described the project "WealthifyAI 16" as having a "comprehensive and technically mature approach," and "AutoMate 5" as a "forward-looking idea with solid supporting data." Overall, the judges appreciated the technical ambition, execution quality, and potential impact of these projects.'

# 🤝 Breakout Room Part #2

#### 🏗️ Activity #1

Your task is to evaluate the various Retriever methods against eachother.

You are expected to:

1. Create a "golden dataset"
 - Use Synthetic Data Generation (powered by Ragas, or otherwise) to create this dataset
2. Evaluate each retriever with *retriever specific* Ragas metrics
 - Semantic Chunking is not considered a retriever method and will not be required for marks, but you may find it useful to do a "semantic chunking on" vs. "semantic chunking off" comparision between them
3. Compile these in a list and write a small paragraph about which is best for this particular data and why.

Your analysis should factor in:
  - Cost
  - Latency
  - Performance

> NOTE: This is **NOT** required to be completed in class. Please spend time in your breakout rooms creating a plan before moving on to writing code.

##### HINTS:

- LangSmith provides detailed information about latency and cost.

In [49]:
### YOUR CODE HERE

In [50]:
## take synthetic_usecase_data and load into a HF dataset

## use ragas to generate a golden testset

## load the golden testset into a HF dataset

## load the golden testset into LangSmith

In [51]:
# synthetic_usecase_data

In [52]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from datetime import datetime, timedelta

loader = CSVLoader(
    file_path=f"./data/Projects_with_Domains.csv",
    metadata_columns=[
      "Project Title",
      "Project Domain",
      "Secondary Domain",
      "Description",
      "Judge Comments",
      "Score",
      "Project Name",
      "Judge Score"
    ]
)

ragas_usecase_data = loader.load()

for doc in ragas_usecase_data:
    title = doc.metadata.get("Project Title", "")
    domain = doc.metadata.get("Project Domain", "")
    secondary = doc.metadata.get("Secondary Domain", "")
    desc = doc.metadata.get("Description", "")
    
    doc.page_content = f"{title}\nDomain: {domain}\nSecondary Domain: {secondary}\nDescription: {desc}".strip()

In [80]:
from langchain_community.document_loaders.csv_loader import CSVLoader

ragas_loader = CSVLoader(
    file_path=f"./data/Projects_with_Domains.csv",
    metadata_columns=[
      "Judge Comments",
      "Score",
      "Project Name",
      "Judge Score"
    ]
)

ragas_usecase_data2 = ragas_loader.load()

In [81]:
ragas_usecase_data2[0]

Document(metadata={'source': './data/Projects_with_Domains.csv', 'row': 0, 'Judge Comments': 'Technically ambitious and well-executed.', 'Score': '85', 'Project Name': 'Project Aurora', 'Judge Score': '9.5'}, page_content='Project Title: InsightAI 1\nProject Domain: Security\nSecondary Domain: Finance / FinTech\nDescription: A low-latency inference system for multimodal agents in autonomous systems.')

In [53]:
len(ragas_usecase_data)

50

In [54]:
# ragas_usecase_data

In [55]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
import openai


generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))
openai_client = openai.OpenAI()
generator_embeddings = OpenAIEmbeddings(client=openai_client, model="text-embedding-3-small")

  generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))


In [56]:
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.graph import Node, NodeType


kg = KnowledgeGraph()

for doc in ragas_usecase_data:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
        )
    )

kg

KnowledgeGraph(nodes: 50, relationships: 0)

In [69]:
from ragas.testset.transforms import apply_transforms
from ragas.testset.transforms import HeadlinesExtractor, HeadlineSplitter, KeyphrasesExtractor

headline_extractor = HeadlinesExtractor(llm=generator_llm)
headline_splitter = HeadlineSplitter(max_tokens=1500)
keyphrase_extractor = KeyphrasesExtractor(llm=generator_llm)

transforms = [
    headline_extractor,
    headline_splitter,
    keyphrase_extractor
]

apply_transforms(kg, transforms=transforms)
kg

Applying HeadlinesExtractor:   0%|          | 0/100 [00:00<?, ?it/s]

Property 'headlines' already exists in node 'ec091a'. Skipping!
Property 'headlines' already exists in node '56e3e6'. Skipping!
Property 'headlines' already exists in node 'b4aba8'. Skipping!
Property 'headlines' already exists in node 'b32f69'. Skipping!
Property 'headlines' already exists in node '260f81'. Skipping!
Property 'headlines' already exists in node '565241'. Skipping!
Property 'headlines' already exists in node 'c68718'. Skipping!
Property 'headlines' already exists in node 'f7c747'. Skipping!
Property 'headlines' already exists in node '78cb51'. Skipping!
Property 'headlines' already exists in node '63f015'. Skipping!
Property 'headlines' already exists in node '2413dd'. Skipping!
Property 'headlines' already exists in node 'c62ab3'. Skipping!
Property 'headlines' already exists in node 'f95401'. Skipping!
Property 'headlines' already exists in node '1ec025'. Skipping!
Property 'headlines' already exists in node 'c68a19'. Skipping!
Property 'headlines' already exists in n

Applying HeadlineSplitter:   0%|          | 0/100 [00:00<?, ?it/s]

Applying KeyphrasesExtractor:   0%|          | 0/200 [00:00<?, ?it/s]

Property 'keyphrases' already exists in node '565241'. Skipping!
Property 'keyphrases' already exists in node 'f7c747'. Skipping!
Property 'keyphrases' already exists in node 'b4aba8'. Skipping!
Property 'keyphrases' already exists in node 'c62ab3'. Skipping!
Property 'keyphrases' already exists in node '260f81'. Skipping!
Property 'keyphrases' already exists in node 'b32f69'. Skipping!
Property 'keyphrases' already exists in node '1ec025'. Skipping!
Property 'keyphrases' already exists in node '2413dd'. Skipping!
Property 'keyphrases' already exists in node 'ec091a'. Skipping!
Property 'keyphrases' already exists in node 'c68718'. Skipping!
Property 'keyphrases' already exists in node '609b9f'. Skipping!
Property 'keyphrases' already exists in node '56e3e6'. Skipping!
Property 'keyphrases' already exists in node 'f95401'. Skipping!
Property 'keyphrases' already exists in node '63f015'. Skipping!
Property 'keyphrases' already exists in node '61578b'. Skipping!
Property 'keyphrases' alr

KnowledgeGraph(nodes: 200, relationships: 0)

| Persona               | Use Case Type                             | Derived From                                                                |
| --------------------- | ----------------------------------------- | --------------------------------------------------------------------------- |
| Decision Analyst      | **Asking / Seeking Information**          | “Decision support and information interpretation dominate work-related use” |
| Domain Researcher     | **Knowledge Graph & Multi-hop Retrieval** | Multi-domain structure in `Projects_with_Domains.csv`                       |
| Instructional Creator | **Practical Guidance / Tutoring**         | Education & self-learning patterns (10% of usage)                           |
| AI Practitioner       | **Evaluation & Coding Assistance**        | Work-related “Doing” messages (40% overall)                                 |
| Creative Strategist   | **Self-Expression / Ideation**            | Growth of “Expressing” and “Creative Guidance” segments                     |


In [72]:
from ragas.testset.persona import Persona

persona_decision_analyst = Persona(
    name="Decision Analyst",
    role_description=(
        "Uses AI for analytical reasoning and decision support. "
        "Seeks data-driven insights, summaries, and structured outputs to inform business or policy decisions. "
        "Values concise factual responses, traceable evidence, and cost-effective solutions."
    ),
)

persona_domain_researcher = Persona(
    name="Domain Researcher",
    role_description=(
        "Explores multi-domain knowledge sources (e.g., education, health, finance, engineering). "
        "Prefers context-rich retrieval with citations and nuanced synthesis. "
        "Often asks cross-domain 'why/how' questions requiring reasoning beyond surface-level facts."
    ),
)

persona_instructional_creator = Persona(
    name="Instructional Creator",
    role_description=(
        "Designs educational or training materials using AI. "
        "Relies on clear, pedagogical explanations and consistent tone. "
        "Frequently asks for examples, analogies, or simplified explanations for learners."
    ),
)

persona_ai_practitioner = Persona(
    name="AI Practitioner",
    role_description=(
        "Implements and evaluates retrieval-augmented systems. "
        "Needs structured, reproducible outputs like JSON schemas, test cases, and evaluation metrics. "
        "Focuses on precision, recall, and factual grounding when comparing retrievers or datasets."
    ),
)

persona_creative_strategist = Persona(
    name="Creative Strategist",
    role_description=(
        "Uses AI for ideation, storytelling, and persuasive communication. "
        "Seeks novel phrasing, emotional resonance, and creative reframing of ideas. "
        "Frequently explores role-play or scenario-based reasoning."
    ),
)

personas = [
    persona_decision_analyst,
    persona_domain_researcher,
    persona_instructional_creator,
    persona_ai_practitioner,
    persona_creative_strategist,
]


In [73]:
from ragas.testset.synthesizers.single_hop.specific import (
    SingleHopSpecificQuerySynthesizer,
)

query_distibution = [
    (
        SingleHopSpecificQuerySynthesizer(llm=generator_llm, property_name="headlines"),
        0.5,
    ),
    (
        SingleHopSpecificQuerySynthesizer(
            llm=generator_llm, property_name="keyphrases"
        ),
        0.5,
    ),
]

In [74]:
kg.save("usecase_data_kg.json")
usecase_data_kg = KnowledgeGraph.load("usecase_data_kg.json")
usecase_data_kg

KnowledgeGraph(nodes: 200, relationships: 0)

In [75]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embeddings,
    knowledge_graph=usecase_data_kg,
    persona_list=personas,
)

In [76]:
testset = generator.generate(testset_size=10, query_distribution=query_distibution)
testset.to_pandas()

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Could you explain what AutoMate 11 is and how ...,[AutoMate 11\nDomain: E‑commerce / Marketplace...,AutoMate 11 is a reinforcement learning setup ...,single_hop_specific_query_synthesizer
1,why Pathfinder 24 use Secondary Domain Securit...,[Pathfinder 24\nDomain: Healthcare / MedTech\n...,Pathfinder 24 is an AI-powered platform primar...,single_hop_specific_query_synthesizer
2,What is AutoMate 11 used for in e-commerce?,[AutoMate 11\nDomain: E‑commerce / Marketplace...,AutoMate 11 is a reinforcement learning setup ...,single_hop_specific_query_synthesizer
3,Can you explane how the Descripton: An AI-powe...,[Pathfinder 24\nDomain: Healthcare / MedTech\n...,Pathfinder 24 is an AI-powered platform that o...,single_hop_specific_query_synthesizer
4,Can you explane what AutoMate 11 is and how it...,[AutoMate 11\nDomain: E‑commerce / Marketplace...,AutoMate 11 is a reinforcement learning setup ...,single_hop_specific_query_synthesizer
5,What is InsightAI 1 and how does it relate to ...,[InsightAI 1\nDomain: Security\nSecondary Doma...,InsightAI 1 is a low-latency inference system ...,single_hop_specific_query_synthesizer
6,what is Productivity Assistants in ShopSmart 2?,[ShopSmart 2\nDomain: Developer Tools / DevEx\...,"In ShopSmart 2, Productivity Assistants is the...",single_hop_specific_query_synthesizer
7,How does WealthifyAI 3 contribute to enhancing...,[WealthifyAI 3\nDomain: Developer Tools / DevE...,WealthifyAI 3 operates in the developer tools ...,single_hop_specific_query_synthesizer
8,Could you explain how the term Finance is rela...,[MediMind 4\nDomain: E‑commerce / Marketplaces...,MediMind 4 operates primarily in the e-commerc...,single_hop_specific_query_synthesizer
9,Could you explain how the term Finance relates...,[AutoMate 5\nDomain: Finance / FinTech\nSecond...,AutoMate 5 is primarily associated with the Fi...,single_hop_specific_query_synthesizer


In [77]:
# calculate the length of the ragas_usecase_data page_content column for each document

for doc in ragas_usecase_data:
    print(len(doc.page_content))


153
172
167
166
161
162
166
165
164
176
183
155
162
149
161
172
160
173
162
166
168
165
168
153
167
168
167
142
151
175
184
168
179
169
162
148
161
177
163
161
181
170
182
157
175
167
158
167
175
159


In [78]:
from langchain.text_splitter import TokenTextSplitter

token_splitter = TokenTextSplitter()

for i, doc in enumerate(ragas_usecase_data):
    tokens = token_splitter._tokenizer.encode(doc.page_content)
    print(f"Doc {i}: {len(tokens)} tokens")


Doc 0: 36 tokens
Doc 1: 34 tokens
Doc 2: 34 tokens
Doc 3: 36 tokens
Doc 4: 36 tokens
Doc 5: 34 tokens
Doc 6: 37 tokens
Doc 7: 35 tokens
Doc 8: 34 tokens
Doc 9: 41 tokens
Doc 10: 40 tokens
Doc 11: 35 tokens
Doc 12: 37 tokens
Doc 13: 32 tokens
Doc 14: 40 tokens
Doc 15: 39 tokens
Doc 16: 31 tokens
Doc 17: 37 tokens
Doc 18: 33 tokens
Doc 19: 35 tokens
Doc 20: 37 tokens
Doc 21: 36 tokens
Doc 22: 34 tokens
Doc 23: 30 tokens
Doc 24: 36 tokens
Doc 25: 33 tokens
Doc 26: 35 tokens
Doc 27: 32 tokens
Doc 28: 33 tokens
Doc 29: 37 tokens
Doc 30: 38 tokens
Doc 31: 32 tokens
Doc 32: 37 tokens
Doc 33: 37 tokens
Doc 34: 34 tokens
Doc 35: 31 tokens
Doc 36: 38 tokens
Doc 37: 40 tokens
Doc 38: 35 tokens
Doc 39: 37 tokens
Doc 40: 37 tokens
Doc 41: 35 tokens
Doc 42: 35 tokens
Doc 43: 36 tokens
Doc 44: 39 tokens
Doc 45: 37 tokens
Doc 46: 33 tokens
Doc 47: 38 tokens
Doc 48: 36 tokens
Doc 49: 36 tokens


In [79]:
# ALTERNATE

from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset2 = generator.generate_with_langchain_docs(ragas_usecase_data, testset_size=10)

ValueError: Documents appears to be too short (ie 100 tokens or less). Please provide longer documents.