# Advanced Retrieval with LangChain

In [1]:
# Standard Library Imports
import getpass
import os
from datetime import datetime, timedelta
from operator import itemgetter
from uuid import uuid4

# Third-Party Imports
# LangChain Core
from langchain.retrievers import EnsembleRetriever, ParentDocumentRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# LangChain Community
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import Qdrant

# LangChain Integrations
from langchain_cohere import CohereRerank
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore

# Qdrant
from qdrant_client import QdrantClient, models

# Local Application Imports
# (none yet)


In [2]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"AIM - SDG - {uuid4().hex[0:8]}"

In [3]:
chat_model = ChatOpenAI(model="gpt-4.1-nano")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
RAG_TEMPLATE = """\
You are a helpful and kind assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

In [5]:
loader = CSVLoader(
    file_path=f"../data/Projects_with_Domains.csv",
    metadata_columns=[
      "Project Title",
      "Project Domain",
      "Secondary Domain",
      "Description",
      "Judge Comments",
      "Score",
      "Project Name",
      "Judge Score"
    ]
)

synthetic_usecase_data = loader.load()

for doc in synthetic_usecase_data:
    doc.page_content = doc.metadata["Description"]

In [6]:
synthetic_usecase_data[0]

Document(metadata={'source': '../data/Projects_with_Domains.csv', 'row': 0, 'Project Title': 'InsightAI 1', 'Project Domain': 'Security', 'Secondary Domain': 'Finance / FinTech', 'Description': 'A low-latency inference system for multimodal agents in autonomous systems.', 'Judge Comments': 'Technically ambitious and well-executed.', 'Score': '85', 'Project Name': 'Project Aurora', 'Judge Score': '9.5'}, page_content='A low-latency inference system for multimodal agents in autonomous systems.')

## Vector Stores

### Naive Vector Store

In [7]:
vectorstore = Qdrant.from_documents(
    synthetic_usecase_data,
    embeddings,
    location=":memory:",
    collection_name="Synthetic_Usecases"
)

In [8]:
naive_retriever = vectorstore.as_retriever(search_kwargs={"k" : 10})

### Semantic Vector Store

In [9]:
semantic_chunker = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="percentile"
)

In [10]:
semantic_documents = semantic_chunker.split_documents(synthetic_usecase_data[:20])

In [11]:
semantic_vectorstore = Qdrant.from_documents(
    semantic_documents,
    embeddings,
    location=":memory:",
    collection_name="Synthetic_Usecase_Data_Semantic_Chunks"
)

In [12]:
semantic_retriever = semantic_vectorstore.as_retriever(search_kwargs={"k" : 10})

### Parent Document Vector Store

In [13]:
parent_docs = synthetic_usecase_data
child_splitter = RecursiveCharacterTextSplitter(chunk_size=750)

In [14]:
client = QdrantClient(location=":memory:")

client.create_collection(
    collection_name="full_documents",
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
)

parent_document_vectorstore = QdrantVectorStore(
    collection_name="full_documents", embedding=OpenAIEmbeddings(model="text-embedding-3-small"), client=client
)

In [15]:
store = InMemoryStore()

parent_document_retriever = ParentDocumentRetriever(
    vectorstore = parent_document_vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [16]:
parent_document_retriever.add_documents(parent_docs, ids=None)

### Other Retrievers

In [17]:
bm25_retriever = BM25Retriever.from_documents(synthetic_usecase_data)

In [18]:
compressor = CohereRerank(model="rerank-v3.5")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=naive_retriever
)

In [19]:
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=naive_retriever, llm=chat_model
) 

In [20]:
retriever_list = [bm25_retriever, naive_retriever, parent_document_retriever, compression_retriever, multi_query_retriever]
equal_weighting = [1/len(retriever_list)] * len(retriever_list)

ensemble_retriever = EnsembleRetriever(
    retrievers=retriever_list, weights=equal_weighting
)

## LangChain Retrieval Chains

### Naive

In [21]:
naive_retrieval_chain = (
    {"context": itemgetter("question") | naive_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### BM25

In [22]:
bm25_retrieval_chain = (
    {"context": itemgetter("question") | bm25_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Contextual Compression

In [23]:
contextual_compression_retrieval_chain = (
    {"context": itemgetter("question") | compression_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Multi-Query

In [24]:
multi_query_retrieval_chain = (
    {"context": itemgetter("question") | multi_query_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Parent Document

In [25]:
parent_document_retrieval_chain = (
    {"context": itemgetter("question") | parent_document_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Ensemble

In [26]:
ensemble_retrieval_chain = (
    {"context": itemgetter("question") | ensemble_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Semantic Retriever

In [27]:
semantic_retrieval_chain = (
    {"context": itemgetter("question") | semantic_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

## Sample Requests

In [28]:
naive_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain is "Healthcare / MedTech," which appears multiple times in the dataset.'

In [29]:
naive_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are use cases related to security. Specifically, one project titled "PlanPilot 35" involves a federated learning toolkit aimed at improving privacy in healthcare applications, which can be associated with security and privacy enhancement. Additionally, there is a project titled "LatticeFlow" within the healthcare/MedTech domain that mentions security in its secondary domain, potentially indicating a use case involving security considerations.'

In [30]:
naive_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges generally had positive comments about the fintech projects, highlighting their technical quality, promising ideas, and real-world impact. For example, one project was noted as "Technically ambitious and well-executed," while another was described as a "promising idea with robust experimental validation." Overall, the judges appreciated the innovative approaches and strong execution of the fintech-related projects.'

In [31]:
bm25_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain is not explicitly stated. However, among the sample projects listed, the domains include Productivity Assistants, E-commerce / Marketplaces, Healthcare / MedTech, and Finance / FinTech. Since only a few entries are provided and they are all different, I cannot determine the most common project domain from this limited data.'

In [32]:
bm25_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there is a use case related to security. The project titled "SecureNest 49" falls under the domains of E‑commerce / Marketplaces and Legal / Compliance, and involves a document summarization and retrieval system for enterprise knowledge bases.'

In [33]:
bm25_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges viewed the fintech project, "SynthMind," positively. They described it as conceptually strong, indicating confidence in the idea and its potential. However, they also noted that the results need more benchmarking, suggesting that while the concept is promising, further evaluation and comparison are necessary to fully establish its effectiveness.'

In [34]:
contextual_compression_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain appears to be "Security," as it is explicitly mentioned for one project, but the dataset snippet includes multiple domains such as Healthcare / MedTech, Creative / Design / Media, and Security. Since the sample shows only a few entries, I cannot conclusively determine the most frequent domain across the entire dataset. However, from this small sample, "Security" does not seem to be the most common. \n\nIf you need a definitive answer, I would recommend analyzing the full dataset for the frequency of each project domain.'

In [35]:
contextual_compression_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Based on the provided context, there are no explicit use cases related to security. The examples mentioned focus on federated learning toolkits aimed at improving privacy in healthcare applications, but none specifically mention security.'

In [36]:
contextual_compression_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges had positive comments about the fintech project "Pathfinder 27." They noted it had excellent code quality and made good use of open-source libraries.'

In [37]:
multi_query_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'The most common project domain in the provided data is "Healthcare / MedTech," which appears multiple times.'

In [38]:
multi_query_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are usecases related to security. Specifically, the projects titled "A federated learning toolkit improving privacy in healthcare applications" and "A document summarization and retrieval system for enterprise knowledge bases" are relevant to security, as they focus on privacy and secure information management.'

In [39]:
multi_query_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges generally had positive feedback on the fintech projects. For example, the project "Pathfinder 27" received a score of 81 and was praised for its "excellent code quality and use of open-source libraries." Another project, "SkyForge," scored 94 and was described as a "clever solution with measurable environmental benefit." Similarly, "CreateFlow 43" scored 62, with judges noting it as "a forward-looking idea with solid supporting data." Overall, the judges appreciated the conceptual strength, potential impact, and quality of work on these fintech projects.'

In [40]:
parent_document_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the project domains mentioned are Security, Creative / Design / Media, Healthcare / MedTech, and Productivity Assistants. Since the sample includes only a few projects, I cannot definitively determine the most common domain overall. However, within this sample, each domain appears only once, so there is no clear majority. If you have a larger dataset, I could help analyze it to find the most common project domain.'

In [41]:
parent_document_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Based on the provided context, there are no specific use cases explicitly related to security. The projects mentioned focus on federated learning to improve privacy in healthcare applications, but security is not directly highlighted as a use case.'

In [42]:
parent_document_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges had positive comments about the fintech projects. For example, TrendLens was described as "Technically ambitious and well-executed," and WealthifyAI was noted for being a "Comprehensive and technically mature approach." Overall, the judges recognized the promising and well-developed nature of these fintech projects.'

In [43]:
ensemble_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain among the listed projects is "Finance / FinTech," which appears multiple times in the dataset.'

In [44]:
ensemble_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there was at least one use case related to security. The project titled "SecureNest 49" involves a document summarization and retrieval system for enterprise knowledge bases, which is a security-related application in the context of legal and compliance domains. Additionally, "SecureNest 28" focuses on hardware-aware model quantization benchmarking, which can also relate to security in hardware and model robustness.'

In [45]:
ensemble_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges\' comments on the fintech projects were generally positive. For example, the project "Pathfinder 27" received praise for "excellent code quality and use of open-source libraries," and "SecureNest 47" was described as having a "comprehensive and technically mature approach." Overall, the judges appreciated the quality, innovation, and technical strength of the fintech projects.'

In [46]:
semantic_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain appears to be "Legal / Compliance," which is mentioned multiple times. Would you like to see a detailed count or further analysis?'

In [47]:
semantic_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are usecases related to security in the provided context. Specifically, the projects "SynthMind" and "BioForge" are associated with the security domain. "SynthMind" involves a medical imaging solution improving early diagnosis, and "BioForge" is a medical imaging solution that exceeds expectations in creativity and usability within the security secondary domain.'

In [48]:
semantic_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges had the following comments about the fintech projects:\n\n- For "TrendLens 19," judges described it as "Technically ambitious and well-executed."\n- For "WealthifyAI 16," the project was called "Comprehensive and technically mature approach."\n- For "AutoMate 5," judges noted it as "A forward-looking idea with solid supporting data."\n- For "InsightAI 1," judges considered it "Technically ambitious and well-executed."\n\nOverall, the judges praised the fintech projects for their technical ambition, thoroughness, and promising approach.'

### some grouping for requests

In [49]:
# --- 1) Chain registry (use your existing chain objects) ---
CHAINS = {
    "naive": naive_retrieval_chain,
    "bm25": bm25_retrieval_chain,
    "compression": contextual_compression_retrieval_chain,
    "multi_query": multi_query_retrieval_chain,
    "parent_doc": parent_document_retrieval_chain,
    "ensemble": ensemble_retrieval_chain,
    "semantic": semantic_retrieval_chain,
}

# --- 2) Minimal helpers to normalize outputs ---
def _to_text(resp_dict):
    """Your chains return {'response': <AIMessage|str>, 'context': [...] }."""
    r = resp_dict.get("response")
    if hasattr(r, "content"):  # AIMessage
        return r.content
    return str(r) if r is not None else ""

def _to_context(resp_dict):
    return resp_dict.get("context", [])

# --- 3) Run a single question across selected chains ---
def run_all(question: str, chains=CHAINS):
    results = {}
    for name, ch in chains.items():
        out = ch.invoke({"question": question})
        results[name] = {
            "answer": _to_text(out),
            "contexts": _to_context(out),
        }
    return results

# --- 4) Convenience: quick pretty print for ad-hoc inspection ---
def print_quick(results, max_len=200):
    for name, rec in results.items():
        ans = rec["answer"].strip().replace("\n", " ")
        print(f"[{name}] {ans[:max_len]}{'…' if len(ans) > max_len else ''}")


In [50]:
# single question across all chains
res = run_all("What is the most common project domain?")
print_quick(res)

[naive] Based on the provided data, the most common project domain is "Healthcare / MedTech," which appears multiple times in the dataset.
[bm25] Based on the provided data, the project domains mentioned include 'Productivity Assistants,' 'E‑commerce / Marketplaces,' 'Healthcare / MedTech,' and 'Finance / FinTech.' Since this is just a small sa…
[compression] Based on the provided data, the most common project domain appears to be "Security," as it is mentioned alongside other project domains, and the context suggests multiple projects are categorized unde…
[multi_query] The most common project domain in the provided data appears to be "Writing & Content," which is mentioned multiple times across different projects.
[parent_doc] Based on the provided data, the most common project domain is not explicitly detailed across all entries. However, among the samples shown, the domain "Healthcare / MedTech" appears twice, while "Secu…
[ensemble] Based on the provided data, the most common proj

In [51]:
def run_batch(questions, chains=CHAINS):
    """
    Returns: dict[chain_name] -> list of {question, answer, contexts}
    """
    payloads = [{"question": q} for q in questions]
    all_results = {}
    for name, ch in chains.items():
        outs = ch.batch(payloads)
        all_results[name] = [
            {
                "question": q["question"],
                "answer": _to_text(o),
                "contexts": _to_context(o),
            }
            for q, o in zip(payloads, outs)
        ]
    return all_results


def print_results(all_results, max_answer=150, max_context=100, max_ctxs=2):
    """
    Nicely print abridged chain results for inspection.
    """
    for name, records in all_results.items():
        print(f"\n=== {name.upper()} ===")
        for rec in records:
            print(f"Q: {rec['question']}")
            ans = rec['answer'].strip().replace("\n", " ")
            print(f"A: {ans[:max_answer]}{'…' if len(ans) > max_answer else ''}")
            ctxs = rec['contexts'][:max_ctxs]
            for i, c in enumerate(ctxs, 1):
                snippet = c.page_content.strip().replace("\n", " ")
                print(f"  [ctx{i}] {snippet[:max_context]}{'…' if len(snippet) > max_context else ''}")
            print()  # blank line between questions


In [52]:
QUESTIONS = [
    "What is the most common project domain?",
    "Were there any usecases about security?",
]

batched_results = run_batch(QUESTIONS)
print_results(batched_results)



=== NAIVE ===
Q: What is the most common project domain?
A: Based on the provided data, the most common project domain appears to be "Healthcare / MedTech," which is listed multiple times among the projects.
  [ctx1] A synthetic data generator for low-resource domain adaptation tasks.
  [ctx2] A synthetic data generator for low-resource domain adaptation tasks.

Q: Were there any usecases about security?
A: Yes, there are usecases related to security. For example, the project titled "WealthifyAI 16" involves a federated learning toolkit that improves priv…
  [ctx1] A federated learning toolkit improving privacy in healthcare applications.
  [ctx2] A federated learning toolkit improving privacy in healthcare applications.


=== BM25 ===
Q: What is the most common project domain?
A: Based on the provided data, there are multiple project domains mentioned, including Productivity Assistants, E‑commerce / Marketplaces, Healthcare / M…
  [ctx1] An adaptive fine-tuning pipeline for multiling