# Advanced Retrieval with LangChain

In [1]:
# Standard Library Imports
import getpass
import os
from datetime import datetime, timedelta
from operator import itemgetter
from uuid import uuid4

# Third-Party Imports
# LangChain Core
from langchain.retrievers import EnsembleRetriever, ParentDocumentRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# LangChain Community
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import Qdrant

# LangChain Integrations
from langchain_cohere import CohereRerank
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore

# Qdrant
from qdrant_client import QdrantClient, models

# Local Application Imports
# (none yet)


In [2]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"AIM - SDG - {uuid4().hex[0:8]}"

In [3]:
chat_model = ChatOpenAI(model="gpt-4.1-nano")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
RAG_TEMPLATE = """\
You are a helpful and kind assistant. Use the context provided below to answer the question.

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

In [5]:
loader = CSVLoader(
    file_path=f"../data/Projects_with_Domains.csv",
    metadata_columns=[
      "Project Title",
      "Project Domain",
      "Secondary Domain",
      "Description",
      "Judge Comments",
      "Score",
      "Project Name",
      "Judge Score"
    ]
)

synthetic_usecase_data = loader.load()

for doc in synthetic_usecase_data:
    doc.page_content = doc.metadata["Description"]

In [6]:
synthetic_usecase_data[0]

Document(metadata={'source': '../data/Projects_with_Domains.csv', 'row': 0, 'Project Title': 'InsightAI 1', 'Project Domain': 'Security', 'Secondary Domain': 'Finance / FinTech', 'Description': 'A low-latency inference system for multimodal agents in autonomous systems.', 'Judge Comments': 'Technically ambitious and well-executed.', 'Score': '85', 'Project Name': 'Project Aurora', 'Judge Score': '9.5'}, page_content='A low-latency inference system for multimodal agents in autonomous systems.')

## Vector Stores

### Naive Vector Store

In [7]:
vectorstore = Qdrant.from_documents(
    synthetic_usecase_data,
    embeddings,
    location=":memory:",
    collection_name="Synthetic_Usecases"
)

In [8]:
naive_retriever = vectorstore.as_retriever(search_kwargs={"k" : 10})

### Semantic Vector Store

In [9]:
semantic_chunker = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="percentile"
)

In [10]:
semantic_documents = semantic_chunker.split_documents(synthetic_usecase_data[:20])

In [11]:
semantic_vectorstore = Qdrant.from_documents(
    semantic_documents,
    embeddings,
    location=":memory:",
    collection_name="Synthetic_Usecase_Data_Semantic_Chunks"
)

In [12]:
semantic_retriever = semantic_vectorstore.as_retriever(search_kwargs={"k" : 10})

### Parent Document Vector Store

In [13]:
parent_docs = synthetic_usecase_data
child_splitter = RecursiveCharacterTextSplitter(chunk_size=750)

In [14]:
client = QdrantClient(location=":memory:")

client.create_collection(
    collection_name="full_documents",
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
)

parent_document_vectorstore = QdrantVectorStore(
    collection_name="full_documents", embedding=OpenAIEmbeddings(model="text-embedding-3-small"), client=client
)

In [15]:
store = InMemoryStore()

parent_document_retriever = ParentDocumentRetriever(
    vectorstore = parent_document_vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [16]:
parent_document_retriever.add_documents(parent_docs, ids=None)

### Other Retrievers

In [17]:
bm25_retriever = BM25Retriever.from_documents(synthetic_usecase_data)

In [18]:
compressor = CohereRerank(model="rerank-v3.5")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=naive_retriever
)

In [19]:
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=naive_retriever, llm=chat_model
) 

In [20]:
retriever_list = [bm25_retriever, naive_retriever, parent_document_retriever, compression_retriever, multi_query_retriever]
equal_weighting = [1/len(retriever_list)] * len(retriever_list)

ensemble_retriever = EnsembleRetriever(
    retrievers=retriever_list, weights=equal_weighting
)

## LangChain Retrieval Chains

### Naive

In [21]:
naive_retrieval_chain = (
    {"context": itemgetter("question") | naive_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### BM25

In [22]:
bm25_retrieval_chain = (
    {"context": itemgetter("question") | bm25_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Contextual Compression

In [23]:
contextual_compression_retrieval_chain = (
    {"context": itemgetter("question") | compression_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Multi-Query

In [24]:
multi_query_retrieval_chain = (
    {"context": itemgetter("question") | multi_query_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Parent Document

In [25]:
parent_document_retrieval_chain = (
    {"context": itemgetter("question") | parent_document_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Ensemble

In [26]:
ensemble_retrieval_chain = (
    {"context": itemgetter("question") | ensemble_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

### Semantic Retriever

In [27]:
semantic_retrieval_chain = (
    {"context": itemgetter("question") | semantic_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

## Sample Requests

In [28]:
naive_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain appears to be "Healthcare / MedTech," which is mentioned multiple times among the projects.'

In [29]:
naive_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are usecases related to security. Specifically, one of the projects, "Pathfinder 24," is described as an "AI-powered platform optimizing logistics routes for sustainability" with the secondary domain being "Security."'

In [30]:
naive_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges had positive comments about the fintech projects. For example, one judge described the project "TrendLens" as "Technically ambitious and well-executed," and another praised "Pathfinder" as a "Promising idea with robust experimental validation." Overall, the judges recognized the quality, impact, and promise of the fintech-related projects, highlighting their strong technical execution and potential.'

In [31]:
bm25_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'The most common project domain is not explicitly provided in the data snippets, but based on the visible entries, the projects listed fall into the following domains: Productivity Assistants, E-commerce / Marketplaces, Healthcare / MedTech, and Finance / FinTech. Since only a few project entries are available, it is difficult to determine the overall most common domain among all projects.\n\nHowever, if considering these examples alone, each domain appears only once in the sample, so I do not have enough information to confidently identify the most common project domain.'

In [32]:
bm25_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there was a use case related to security. The project called "SecureNest 49" involved a document summarization and retrieval system for enterprise knowledge bases within the E‑commerce / Marketplaces and Legal / Compliance domains.'

In [33]:
bm25_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges had the following comments about the fintech projects:\n\n- For "SynthMind" in the finance/fintech domain, they said: "Conceptually strong but results need more benchmarking." The judge score for this project was 9.6.\n\nOverall, the judges acknowledged the strength in the concept of the fintech project, though they noted that more benchmarking of results could strengthen it further.'

In [34]:
contextual_compression_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the project domains listed are "Security," "Creative / Design / Media," and "Productivity Assistants." Since only a few sample entries are shown, and there is no indication that these are the only domains present, I cannot determine the most common project domain overall. However, in the sample provided, each domain appears only once.\n\nIf you have access to the full dataset, I recommend analyzing the frequency of each domain within it to identify the most common one.'

In [35]:
contextual_compression_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Based on the provided context, there are no specific use cases directly related to security. The mentioned projects focus on privacy improvements in healthcare applications through federated learning, but security as a distinct use case is not explicitly discussed.'

In [36]:
contextual_compression_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges had positive remarks about the fintech projects. For example, they praised the project "Pathfinder 27" for its excellent code quality and the use of open-source libraries, giving it a high judge score of 9.8.'

In [37]:
multi_query_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'The most common project domain in the provided data appears to be "Healthcare / MedTech," as it is mentioned multiple times among the projects. However, to be certain, a full count of all domain occurrences would be ideal. Based on the numbers observed in the sample, "Healthcare / MedTech" seems to be the most frequent.'

In [38]:
multi_query_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are use cases related to security mentioned in the provided context. Specifically, the project "SecureNest" involves a document summarization and retrieval system for enterprise knowledge bases, which relates to security and compliance in handling sensitive information.'

In [39]:
multi_query_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges generally had positive comments about the fintech projects. They praised them for being clever, impressive, solid, and promising, often noting their real-world impact, technical maturity, and strong code quality. Specific remarks included that some projects were "a clever solution with measurable environmental benefit," "solid work with impressive real-world impact," and "promising idea with robust experimental validation." Overall, the judges viewed these fintech projects favorably, highlighting their innovative approaches and potential.'

In [40]:
parent_document_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain appears to be "Healthcare / MedTech," as it is mentioned more than once in the sample. However, since this is only a small snippet, I cannot definitively determine the most common domain across the entire dataset. \n\nIf you have the full dataset or more context, I can help analyze it further.'

In [41]:
parent_document_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Based on the provided context, there are no specific use cases explicitly related to security mentioned. The projects focus on federated learning and privacy in healthcare applications, which relate to security in terms of data privacy, but there are no direct references to security-centric use cases.'

In [42]:
parent_document_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'The judges had positive comments about the fintech projects. They noted aspects such as "solid work with impressive real-world impact," "comprehensive and technically mature approach," "promising idea with robust experimental validation," and "technically ambitious and well-executed." Overall, the judges recognized the projects for their promising ideas, technical quality, and practical impact.'

In [43]:
ensemble_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'The most common project domain in the provided data appears to be "E‑commerce / Marketplaces," which is mentioned more than once.'

In [44]:
ensemble_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are usecases related to security. For example, the project "LearnWise 39" involves an AI model compression suite enabling on-device reasoning for IoT sensors, which can enhance security by minimizing data transmission and processing on secure local devices. Additionally, other projects like "SecureNest 49" and "SecureNest 28" focus on legal and compliance-related solutions, such as document summarization and retrieval systems for enterprise knowledge bases and hardware-aware model quantization benchmarks, which can contribute to security and compliance standards.'

In [45]:
ensemble_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges had varied comments about the fintech projects. For example:\n\n- The project "SecureNest 28," which is a hardware-aware model quantization benchmark suite in the finance/fintech secondary domain, was described as "Conceptually strong but results need more benchmarking," with a high judge score of 9.0.\n- "DocuCheck 47," an AI-powered platform optimizing logistics routes for sustainability within fintech, received a judge comment stating it was "Conceptually strong but results need more benchmarking," and scored 9.6.\n- "PulseAI 50," an adaptive fine-tuning pipeline for multilingual reasoning models in fintech, was deemed "Technically ambitious and well-executed" with a judge score of 8.0.\n\nOverall, judges recognized the fintech projects as conceptually strong and ambitious, often noting that they are promising but could benefit from further benchmarking or stronger evaluation metrics.'

In [46]:
semantic_retrieval_chain.invoke({"question" : "What is the most common project domain?"})["response"].content

'Based on the provided data, the most common project domain is "Legal / Compliance," appearing twice in the sample. However, since this is a limited dataset, I cannot determine with certainty if it is the most common overall. If considering the sample data, "Legal / Compliance" seems to be a frequently occurring domain.'

In [47]:
semantic_retrieval_chain.invoke({"question" : "Were there any usecases about security?"})["response"].content

'Yes, there are usecases related to security in the provided data. Specifically, one project titled "Project Aurora" involves a low-latency inference system for multimodal agents in autonomous systems, which is within the security domain. Additionally, "Neural Canvas" is another project in the security domain focusing on a low-latency inference system for autonomous systems.'

In [48]:
semantic_retrieval_chain.invoke({"question" : "What did judges have to say about the fintech projects?"})["response"].content

'Judges had various comments about the fintech projects. For example, they described the projects as "Technically ambitious and well-executed," "Comprehensive and technically mature," and "A forward-looking idea with solid supporting data." Overall, the judges generally viewed the fintech projects positively, highlighting their technical quality, maturity, and forward-looking nature.'