In [54]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chat_models import init_chat_model
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain.prompts import PromptTemplate

from dotenv import dotenv_values
import os


In [53]:
env_vars = dotenv_values()
os.environ["GOOGLE_API_KEY"] = env_vars["GOOGLE_API_KEY"]

RAG_PROMPT_TEMPLATE = """
Use the following context to answer the question at the end. 
If you don't know the answer, just say you don't know — don't make anything up.

{context}

Question: {question}
Answer:
"""

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
parent_text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=400) # Was planned but not used, the code retrieves the whole document instead

pdf_docs_names = os.listdir("docs")
pdf_docs_names = [doc for doc in pdf_docs_names if doc.endswith(".pdf")]
pdf_loaders = [PyPDFLoader(f"docs/{doc}") for doc in pdf_docs_names]
pdf_docs = [loader.load() for loader in pdf_loaders]
pdf_docs = [item for sublist in pdf_docs for item in sublist]  




In [46]:
vectorstore = Chroma(
    collection_name="pdf_docs",
    embedding_function=embeddings,
)
docstore   = InMemoryStore() 
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    child_splitter=child_text_splitter,
    docstore=docstore
)

retriever.add_documents(pdf_docs)


In [None]:
query = "What is the best gun for hunting birds?"

docs = retriever.vectorstore.similarity_search_with_relevance_scores(
    query=query,
    k=3,
    filter=None
)


relevant_doc_metadata = [doc.metadata for doc, score in docs if score > 0.6]

relevant_doc_contents = ""

seen_sources = set()
for metadata in relevant_doc_metadata:
    source = metadata.get("source", "Unknown source")
    if source in seen_sources:
        continue
    seen_sources.add(source)
    if source != "Unknown source":
        loaded_doc = PyPDFLoader(source).load()
        relevant_doc_contents += f"Source: {source}\nContent: "
        for page in loaded_doc:
            relevant_doc_contents += page.page_content + "\n"


prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=RAG_PROMPT_TEMPLATE
)

chat_model = init_chat_model("gemini-2.5-flash", model_provider="google_genai")

chain = prompt | chat_model

response = chain.invoke({
    "context": relevant_doc_contents,
    "question": query
})

print(response.content)

Based on the provided context, the Marlin Glenfield Model 60 .22 LR rifle is mentioned as being used for hitting "crows at well over 100 yards" and is enjoyed by "small-game hunters." This suggests it is suitable for hunting birds.
