Metadata-based Builders – Add filters or tags (e.g., dates, topic keywords).

In [5]:
# ===================== INSTALL DEPENDENCIES =====================
!pip install -q langchain chromadb sentence-transformers groq langchain-community langchain_groq lark pypdf

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/313.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
# =================== IMPORTS ===================
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import get_query_constructor_prompt, StructuredQueryOutputParser
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda, RunnableMap, RunnablePassthrough
from langchain_groq import ChatGroq
from google.colab import userdata

In [7]:
# ================== LOAD & SPLIT PDF ==================
# Load PDF
loader = PyPDFLoader("/content/solid-python.pdf")
raw_docs = loader.load()

# Add metadata to each document
for doc in raw_docs:
    doc.metadata["topic"] = "python"
    doc.metadata["year"] = 2021

# Split into chunks
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(raw_docs)

In [8]:
# ================== EMBEDDINGS + CHROMA ==================
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    collection_name="solid_docs"
)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
# ================== DEFINE LLM ==================
from google.colab import userdata
llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    api_key=userdata.get("GROQ_API_KEY")
)
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7a3b4f5984d0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7a3b65859550>, model_name='llama-3.3-70b-versatile', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [10]:
# ================== METADATA-BASED QUERY CONSTRUCTOR ==================
import lark
# Define metadata structure
attribute_info = [
    AttributeInfo(name="topic", description="The topic of the document", type="string"),
    AttributeInfo(name="year", description="The publication year", type="integer"),
]

document_contents = "Sections from a PDF on advanced Python programming concepts."

# Prompt and output parser
prompt = get_query_constructor_prompt(document_contents, attribute_info)
output_parser = StructuredQueryOutputParser.from_components()

# Build Query Construction Chain (LLM → StructuredQuery)
query_constructor_chain = prompt | llm | output_parser

In [11]:
# ================== PROMPT FOR ANSWERING ==================
from langchain_core.prompts import PromptTemplate

rag_prompt = PromptTemplate.from_template(
    "Use the following context to answer the question:\n\n{context}\n\nQuestion: {question}"
)

In [12]:
# ================== Retrivers =============================
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 5})

In [23]:
# ================== BUILD FULL RAG CHAIN ==================

query_runnable = RunnableLambda(lambda x: {"question": x["question"]})
retrieve_runnable = query_runnable | RunnableLambda(lambda inp: retriever.get_relevant_documents(inp["question"]))

rag_chain = (
    RunnableMap({
        "context": retrieve_runnable,
        "question": RunnablePassthrough()
    })
    | rag_prompt
    | llm
)


In [25]:
# ================== ASK A QUESTION ==================
question = "Show me tips on Python from the year 2021"

response = rag_chain.invoke({"question": question})

print("Final Answer:\n")
print(response.content)

Final Answer:

Based on the provided context, here are some tips on Python from the year 2021:

1. **Prefer Composition over Inheritance**: Python does not force type inheritance. For API implementation and code reuse, prefer composition over inheritance.

2. **Minimize Dependency**: Be mindful of compilation and startup time dependencies. When importing modules, use `from <package> import module` to avoid unnecessary dependencies.

These tips are derived from the page contents of the documents, which discuss Python development principles, including the SOLID principles, and provide guidance on best practices for coding in Python.
