In [None]:
# ===================== INSTALL DEPENDENCIES =====================
!pip install -q langchain sentence-transformers faiss-cpu pypdf langchain-community langchain-groq

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.6/304.6 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.9/438.9 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ===================== IMPORTS =====================
import os
import uuid
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from google.colab import userdata

In [None]:
# ===================== STEP 1: LOAD TWO PDFs =====================
pdf_paths = [
    "/content/solid-python.pdf",
    "/content/LLD_Best_Practices_Python.pdf"
]

all_embedding_docs = []
docstore = InMemoryStore()

for path in pdf_paths:
    loader = PyPDFLoader(path)
    pages = loader.load()

    parent_id = str(uuid.uuid4())
    filename = os.path.basename(path).replace(".pdf", "")

    # Extract summary = first page or first paragraph
    summary_text = pages[0].page_content[:700]

    # Extract body = full document content
    full_body = "\n\n".join([p.page_content for p in pages])

    # ---- Create Embedding Fields ----
    title_doc = Document(
        page_content=filename,
        metadata={"field": "title", "doc_title": filename, "parent_id": parent_id}
    )

    summary_doc = Document(
        page_content=summary_text,
        metadata={"field": "summary", "doc_title": filename, "parent_id": parent_id}
    )

    body_doc = Document(
        page_content=full_body,
        metadata={"field": "body", "doc_title": filename, "parent_id": parent_id}
    )

    all_embedding_docs.extend([title_doc, summary_doc, body_doc])

    # ---- Store Parent Doc (for generation) ----
    parent_combined = f"Title: {filename}\n\nSummary:\n{summary_text}\n\nBody:\n{full_body}"
    parent_doc = Document(page_content=parent_combined)
    docstore.mset([(parent_id, parent_doc)])

In [None]:
# ===================== EMBEDDING & VECTOR STORE =====================
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(all_embedding_docs, embedding_model)

MultiVectorRetriever	Stores multiple vectors per document (e.g. for sections, titles, etc.), preffered when	Structured data like manuals or FAQs, generally Needs extra embeddings per doc

In [None]:
# ===================== MULTIVECTOR RETRIEVER =====================
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    id_key="parent_id"
)

In [None]:
# ===================== LLM SETUP =====================
llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    api_key=userdata.get("GROQ_API_KEY")
)

In [None]:
# ===================== PROMPT =====================
prompt_template = PromptTemplate.from_template(
    "Use the following context to answer the question:\n\n{context}\n\nQuestion: {question}"
)

In [None]:
# ===================== BUILD RAG CHAIN =====================
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template}
)

In [None]:
# ===================== RUN A TEST QUESTION =====================
question = "Explain Open/Closed Principle in software design?"
result = qa_chain.invoke({"query": question})
print("Answer:", result["result"])

Answer: **Open/Closed Principle (OCP) in Software Design**

The Open/Closed Principle (OCP) is a fundamental concept in software design that states:

**"Software entities (classes, modules, functions, etc.) should be open for extension but closed for modification."**

In other words, the OCP principle suggests that a software component should be designed in such a way that it can be extended or modified without altering its underlying structure or code.

**Key Aspects of OCP:**

1. **Open for extension**: The component should be able to accommodate new functionality or features without requiring significant changes to its existing code.
2. **Closed for modification**: The component's existing code should not be modified or altered in any way, ensuring that its behavior and functionality remain consistent.

**Benefits of OCP:**

1. **Reduced maintenance costs**: By avoiding modifications to existing code, the risk of introducing bugs or breaking existing functionality is minimized.
2. *

In [None]:
# ===================== METRIC EVALUATION HELPERS =====================
def precision_recall_f1(retrieved_ids, relevant_ids):
    intersection = set(retrieved_ids).intersection(set(relevant_ids))
    precision = len(intersection) / len(retrieved_ids) if retrieved_ids else 0
    recall = len(intersection) / len(relevant_ids) if relevant_ids else 0
    f1 = 2 * precision * recall / (precision + recall + 1e-8) if (precision + recall) > 0 else 0
    return precision, recall, f1

def mean_reciprocal_rank(retrieved_ids, relevant_ids):
    for i, doc_id in enumerate(retrieved_ids):
        if doc_id in relevant_ids:
            return 1 / (i + 1)
    return 0.0

def hit_at_k(retrieved_ids, relevant_ids, k):
    return int(any(doc_id in retrieved_ids[:k] for doc_id in relevant_ids))

def faithfulness_check(llm, answer, context_docs):
    context = "\n\n".join([doc.page_content for doc in context_docs[:5]])
    prompt = f"Context:\n{context}\n\nAnswer:\n{answer}\n\nIs the answer supported by the context? Answer 'yes' or 'no'."
    verdict = llm.invoke(prompt)
    return 1 if "yes" in verdict.content.lower() else 0

In [None]:
# ===================== QUERY SETUP =====================
queries = [
    "What are the five solid principles?",
    "What is most used practice in python?",
    "What is the difference between interface segregation and Liskov substitution?",
]

ground_truths = [
    ["doc_3", "doc_7"],     # Expected relevant docs for query 1
    ["doc_5"],              # Expected relevant docs for query 2
    ["doc_6", "doc_8"],     # Expected relevant docs for query 3
]

In [None]:
# ===================== RUN EVALUATION =====================
all_precisions, all_recalls, all_f1s, all_mrrs, all_hits, all_faith = [], [], [], [], [], []

for i, question in enumerate(queries):
    print(f"\n Query: {question}")

    retrieved_docs = retriever.get_relevant_documents(question)
    retrieved_ids = [doc.metadata.get("doc_id") for doc in retrieved_docs]

    precision, recall, f1 = precision_recall_f1(retrieved_ids, ground_truths[i])
    mrr = mean_reciprocal_rank(retrieved_ids, ground_truths[i])
    hit = hit_at_k(retrieved_ids, ground_truths[i], k=5)

    answer = qa_chain.invoke({"query": question})
    is_faithful = faithfulness_check(llm, answer["result"], retrieved_docs)

    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1s.append(f1)
    all_mrrs.append(mrr)
    all_hits.append(hit)
    all_faith.append(is_faithful)

    print(f"Answer: {answer['result']}")
    print(f"Precision: {precision:.2f} | Recall: {recall:.2f} | F1: {f1:.2f} | MRR: {mrr:.2f} | Hit@5: {hit} | Faithful: {is_faithful}")


 Query: What are the five solid principles?


  retrieved_docs = retriever.get_relevant_documents(question)


Answer: The five SOLID principles are:

1. **S** - Single Responsibility Principle (SRP): Each class should have one and only one reason to change.
2. **O** - Open/Closed Principle (OCP): Software entities should be open for extension but closed for modification.
3. **L** - Liskov Substitution Principle (LSP): Derived classes must be substitutable for their base classes.
4. **I** - Interface Segregation Principle (ISP): Prefer many specific interfaces over one general-purpose interface.
5. **D** - Dependency Inversion Principle (DIP): Depend on abstractions, not on concretions.

These principles aim to promote simpler, more robust, and updatable code for software development in object-oriented languages like Python.
Precision: 0.00 | Recall: 0.00 | F1: 0.00 | MRR: 0.00 | Hit@5: 0 | Faithful: 1

 Query: What is most used practice in python?
Answer: Based on the provided context, the most used practices in Python are:

1. **Follow SOLID Principles**: This includes Single Responsibility P

In [None]:
import numpy as np
# ===================== METRIC SUMMARY =====================
print("\n Average Metrics Across Queries")
print(f"Precision@10: {np.mean(all_precisions):.2f}") #Of the top k retrieved documents, how many are actually relevant?
print(f"Recall@10: {np.mean(all_recalls):.2f}")#How many of the total relevant documents were retrieved?
print(f"F1 Score: {np.mean(all_f1s):.2f}")#Harmonic Mean of Ps/Rc - (# of relevant documents retrieved) / (Total relevant docs)
print(f"MRR: {np.mean(all_mrrs):.2f}") #Maximal Marginal Relevance - Helps balance between relevance and diversity in results.
print(f"Hit@5: {np.mean(all_hits):.2f}") #mAP (Mean Average Precision)
print(f"Faithfulness: {np.mean(all_faith):.2f}")


 Average Metrics Across Queries
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
MRR: 0.00
Hit@5: 0.00
Faithfulness: 1.00
