In [None]:
# ===================== INSTALL DEPENDENCIES =====================
!pip install -q langchain sentence-transformers faiss-cpu pypdf langchain-community langchain-groq

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.6/304.6 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.9/438.9 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ===================== IMPORTS =====================
import os
import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

from sentence_transformers.cross_encoder import CrossEncoder
from IPython.display import display, Markdown
import numpy as np

In [None]:
# ===================== LOAD PDF =====================
loader = PyPDFLoader("/content/solid-python.pdf")
documents = loader.load()

# Assign parent IDs
for i, doc in enumerate(documents):
    doc.metadata["doc_id"] = f"doc_{i}"

In [None]:
# ===================== SPLITTING =====================
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_splitter

<langchain_text_splitters.character.CharacterTextSplitter at 0x7a5c67d73050>

In [None]:
# ===================== EMBEDDINGS =====================
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embedding_model

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
# ===================== VECTOR STORE + DOC STORE =====================
# Create a FAISS vector store from the documents and embeddings
vectorstore = FAISS.from_documents(documents, embedding_model)
docstore = InMemoryStore()  # Stores parent docs

In [None]:
# =====================  RETRIEVER WITH CUSTOM MMR =====================
# retriever = vectorstore.as_retriever(
#     search_type="mmr", search_kwargs={"k": 15, "fetch_k": 30}, lambda_mult=0.3
# )

Retrieves parent document from child chunks (preserves context), preffered on Long documents with semantic hierarchy, Needs ParentDocumentRetriever setup

In [None]:
# ===================== PARENT RETRIEVER WITH MMR =====================
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    child_splitter=text_splitter,
    parent_splitter=None,  # no parent split, original docs are used
)
retriever

ParentDocumentRetriever(vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7a5c6b47ba10>, docstore=<langchain_core.stores.InMemoryStore object at 0x7a5c6b23dc90>, search_kwargs={}, child_splitter=<langchain_text_splitters.character.CharacterTextSplitter object at 0x7a5c67d73050>)

In [None]:
# ===================== ADD DOCUMENTS =====================
retriever.add_documents(documents)

In [None]:
# ===================== LLM SETUP =====================
from google.colab import userdata
llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    api_key=userdata.get("GROQ_API_KEY")
)
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7a5c67721790>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7a5c6771fd50>, model_name='llama-3.3-70b-versatile', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [None]:
# ===================== PROMPT =====================
prompt_template = PromptTemplate.from_template(
    "Use the following context to answer the question:\n\n{context}\n\nQuestion: {question}"
)

In [None]:
# ===================== BUILD RAG CHAIN =====================
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template}
)
qa_chain

RetrievalQA(verbose=False, combine_documents_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Use the following context to answer the question:\n\n{context}\n\nQuestion: {question}'), llm=ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7a5c67721790>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7a5c6771fd50>, model_name='llama-3.3-70b-versatile', model_kwargs={}, groq_api_key=SecretStr('**********')), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='{page_content}'), document_variable_name='context'), retriever=ParentDocumentRetriever(vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7a5c6b47ba10>, docstore=<langchain_core.stores.InMemoryStore object at 0x7a5c6b23dc9

In [None]:
# ===================== RUN A QUERY =====================
question = "What is the main objective of the document?"
result = qa_chain.invoke({"query": question})
print(" Answer:", result["result"])

 Answer: The main objective of the document is to discuss the principles of SOLID software design, specifically the 5 aspects of a class and the corresponding 5 principles (SOLID) for software design, as presented by Mike Lindner.


In [None]:
# ===================== METRIC EVALUATION HELPERS =====================

def precision_recall_f1(retrieved_ids, relevant_ids):
    intersection = set(retrieved_ids).intersection(set(relevant_ids))
    precision = len(intersection) / len(retrieved_ids) if retrieved_ids else 0
    recall = len(intersection) / len(relevant_ids) if relevant_ids else 0
    f1 = 2 * precision * recall / (precision + recall + 1e-8) if (precision + recall) > 0 else 0
    return precision, recall, f1

def mean_reciprocal_rank(retrieved_ids, relevant_ids):
    for i, doc_id in enumerate(retrieved_ids):
        if doc_id in relevant_ids:
            return 1 / (i + 1)
    return 0.0

def hit_at_k(retrieved_ids, relevant_ids, k):
    return int(any(doc_id in retrieved_ids[:k] for doc_id in relevant_ids))

def faithfulness_check(llm, answer, context_docs):
    context = "\n\n".join([doc.page_content for doc in context_docs[:5]])
    prompt = f"Context:\n{context}\n\nAnswer:\n{answer}\n\nIs the answer supported by the context? Answer 'yes' or 'no'."
    verdict = llm.invoke(prompt)
    return 1 if "yes" in verdict.content.lower() else 0

In [None]:
# ===================== QUERY SETUP =====================
queries = [
    "What are the five solid principles?",
    "Why is dependency inversion important?",
    "What is the difference between interface segregation and Liskov substitution?",
]

ground_truths = [
    ["doc_3", "doc_7"],     # Expected relevant docs for query 1
    ["doc_5"],              # Expected relevant docs for query 2
    ["doc_6", "doc_8"],     # Expected relevant docs for query 3
]

In [None]:
# ===================== RUN EVALUATION =====================
all_precisions, all_recalls, all_f1s, all_mrrs, all_hits, all_faith = [], [], [], [], [], []

for i, question in enumerate(queries):
    print(f"\n Query: {question}")

    retrieved_docs = retriever.get_relevant_documents(question)
    retrieved_ids = [doc.metadata.get("doc_id") for doc in retrieved_docs]

    precision, recall, f1 = precision_recall_f1(retrieved_ids, ground_truths[i])
    mrr = mean_reciprocal_rank(retrieved_ids, ground_truths[i])
    hit = hit_at_k(retrieved_ids, ground_truths[i], k=5)

    answer = qa_chain.invoke({"query": question})
    is_faithful = faithfulness_check(llm, answer["result"], retrieved_docs)

    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1s.append(f1)
    all_mrrs.append(mrr)
    all_hits.append(hit)
    all_faith.append(is_faithful)

    print(f"Answer: {answer['result']}")
    print(f"Precision: {precision:.2f} | Recall: {recall:.2f} | F1: {f1:.2f} | MRR: {mrr:.2f} | Hit@5: {hit} | Faithful: {is_faithful}")


 Query: What are the five solid principles?
Answer: The five SOLID principles are:

1. Single Responsibility Principle
2. Open-Closed Principle
3. Liskov Substitution Principle
4. Interface Segregation Principle
5. Dependency Inversion Principle

These principles are guidelines for designing solid, maintainable, and scalable software systems.
Precision: 0.00 | Recall: 0.00 | F1: 0.00 | MRR: 0.00 | Hit@5: 0 | Faithful: 1

 Query: Why is dependency inversion important?
Answer: The context provided doesn't explicitly state why dependency inversion is important. However, based on general knowledge of the SOLID principles, dependency inversion is important because it helps to reduce coupling between classes and makes the system more modular, flexible, and easier to test.

Dependency inversion principle states that high-level modules should not depend on low-level modules, but both should depend on abstractions. This means that instead of a high-level module depending directly on a low-leve

In [None]:
# ===================== METRIC SUMMARY =====================
print("\n Average Metrics Across Queries")
print(f"Precision@10: {np.mean(all_precisions):.2f}") #Of the top k retrieved documents, how many are actually relevant?
print(f"Recall@10: {np.mean(all_recalls):.2f}")#How many of the total relevant documents were retrieved?
print(f"F1 Score: {np.mean(all_f1s):.2f}")#Harmonic Mean of Ps/Rc - (# of relevant documents retrieved) / (Total relevant docs)
print(f"MRR: {np.mean(all_mrrs):.2f}") #Maximal Marginal Relevance - Helps balance between relevance and diversity in results.
print(f"Hit@5: {np.mean(all_hits):.2f}") #mAP (Mean Average Precision)
print(f"Faithfulness: {np.mean(all_faith):.2f}")


 Average Metrics Across Queries
Precision: 0.17
Recall: 0.33
F1 Score: 0.22
MRR: 0.33
Hit@5: 0.33
Faithfulness: 0.67
