##### Step 1

In [None]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv()

# MongoDB
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
VECTOR_INDEX = os.getenv("VECTOR_INDEX")

# Create a new client and connect to the server
client = MongoClient(MONGO_URI)
collection = client[DB_NAME][COLLECTION_NAME]

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


##### Step 2

In [30]:
# ingest documents
from langchain_community.document_loaders import PyPDFLoader

file_path = "Swapnil_Resume_Nov.pdf"

loader = PyPDFLoader(file_path)
docs = loader.load()
docs

[Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-03-26T10:09:11+05:30', 'author': 'Austin, Heather', 'moddate': '2025-03-26T10:09:11+05:30', 'source': 'Swapnil_Resume_Nov.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Swapnil Katiyar  \n+91-7503533658     •     swapnil240695@gmail.com    •     Noida, India \nFront-End Developer \nNext.js    |    React.js    |    JavaScript (ES6+)    |    Git   |    Jira \nPassionate Front-End Developer skilled in HTML, CSS, JavaScript, React, and Next.js, with hands-on experience in \nbuilding responsive and interactive web applications. Strong focus on delivering high-quality, maintainable code and \ncrafting exceptional user experiences through collaboration and innovation. \nPROFESSIONAL SKILLS \n• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js \n• Frameworks/Libraries: Redux, Material-UI, React Hook Form, Tailwind CSS \n• Version Control

##### Step 3

In [31]:
# clean documents
import re


def clean_text(text):
    text = re.sub(r"\n+", "\n", text)
    text = re.sub(r"Page \d+", "", text)
    text = re.sub(r"\(cid:\d+\)", "", text)
    text = re.sub(r"[-_]{2,}", "", text)
    return text.strip()


for doc in docs:
    doc.page_content = clean_text(
        doc.page_content
    )  ## why is this line required since docs is directly used in text_splitter
    print(doc.page_content)

Swapnil Katiyar  
+91-7503533658     •     swapnil240695@gmail.com    •     Noida, India 
Front-End Developer 
Next.js    |    React.js    |    JavaScript (ES6+)    |    Git   |    Jira 
Passionate Front-End Developer skilled in HTML, CSS, JavaScript, React, and Next.js, with hands-on experience in 
building responsive and interactive web applications. Strong focus on delivering high-quality, maintainable code and 
crafting exceptional user experiences through collaboration and innovation. 
PROFESSIONAL SKILLS 
• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js 
• Frameworks/Libraries: Redux, Material-UI, React Hook Form, Tailwind CSS 
• Version Control & Collaboration: Git, Jira 
• Testing: React Testing Library, Jest, Vitest 
WORK EXPERIENCE 
Treeroot Informatics – Ahmedabad, Gujarat, India January 2023 – April 2024 
Front End Developer 
• Worked closely with a team of 8 developers to deliver high-quality front-end features for projects, utilizing 
React and 

##### Step 4

In [32]:
# split to chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=80)
chunks = splitter.split_documents(docs)
chunks

[Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-03-26T10:09:11+05:30', 'author': 'Austin, Heather', 'moddate': '2025-03-26T10:09:11+05:30', 'source': 'Swapnil_Resume_Nov.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Swapnil Katiyar  \n+91-7503533658     •     swapnil240695@gmail.com    •     Noida, India \nFront-End Developer \nNext.js    |    React.js    |    JavaScript (ES6+)    |    Git   |    Jira \nPassionate Front-End Developer skilled in HTML, CSS, JavaScript, React, and Next.js, with hands-on experience in'),
 Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-03-26T10:09:11+05:30', 'author': 'Austin, Heather', 'moddate': '2025-03-26T10:09:11+05:30', 'source': 'Swapnil_Resume_Nov.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='building responsive and interactive web applications. Strong focus on delivering high

##### Step 5

In [None]:
# create embeddings and insert into vector database
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

for i, chunk in enumerate(chunks):
    embedding = embeddings.embed_query(chunk.page_content)

    collection.insert_one(
        {
            "text": chunk.page_content,
            "embedding": embedding,
            "metadata": {
                "page": chunk.metadata.get("page"),
                "source": file_path,
                "chunk_id": i,
            },
        }
    )

"""
Note : after this step you can preform verctorSearch (step 6 & 7) or 
convert db to a retriever class (step 8 & 9)
"""

In [38]:
collection.count_documents({})


13

In [39]:
doc = collection.find_one()
type(doc["embedding"]), len(doc["embedding"])

(list, 3072)

##### Step 6

In [40]:
# perform vector search and retrieve relevant contexts

query = "List all the professional skills"

query_embedding = embeddings.embed_query(query)

pipeline = [
    {
        "$vectorSearch": {
            "index": VECTOR_INDEX,
            "path": "embedding",
            "queryVector": query_embedding,
            "numCandidates": 100,
            "limit": 5,
        }
    },
    {
        "$project": {
            "_id": 0,
            "text": 1,
            "metadata": 1,
            "score": {"$meta": "vectorSearchScore"},
        }
    },
]

results = list(collection.aggregate(pipeline))
results

[{'text': 'building responsive and interactive web applications. Strong focus on delivering high-quality, maintainable code and \ncrafting exceptional user experiences through collaboration and innovation. \nPROFESSIONAL SKILLS \n• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js',
  'metadata': {'page': 0, 'source': 'Swapnil_Resume_Nov.pdf', 'chunk_id': 1},
  'score': 0.6837207078933716},
 {'text': 'work focused on front-end development technologies. \n• Mentored 8+ aspiring developers, offering guidance and support to accelerate their skill development. \n• Completed diverse self-learning projects, showcasing practical skills and a commitment to improvement.',
  'metadata': {'page': 0, 'source': 'Swapnil_Resume_Nov.pdf', 'chunk_id': 8},
  'score': 0.6762061715126038},
 {'text': '• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js \n• Frameworks/Libraries: Redux, Material-UI, React Hook Form, Tailwind CSS \n• Version Control & Collabora

In [79]:
for i, res in enumerate(results):
    print(f"Result {i+1}: {res['text']}\n")

Result 1: building responsive and interactive web applications. Strong focus on delivering high-quality, maintainable code and 
crafting exceptional user experiences through collaboration and innovation. 
PROFESSIONAL SKILLS 
• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js

Result 2: work focused on front-end development technologies. 
• Mentored 8+ aspiring developers, offering guidance and support to accelerate their skill development. 
• Completed diverse self-learning projects, showcasing practical skills and a commitment to improvement.

Result 3: • Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js 
• Frameworks/Libraries: Redux, Material-UI, React Hook Form, Tailwind CSS 
• Version Control & Collaboration: Git, Jira 
• Testing: React Testing Library, Jest, Vitest 
WORK EXPERIENCE

Result 4: EDUCATION   
ABES Engineering College – Ghaziabad, Uttar Pradesh, India 
Bachelor of Technology – Civil Engineering, June 2018 
 
PROFILES 


##### Step 7

In [None]:
# generate answer with LLM
llm = ChatOpenAI(model="gpt-4o", temperature=0, streaming=True)

contexts = [r["text"] for r in results]
context_text = "\n\n".join(contexts)  ## It combines multiple text chunks into one single string, separating each chunk with two newlines.

prompt = f"""
You are a helpful assistant.
Answer ONLY using the context below.
If the answer is not found, say "Not found in the document."

Context:
{context_text}

Question:
{query}
"""

## explore if retrieval_chain and ChatPromptTemplate can be used here...

answer = llm.stream(prompt)

full_answer = ""
for chunk in answer:
    full_answer += chunk.content

print(full_answer)

• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js
• Frameworks/Libraries: Redux, Material-UI, React Hook Form, Tailwind CSS
• Version Control & Collaboration: Git, Jira
• Testing: React Testing Library, Jest, Vitest


##### Step 8

In [41]:
# Create a LangChain wrapper around existing MongoDB collection for retrieval

from langchain_mongodb import MongoDBAtlasVectorSearch

vectorstore = MongoDBAtlasVectorSearch(
    collection=collection,
    embedding=embeddings,
    index_name="vector_index"
)
vectorstore

<langchain_mongodb.vectorstores.MongoDBAtlasVectorSearch at 0x1b360ab7770>

In [None]:
# checking if documents are retrieved through similarity search (only for debugging)

vectorstore.similarity_search(query, k=5)

[Document(id='696d0cd901ff9b3c3c054ddd', metadata={'_id': '696d0cd901ff9b3c3c054ddd', 'metadata': {'page': 0, 'source': 'Swapnil_Resume_Nov.pdf', 'chunk_id': 1}}, page_content='building responsive and interactive web applications. Strong focus on delivering high-quality, maintainable code and \ncrafting exceptional user experiences through collaboration and innovation. \nPROFESSIONAL SKILLS \n• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js'),
 Document(id='696d0cdd01ff9b3c3c054de4', metadata={'_id': '696d0cdd01ff9b3c3c054de4', 'metadata': {'page': 0, 'source': 'Swapnil_Resume_Nov.pdf', 'chunk_id': 8}}, page_content='work focused on front-end development technologies. \n• Mentored 8+ aspiring developers, offering guidance and support to accelerate their skill development. \n• Completed diverse self-learning projects, showcasing practical skills and a commitment to improvement.'),
 Document(id='696d0cda01ff9b3c3c054dde', metadata={'_id': '696d0cda01ff9b3c3c054

##### Step 9

In [44]:
# Create a Retriever

query = "List all the professional skills"

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10},  # retrieve more → better recall → rerank implicitly
)

retrieval_results = retriever.invoke(query)

for i, res in enumerate(retrieval_results):
    print(f"Result {i+1}: {res.page_content}\n")

Result 1: building responsive and interactive web applications. Strong focus on delivering high-quality, maintainable code and 
crafting exceptional user experiences through collaboration and innovation. 
PROFESSIONAL SKILLS 
• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js

Result 2: work focused on front-end development technologies. 
• Mentored 8+ aspiring developers, offering guidance and support to accelerate their skill development. 
• Completed diverse self-learning projects, showcasing practical skills and a commitment to improvement.

Result 3: • Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js 
• Frameworks/Libraries: Redux, Material-UI, React Hook Form, Tailwind CSS 
• Version Control & Collaboration: Git, Jira 
• Testing: React Testing Library, Jest, Vitest 
WORK EXPERIENCE

Result 4: EDUCATION   
ABES Engineering College – Ghaziabad, Uttar Pradesh, India 
Bachelor of Technology – Civil Engineering, June 2018 
 
PROFILES 


##### Step 10

In [45]:
# create a chat prompt template

from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a helpful assistant. "
        "Answer strictly using the provided context. "
        "If the answer is not in the context, say you don't know."
    ),
    (
        "human",
        "Context:\n{context}\n\nQuestion:\n{input}"
    )
])


##### Step 11

In [46]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0
)

document_chain = create_stuff_documents_chain(llm, prompt) ## responsible to fill the context in the prompt
retrieval_chain = create_retrieval_chain(retriever, document_chain)

##### Step 10

In [50]:
# Invoke the chain

response = retrieval_chain.invoke(
    {"input": query}
)

print(response["answer"])

- Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js
- Frameworks/Libraries: Redux, Material-UI, React Hook Form, Tailwind CSS
- Version Control & Collaboration: Git, Jira
- Testing: React Testing Library, Jest, Vitest


##### Add Re-ranking & Add Re-ranking  

In [49]:
def retrieve_with_scores(query, k=20):
    results = vectorstore.similarity_search_with_score(query, k=k)
    # results = [(Document, distance), ...]
    return results


def distance_to_similarity(distance):
    return 1 / (1 + distance)


##### Re-rank results

In [51]:
def rerank_results(docs_with_scores, top_n=5):
    reranked = []

    for doc, distance in docs_with_scores:
        similarity = distance_to_similarity(distance)
        reranked.append({
            "doc": doc,
            "distance": distance,
            "similarity": similarity
        })

    # Higher similarity = better
    reranked.sort(key=lambda x: x["similarity"], reverse=True)

    return reranked[:top_n]


##### Build score-aware RetrievalChain

In [52]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """
You are a helpful assistant.

Use ONLY the context below to answer.
Each context chunk has a similarity score (higher = more relevant).

Context:
{context}

Question:
{question}

Answer:
"""
)

##### Format context WITH scores

In [53]:
def format_context(reranked_docs):
    contexts = []
    for item in reranked_docs:
        text = item["doc"].page_content
        score = round(item["similarity"], 3)
        contexts.append(f"[Score: {score}]\n{text}")
    return "\n\n".join(contexts)


##### Full Retrieval + Generation Chain

In [54]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0)

def rag_with_reranking(query):
    # 1. Retrieve
    raw_results = retrieve_with_scores(query, k=20)

    # 2. Rerank
    reranked = rerank_results(raw_results, top_n=5)

    # 3. Format context
    context_text = format_context(reranked)

    # 4. Prompt LLM
    response = llm.invoke(
        prompt.format(
            context=context_text,
            question=query
        )
    )

    return response.content, reranked


In [55]:
answer, ranked_chunks = rag_with_reranking(
    "List all the professional skills"
)

print(answer)

print("\n--- Debug scores ---")
for item in ranked_chunks:
    print(item["similarity"], item["doc"].metadata)


- Version control best practices using Git
- Building responsive and interactive user interfaces with React and Next.js
- Using libraries such as Material-UI and React Hook Form
- Collaborating with architects and construction teams
- Structural analysis and design
- Ensuring accuracy and compliance with industry standards

--- Debug scores ---
0.6242476669708922 {'_id': '696d0cdc01ff9b3c3c054de2', 'metadata': {'page': 0, 'source': 'Swapnil_Resume_Nov.pdf', 'chunk_id': 6}}
0.6193117042757317 {'_id': '696d0cdb01ff9b3c3c054de0', 'metadata': {'page': 0, 'source': 'Swapnil_Resume_Nov.pdf', 'chunk_id': 4}}
0.6187335822151624 {'_id': '696d0cde01ff9b3c3c054de7', 'metadata': {'page': 1, 'source': 'Swapnil_Resume_Nov.pdf', 'chunk_id': 11}}
0.6177198900040497 {'_id': '696d0cdd01ff9b3c3c054de5', 'metadata': {'page': 0, 'source': 'Swapnil_Resume_Nov.pdf', 'chunk_id': 9}}
0.6122028330281968 {'_id': '696d0cde01ff9b3c3c054de6', 'metadata': {'page': 0, 'source': 'Swapnil_Resume_Nov.pdf', 'chunk_id': 1

##### Debugging the retrieval chain

In [65]:
# retrieving with scores

results = vectorstore.similarity_search_with_score(query, k=20)
print(len(results))

for doc, score in results:
    print(doc.page_content)
    print(score)
    print("\n")

13
building responsive and interactive web applications. Strong focus on delivering high-quality, maintainable code and 
crafting exceptional user experiences through collaboration and innovation. 
PROFESSIONAL SKILLS 
• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js
0.6837207078933716


work focused on front-end development technologies. 
• Mentored 8+ aspiring developers, offering guidance and support to accelerate their skill development. 
• Completed diverse self-learning projects, showcasing practical skills and a commitment to improvement.
0.6762061715126038


• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js 
• Frameworks/Libraries: Redux, Material-UI, React Hook Form, Tailwind CSS 
• Version Control & Collaboration: Git, Jira 
• Testing: React Testing Library, Jest, Vitest 
WORK EXPERIENCE
0.6723581552505493


EDUCATION   
ABES Engineering College – Ghaziabad, Uttar Pradesh, India 
Bachelor of Technology – Civil Engineering, 

In [None]:
# calculate the similarity_to_distance 

for doc, distance in results:
    print(doc.page_content) 
    print(1 / (1 + distance))
    print("\n")

building responsive and interactive web applications. Strong focus on delivering high-quality, maintainable code and 
crafting exceptional user experiences through collaboration and innovation. 
PROFESSIONAL SKILLS 
• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js
0.5939227303625519


work focused on front-end development technologies. 
• Mentored 8+ aspiring developers, offering guidance and support to accelerate their skill development. 
• Completed diverse self-learning projects, showcasing practical skills and a commitment to improvement.
0.5965853228529775


• Frontend Development: HTML5, CSS3, JavaScript, TypeScript, React, Next.js 
• Frameworks/Libraries: Redux, Material-UI, React Hook Form, Tailwind CSS 
• Version Control & Collaboration: Git, Jira 
• Testing: React Testing Library, Jest, Vitest 
WORK EXPERIENCE
0.5979580371945996


EDUCATION   
ABES Engineering College – Ghaziabad, Uttar Pradesh, India 
Bachelor of Technology – Civil Engineering, Jun

In [97]:
# re-rank results

reranked = []

for doc, distance in results:
    similarity = 1 / (1 + distance)
    reranked.append({"doc": doc, "distance": distance, "similarity": similarity})

reranked.sort(key=lambda x: x['similarity'], reverse=True)
top_5 = reranked[:5]

for item in top_5:
    print(item['similarity'], item['doc'].page_content)
    print("\n")


0.6242476669708922 ensuring alignment and progress within the team. 
• Followed version control best practices using Git, ensuring seamless collaboration, clear commit history, 
and reliable code integration across the team.


0.6193117042757317 React and Next.js to build responsive and interactive user interfaces. 
• Leveraged libraries such as Material-UI and React Hook Form to streamline UI components, improve form 
handling, and enhance the overall user experience.


0.6187335822151624 Swapnil Katiyar  
 
• Collaborated with architects and construction teams to optimize designs for efficiency and cost-effectiveness, 
resulting in successful project outcomes. 
EDUCATION   
ABES Engineering College – Ghaziabad, Uttar Pradesh, India


0.6177198900040497 Econstruct Design and Build Pvt. Ltd.- Bangalore, Karnataka, India        June 2019 – January 2022 
Structural Design Engineer 
• Applied structural analysis and design software to ensure accuracy and compliance with industry standards

In [101]:
# format context

contexts = []

for item in top_5:
    text = item["doc"].page_content
    score = round(item["similarity"], 3)
    contexts.append(f"[Score: {score}]\n{text}")

final_context = "\n\n".join(contexts)
final_context


'[Score: 0.624]\nensuring alignment and progress within the team. \n• Followed version control best practices using Git, ensuring seamless collaboration, clear commit history, \nand reliable code integration across the team.\n\n[Score: 0.619]\nReact and Next.js to build responsive and interactive user interfaces. \n• Leveraged libraries such as Material-UI and React Hook Form to streamline UI components, improve form \nhandling, and enhance the overall user experience.\n\n[Score: 0.619]\nSwapnil Katiyar  \n \n• Collaborated with architects and construction teams to optimize designs for efficiency and cost-effectiveness, \nresulting in successful project outcomes. \nEDUCATION   \nABES Engineering College – Ghaziabad, Uttar Pradesh, India\n\n[Score: 0.618]\nEconstruct Design and Build Pvt. Ltd.- Bangalore, Karnataka, India        June 2019 – January 2022 \nStructural Design Engineer \n• Applied structural analysis and design software to ensure accuracy and compliance with industry standa