In [1]:
%pip install --upgrade langchain-community langchain-google-genai faiss-cpu \
    pypdf torch torchvision torchaudio sentence-transformers python-dotenv google-genai

Note: you may need to restart the kernel to use updated packages.


In [17]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [18]:
# Load API key
# -----------------------
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")


In [19]:
# -----------------------
def load_pdf_file(data_path):
    loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
    return loader.load()


In [20]:
extracted_data = load_pdf_file("Data/")




Chunks: 5961


In [31]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)



In [30]:
text_chunks = text_splitter.split_documents(extracted_data)

In [29]:
print("Chunks:", len(text_chunks))

Chunks: 5961


In [21]:
embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')



In [23]:
vectorstore = FAISS.from_documents(text_chunks, embeddings)


In [22]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [24]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",   # free & fast
    google_api_key=GEMINI_API_KEY,
    temperature=0.2,
    max_output_tokens=300
)

E0000 00:00:1758274616.338918 3668417 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [25]:
# Retrieval Chain
# -----------------------
system_prompt = """You are a helpful medical assistant.
Use the following context to answer concisely.
If you don’t know, say 'I don’t know'. 
Maximum 3 sentences.

{context}"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])


In [26]:
qa_chain = create_stuff_documents_chain(llm, prompt)


In [27]:
rag_chain = create_retrieval_chain(retriever, qa_chain)

In [28]:
# Test the chatbot
# -----------------------
response = rag_chain.invoke({"input": "What is acne and how is it treated?"})
print("Answer:", response["answer"])

Answer: Acne is a common skin disease with pimples on the face, chest, and back, caused by clogged pores from oil, dead skin cells, and bacteria.  Treatment options are discussed in several of the provided articles but specifics aren't listed.


In [35]:
def ask_question(query: str):
    """
    Ask a question to the RAG chatbot.
    Returns: (answer, sources)
    """
    response = rag_chain.invoke({"input": query})

    # Extract answer
    answer = response["answer"]

    # Extract sources (retrieved docs)
    sources = []
    if "context" in response:
        for doc in response["context"]:
            sources.append(doc.metadata.get("source", "Unknown source"))

    return answer, sources
