In [9]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
%pwd

'/Users/gaurabacharya/Documents/Medical-RAG-Chatbot/research'

In [11]:
import os
os.chdir('../')

In [12]:
%pwd

'/Users/gaurabacharya/Documents/Medical-RAG-Chatbot'

In [13]:
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader, show_progress=True)
    documents=loader.load()

    return documents

In [14]:
extracted_data = load_pdf_file(data='data/')

100%|██████████| 1/1 [10:47<00:00, 647.07s/it]


In [None]:
#extracted_data

In [15]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [16]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 39994


In [None]:
text_chunks

In [17]:
from langchain.embeddings import HuggingFaceEmbeddings
#from sentence_transformers import SentenceTransformer

def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings



In [18]:
embeddings = download_hugging_face_embeddings()


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [19]:
query_result = embeddings.embed_query("Hello World")
print("Length", len(query_result))

Length 384


In [None]:
#query_result

In [36]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [37]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GOOGLE_API_KEY=os.environ.get('GOOGLE_API_KEY')

In [38]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)


In [8]:

import time

index_name = "medical-chatbot"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [20]:
from langchain_pinecone import PineconeVectorStore

# Embed each chunk and upsert the embeddings into Pinecone index
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [23]:

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [24]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [25]:
retrieved_docs = retriever.invoke("How do you manage type 2 diabetes?")

In [26]:
retrieved_docs

[Document(id='0bfb91b6-0f55-472e-9967-fe7744522a7e', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 1187.0, 'page_label': '1158', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf', 'total_pages': 4505.0}, page_content='exercise, and blood glucose levels are as important as\nthe use of insulin or oral medications in preventing\ncomplications of diabetes. In 2003, the American\nDiabetes Association updated its Standards of Care\nfor the management of diabetes. These standards help\nmanage health care providers in the most recent\nrecommendations for diagnosis and treatment of the\ndisease.\nDietary changes\nDiet and moderate exercise are the first treatments\nimplemented in diabetes. For many Type II diabetics,'),
 Document(id='e7ec0ade-3eb6-415c-ac45-d5fe78f4a79b', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat

In [55]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(google_api_key=GOOGLE_API_KEY, model="gemini-2.0-flash")

In [56]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import SystemMessage



system_prompt = (
    "I want you to act as a medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If there are answers that not medical related or in the retrieved context, "
    "say it is out of the context of this chat."
    "If you don't know the answer, say that you don't know."
    "Use five sentences maximum and keep the answer concise.\n\n"
    "{context}"
)

question = "How do you manage type 2 diabetes?"
# prompt = ChatPromptTemplate.from_messages([
#         SystemMessage(content=system_prompt),
#         HumanMessagePromptTemplate.from_template("{input}")
#     ]
# )
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

In [57]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [58]:
response = rag_chain.invoke({"input": "How do you manage type 2 diabetes?"})
print(response["answer"])

Diet and moderate exercise are the first treatments implemented in diabetes. Maintaining a healthy lifestyle is critical to preventing the onset of Type II diabetes and preventing further complications of the disease. A doctor can recommend a proper diet, and there are many cookbooks available for diabetics. In addition, exercise and blood glucose levels are as important as the use of insulin or oral medications in preventing complications of diabetes.
