In [1]:
%pwd

'g:\\GKML\\2025_GKTrainings_GenAI\\GKGenAIProjects_2025\\Medical-Chatbot\\research'

In [2]:
import os
os.chdir('../')

In [3]:
%pwd

'g:\\GKML\\2025_GKTrainings_GenAI\\GKGenAIProjects_2025\\Medical-Chatbot'

# Extract Data from PDF files

In [4]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
def load_pdf_file(data):
    loader = DirectoryLoader(path=data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [7]:
extracted_data = load_pdf_file(data='Data/')

In [8]:
type(extracted_data)

list

In [9]:
len(extracted_data)

637

# Split the documents to text chunks

In [10]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [11]:
text_chunks = text_split(extracted_data)
print("Length of chunks : ",len(text_chunks))

Length of chunks :  5860


# Embedding Model

In [12]:

# Using Google Gemini Embeddings instead of HuggingFace
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Initialize Google Gemini Embeddings
def get_gemini_embeddings():
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return embeddings

embeddings = get_gemini_embeddings()


In [13]:

query_result = embeddings.embed_query("Hello world")
print("Context Length", len(query_result))


Context Length 768


# Load environment variables from .env file

In [14]:

from dotenv import load_dotenv
import os
load_dotenv()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')  # Added for Gemini
GROQ_API_KEY = os.getenv('GROQ_API_KEY')

In [15]:
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
os.environ['GROQ_API_KEY'] = GROQ_API_KEY

# Vector Store

- Convert the text to vectors and store in Vector Store ==> `pinecone vectorDB`
- Pinecone index creation

In [16]:
from pinecone.grpc import PineconeGRPC as Pinecone 
from pinecone import ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = 'gkbot'

## Pinecone index creation

In [19]:
index_list = pc.list_indexes().names()
index_list

['medicalbot']

In [22]:
if index_name not in index_list:
    pc.create_index(name=index_name,
                    dimension=768,
                    metric='cosine',
                    spec=ServerlessSpec(
                        cloud='aws',
                        region='us-east-1'
                    ))

## Embed and upsert to Pinecone 

In [23]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(documents=text_chunks,
                                               embedding=embeddings,
                                               index_name=index_name)

## Load Existing index

In [24]:
docsearch1 = PineconeVectorStore.from_existing_index(index_name=index_name,
                                                     embedding=embeddings)

# Create retriever

In [26]:
retriever = docsearch.as_retriever(search_type="similarity",
                                   search_kwargs={"k":3})


In [27]:
retrieved_docs = retriever.invoke("Antianxiety drugs")
retrieved_docs

[Document(id='b53cc754-5cfa-4b04-a1be-d1ddc4043a2d', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 246.0, 'page_label': '247', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='of buspirone at the same time as mono-amine oxidase\ninhibitors (MAOIs, phenelzine, tranycypromine) may\ncause severe blood pressure elevations. Use of buspirone\nwith MAOIs should be avoided.\nSamuel Uretsky, PharmD\nAntiarrhythmic drugs\nDefinition\nAntiarrhythmic drugs are medicines that correct irreg-\nular heartbeats and slow down hearts that beat too fast.\nKEY TERMS\nAnxiety—Worry or tension in response to real or\nimagined stress, danger, or dreaded situations.'),
 Document(id='3bdc987d-2f78-4740-8d36-589521d18a2f', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 333.0, 'page_label': '334', 'p

In [28]:
for i in retrieved_docs:
    print(i.page_content)
    print('-'*80)

of buspirone at the same time as mono-amine oxidase
inhibitors (MAOIs, phenelzine, tranycypromine) may
cause severe blood pressure elevations. Use of buspirone
with MAOIs should be avoided.
Samuel Uretsky, PharmD
Antiarrhythmic drugs
Definition
Antiarrhythmic drugs are medicines that correct irreg-
ular heartbeats and slow down hearts that beat too fast.
KEY TERMS
Anxiety—Worry or tension in response to real or
imagined stress, danger, or dreaded situations.
--------------------------------------------------------------------------------
work directly on the chemicals in the brain that are
thought to underlie the anxiety.
ANXIOLYTICS. Anxiolytics are sometimes called
tranquilizers. Most anxiolytic drugs are either benzodi-
azepines or barbiturates. Barbiturates, once commonly
used, are now rarely used in clinical practice. Barbitu-
rates work by slowing down the transmission of nerve
impulses from the brain to other parts of the body. They
include such drugs as phenobarbital (Luminal) 

# LLM

In [29]:
from langchain_groq import ChatGroq
llm = ChatGroq(model="llama3-70b-8192")

In [30]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate

In [31]:

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [32]:

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

## Legacy Chains

In [33]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [34]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response)


{'input': 'what is Acromegaly and gigantism?', 'context': [Document(id='11540a84-238e-4cc7-9305-64be88c75b68', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 45.0, 'page_label': '46', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='Whitehouse Station, NJ: Merck Research Laboratories,\n1997.\nLarsen, D. E., ed. Mayo Clinic Family Health Book.New York:\nWilliam Morrow and Co., Inc., 1996.\nJohn T. Lohr, PhD\nAcromegaly and gigantism\nDefinition\nAcromegaly is a disorder in which the abnormal\nrelease of a particular chemical from the pituitary gland\nin the brain causes increased growth in bone and soft tis-\nsue, as well as a variety of other disturbances throughout\nthe body. This chemical released from the pituitary gland'), Document(id='7afdc60c-2733-408b-8331-8411877aedb4', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPD

In [35]:
print(response['answer'])

Acromegaly is a disorder in which the abnormal release of a particular chemical from the pituitary gland in the brain causes increased growth in bone and soft tissue, as well as a variety of other disturbances throughout the body. This leads to physical changes such as a thickening of the skin, a coarsening of facial features, and a deeper, hollow-sounding voice. Gigantism is a related condition that occurs when the same hormonal imbalance occurs in children or adolescents, leading to excessive growth and tall stature.


In [36]:

response1 = rag_chain.invoke({"input": "What is stats?"})
print(response1["answer"])

I don't know what "stats" refers to in this context. It's possible that it's an abbreviation for "statistics," but without more information, I can't provide a specific answer.


## LCEL Chains

In [37]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [38]:
rag_chain = (
    {
        "context":retriever | (lambda docs: "\n\n".join(doc.page_content for doc in docs)),
        "input": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [39]:
# Usage
response = rag_chain.invoke("what is Acromegaly and gigantism?")
print(response)

Acromegaly is a disorder in which the abnormal release of a particular chemical from the pituitary gland in the brain causes increased growth in bone and soft tissue, as well as a variety of other disturbances throughout the body. This leads to physical changes such as thick and doughy skin, a coarsened facial feature, and a deeper voice. Gigantism is a related condition that occurs when the abnormal growth hormone release begins in childhood or adolescence, leading to abnormal growth and tall stature.


In [41]:
response = rag_chain.invoke("What is stats?")
print(response)

I don't know what "stats" refers to in this context. It's possible that it's an abbreviation for "statistics," but without more information, I can't provide a specific answer.
