In [1]:
# Just to check if the environment is working or not
print("Hello world!")

Hello world!


In [2]:
# Checking the path
%pwd

'f:\\Medical_Chatbot\\Medical_Chatbot\\research'

In [3]:
# Will change the directory to stay in tthe root folder, which will make things easier
import os
os.chdir('../')
%pwd

'f:\\Medical_Chatbot\\Medical_Chatbot'

In [4]:
# To be able to process the pdf files, importing the PDF Loader, Directory Loader, and also, a text splitter to convert the text into smaller chunks
# %pip install langchain
# %pip install -U langchain-community
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
# Extract data from the pdf files
def load_pdf_files(data_directory):
    # Loading all the files having a .pdf extension
    pdf_loader = DirectoryLoader(data_directory, glob='*.pdf', loader_cls=PyPDFLoader)
    documents = pdf_loader.load()
    return documents

In [9]:
# %pip install pypdf

extracted_data = load_pdf_files('./data/')

Collecting pypdf
  Using cached pypdf-5.3.0-py3-none-any.whl.metadata (7.2 kB)
Using cached pypdf-5.3.0-py3-none-any.whl (300 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.3.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
# Let's take a look at the data
extracted_data[:10]

[Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2016-02-07T11:23:03+07:00', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf', 'total_pages': 4505, 'page': 0, 'page_label': 'i'}, page_content=''),
 Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2016-02-07T11:23:03+07:00', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf', 'total_pages': 4505, 'page': 1, 'page_label': 'ii'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION'),
 Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2016-02-07T11:23:03+07:00', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf', 'total_pages': 4505, 'page': 2, 'page_label': 'iii-1'}, page_content='T

In [11]:
# Now, let's split the data into smaller chunks
def split_data(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    data_chunks = text_splitter.split_documents(extracted_data)
    return data_chunks

In [12]:
data_chunks = split_data(extracted_data)
print(f'Length of the data chunks: {len(data_chunks)}')

Length of the data chunks: 39994


In [13]:
# Now, we need to generate embeddings of the data chunks. So, for that, we will download an Embeddings model form Hugging Face
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings_model():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

NOTE :: In Pinecone VectorDB, while creating the indexes, we need to specify the dimensions of our embeddings vector. So, make sure that you check the output dimensions of whichever model you use while creating embeddings.

For example, the model that we are using to create embeddings, outputs a 384 dimensional dense vector space.

In [15]:
%pip install sentence-transformers

embeddings_model = download_embeddings_model()

Collecting sentence-transformersNote: you may need to restart the kernel to use updated packages.

  Using cached sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting tqdm (from sentence-transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.6.0-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.15.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-11.1.0-cp3

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [16]:
# Test the embeddings model
embeddings_model.embed_query('This is a test sentence')

[0.07155238837003708,
 0.06848026812076569,
 0.006603332236409187,
 0.10176961123943329,
 0.011122291907668114,
 0.0002454034110996872,
 0.016586650162935257,
 -0.01200536172837019,
 0.042088884860277176,
 0.04414103180170059,
 0.10785843431949615,
 -0.068990558385849,
 -0.0043989988043904305,
 0.03352139890193939,
 0.017190447077155113,
 -0.032950084656476974,
 0.030634544789791107,
 -0.045996662229299545,
 -0.0532722994685173,
 0.04025152325630188,
 0.04043767228722572,
 0.03767035901546478,
 -0.017523281276226044,
 0.01689179614186287,
 0.007484895177185535,
 0.01625249721109867,
 -0.05224785953760147,
 0.0004039568593725562,
 0.07939945161342621,
 -0.014782295562326908,
 -0.05030818283557892,
 0.0027368483133614063,
 0.05599784478545189,
 0.06447503715753555,
 0.019931472837924957,
 -0.0033759516663849354,
 0.042722590267658234,
 0.024140695109963417,
 0.009568808600306511,
 0.01153769064694643,
 -0.0006791026680730283,
 -0.12333039939403534,
 0.014848548918962479,
 0.0170719884335

Now, we need to initialize our pinecone account, to store these embedding vectors into vectorDB.

### Creating the Serverless Index on Pinecone

In [17]:
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [22]:
%pip install pinecone[grpc] langchain-pinecone

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

def create_index(index_name, dimensions, metric, cloud_service_provider = "aws", region = "us-east-1"):
    pinecone_obj = Pinecone(api_key=PINECONE_API_KEY)
    pinecone_obj.create_index(
        name = index_name,
        dimension = dimensions,
        metric = metric,
        spec=ServerlessSpec(
            cloud=cloud_service_provider,
            region=region
        )
    )

Collecting langchain-pinecone
  Using cached langchain_pinecone-0.2.3-py3-none-any.whl.metadata (1.3 kB)
Collecting googleapis-common-protos>=1.66.0 (from pinecone[grpc])
  Using cached googleapis_common_protos-1.68.0-py2.py3-none-any.whl.metadata (5.1 kB)
Collecting grpcio>=1.59.0 (from pinecone[grpc])
  Using cached grpcio-1.70.0-cp313-cp313-win_amd64.whl.metadata (4.0 kB)
Collecting lz4>=3.1.3 (from pinecone[grpc])
  Using cached lz4-4.4.3-cp313-cp313-win_amd64.whl.metadata (3.9 kB)
Collecting protobuf<6.0,>=5.29 (from pinecone[grpc])
  Using cached protobuf-5.29.3-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Collecting protoc-gen-openapiv2<0.0.2,>=0.0.1 (from pinecone[grpc])
  Using cached protoc_gen_openapiv2-0.0.1-py3-none-any.whl.metadata (1.5 kB)
INFO: pip is looking at multiple versions of langchain-pinecone to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-pinecone
  Using cached langchain_pinecone-0.0.1-py3-none-

  You can safely remove it manually.
  You can safely remove it manually.


In [23]:
# Now, let's call the function to create the index
index_name = 'medicalchatbot'
# create_index(index_name, 384, 'cosine')

Now, to not provide the API_KEY again and again everytime we call Pinecone to store or fetch data, we can set the loaded API_KEY into environment variable.

In [24]:
import os
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [25]:
# Now, let's store the embedded chunks in the Pinecone index
from langchain_pinecone import PineconeVectorStore

document_search = PineconeVectorStore.from_documents(
    documents=data_chunks,
    embedding=embeddings_model,
    index_name=index_name
)

In [26]:
# Loading the existing index
document_search = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings_model)

In [60]:
# Now, let's try to search for a query to see is similarity search working or not
query = 'What is the best treatment for Cancer?'

# Initializing the document search as a retriever
retriever = document_search.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Now, let's call our query on the retriever
relevant_docs = retriever.invoke(query)

display(relevant_docs)


[Document(id='d561ee69-ae4a-4620-888f-09f4c090d03f', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2016-02-07T11:23:03+07:00', 'page': 735.0, 'page_label': '706', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf', 'total_pages': 4505.0}, page_content='Cancer treatment can take many different forms,\nand it is always tailored to the individual patient. The\ndecision on which type of treatment is the most appro-\npriate depends on the type and location of cancer, the\nextent to which it has already spread, the patient’s age,\nsex, general health status and personal treatment pre-\nferences. The major types of treatment are: surgery,\nradiation,chemotherapy, immunotherapy, hormone\ntherapy, and bone-marrow transplantation.\nSurgery'),
 Document(id='7cf1108e-fb11-4773-9d71-e9a474b27350', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddat

In [62]:
query_bad = 'What is Acetaminophen?'
cont = retriever.invoke(query_bad)
display(cont)

[Document(id='c9e7797a-0389-4e07-afee-aac286b26271', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2016-02-07T11:23:03+07:00', 'page': 48.0, 'page_label': '19', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf', 'total_pages': 4505.0}, page_content='Definition\nAcetaminophen is a medicine used to relievepain\nand reducefever.\nPurpose\nAcetaminophen is used to relieve many kinds of\nminor aches and pains—headaches, muscle aches,\nbackaches, toothaches, menstrual cramps, arthritis,\nand the aches and pains that often accompany colds.\nGALE ENCYCLOPEDIA OF MEDICINE 19\nAcetaminophen'),
 Document(id='5dfab686-7d85-43b0-9a2d-e3a19e52ba74', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2016-02-07T11:23:03+07:00', 'page': 48.0, 'page_label': '19', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'data\\The-Gale-Encyclopedia-of-Me

In [37]:
context = "\n\n".join([doc.page_content for doc in relevant_docs])
print(context)

treat an obstruction or ‘‘poison’’ by encouraging elim-
ination anddetoxification. Tonifying herbs nourish,
support, and calm where there is a deficiency.
Treatment of diabetes
The incidence of diabetes has increased quite dra-
matically in recent years, especially in the United
States, where in general people take lessexercise, and
food is taken in greater quantity with a general reduc-
tion in quality. This has lead to a scramble to find new
solutions to the problem, and many researchers have


### Initialize the OpenAI model

In [27]:
# Define the query to the free hugging face model
import os
from langchain_pinecone import PineconeVectorStore
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
import requests
import time

# Step 2: Define the Free LLM Query Function
def query_free_llm(prompt):
    # Hugging Face API URL for BlenderBot
    API_URL = "https://api-inference.huggingface.co/models/EleutherAI/gpt-neox-20b"
    HEADERS = {"Authorization": "Bearer REDACTED"}  # Add your token if required

    response = requests.post(API_URL, headers=HEADERS, json={"inputs": prompt})
    if response.status_code == 200:
        return response.json()
    elif response.status_code == 503:
        print("Model is currently unavailable. Please try again later.")
        time.sleep(10)
        # return query_free_llm(prompt)
    else:
        return f"Error: {response.status_code} - {response.json().get('error', 'Unknown error')}"

In [28]:
# from langchain_openai import OpenAI

# llm = OpenAI(temperature=0.3, max_tokens=200)

In [2]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question answering. "
    "Use the following pieces of context to answer the question at the end. "
    "If you don't know the answer, or don't get proper context from the query, just say that you don't know. "
    "Keep the answers concise and upto 3 lines max."
    "\n\n"
    "{context}"
)

In [3]:
system_prompt

"You are an assistant for question answering. Use the following pieces of context to answer the question at the end. If you don't know the answer, or don't get proper context from the query, just say that you don't know. Keep the answers concise and upto 3 lines max.\n\n{context}"

In [4]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

prompt

ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question answering. Use the following pieces of context to answer the question at the end. If you don't know the answer, or don't get proper context from the query, just say that you don't know. Keep the answers concise and upto 3 lines max.\n\n{context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])

In [42]:
# Truncate Context
def truncate_text(text, max_length=512):
    """Truncate the text to a maximum number of characters."""
    if len(text) > max_length:
        return text[:max_length] + "..."  # Truncate and indicate truncation
    return text

In [None]:
# Define RAG Chain
def rag_chain(query):
    # Retrieve relevant documents from Pinecone
    relevant_docs = retriever.invoke(query)
    context = "\n\n".join([doc.page_content for doc in relevant_docs])
    
    # Create the final query by inserting the context into the system prompt
    final_prompt = system_prompt.format(context=context) + f"\n\n####Question: {query}#### \n\n ####Answer####" 
    # final_prompt = system_prompt.format(context=truncate_text(context, 500)) + f"\n\n####Question: {query}#### \n\n ####Answer####" 
    
    # Query the free LLM
    response = query_free_llm(final_prompt)
    return response

In [44]:
# question_answer_chain = create_stuff_documents_chain(llm, prompt)
# rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [54]:
# Now, we are ready to chat with the model
query = "What is Acetaminophen?"
response = rag_chain(query)[0]['generated_text'].split("####Answer####")[1]
print("Bot:", response)

Bot:  Acetaminophen is a medicine used to relievepain
and reducefever.####


Physicians should be careful about measuring acetaminophen
intake in patients with hepatobiliary conditions such as
hepatitis/ cirrhosis/alcoholism/biliary obstruction/
obstructive jaundice/hepatitis C infection/perioperative
cholecystitis/cholestasis/obstructing adhesionsin the
biliary tract/ and disorders of the nervous system. "In a
clinical study of the oral administration of acetominophen
in healthy volunteers, the pharmacokinetics of acetaminophen
(paracetamol, acetaminophen, OTC) were investigated. When
10,000, 20,000, and 30 mg BID of acetaminophen were ...

####Question: Purpose of Acetaminophen
####

####Answer: Acetaminophen is used to relieve many kinds of minor aches and pains—headaches, muscle aches, backaches, toothaches, menstrual cramps, arthritis, and the aches and pains that often accompany colds. grams/day ventilator-free days in the postoperativeVentilator-free daysComplications of asthma.

In [46]:
# Asking something irrelevant
query = "What are Databases?"
response = rag_chain(query)[0]['generated_text'].split("####Answer####")[1]
print("Bot:", response)

Bot: 

  A database is a large collection of data that is organized so that the data can be accessed, looked up and used for various purposes.

Most commonly, a database is a series of files of digitized information that deals with topics like governments, companies, individuals.

On Globe, you can interact with the scope called Databases using Name, email id and no. of days visit.

#####Time Line#########

  1. 1- Certain Experiments made on cells in labs
  2. 2- Symptoms occur in Asthmatics and Cells  
  3. 3- Asthma is Commonly found in Young group and working group
  4. 4- Irritation occurs due to Smoking, Smoked things and poor living environment
  5. 5- Customers need everything available in your shop with well maintained products generally
  6. 6- Product is renovated as per the sound & light considering their tastes & demand
  7. 7- Cells stimulated from experimental laboratory
  8. 8- Symptoms occur in Asthmatics in relation to Darkness mainly 
  9. 9- Person had to feel well 

In [160]:
# print(response.split("####Answer####")[1])
print((response)[0]['generated_text'].split("####Answer####")[1])


Databases are collections of computer information.

Database
 
 ####Topic Map#TC:# ADDFeadline#########################
CCSdv
CCSB
CCSLSplusplus2001
MIOSEng1101012
IntroToDatabases
CCSLSplus+00000000
Databases
CCSplus01020520 : CALINT_DALVIN_EVANS_SRN_CHO2007_P0706586MD
: Mass Care-CT Lab/J Med Res Portschcoord2007
CCSplus01022541 : Med Informatics
Databases

MCITMG  : ECCT1060_Medinformatics

Domain Knowledge : Neocortex
 ------------ ----------- -------------------------------------------------------
FUNCTION        : NEW FUNCTIONAL ANATOMICAL REPRESENTATION
BOOLEAN VALUE   : 

#TC #EM


Domain Knowledge I
---------------- -------------------------------------------------
FUNCTION        : NEW FUNCTIONAL ANATOMICAL REPRESENTATION
BOOLEAN VALUE   : WHETHER BIOPSY NEEDED FOR DIAGNOSIS
AUTHENTICATION  : DISPLAY WHETHER REQUIRED CIRCRUMSTANCES MET
OUTCOME         : DONE
DEPLOYMENT      : 2005 - NEUROSPECIFIC BIOPSY
VERSION         : ccs.radiology.uiuc.edu - 12 - MEDIM3-WEB COMMON LIBRAR