In [3]:
pwd

'd:\\Learning\\ChatBot_Learning\\MedicalChabot'

In [2]:
import os
os.chdir("../")

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf", # loads all the files with .pdf extension
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [6]:
extracted_data = load_pdf_files("data")

In [7]:
type(extracted_data)

list

In [8]:
extracted_data[:3]

[Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data\\Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data\\Medical_book.pdf', 'total_pages': 637, 'page': 1, 'page_label': '2'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data\\Medical_book.pdf', 'total_pages': 637, 'page': 2, 'page_label': '3'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1')]

In [9]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]: # input is list of Document type and output is also list of Document type
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source":src}
            )
        )
    return minimal_docs

In [11]:
minimal_documents = filter_to_minimal_docs(extracted_data)

In [12]:
minimal_documents[:3]

[Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content=''),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1')]

In [13]:
def text_split(minimal_documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    text_chunks = text_splitter.split_documents(minimal_documents)
    return text_chunks

In [14]:
text_chunks = text_split(minimal_documents)

In [15]:
print(f"Number of text chunks: {len(text_chunks)}")

Number of text chunks: 5859


In [16]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [17]:
vector = embeddings.embed_query("Hello world")

In [18]:
print(len(vector))

384


In [19]:
from dotenv import load_dotenv
load_dotenv()

True

In [20]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [21]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [22]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384, # dimension of the embedding model
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [23]:
# Create a vector store from the documents
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name,
)

KeyboardInterrupt: 

In [24]:
# Loading from existing index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,  
)

In [25]:
# add more documents to the existing index
new_text_chunks = Document(
    page_content="New document content goes here.",
    metadata={"source": "new_document"}
)

docsearch.add_documents(documents=[new_text_chunks])

['d3568e60-8450-4b51-88e7-a73b58545af3']

In [26]:
retriver = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [27]:
retrieved_docs = retriver.invoke("What is diabetes?")
retrieved_docs

[Document(id='c7cac104-a224-468b-bb70-3129238bd46c', metadata={'source': 'data\\Medical_book.pdf'}, page_content='begin to fall. A person with diabetes mellitus either does\nnot make enough insulin, or makes insulin that does not\nwork properly. The result is blood sugar that remains\nhigh, a condition called hyperglycemia.\nDiabetes must be diagnosed as early as possible. If\nleft untreated, it can damage or cause failure of the eyes,\nkidneys, nerves, heart, blood vessels, and other body\norgans. Hypoglycemia, or low blood sugar, may also be\ndiscovered through blood sugar testing. Hypoglycemia is'),
 Document(id='4d6afef4-a67b-4eec-88cb-abcd8afec5a6', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Resources\nBOOKS\nBerkow, Robert, ed. The Merck Manual of Medical Informa-\ntion: Home Edition. Whitehouse Station, NJ: Merck &\nCo., Inc., 1997.\nKEY TERMS\nAplastic —Exhibiting incomplete or faulty devel-\nopment.\nDiabetes mellitus —A disorder of carbohydrate\nmetabolism b

In [28]:
# from langchain_openai import ChatOpenAI

# chatmodel = ChatOpenAI(model="gpt-4o")

from langchain_community.llms import Ollama
chatmodel = Ollama(
    model="llama3",
    base_url="http://localhost:11434"
)

  chatmodel = Ollama(


In [29]:
print(chatmodel.invoke("What is diabetes?"))

Diabetes is a group of metabolic disorders characterized by high blood sugar levels. It occurs when the body becomes resistant to insulin, a hormone produced by the pancreas that regulates blood sugar levels. Insulin helps glucose (a type of sugar) enter cells throughout the body, where it can be used for energy or stored for later use.

There are three main types of diabetes:

1. **Type 1 Diabetes**: An autoimmune disease in which the body's immune system attacks and destroys the insulin-producing beta cells in the pancreas. People with Type 1 diabetes typically develop symptoms suddenly and need to take insulin injections to control their blood sugar levels.
2. **Type 2 Diabetes**: The most common form of diabetes, accounting for about 90% of all cases. In Type 2 diabetes, the body becomes resistant to insulin, making it harder for cells to absorb glucose. As a result, blood sugar levels rise, and the pancreas produces more insulin to try to compensate. Eventually, the pancreas may n

In [30]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [31]:
system_prompt = (
    "you are a medical assistant for question-answering tasks."
    "Use the following context to answer the question."
    "If you don't know the answer, just say that you don't know, don't try to make up an answer."
    "Use three sentences maximum to answer the question."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "{input}"),
    ]
)

In [32]:
question_answer_chain = create_stuff_documents_chain(chatmodel, prompt)
rag_chain = create_retrieval_chain(retriver,question_answer_chain)

In [33]:
response = rag_chain.invoke({"input": "Who is Modi?"})
print(response["answer"])

I'm just a medical assistant, I don't know who Modi is.
