In [6]:
print('OK')

OK


In [7]:
%pwd

'd:\\AI\\Projects\\Complete-Medical-Chatbot\\research'

In [8]:
import os
os.chdir('../')

In [9]:
%pwd

'd:\\AI\\Projects\\Complete-Medical-Chatbot'

In [23]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [61]:
def load_pdf(data: str):
    loader = DirectoryLoader(
        path= data,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )

    document = loader.load()
    return document

In [62]:
extracted_data = load_pdf("data")

In [63]:
len(extracted_data)

637

In [69]:
extracted_data[100].metadata.get('source')

'data\\Medical_book.pdf'

In [70]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_document(docs: List[Document]) ->List[Document]:
    minimal_docs: List[Document] = []

    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    
    return minimal_docs

In [71]:
minimal_docs = filter_to_minimal_document(extracted_data)

In [73]:
minimal_docs[100].metadata

{'source': 'data\\Medical_book.pdf'}

In [74]:
def text_split(minimal_docs: List[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
    )

    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [75]:
text_chunks = text_split(minimal_docs)

In [76]:
len(text_chunks)

6600

In [77]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

def download_embeddings():
    model_name = 'ibm-granite/granite-embedding-english-r2'
    embedding_model = HuggingFaceBgeEmbeddings(
        model_name=model_name,
    )
    return embedding_model

embedding_model = download_embeddings()


In [78]:
response = embedding_model.embed_query("Hello, my name is abc")

In [79]:
len(response)

768

In [80]:
from pinecone import Pinecone
pc = Pinecone()

In [81]:
from pinecone import ServerlessSpec

index_name = 'medical-chatbot'

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=768,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )

index = pc.Index(index_name)

In [82]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_existing_index(
    embedding=embedding_model,
    index_name=index_name
)

In [83]:
for i in range(0, len(text_chunks), 100):
    batch = text_chunks[i:i+100]
    vector_store.add_documents(batch)
    print(i, end=" ")

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 

In [84]:
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

In [85]:
retrieved_docs = retriever.invoke("What is acne")

In [88]:
from langchain_google_genai import ChatGoogleGenerativeAI

chat_model = ChatGoogleGenerativeAI(model='gemini-2.5-flash')

In [90]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

In [105]:
system_prompt = (
    """
You are an medical-assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, say that you don't know.
Use the three sentences maximum and keep the answer concise. 

{context}
"""
)

In [106]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [107]:
question_answer_chain = create_stuff_documents_chain(chat_model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [108]:
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism ?"})

In [113]:
response['answer']

'Acromegaly is a disorder where an abnormal release of a specific chemical from the pituitary gland in the brain leads to increased growth in bone and soft tissue. This condition is most commonly caused by a noncancerous tumor.'