In [26]:

import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#os.chdir("../")

def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [34]:
os.chdir("/Users/debasishmallick/workspace/medihealth/research")
%pwd

extracted_data=load_pdf_file(data='data/')



In [None]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

In [None]:
def text_split(minimal_docs):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(minimal_docs)
    return text_chunks
    
text_chunks=text_split(minimal_docs)
print("Length of Text Chunks", len(text_chunks))

text_chunks

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings
embeddings=download_hugging_face_embeddings()



In [93]:

query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [84]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [85]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')


In [86]:
#OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
print(GOOGLE_API_KEY)

AIzaSyCgUTyiDD6xeICcm9JnBTjPD48nfwlLKek


In [75]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [55]:
from pinecone import ServerlessSpec
index_name = "medihealth-index"  
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
index = pc.Index(index_name)

In [57]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [58]:
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [59]:
dswith = Document(
    page_content="Debasish Mallick is a SEM who is exploring the GEN AI features.",
    metadata={"source": "github"}
)

In [94]:
docsearch.add_documents(documents=[dswith])
query = "What is the role of Debasish Mallick in GEN AI?"
docs = docsearch.similarity_search(query, k=1)
print(docs[0].page_content)

Debasish Mallick is a SEM who is exploring the GEN AI features.


In [61]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [None]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs


In [88]:
#from langchain_openai import ChatOpenAI
#chatModel = ChatOpenAI(model="gpt-4o")

from langchain_google_genai import GoogleGenerativeAI
chatModel =  GoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=GOOGLE_API_KEY)

In [89]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [90]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [91]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [92]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder caused by the abnormal release of a chemical from the pituitary gland, leading to increased growth in bone and soft tissue. When this abnormality occurs before bone growth stops, it results in unusual height, known as gigantism. Acromegaly is relatively rare, affecting approximately 50 out of every one million people, and diagnosis is often delayed due to the gradual onset of symptoms.
