In [1]:
print("ok")

ok


In [2]:
%pwd

'/Users/deepak.panwar/ai-personal-projects/ai-health-assistant-langchain/research'

In [3]:
import os
os.chdir("../")
%pwd

'/Users/deepak.panwar/ai-personal-projects/ai-health-assistant-langchain'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [6]:
extracted_data = load_pdf_files("data")

In [7]:
from typing import List
from langchain.schema import Document

def filter_extracted_data(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    filtered_extracted_data: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        filtered_extracted_data.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return filtered_extracted_data

In [8]:
filtered_extracted_data = filter_extracted_data(extracted_data)

In [9]:
# Split the documents into smaller chunks
def text_to_chunks(filtered_extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    chunks = text_splitter.split_documents(filtered_extracted_data)
    return chunks

In [10]:
text_chunks = text_to_chunks(filtered_extracted_data)
print(f"Number of chunks: {len(text_chunks)}")

Number of chunks: 5859


In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embedding_model():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding_model = download_embedding_model()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [13]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [14]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

In [15]:
from pinecone import ServerlessSpec

index_name = "ai-health-assistant-langchain"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [19]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embedding_model,
    index_name=index_name
)

In [None]:
# Load Existing index

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding_model
)

In [24]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":6})

In [25]:
retrieved_docs = retriever.invoke("What is Arthritis?")

In [26]:
retrieved_docs

[Document(id='eb334360-8604-4bce-b85e-c6f22ef4dab4', metadata={'source': 'data/Medical_book.pdf'}, page_content='of the joints in the spine. AS is also known as rheumatoid\nspondylitis or Marie-Strümpell disease (among other\nnames).\nDescription\nA form of arthritis, AS is characterized by chronic\ninflammation, causing pain and stiffness of the back,\nGALE ENCYCLOPEDIA OF MEDICINE 2208\nAnkylosing spondylitis\nGEM - 0001 to 0432 - A  10/22/03 1:42 PM  Page 208'),
 Document(id='d109069a-acd4-47ce-a377-7b5fd9447e13', metadata={'source': 'data/Medical_book.pdf'}, page_content='Swezey, Robert L. Straight Talk on Spondylitis.Spondylitis\nAssociation of America, 1994.\nTaurog, Joel D., and Peter E. Lipsky. “Ankylosing Spondylitis,\nReactive Arthritis, and Undifferentiated Spondy-\nloarthropathy.” In Harrison’s Principles of Internal Medi-\ncine, ed. Anthony S. Fauci, et al. New York: McGraw-Hill,\n1997.\nORGANIZATIONS\nArthritis Foundation.1300 W. Peachtree St., Atlanta, GA\n30309. (800) 2

In [27]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-4o")

In [28]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [30]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


In [31]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [32]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [33]:
response = rag_chain.invoke({"input": "what is Arthritis?"})
print(response["answer"])

Arthritis is the inflammation of a joint that may lead to changes in the joint's structure, causing pain and swelling. It includes various types, such as rheumatoid arthritis, which is a chronic auto-immune disease leading to painful, stiff, and swollen joints. Rheumatoid arthritis can also result in crippling deformities and affect general health.


In [34]:
response = rag_chain.invoke({"input": "How to treat Arthritis?"})
print(response["answer"])

To treat arthritis, start by managing pain and inflammation with nonsteroidal anti-inflammatory drugs (NSAIDs) like aspirin, ibuprofen, or naproxen. Consider alternative treatments such as acupuncture or herbal remedies like white willow and yarrow to reduce inflammation. For those with joint deformity, physical therapy or, in severe cases, surgery may be considered to improve function and quality of life.


In [35]:
response = rag_chain.invoke({"input": "How to prevent Arthritis?"})
print(response["answer"])

There is no definitive way to prevent arthritis, especially since its development can depend on various genetic and environmental factors. However, maintaining a healthy weight, engaging in regular physical activity, and avoiding joint injuries may help reduce the risk. Additionally, managing underlying health conditions that can contribute to arthritis, such as diabetes, is beneficial.
