In [37]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 

In [None]:
import os
os.chdir("MedicalChatBotGenAI")


'c:\\Users\\KIIT\\Documents\\forage-midas\\MedicalChatBotGenAI'

In [38]:
#to extract data from pdf file
def load_pdf_file(data):
    loader=DirectoryLoader(data,glob="*.pdf",
                           loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [39]:
extracted_data=load_pdf_file(data='Data/')
#extracted_data

In [40]:
#splitting data into text chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [41]:
text_chunks=text_split(extracted_data)
print(len(text_chunks))

5859


In [None]:
#Download the Embeddings from HuggingFace
from langchain_huggingface import HuggingFaceEmbeddings
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings


In [43]:
embeddings=download_hugging_face_embeddings()

In [44]:
query_result=embeddings.embed_query("Hello Wrold")
print("Length", len(query_result))

Length 384


In [79]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
import os
PINECONE_API_KEY= os.environ.get("PINECONE_API_KEY")



In [48]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os
pc =Pinecone(api_key=PINECONE_API_KEY)
index_name="medchatbot"
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "medchatbot",
    "metric": "cosine",
    "host": "medchatbot-x4wnlsl.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [None]:
import os
os.environ["PINECONE_API_KEY"]=PINECONE_API_KEY



In [72]:
#Embed each chunk and upload the embeddings into the Pinecone index
from langchain_pinecone import PineconeVectorStore
docsearch=PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [73]:
#Load Existing index
from langchain_pinecone import PineconeVectorStore
docsearch=PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [74]:
retriever =docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt=(
     "You are an assistant for question-answering tasks."
     "Use the following pieces of retrieved context to answer"
     "the question.If you don't know the answer, say that you"
     "don't know. Use 3 sentences maximum and keep the"
     "answer concise."
     "\n\n"
     "{context}"
)
prompt= ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}"),
    ]
)

In [101]:
from langchain_community.llms import Ollama

llm = Ollama(
    model="mistral"
)

In [102]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [103]:
response=rag_chain.invoke({"input":"What is acne?"})
print(response["answer"])



 Acne is a skin disorder characterized by inflammation of the sebaceous glands. It can affect various parts of the body, but often appears on the face.
