In [None]:
import os

os.chdir("../")
os.getcwd()

In [None]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
#Extract Data from the PDF File
def load_pdf_file(path):
    loader = DirectoryLoader(path,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)

    return loader.load()

In [None]:
extracted_data = load_pdf_file(path="data/")

In [None]:
# Split the data into text chunks
def text_split(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    return text_splitter.split_documents(data)

In [None]:
text_chunks = text_split(extracted_data)
len(text_chunks)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [ ]:
# Download huggingface embedding model from Hugging Face
def download_hugging_face_embeddings():
    return HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

In [ ]:
embeddings = download_hugging_face_embeddings()

In [None]:
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
import os

load_dotenv()

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

index_name = "medibot"

pc.create_index(
    name=index_name,
    dimension= 384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [None]:
import os
os.environ["PINECONE_API_KEY"] = os.environ.get("PINECONE_API_KEY")

In [ ]:
from langchain_pinecone import PineconeVectorStore

docs_search = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [None]:
# Load existing index
from langchain_pinecone import PineconeVectorStore

docs_search = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [ ]:
# Similarity Search
retriever = docs_search.as_retriever(search_type="similarity", search_kwargs={"k":3})
retrieved_docs = retriever.invoke("What is Acromegaly and gigantism?")

In [ ]:
from langchain_ollama import OllamaLLM
model = OllamaLLM(model="llama3.2:latest")

In [ ]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [ ]:
question_answer_chain = create_stuff_documents_chain(model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [ ]:
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})
print(response["answer"])

In [ ]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])