In [1]:
# %uv pip install ipykernal


In [4]:
import os
os.chdir("../")
%pwd



In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [2]:
#Extract data from pdf file.
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents


In [4]:
%pwd
extracted_data = load_pdf_file(data="Data/")


In [6]:
#Split the data into chunks.
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks


In [7]:
# Combine all page_content into a single string
text_chunks = text_split(extracted_data)
print("length of text chunks", len(text_chunks))


In [8]:
from langchain.embeddings import HuggingFaceEmbeddings


In [9]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings


In [10]:
embeddings = download_hugging_face_embeddings()


In [11]:
# query_result = embeddings.embed_query("What is Langchain?")
# print("Length of query_result:", len(query_result))
# query_result


In [6]:
from dotenv import load_dotenv
load_dotenv()


In [7]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
print(OPENAI_API_KEY)


In [14]:
import os
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from pydantic import SerializationInfo

# Check if PINECONE_API_KEY is set
api_key = os.environ.get("PINECONE_API_KEY")
if not api_key:
    raise ValueError("PINECONE_API_KEY environment variable not set. Please set it in your .env file or environment.")

pc = Pinecone(api_key=api_key)
index_name = "medicalchatbot"


if not pc.has_index(index_name):
    pc.create_index(name=index_name, dimension=384, metric="cosine", 
                    spec=ServerlessSpec(cloud="aws", 
                                   region="us-east-1"))


In [15]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(documents=text_chunks, embedding=embeddings, index_name=index_name)


In [16]:
# Function for loading existing index.
from langchain_pinecone import PineconeVectorStore

# Embed each chunk and upsert the embeddings in to your pinecone index.
docsearch = PineconeVectorStore.from_existing_index(embedding=embeddings, index_name=index_name)
docsearch

    

In [17]:
# check as docsearch to fetch relevent docs.
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})
retrived_docs = retriever.invoke("What is Acne ?")
retrived_docs


In [18]:
# Now setup llm to pass the query from knowledgebase & user.
from langchain_openai import OpenAIß

open_api_key = os.environ.get("OPENAI_API_KEY")

llm = OpenAI(api_key=open_api_key, temperature=0.4, max_tokens=500)


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer"
    "the question. If you don't know the ansewr, just say that you"
    "don't know. Use three sentences maximum and keep the answer concise.\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", f"{input}"),
    
])


In [None]:
question_answer_chain =  create_stuff_documents_chain(llm=llm, prompt=prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)


In [21]:
response = rag_chain.invoke({"input": "What is gigantism?"})
print(response["answer"])
