# import Libraries

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from dotenv import load_dotenv

In [None]:
import os
os.chdir("../")

# data loading

In [None]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)

    documents = loader.load()

    return documents
data = load_pdf("data/")

# splitting chunks the data

In [None]:
def text_split(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(data)

    return text_chunks


# to upsert all the chunks takes time so I take some of them 
text_chunks = text_split(data)
print("length of my chunk:", len(text_chunks))

chunks = text_chunks[:100]


# embedding model

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

load_dotenv()
# get openai api key from platform.openai.com
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

# embedding chunks and prepare for upsering to Pinecone

In [None]:
page_contents = [t.page_content for t in chunks]
embeddinged_chunks = [embed.embed_query(content) for content in page_contents]

In [None]:
ids = [f"id-{i}" for i in range(len(embeddinged_chunks))]
upsert_data = [
    {
        "id":ids[i],
        "values":embeddinged_chunks[i],
        "metadata":{"text":page_contents[i]}
    }
    for i in range(len(embeddinged_chunks))
]

# initializing Pinecone

In [None]:
from pinecone import Pinecone

index_name = "medical-chatbot"
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# configure client
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(index_name)

In [None]:
index.upsert(upsert_data)

In [None]:
from langchain.vectorstores import Pinecone

text_field = "text"

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

# Prompt

In [None]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [None]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [None]:
from langchain.llms import OpenAI

openai.api_key = OPENAI_API_KEY

# Initialize the LLM
llm = OpenAI(
    model_name="gpt-3.5-turbo",  # or "gpt-4"
    temperature=0.8,
    max_tokens=512,
)

In [None]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=vectorstore.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [None]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])
    