In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import fitz
from io import BytesIO
from langchain import FAISS
from typing import Optional
from langchain_cohere import CohereEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from model import LLM, LLMBuilder
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains import create_history_aware_retriever
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [None]:
def extract_text_and_urls(
        uploaded_pdf: Optional[BytesIO] = None,
        pdf_path: Optional[str] = None
) -> str:
    doc = (
        fitz.open(pdf_path) if pdf_path
        else fitz.open(stream=uploaded_pdf.read(), filetype="pdf")
    )
    _text = ""

    for page in doc:
        # Extract text and append to the _text variable
        _text += page.get_text()

        # Extract links and the corresponding text
        links = page.get_links()
        for link in links:
            if "uri" in link and "from" in link:
                link_uri = link["uri"]
                link_rect = fitz.Rect(link["from"])

                link_text = page.get_text("text", clip=link_rect)
                # Append the link URI to the extracted text
                _text += f"\n{link_text} ({link_uri})"
    
    doc.close()
    return _text

In [None]:
def process_pdf(pdf_path: str) -> FAISS:
    embeddings = CohereEmbeddings(model="embed-multilingual-v2.0")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
    
    text = extract_text_and_urls(pdf_path=pdf_path)
    
    chunks = text_splitter.split_text(text)
    
    return FAISS.from_texts(chunks, embeddings)

In [None]:
knowledge_base = process_pdf("/Users/chaitanyabasava/Desktop/Sai Naga Viswa Chaitanya_Basava_resume.pdf")
retriever = knowledge_base.as_retriever()

In [None]:
llm = LLMBuilder.get_llm(LLM.get_llm_by_id(5))

In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [None]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [None]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
from langchain_community.chat_message_histories import SQLChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    return SQLChatMessageHistory(session_id, "sqlite:///memory.db")


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [None]:
conversational_rag_chain.invoke(
    {"input": ""},
    config={
        "configurable": {"session_id": "abc123"}
    }
)