In [None]:
!pip install langchain
!pip install python-dotenv
!pip install langchain_google_genai
!pip install pypdf

In [None]:
!pip install beautifulsoup4
!pip install mistletoe
!pip install langchain_community

In [None]:
!pip install chromadb

In [15]:

import os
from dotenv import load_dotenv
load_dotenv()
import html
import bs4
from mistletoe import markdown
from prompts import system_prompt as sys_prompt, contextualize_q_system_prompt_lookup
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.document_loaders import TextLoader
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.document_loaders import TextLoader, DirectoryLoader


from langchain_community.document_loaders import PyPDFLoader

In [16]:
import random
import string


In [26]:

class ChainSetup:
    def __init__(self, llm:str = "gemini-1.5-flash" ,temperature: float = .2 ): 
        #create a new session each time app runs -> thus will be unique to user
        self.session_id = self.generate_session_id()

        # set up model/llm to use
        model = ChatGoogleGenerativeAI(
            model=llm,
            google_api_key=os.environ["GOOGLE_API_KEY"],
            temperature=temperature,
            verbose=True
        )

        # setup embedding model to be used for creating vectors 
        embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001",
            google_api_key=os.environ["GOOGLE_API_KEY"]
        )


        #create embeddings from our documents     
        docs = self.load_context_files()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(docs)

        #create a vector store of embedding
        vector_store = Chroma.from_documents(
            documents=splits, 
            embedding=embeddings
        )

        # create a retriver 
        retriever = vector_store.as_retriever()
        contextualize_q_system_prompt = contextualize_q_system_prompt_lookup

        contextualize_q_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", contextualize_q_system_prompt),
                MessagesPlaceholder("chat_history"),
                ("human", "{input}"),
            ]
        )

        # history aware retriver using model, retriver and contexutalize prompt
        history_aware_retriever = create_history_aware_retriever(
            model, retriever, contextualize_q_prompt
        )


        #begin working on final qa chain
        #create prompt
        system_prompt = sys_prompt
        qa_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_prompt),
                MessagesPlaceholder("chat_history"),
                ("human", "{input}"),
            ]
        )

        #create qa chain
        question_answer_chain = create_stuff_documents_chain(model, qa_prompt)

        #rag chain
        rag_chain = create_retrieval_chain(
            history_aware_retriever, 
            question_answer_chain
        )

        # final conversational RAG chain with History
        self.conversational_rag_chain = RunnableWithMessageHistory(
            rag_chain, 
            self.get_session_history, 
            input_messages_key="input",
            history_messages_key="chat_history", 
            output_messages_key="answer"
        )

        
        # session ID variable
        self.store = {}

        print("""\n\nRAG is Ready\n\n""")
        # END function

    def load_context_files(self):
        # load all markdown files
        # md_loader = DirectoryLoader('./app/data', glob='**/*.md',loader_cls=TextLoader, use_multithreading=True)
        # docs = md_loader.load()

        # can load all pdf files but we only have one, ie, our notes 
        pdf_loader = DirectoryLoader('./paper/', glob='**/*.pdf', loader_cls=PyPDFLoader, use_multithreading=True)
        docs = pdf_loader.load()

        # check to see only if we are loading all documents
        print(f"Retrival Doc Count: {len(docs)}")
        for i in range(0, len(docs)):
            print(docs[i].metadata)
        #done
        return docs

    def get_rag_output(self, input: str): 
        # call the model / invoke chain -> every messege contain a key/config
        response = self.conversational_rag_chain.invoke(
            {"input": f"{input}"},
            config={
                "configurable": {"session_id": f"{self.session_id}"}
            },  # constructs a key "session id" in `store`.
        )
        escaped_res = html.unescape(markdown(response["answer"]))


        ### DEBUG ONLY
        print(f"""\n\n
              In memory chat history: \n\n{self.store} \n\n""")
        print(f"""SessionID: {self.session_id}""")
        ###
        return escaped_res
    
    def get_session_history(self, session_id: str) -> BaseChatMessageHistory:
        # messege history store 
        if session_id not in self.store:
            self.store[session_id] = ChatMessageHistory()
        return self.store[session_id]

    def generate_session_id(self) -> str:
        #create a new session ID at start
        characters = string.ascii_letters + string.digits
        return ''.join(random.choices(characters, k=6))


In [27]:
result = ChainSetup()

I0000 00:00:1722884265.819134  293056 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1722884265.819771  293056 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1722884265.821095  293056 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


Retrival Doc Count: 9
{'source': 'paper/KGAT-Knowledge Graph Attention Network for Recommendation.pdf', 'page': 0}
{'source': 'paper/KGAT-Knowledge Graph Attention Network for Recommendation.pdf', 'page': 1}
{'source': 'paper/KGAT-Knowledge Graph Attention Network for Recommendation.pdf', 'page': 2}
{'source': 'paper/KGAT-Knowledge Graph Attention Network for Recommendation.pdf', 'page': 3}
{'source': 'paper/KGAT-Knowledge Graph Attention Network for Recommendation.pdf', 'page': 4}
{'source': 'paper/KGAT-Knowledge Graph Attention Network for Recommendation.pdf', 'page': 5}
{'source': 'paper/KGAT-Knowledge Graph Attention Network for Recommendation.pdf', 'page': 6}
{'source': 'paper/KGAT-Knowledge Graph Attention Network for Recommendation.pdf', 'page': 7}
{'source': 'paper/KGAT-Knowledge Graph Attention Network for Recommendation.pdf', 'page': 8}


RAG is Ready




In [28]:
result.get_rag_output(input="""Give me literature review of all the documents in markdown, tabular format""")




              In memory chat history: 

{'9egV2T': InMemoryChatMessageHistory(messages=[HumanMessage(content='Give me literature review of all the documents in markdown, tabular format'), AIMessage(content="Okay, I can help you with that! But first, let's talk about time management.  You're working on a literature review, which is a big task.  Have you broken it down into smaller, manageable chunks?  What's your strategy for staying on track and avoiding procrastination? \n\nNow, let's get back to your literature review.  I need a little more information to create a table for you.  Could you please tell me:\n\n1. **What are the specific research questions you are trying to answer?** This will help me identify the most relevant information from the documents.\n2. **What are the key themes or categories you want to include in your table?** For example, you might want to include columns for author, year, methodology, findings, etc.\n\nOnce I have this information, I can create a table 

"<p>Okay, I can help you with that! But first, let's talk about time management.  You're working on a literature review, which is a big task.  Have you broken it down into smaller, manageable chunks?  What's your strategy for staying on track and avoiding procrastination?</p>\n<p>Now, let's get back to your literature review.  I need a little more information to create a table for you.  Could you please tell me:</p>\n<ol>\n<li><strong>What are the specific research questions you are trying to answer?</strong> This will help me identify the most relevant information from the documents.</li>\n<li><strong>What are the key themes or categories you want to include in your table?</strong> For example, you might want to include columns for author, year, methodology, findings, etc.</li>\n</ol>\n<p>Once I have this information, I can create a table that summarizes the key findings from your documents.</p>\n<p>Remember, time is precious!  Let's make sure you're using it wisely.  How do you plan 

In [29]:
result.get_rag_output("""

You are a [Research Assitant] bot. You help with creating [Literature Review]. 

Input :You will be given access to a [research paper]


Task: You have to [extract] the following information:
Information To BE extracted is present in backticks: 
```
Paper Name
Focus Area of the paper
Methodology: (eg Qualitative, Quantitative, Review, Conceptual, Report)
Key Findings: in 10 words what the paper has implemented/achieved
Application: real life potential use cases (summerize in 10 words or 1-2 sentences)
Challenges:  Drawbacks of this paper/approach (summerize 1-2 short sentences)
Opportunities: Future scope/possibilities of paper (summerize in 1-2 sentences)
```

""")




              In memory chat history: 

{'9egV2T': InMemoryChatMessageHistory(messages=[HumanMessage(content='Give me literature review of all the documents in markdown, tabular format'), AIMessage(content="Okay, I can help you with that! But first, let's talk about time management.  You're working on a literature review, which is a big task.  Have you broken it down into smaller, manageable chunks?  What's your strategy for staying on track and avoiding procrastination? \n\nNow, let's get back to your literature review.  I need a little more information to create a table for you.  Could you please tell me:\n\n1. **What are the specific research questions you are trying to answer?** This will help me identify the most relevant information from the documents.\n2. **What are the key themes or categories you want to include in your table?** For example, you might want to include columns for author, year, methodology, findings, etc.\n\nOnce I have this information, I can create a table 

"<p>Okay, I'm ready to help you with your literature review!  Just paste the research paper here, and I'll extract the information you need.  I'll do my best to summarize the key findings, applications, challenges, and opportunities in a concise and clear way.</p>\n<p>Remember, time is valuable!  How are you managing your time while working on this literature review?  Are you using any specific techniques to stay focused and productive?</p>\n"