<a href="https://colab.research.google.com/github/deep-diver/agentic-system/blob/main/notebooks/document_parse_w_lang_graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langgraph
!pip install openai
!pip install chromadb

In [None]:
import os

os.environ["UPSTAGE_API_KEY"] = "YOUR_API_KEY"

In [None]:
from langchain_upstage import ChatUpstage

tools = []

llm = ChatUpstage()
llm = llm.bind_tools(
    [
        to_paper_search_agent,
        to_download_and_parse_paper_agent,
        to_retrive_paper_content_to_answer_question_agent
    ]
)

In [None]:
from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver

# from state import State
# from graph.nodes import chatbot, summarizer, summary_grader # test_edge, test_edge2
# from graph.edges import summary_grader_edge

from typing import Annotated
from typing_extensions import TypedDict

from langgraph.graph.message import add_messages

class State(TypedDict):
    messages: Annotated[list, add_messages]
    found: bool

def search_cached_paper_edge(state: State):
    if state["found"] is None:
        print("---Paper Not Found on ChromaDB---")
        return "qualified"
    else:
        print("---Paper Found on ChromaDB---")
        return "qualified"

def chatbot(state: State):
    msg = llm.invoke(state["messages"])
    if "new_summary" in state:
        print("---chatbot(NEW SUMMARY FOUND)---")
        return {"messages": [msg], "summary": [state["new_summary"]]}
    else:
        print("---chatbot(NO NEW SUMMARY FOUND)---")
        return {"messages": [msg]}

def compile_graph(memory: MemorySaver):
    graph_builder = StateGraph(State)

    graph_builder.add_node("chatbot", chatbot)
    graph_builder.add_edge("summarizer", "summary_grader")
    graph_builder.add_conditional_edges(
        "summary_grader",
        search_cached_paper_edge,
        {
            "qualified": END,
            "unqualified": "summary_grader"
        }
    )
    graph_builder.set_entry_point("chatbot")
    return graph_builder.compile(checkpointer=memory)

In [None]:
from langgraph.graph import StateGraph
from langgraph.checkpoint.memory import MemorySaver

from graph import compile_graph

def stream_graph_updates(graph: StateGraph, user_input: str, config: dict):
    messages = [{"role": "user", "content": user_input}]

    for event in graph.stream({"messages": messages}, config):
        for value in event.values():
            print(value)
            # if "summary" in value and len(value["summary"]) > 0:
            #     # print(value["summary"])
            #     print("Summary:\n", value["summary"][0])
            # elif "scores" in value:
            #     print("Scores:\n", value["scores"])
            # print("Assistant:", value["messages"][-1].content)

def main():
    memory = MemorySaver()
    graph = compile_graph(memory)
    config = {"configurable": {"thread_id": "1"}}

    while True:
        try:
            user_input = input("User: ")
            while not user_input.strip():
                user_input = input("User: ")

            if user_input.lower() in ["quit", "exit", "q"]:
                print("Goodbye!")
                break

            stream_graph_updates(graph, user_input, config)
        except Exception as e:
            break

if __name__ == "__main__":
    main()

In [None]:
import json
import requests
import chromadb
import numpy as np
from PyPDF2 import PdfReader, PdfWriter
from chromadb import Documents, EmbeddingFunction, Embeddings

chroma_client = chromadb.PersistentClient(path="./chroma_db")
embedding_context_length = 4000

class UpstageEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(
        self,
        client,
        model_name: str = "embedding-query",
    ):
        self.client = client
        self.model_name = model_name

    def __call__(self, input: Documents) -> Embeddings:
        if not all(isinstance(item, str) for item in input):
            raise ValueError("Solar embedding only supports text documents, not images")

        batch_process_result = self.client.embeddings.create(model=self.model_name, input=input).data
        passage_embedding_list = [i.embedding for i in batch_process_result]
        return np.array(passage_embedding_list, dtype=np.float32)

embedding_fn = UpstageEmbeddingFunction(client)

def get_md_with_document_parse(root_path, paper_url, paper_id):
    response = requests.get(paper_url)
    # Save the PDF to a temporary file

    pdf_path = f"{root_path}/paper.pdf"
    with open(pdf_path, "wb") as f:
        f.write(response.content)

    split_factor = 1
    split_pdfs = split_pdf_by_pages(pdf_path, root_path, split_factor) # by 10

    markdown = ""
    total_responses = []
    for i, split_pdf in enumerate(split_pdfs):
        upstage_response = get_document_parse_response(split_pdf, UPSTAGE_API_KEY)

        # Append the response to the total_responses list
        total_responses.append({f"page_{i+1 * split_factor}": upstage_response})
        # Also write the response to a JSON file for persistence
        json_output_path = f"{root_path}/response_{i+1}.json"
        with open(json_output_path, "w") as json_file:
            json.dump(upstage_response, json_file, indent=2)

        try:
            markdown += upstage_response['content']['markdown']
        except KeyError:
            pass

    collection = chroma_client.create_collection(name=paper_id, embedding_function=embedding_fn)

    processed_input = []
    if len(markdown) > embedding_context_length:
        chunks = [markdown[i:i+embedding_context_length] for i in range(0, len(markdown), embedding_context_length)]
        processed_input.extend(chunks)
    else:
        processed_input.append(markdown)

    ids = []
    for i in range(len(processed_input)):
        ids.append(f"{paper_id}_{i}")

    collection.add(documents=processed_input, ids=ids)
    return collection

def to_download_and_parse_paper_agent(paper_url: str):
    """Use this to download and parse paper only when paper URL is found."""
    paper_id = paper_url.split("/")[-1]
    root_path = paper_id

    if os.path.exists(root_path):
        print(f"Found cached markdown for {paper_id}")
        return f"we already have the paper content stored in our database in the id of {paper_id}"
        # chunks = get_md_from_fs(paper_id)
    else:
        print(f"No cached markdown found for {paper_id}, parsing from URL")
        os.makedirs(root_path, exist_ok=True)
        collection = get_md_with_document_parse(root_path, paper_url, paper_id)
        return f"we have parsed the paper content and stored in our database in the id of {paper_id}"

def to_retrive_paper_content_to_answer_question_agent(question: str, paper_id: str):
    """Use this to answer question about the paper."""
    collection = chroma_client.get_collection(name=paper_id, embedding_function=embedding_fn)
    results = collection.query(query_texts=[question], n_results=10)
    results_str = ["Retrieved Paper Content\n-----------------------------------\n"]
    for i in range(len(results['documents'])):
        results_str.append(f"{i}: {results['documents'][i]}")
    return "\n".join(results_str)
