# SEC 10-Q Eval

Evaluating Docugami KG-RAG against OpenAI Assistants Retrieval for this dataset: https://github.com/docugami/KG-RAG-datasets/tree/main/sec-10-q

## Set up Eval

In [None]:
!rm -rf temp
!git clone https://github.com/docugami/KG-RAG-datasets.git temp

In [None]:
import os
from pathlib import Path
from datetime import datetime

# Important: Create your OpenAI assistant via https://platform.openai.com/playground
#            and put the assistant ID here. Make sure you upload the identical set of
#            files listed below (these files will be uploaded automatically to Docugami)
OPENAI_ASSISTANT_ID = "asst_qY1M0SeFYlmqkEZsMVZX2VAK"

DOCSET_NAME = "SEC 10Q Filings"
EVAL_NAME = DOCSET_NAME + " " + datetime.now().strftime("%Y-%m-%d")
FILES_DIR = Path(os.getcwd()) / "temp/sec-10-q/docs"
FILE_NAMES = [
    "2022 Q3 AAPL.pdf",
    "2022 Q3 AMZN.pdf",
    "2022 Q3 INTC.pdf",
    "2022 Q3 MSFT.pdf",
    "2022 Q3 NVDA.pdf",
    "2023 Q1 AAPL.pdf",
    "2023 Q1 AMZN.pdf",
    "2023 Q1 INTC.pdf",
    "2023 Q1 MSFT.pdf",
    "2023 Q1 NVDA.pdf",
    "2023 Q2 AAPL.pdf",
    "2023 Q2 AMZN.pdf",
    "2023 Q2 INTC.pdf",
    "2023 Q2 MSFT.pdf",
    "2023 Q2 NVDA.pdf",
    "2023 Q3 AAPL.pdf",
    "2023 Q3 AMZN.pdf",
    "2023 Q3 INTC.pdf",
    "2023 Q3 MSFT.pdf",
    "2023 Q3 NVDA.pdf",
]

# Using mini set to save cost while developing, use full set for actual runs (~$300 per run in OpenAI costs per run)
GROUND_TRUTH_CSV = Path(os.getcwd()) / "temp/sec-10-q/data/v1/qna_data_mini.csv"

# We will run each experiment multiple times and average,
# since results vary slightly over runs
PER_EXPERIMENT_RUN_COUNT = 5

# Note: Please specify ~6 (or more!) similar files to process together as a document set
# This is currently a requirement for Docugami to automatically detect motifs
# across the document set to generate a semantic XML Knowledge Graph.
assert len(FILE_NAMES) >= 6, "Please provide at least 6 files"

In [None]:
import pandas as pd
from langsmith import Client

# Read
df = pd.read_csv(GROUND_TRUTH_CSV)

# Dataset
client = Client()
dataset_name = EVAL_NAME
existing_datasets = list(client.list_datasets(dataset_name=dataset_name))
if existing_datasets:
    # read existing dataset
    dataset = client.read_dataset(dataset_name=dataset_name)
else:
    dataset = client.create_dataset(dataset_name=dataset_name)
    # Populate dataset
    for _, row in df.iterrows():
        q = row["Question"]
        a = row["Answer"]
        client.create_example(
            inputs={"question": q}, outputs={"answer": a}, dataset_id=dataset.id
        )

## Set up Docugami KG-RAG

#### Upload files to Docugami

In [None]:
from docugami import Docugami
from docugami.lib.upload import upload_to_named_docset, wait_for_dgml

dg_client = Docugami()
file_paths = [FILES_DIR / file_name for file_name in FILE_NAMES]

# Files will not be re-uploaded if they were previously uploaded (based on name)
dg_docs = upload_to_named_docset(dg_client, file_paths, DOCSET_NAME)

docset_id = ""
docset_name = ""
for doc in dg_docs:
    if not docset_id:
        docset_id = doc.docset.id
    else:
        # all docs must be in the same docset
        assert docset_id == doc.docset.id

    if not docset_name:
        docset_name = dg_client.docsets.retrieve(doc.docset.id).name

In [None]:
# Wait for files to finish processing (OCR, and zero-shot creation of XML knowledge graph)

# Note: This can take some time on the free docugami tier (up to ~20 mins). Please contact us for faster paid plans.
wait_for_dgml(dg_client, dg_docs)

In [None]:
# Run indexing
from docugami_kg_rag.helpers.indexing import index_docset

assert docset_id
assert docset_name

# Note: This can take some time since it is embedding and creating summaries for all the docs and chunks
index_docset(docset_id=docset_id, name=docset_name, overwrite=True)

#### Create Docugami Agent

In [None]:
from langchain.agents import AgentExecutor
from docugami_kg_rag.chain import agent as docugami_agent, _get_tools, AgentInput

def predict_docugami_agent(input: dict) -> dict:
    question = input["question"]
    chain = AgentExecutor(
        agent=docugami_agent,
        tools=_get_tools(),
    ).with_types(
        input_type=AgentInput,
    )
    result = chain.invoke({
        "input": question,
        "use_reports": False,
        "chat_history": [],
    })

    return result["output"]

In [None]:
# Test the agent to make sure it is working
predict_docugami_agent({"question": "How much did Microsoft spend for opex in the latest quarter?"})

## Set up OpenAI Assistants Retrieval

### Create OpenAI Agent

Please go to https://platform.openai.com/playground and create your agent. 

In [None]:
from langchain.agents.openai_assistant import OpenAIAssistantRunnable

def predict_openai_agent(input: dict, config: dict = None) -> dict:
    openai_agent = OpenAIAssistantRunnable(assistant_id=OPENAI_ASSISTANT_ID, as_agent=True).with_config(config)
    question = input["question"]
    result = openai_agent.invoke({"content": question})

    return result.return_values["output"]

In [None]:
# Test the agent to make sure it is working
predict_openai_agent({"question": "How much did Microsoft spend for opex in the latest quarter?"})

## Run Evals


In [None]:
import uuid
from langsmith.client import Client
from langchain.smith import RunEvalConfig
from langchain.globals import set_llm_cache, get_llm_cache

eval_config = RunEvalConfig(
    evaluators=["qa"],
)


def run_eval(eval_func, eval_run_name):
    """
    Run eval
    """
    client = Client()
    client.run_on_dataset(
        dataset_name=EVAL_NAME,
        llm_or_chain_factory=eval_func,
        evaluation=eval_config,
        verbose=True,
        project_name=eval_run_name,
        concurrency_level=2,  # Reduced to help with rate limits, but will take longer
    )


# Experiments
agent_map = {
    "docugami_kg_rag_zero_shot": predict_docugami_agent,
    "openai_assistant_retrieval": predict_openai_agent,
}

try:
    # Disable global cache setting to get fresh results every time for all experiments
    # since no caching or temperature-0 is supported for the openai assistants API and
    # we want to measure under similar conditions
    cache = get_llm_cache()
    set_llm_cache(None)

    for i in range(PER_EXPERIMENT_RUN_COUNT):
        run_id = str(uuid.uuid4())
        for project_name, agent in agent_map.items():
            run_eval(agent, project_name + "_" + run_id)
finally:
    # Revert cache setting to global default
    set_llm_cache(cache)