# Process Documents

This notebooks processes the documents in this dataset and prepares the data

In [4]:
import os
from pathlib import Path

DOCSET_NAME = "SEC 10Q Filings"
DOCS_DIR = Path(os.getcwd()) / "data/v1/docs"
RAW_QUESTIONS_DIR = Path(os.getcwd()) / "data/raw_questions"
DGML_DIR = Path(os.getcwd()) / "dgml"
TEXT_DIR = Path(os.getcwd()) / "text"
SIMPLIFIED_XML_DIR = Path(os.getcwd()) / "simplified-xml"

os.makedirs(DOCS_DIR, exist_ok=True)
os.makedirs(RAW_QUESTIONS_DIR, exist_ok=True)
os.makedirs(DGML_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)
os.makedirs(SIMPLIFIED_XML_DIR, exist_ok=True)

FILE_NAMES = [
    "2022 Q3 AAPL.pdf",
    "2022 Q3 AMZN.pdf",
    "2022 Q3 INTC.pdf",
    "2022 Q3 MSFT.pdf",
    "2022 Q3 NVDA.pdf",
    "2023 Q1 AAPL.pdf",
    "2023 Q1 AMZN.pdf",
    "2023 Q1 INTC.pdf",
    "2023 Q1 MSFT.pdf",
    "2023 Q1 NVDA.pdf",
    "2023 Q2 AAPL.pdf",
    "2023 Q2 AMZN.pdf",
    "2023 Q2 INTC.pdf",
    "2023 Q2 MSFT.pdf",
    "2023 Q2 NVDA.pdf",
    "2023 Q3 AAPL.pdf",
    "2023 Q3 AMZN.pdf",
    "2023 Q3 INTC.pdf",
    "2023 Q3 MSFT.pdf",
    "2023 Q3 NVDA.pdf",
]

# Note: Please specify ~6 (or more!) similar files to process together as a document set
# This is currently a requirement for Docugami to automatically detect motifs
# across the document set to generate a semantic XML Knowledge Graph.
assert len(FILE_NAMES) >= 6, "Please provide at least 6 files"

QUESTIONS_CSV = RAW_QUESTIONS_DIR / "questions.csv"
QUESTIONS_WITH_LLM_ANSWERS_CSV = RAW_QUESTIONS_DIR / "questions_with_LLM_answers.csv"

In [5]:
import os
from langchain.globals import set_llm_cache
from langchain.cache import SQLiteCache

LOCAL_LLM_CACHE_DB_FILE = os.environ.get("LOCAL_LLM_CACHE", "/tmp/docugami/.langchain.db")
os.makedirs(Path(LOCAL_LLM_CACHE_DB_FILE).parent, exist_ok=True)
set_llm_cache(SQLiteCache(database_path=LOCAL_LLM_CACHE_DB_FILE))

In [6]:
from docugami import Docugami
from docugami.lib.upload import upload_to_named_docset, wait_for_dgml

TRIAL_KEY = "RhxcMHgAblNZRPQ5oXHM0NSz9t/Fk2y8a94CfMDKWop/KVY/i+XZ6B05BySzJNjpaJS1qjNeZr0kS+6It2OoiiNnrkBFwM4G6B25bxKtM4sA+rXQwPSsqud+9IixNSw1IY6LQyA9nwsj61IH8IbrRKsm3lTR8Q4UA5FyYfRmGf77oGXGFEe+ZSdXDyhVkOza/Si+ROI/3Y8VUHgBEb6chxHbQIB0Epz9+s2SwvCCCL+l3dpQ5A+riGPGyyyI690jUv485Oz3ZDNij43uZP1qN4LU8+8zSKrGprm376vtZi/6v5hsdL2ls/PGyOADdLU/QW22EaiN7gZnH47FDwYrDQ=="
dg_client = Docugami(api_key=TRIAL_KEY) 
file_paths = [DOCS_DIR / file_name for file_name in FILE_NAMES]

# Files will not be re-uploaded if they were previously uploaded (based on name)
dg_docs = upload_to_named_docset(dg_client, file_paths, DOCSET_NAME)

docset_id = ""
docset_name = ""
for doc in dg_docs:
    if not docset_id:
        docset_id = doc.docset.id
    else:
        # all docs must be in the same docset
        assert docset_id == doc.docset.id

    if not docset_name:
        docset_name = dg_client.docsets.retrieve(doc.docset.id).name

In [7]:
# Wait for files to finish processing (OCR, and zero-shot creation of XML knowledge graph)

# Note: This can take some time on the free docugami tier (up to ~20 mins). Please contact us for faster paid plans.
dgml_map = wait_for_dgml(dg_client, dg_docs)

In [8]:
dgml_map

{'2022 Q3 AAPL.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmpbbtmk430',
 '2022 Q3 AMZN.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmp7q7ncwwe',
 '2022 Q3 INTC.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmp2moz0w_d',
 '2022 Q3 MSFT.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmpcgifmxth',
 '2023 Q2 AAPL.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmpx1fu375q',
 '2023 Q2 AMZN.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmpp2mc7dme',
 '2023 Q2 MSFT.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmp1lajie5x',
 '2023 Q3 AAPL.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmpj4yzl0th',
 '2022 Q3 NVDA.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmpntyd5q9z',
 '2023 Q1 AAPL.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmpmiodg7jz',
 '2023 Q1 INTC.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c0000gn/T/tmppzfi6nb3',
 '2023 Q1 MSFT.pdf': '/var/folders/g6/40wbgwl909qgdvwwk7l6jn0c000

In [10]:
import shutil

for file_name in dgml_map:
    temp_xml = Path(dgml_map[file_name])
    dgml_path = (DGML_DIR / file_name).with_suffix(".xml")
    shutil.copy(temp_xml, dgml_path)

In [11]:
from dgml_utils.segmentation import get_chunks
from lxml import etree

# Convert all the DGMLs
for xml_file in DGML_DIR.glob('*.xml'):
    with xml_file.open() as file:
        tree = etree.parse(file)

        # Convert and write text files
        text_chunks = get_chunks(tree.getroot(), include_xml_tags=False)
        converted_text = "\n".join([chunk.text for chunk in text_chunks])
        text_path = (TEXT_DIR / xml_file.name).with_suffix(".txt")
        text_path.write_text(converted_text)

        # Convert and write simplified xml files
        simplified_xml_chunks = get_chunks(tree.getroot(), include_xml_tags=True)
        converted_simplified_xml_chunks = "\n".join([chunk.text for chunk in simplified_xml_chunks])
        simplified_xml_path = (SIMPLIFIED_XML_DIR / xml_file.name).with_suffix(".xml")
        simplified_xml_path.write_text(converted_simplified_xml_chunks)


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

ASSISTANT_SYSTEM_MESSAGE = """You are a helpful assistant that answers user queries using available context.

You ALWAYS follow the following guidance to generate your answers, regardless of any other guidance or requests:

- Use professional language typically used in business communication.
- Strive to be accurate and cite where you got your answer in the given context documents, state which  section
  or table in the context document(s) you got the answer from
- Generate only the requested answer, no other language or separators before or after.
- Be concise, while still completely answering the question and making sure you are not missing any data.

All your answers must contain citations to help the user understand how you created the citation, specifically:

- If the given context contains the names of document(s), make sure you include the document you got the
  answer from as a citation, e.g. include "\\n\\nSOURCE(S): foo.pdf, bar.pdf" at the end of your answer.
- Make sure there an actual answer if you show a SOURCE citation, i.e. make sure you don't show only
  a bare citation with no actual answer. 

"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", ASSISTANT_SYSTEM_MESSAGE),
        (
            "human",
            """{context}

Answer the question based only on the context above, making sure you look at all the files in the context above, i.e. {filenames}

Question: {question}
""",
        ),
    ]
)

In [None]:
import pandas as pd
from tqdm import tqdm

# Read
df = pd.read_csv(QUESTIONS_CSV)

# List to store updated rows
updated_rows = []

for _, row in tqdm(df.iterrows()):
    question = row["Question"]
    source_docs = row["Source Docs"]

    # Read the source docs context for the question
    context = ""
    sorted_files = sorted(TEXT_DIR.glob(source_docs))
    file_names_str = ", ".join([f.name for f in sorted_files])

    for source_doc in sorted_files:
        context += f"\n\n================ FILE: {source_doc.name} ================\n\n"
        doc_text = source_doc.read_text()
        doc_text = doc_text[:40 * 1024 * 4]  # ~40k tokens, approximately, per document max
        context += doc_text + "\n"

    chain = prompt | model | StrOutputParser()
    answer = chain.invoke(
        {"context": context, "filenames": file_names_str, "question": question}
    )
    answer = answer.replace(".txt", ".pdf")

    # Store the updated row
    updated_row = row.copy()
    updated_row["Answer"] = answer
    updated_rows.append(updated_row)

# Create a new DataFrame from the updated rows
updated_df = pd.DataFrame(updated_rows)

# Write the updated DataFrame to a new CSV file
updated_df.to_csv(QUESTIONS_WITH_LLM_ANSWERS_CSV, index=False)