In [1]:
import os
from dotenv import load_dotenv
import guardrails as gd
import openai
from unstructured.partition.api import partition_via_api


In [2]:
# Load environment variables for API Keys (see .env.example)
ENV_FILE = "../.env.local"
if os.path.exists(ENV_FILE):
    load_dotenv(ENV_FILE, override=True)
    print("Loaded .env.local")
else:
    print("No .env.local found")

Loaded .env.local


In [5]:
PDF_INPUT_FILE = "msft.pdf"
UNSTRUCTURED_API_KEY = os.environ["UNSTRUCTURED_API_KEY"]

In [6]:
elements = partition_via_api(filename=PDF_INPUT_FILE, api_key=UNSTRUCTURED_API_KEY, coordinates=True)

In [33]:
pdf_input = "\n".join([e.id + " -- " + e.text for e in elements])
element_coordinates = {e.id: e.metadata.coordinates.points for e in elements}

## Generate and Attribute

In [25]:
guard = gd.Guard.from_rail("pdf.rail")

In [43]:
question = "What is this form about?"
raw_llm_output, validated_output = guard(
    openai.ChatCompletion.create,
    prompt_params={"pdf_text": pdf_input, "user_question": question},
    model="gpt-3.5-turbo-16k",
    temperature=0,
)

if validated_output is None or  "pdf_qa" not in validated_output:
    print(f"Error: {validated_output}")
else:
    answer = validated_output["pdf_qa"]["answer"]
    excerpts = [e["excerpt"] for e in validated_output["pdf_qa"]["pdf_excerpts"]]
    ids = [e["id"] for e in validated_output["pdf_qa"]["pdf_excerpts"]]

    print(f"Answer: {answer}")
    print(f"Excerpts: {excerpts}")
    print(f"IDs: {ids}")

Answer: This form is the Microsoft Corporation Annual Report for the fiscal year ended June 30, 2022.
Excerpts: ['Microsoft Corporation Annual Report 2022', 'Form 10-K (NASDAQ:MSFT)', 'For the Fiscal Year Ended June 30, 2022']
IDs: ['0ee518dec3470425c21d8cfbb139f25d', '59479c9296a32396cd0319b19c1bad0e', '5f77f3fd8691310f6e0a2dead18977aa']


## Visualize
There can be a range of "UIs" built to show the results of the GAL process, but for this notebook we will just
print out the results in a formatted string.

In this case we are using the ID output by the LLM to retrieve the coordinates of the associated element in the PDF. These coordinates are the bounding box of the element in the PDF. We can use these coordinates to draw a box around the element in the PDF.

In [45]:
output_str = f"""
{answer}

I generated this answer based on the following excerpts:
"""

for excerpt, id in zip(excerpts, ids):
    print(excerpt)
    print(id)
    output_str += f"\t{excerpt} found at {element_coordinates[id]} in {PDF_INPUT_FILE}\n"

print(output_str)

Microsoft Corporation Annual Report 2022
0ee518dec3470425c21d8cfbb139f25d
Form 10-K (NASDAQ:MSFT)
59479c9296a32396cd0319b19c1bad0e
For the Fiscal Year Ended June 30, 2022
5f77f3fd8691310f6e0a2dead18977aa

This form is the Microsoft Corporation Annual Report for the fiscal year ended June 30, 2022.

I generated this answer based on the following excerpts:
	Microsoft Corporation Annual Report 2022 found at ((103.82100870000001, 311.31328289759995), (103.82100870000001, 330.5186396976), (490.97291395368, 330.5186396976), (490.97291395368, 311.31328289759995)) in msft.pdf
	Form 10-K (NASDAQ:MSFT) found at ((201.6482949, 347.2897175232), (201.6482949, 361.6937351232), (392.78744784936003, 361.6937351232), (392.78744784936003, 347.2897175232)) in msft.pdf
	For the Fiscal Year Ended June 30, 2022 found at ((222.3540702, 75.14256885840007), (222.3540702, 82.94474505840003), (370.92917109113995, 82.94474505840003), (370.92917109113995, 75.14256885840007)) in msft.pdf

