## Setup

In [1]:
import os

from dotenv import dotenv_values
from azure.identity import DefaultAzureCredential
from modules.app_settings import AppSettings
from modules.model_training_client import ModelTrainingClient
from modules.document_canvas import (DocumentCanvas)
from modules.document_intelligence_label import DocumentIntelligenceLabel
from modules.document_intelligence_result_formatter import DocumentIntelligenceResultFormatter


In [2]:
working_dir = os.path.abspath('')

In [3]:
settings = AppSettings(dotenv_values(f"{working_dir}/config.env"))

In [4]:
azure_credential = DefaultAzureCredential(
    exclude_environment_credential=True,
    exclude_managed_identity_credential=True,
    exclude_shared_token_cache_credential=True,
    exclude_interactive_browser_credential=True,
    exclude_powershell_credential=True,
    exclude_visual_studio_code_credential=False,
    exclude_cli_credential=False
)

In [5]:
model_training_client = ModelTrainingClient(settings=settings, use_azure_credential=False, azure_credential=azure_credential)

## Demo

In [6]:
    # case 1: 
    #     model_name = 'Contoso'
    #     pdf_file_name = 'invoice-logic-apps-test 2.pdf'

    # case 2:
    #     model_name = 'Contoso'
    #     pdf_file_name = 'invoice_AG42923.pdf'

    # case 3:
    #     model_name = 'Alton'
    #     pdf_file_name = 'invoice_AG42923.pdf'

    # case 4:
    #     model_name = 'Combivan'
    #     pdf_file_name = 'invoice-logic-apps-test 2.pdf'

    # case 5:
    #     model_name = 'Combivan'
    #     pdf_file_name = 'invoice_AG42923.pdf'

    # case 6:
    #     model_name = 'CombivanV2'
    #     pdf_file_name = 'invoice_6.pdf'

demostep = 1

### Parameters

In [7]:
match demostep:
    case 1: 
        model_name = 'Contoso'
        pdf_file_name = 'invoice-logic-apps-test 2.pdf'
    case 2:
        model_name = 'Contoso'
        pdf_file_name = 'invoice_AG42923.pdf'
    case 3:
        model_name = 'Alton'
        pdf_file_name = 'invoice_AG42923.pdf'
    case 4:
        model_name = 'Combivan'
        pdf_file_name = 'invoice-logic-apps-test 2.pdf'
    case 5:
        model_name = 'Combivan'
        pdf_file_name = 'invoice_AG42923.pdf'
    case 6:
        model_name = 'CombivanV2'
        pdf_file_name = 'invoice_6.pdf'

In [8]:
# The version of the model
initial_model_version = '1.0.0'
initial_model_id = f"{model_name}"
# The name of the PDF file the user is providing.

#pdf_file_name = 'invoice-logic-apps-test 2.pdf'
#pdf_file_name = 'invoice_AG42923.pdf'
#pdf_file_name = 'invoice_6.pdf'

base_file_name = pdf_file_name[:-4]  # Slice the string, excluding the last 4 characters ('.pdf')

# The directory containing the PDF file.
pdf_dir = os.path.join(working_dir, 'pdfs')

# The file path to the PDF file for loading.
pdf_path = os.path.join(pdf_dir, pdf_file_name)

# The file path to where the required JSON result from Azure AI Document Intelligence layout analysis will be stored.
pdf_ocr_path = os.path.join(pdf_dir, f"{pdf_file_name}.ocr.json")

# The file path to where the initial analysis of the user feedback document will be stored.
pdf_feedback_path = os.path.join(pdf_dir, f"{pdf_file_name}.ocr_{initial_model_version}.json")

# The file path to where the required JSON result for Azure AI Document Intelligence labels will be stored after user feedback.
pdf_labels_path = os.path.join(pdf_dir, f"{pdf_file_name}.labels.json")

# The file path to where the required document fields are, based on the original model training data.
document_fields_path = os.path.join(working_dir, 'model_training', 'fields.json')

In [9]:
# For providing the feedback, the user would perform their analysis using your initial model.
model_training_client.run_layout_analysis(pdf_path, pdf_feedback_path, initial_model_id)

{'status': 'succeeded',
 'createdDateTime': '2024-09-19T23:47:23Z',
 'lastUpdatedDateTime': '2024-09-19T23:47:23Z',
 'analyzeResult': {'apiVersion': '2023-07-31',
  'modelId': 'Contoso',
  'content': 'CONTOSO LTD.\nContoso Headquarters 123 456th St New York, NY, 10001\nMicrosoft Corp 123 Other St, Redmond WA, 98052\nBILL TO:\nMicrosoft Finance\n123 Bill St,\nRedmond WA, 98052\nSHIP TO:\nMicrosoft Delivery\n123 Ship St,\nRedmond WA, 98052\nINVOICE\nINVOICE: INV-100 INVOICE DATE: 11/15/2019 DUE DATE: 12/15/2019 CUSTOMER NAME: MICROSOFT CORPORATION SERVICE PERIOD: 10/14/2019 - 11/14/2019 CUSTOMER ID: CID-12345\nSERVICE ADDRESS: Microsoft Services 123 Service St, Redmond WA, 98052\nSALESPERSON\nP.O. NUMBER\nREQUISITIONER\nSHIPPED VIA\nF.O.B. POINT\nTERMS\nPO-3333\nDATE\nITEM CODE\nDESCRIPTION\nQTY\nUM\nPRICE\nTAX\nAMOUNT\n3/4/2021\nA123\nConsulting Services\n2\nhours\n$30.00\n10%\n$60.00\n3/5/2021\nB456\nDocument Fee\n3\n$10.00\n5%\n$30.00\n3/6/2021\nC789\nPrinting Fee\n10\npages\n$1.00\n2

In [10]:
doc_canvas = DocumentCanvas(working_dir)

canvases = doc_canvas.load_pdf(pdf_path, document_fields_path, pdf_feedback_path)

### Display Doc-Intel Results

In [11]:
for canvas in canvases:
    display(canvas)

BBoxWidget(bboxes=[{'x': 115.56, 'y': 129.1, 'width': 351.56, 'height': 50.70000000000001, 'label': 'Vendor', …