# Importing Azure Form Recognizer Python modules


In [1]:
import os
from azure.ai.formrecognizer import FormRecognizerClient
from azure.ai.formrecognizer import FormTrainingClient
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
from pathlib import Path

dotenv_path = Path("../.env")
load_dotenv(dotenv_path=dotenv_path)

True

# Form Recognizer endpoint and key and instantiate object


In [2]:
# Get the environment variable
AZURE_FORM_RECOGNIZER_ENDPOINT = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT")
AZURE_FORM_RECOGNIZER_KEY = os.getenv("AZURE_FORM_RECOGNIZER_KEY")

endpoint = AZURE_FORM_RECOGNIZER_ENDPOINT
key = AZURE_FORM_RECOGNIZER_KEY


form_recognizer_client = FormRecognizerClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

# ID Card detection


In [3]:
id_card_file_path = "ca-dl-avkash-chauhan.png"

with open(id_card_file_path, "rb") as c:
    poller = form_recognizer_client.begin_recognize_identity_documents(
        identity_document=c
    )
id_documents = poller.result()

In [4]:
for idx, id_document in enumerate(id_documents):
    FIELD_KEYS = [
        "FirstName",
        "LastName",
        "DocumentNumber",
        "DateOfBirth",
        "DateOfExpiration",
        "Sex",
        "Address",
        "CountryRegion",
        "Region",
    ]

for idx, id_document in enumerate(id_documents):
    print(f"--------Recognizing ID Card document #{idx + 1}--------")

    for field_key in FIELD_KEYS:
        field = id_document.fields.get(field_key)

        if field:
            # Special handling for the 'Sex' field
            if field_key == "Sex":
                value = field.value[1:]
            else:
                value = field.value

            print(f"{field_key}: {value} has confidence: {field.confidence}")

--------Recognizing ID Card document #1--------
FirstName: AVKASH CHAUHAN has confidence: 0.76
LastName: CHAUHAN has confidence: 0.883
DocumentNumber: D1234578 has confidence: 0.995
DateOfBirth: 1990-01-01 has confidence: 0.995
DateOfExpiration: 2025-01-01 has confidence: 0.992
Sex:  has confidence: 0.161
Address: 1234 Circle Ave, Apt 123 San Mateo, CA, 94401 has confidence: 0.585
CountryRegion: USA has confidence: 0.99
Region: California has confidence: 0.984


# Train Custom Boarding Pass Recognition


In [5]:
form_training_client = FormTrainingClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

saved_model_list = form_training_client.list_custom_models()

In [6]:
trainingDataUrl = os.getenv("AZURE_BLOB_CONTAINER_SAS_URI")

In [7]:
trainingDataUrl

'https://boardingkioskstorageacc.blob.core.windows.net/boardingkioskstorageacc?sp=racwdli&st=2024-04-22T09:59:52Z&se=2024-04-22T17:59:52Z&spr=https&sv=2022-11-02&sr=c&sig=S3mI9mCr4Ud3v%2FXPa9HKwD6iUO6H9M%2BjhZRmeiMPoCc%3D'

In [8]:
training_process = form_training_client.begin_training(
    trainingDataUrl, use_training_labels=False
)
custom_model = training_process.result()

In [9]:
custom_model

CustomFormModel(model_id=ee10241d-0159-45ab-983d-2a5c97491ad8, status=ready, training_started_on=2024-04-22 12:50:00+00:00, training_completed_on=2024-04-22 12:50:13+00:00, submodels=[CustomFormSubmodel(accuracy=None, model_id=ee10241d-0159-45ab-983d-2a5c97491ad8, fields={'field-0': CustomFormModelField(label=Baggage, name=field-0, accuracy=None), 'field-1': CustomFormModelField(label=Boarding Time, name=field-1, accuracy=None), 'field-2': CustomFormModelField(label=Carrier, name=field-2, accuracy=None), 'field-3': CustomFormModelField(label=Chicago, name=field-3, accuracy=None), 'field-4': CustomFormModelField(label=Class, name=field-4, accuracy=None), 'field-5': CustomFormModelField(label=Date, name=field-5, accuracy=None), 'field-6': CustomFormModelField(label=Flight No., name=field-6, accuracy=None), 'field-7': CustomFormModelField(label=From, name=field-7, accuracy=None), 'field-8': CustomFormModelField(label=From:, name=field-8, accuracy=None), 'field-9': CustomFormModelField(lab

In [10]:
custom_model.model_id

'ee10241d-0159-45ab-983d-2a5c97491ad8'

In [11]:
custom_model.status

'ready'

In [12]:
custom_model.training_started_on

datetime.datetime(2024, 4, 22, 12, 50, tzinfo=<isodate.tzinfo.Utc object at 0x00000291C771B910>)

In [13]:
custom_model.training_completed_on

datetime.datetime(2024, 4, 22, 12, 50, 13, tzinfo=<isodate.tzinfo.Utc object at 0x00000291C771B910>)

In [14]:
custom_model.training_documents

[TrainingDocumentInfo(name=boarding-james-webb.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=boarding-james.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=boarding-libby.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=boarding-radha-s-kumar.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=boarding-sameer.pdf, status=succeeded, page_count=1, errors=[], model_id=None)]

In [15]:
custom_model.properties

CustomFormModelProperties(is_composed_model=False)

# Extract Information Using the custom model


In [16]:
file_path = "boarding-avkash.pdf"

In [17]:
with open(file_path, "rb") as f:
    poller = form_recognizer_client.begin_recognize_custom_forms(
        model_id=custom_model.model_id,
        form=f,
        include_field_elements=True,
    )
forms = poller.result()

In [18]:
for idx, form in enumerate(forms):
    print("--------Recognizing Form #{}--------".format(idx + 1))
    print("Form was analyzed with model with ID {}".format(form.model_id))
    for name, field in form.fields.items():
        print(
            "Field '{}' has label '{}' with value '{}' and a confidence score of {}".format(
                name,
                field.label_data.text if field.label_data else name,
                field.value,
                field.confidence,
            )
        )

--------Recognizing Form #1--------
Form was analyzed with model with ID ee10241d-0159-45ab-983d-2a5c97491ad8
Field 'field-0' has label 'Passenger Name' with value 'Avkash Chauhan' and a confidence score of 1.0
Field 'field-1' has label 'Carrier' with value 'UA' and a confidence score of 1.0
Field 'field-2' has label 'Flight No.' with value '234' and a confidence score of 1.0
Field 'field-3' has label 'Class' with value 'E' and a confidence score of 1.0
Field 'field-4' has label 'Passenger Name' with value 'Avkash Chauhan' and a confidence score of 0.36
Field 'field-5' has label 'From:' with value 'San Francisco' and a confidence score of 1.0
Field 'field-6' has label 'Date' with value 'April 20, 2022' and a confidence score of 1.0
Field 'field-7' has label 'Baggage' with value 'NO' and a confidence score of 0.36
Field 'field-8' has label 'Seat' with value '20A' and a confidence score of 0.86
Field 'field-9' has label 'Seat' with value '20A' and a confidence score of 0.36
Field 'field-

In [19]:
import pandas as pd

# Create an empty DataFrame
df = pd.DataFrame(
    columns=[
        "Form Number",
        "Model ID",
        "Field Name",
        "Label",
        "Value",
        "Confidence Score",
    ]
)

for idx, form in enumerate(forms):
    print("--------Recognizing Form #{}--------".format(idx + 1))
    print("Form was analyzed with model with ID {}".format(form.model_id))
    for name, field in form.fields.items():
        label = field.label_data.text if field.label_data else name
        value = field.value
        confidence = field.confidence
        # Create a temporary DataFrame to hold the current row
        temp_df = pd.DataFrame(
            {
                "Form Number": [idx + 1],
                "Model ID": [form.model_id],
                "Field Name": [name],
                "Label": [label],
                "Value": [value],
                "Confidence Score": [confidence],
            }
        )
        # Concatenate the current DataFrame with the temporary one
        df = pd.concat([df, temp_df], ignore_index=True)

display(df)

--------Recognizing Form #1--------
Form was analyzed with model with ID ee10241d-0159-45ab-983d-2a5c97491ad8


Unnamed: 0,Form Number,Model ID,Field Name,Label,Value,Confidence Score
0,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-0,Passenger Name,Avkash Chauhan,1.0
1,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-1,Carrier,UA,1.0
2,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-2,Flight No.,234,1.0
3,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-3,Class,E,1.0
4,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-4,Passenger Name,Avkash Chauhan,0.36
5,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-5,From:,San Francisco,1.0
6,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-6,Date,"April 20, 2022",1.0
7,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-7,Baggage,NO,0.36
8,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-8,Seat,20A,0.86
9,1,ee10241d-0159-45ab-983d-2a5c97491ad8,field-9,Seat,20A,0.36


In [20]:
df.to_csv("forms_data.csv", index=False)