## Part 1 - Extract information from the Digital ID

In [1]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import FormRecognizerClient
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
AZURE_FORM_RECOGNIZER_ENDPOINT = os.getenv('AZURE_FORM_RECOGNIZER_ENDPOINT')
AZURE_FORM_RECOGNIZER_KEY = os.getenv('AZURE_FORM_RECOGNIZER_KEY')

In [4]:
form_recognizer_client = FormRecognizerClient(endpoint=AZURE_FORM_RECOGNIZER_ENDPOINT, credential=AzureKeyCredential(AZURE_FORM_RECOGNIZER_KEY))

In [5]:
digital_id_path = '../material_preparation_step/Digital IDs/Digital_ID_Cristopher_Coronado.png'
with open(digital_id_path, "rb") as f:
    poller = form_recognizer_client.begin_recognize_identity_documents(identity_document=f)

id_documents = poller.result()

In [6]:
type(id_documents[0])

azure.ai.formrecognizer._models.RecognizedForm

In [7]:
def get_digital_id_details(digital_id):
    array: list = []

    fields_list = [
        {"field": "FirstName", "name": "First Name"},
        {"field": "LastName", "name": "Last Name"},
        {"field": "DocumentNumber", "name": "Document Number"},
        {"field": "DateOfBirth", "name": "Date of Birth"},
        {"field": "DateOfExpiration", "name": "Date of Expiration"},
        {"field": "Sex", "name": "Sex"},
        {"field": "CountryRegion", "name": "Country/Region"},
        {"field": "Region", "name": "Region"}
    ]

    for x in fields_list:
        element = digital_id.fields.get(x["field"])
        if element:
            array.append([x["name"], element.value, element.confidence * 100])

    np_array = np.array(array)
    df = pd.DataFrame(np_array, columns = ['Field', 'Value', '% Confidence'])
    display(df)

In [8]:
get_digital_id_details(id_documents[0])

Unnamed: 0,Field,Value,% Confidence
0,First Name,Cristopher,31.6
1,Last Name,Coronado,56.5
2,Document Number,D1549301,99.5
3,Date of Birth,1993-10-08,99.5
4,Date of Expiration,2024-06-09,99.5
5,Sex,M,98.6
6,Country/Region,USA,99.0
7,Region,California,99.0


## Part 2 - Train the custom boarding pass recognition model

In [9]:
from azure.ai.formrecognizer import FormTrainingClient

In [10]:
form_training_client = FormTrainingClient(endpoint=AZURE_FORM_RECOGNIZER_ENDPOINT, credential=AzureKeyCredential(AZURE_FORM_RECOGNIZER_KEY))

In [11]:
saved_model_list = form_training_client.list_custom_models()

In [12]:
trainingDataUrl = os.getenv('PROJECT1_BOARDING_PASSES_DATA_URL')

In [27]:
training_process = form_training_client.begin_training(trainingDataUrl, use_training_labels=True)
custom_model = training_process.result()

In [28]:
custom_model

CustomFormModel(model_id=975601b5-810e-4d7d-83d9-f4daf5db419e, status=ready, training_started_on=2022-05-17 02:54:45+00:00, training_completed_on=2022-05-17 02:54:46+00:00, submodels=[CustomFormSubmodel(accuracy=0.982, model_id=975601b5-810e-4d7d-83d9-f4daf5db419e, fields={'Baggage': CustomFormModelField(label=None, name=Baggage, accuracy=0.995), 'Boarding Time': CustomFormModelField(label=None, name=Boarding Time, accuracy=0.995), 'Carrier': CustomFormModelField(label=None, name=Carrier, accuracy=0.995), 'Class': CustomFormModelField(label=None, name=Class, accuracy=0.995), 'Date': CustomFormModelField(label=None, name=Date, accuracy=0.995), 'Flight No.': CustomFormModelField(label=None, name=Flight No., accuracy=0.995), 'From': CustomFormModelField(label=None, name=From, accuracy=0.8), 'Gate': CustomFormModelField(label=None, name=Gate, accuracy=0.995), 'Passenger Name': CustomFormModelField(label=None, name=Passenger Name, accuracy=0.995), 'Seat': CustomFormModelField(label=None, na

In [31]:
custom_model.training_documents

[TrainingDocumentInfo(name=boarding pass 1.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=boarding pass 10.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=boarding pass 2.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=boarding pass 3.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=boarding pass 4.pdf, status=succeeded, page_count=1, errors=[], model_id=None)]

In [32]:
for doc in custom_model.training_documents:
    print("Document name: {}".format(doc.name))
    print("Document status: {}".format(doc.status))
    print("Document page count: {}".format(doc.page_count))
    print("Document errors: {}".format(doc.errors))

Document name: boarding pass 1.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: boarding pass 10.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: boarding pass 2.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: boarding pass 3.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: boarding pass 4.pdf
Document status: succeeded
Document page count: 1
Document errors: []


In [33]:
custom_model_info = form_training_client.get_custom_model(model_id=custom_model.model_id)
print("Model ID: {}".format(custom_model_info.model_id))
print("Status: {}".format(custom_model_info.status))
print("Training started on: {}".format(custom_model_info.training_started_on))
print("Training completed on: {}".format(custom_model_info.training_completed_on))

Model ID: 975601b5-810e-4d7d-83d9-f4daf5db419e
Status: ready
Training started on: 2022-05-17 02:54:45+00:00
Training completed on: 2022-05-17 02:54:46+00:00


In [34]:
for submodel in custom_model.submodels:
    print(
        "The submodel with form type '{}' has recognized the following fields: {}".format(
            submodel.form_type,
            ", ".join(
                [
                    field.label if field.label else name
                    for name, field in submodel.fields.items()
                ]
            ),
        )
    )

The submodel with form type 'custom:975601b5-810e-4d7d-83d9-f4daf5db419e' has recognized the following fields: Baggage, Boarding Time, Carrier, Class, Date, Flight No., From, Gate, Passenger Name, Seat, To


In [35]:
form_recognizer_client = FormRecognizerClient(endpoint=AZURE_FORM_RECOGNIZER_ENDPOINT, credential=AzureKeyCredential(AZURE_FORM_RECOGNIZER_KEY))

In [46]:
print("model_id from custom_model: "+custom_model.model_id)
print("model_id from custom_model_info: "+custom_model_info.model_id)

model_id from custom_model: 975601b5-810e-4d7d-83d9-f4daf5db419e
model_id from custom_model_info: 975601b5-810e-4d7d-83d9-f4daf5db419e


## Part 3 - Extract information from boarding pass

In [38]:
boarding_pass_path = '../material_preparation_step/Boarding passes/Boarding_Pass_Cristopher_Coronado.pdf'
with open(boarding_pass_path, "rb") as f:
    custom_test_action = form_recognizer_client.begin_recognize_custom_forms(model_id=custom_model_info.model_id, form=f)

In [44]:
id_documents = custom_test_action.result()
array: list = []

for recognized_content in id_documents:
    print("Form type: {}".format(recognized_content.form_type))
    for name, field in recognized_content.fields.items():
        array.append([name, field.label_data.text if field.label_data else name, field.value, round(field.confidence * 100, 2)])

np_array = np.array(array)
df = pd.DataFrame(np_array, columns = ['Field', 'Label', 'Value', '% Confidence'])
df

Form type: custom:975601b5-810e-4d7d-83d9-f4daf5db419e


Unnamed: 0,Field,Label,Value,% Confidence
0,Passenger Name,Passenger Name,Cristopher Coronado,99.0
1,From,From,Miami,77.1
2,Seat,Seat,15A,11.9
3,Class,Class,E,99.0
4,To,To,New York,99.0
5,Baggage,Baggage,0,99.0
6,Flight No.,Flight No.,128,99.0
7,Gate,Gate,G2,99.0
8,Date,Date,"May 30, 2022",99.0
9,Carrier,Carrier,UA,99.0
