Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Form Recognizer Scoring Pipeline

This notebook covers the process of setting up and running an AML pipeline that will score a custom form recognizer model.

Steps in this notebook include:

- Clapperboard Selection Step using OCR
- Form Extraction step (scoring)
- Postprocessing Step

## Import Dependencies

In [1]:
#Load dotenv extension

%load_ext dotenv
%dotenv

import os
from os.path import join
import sys

sys.path.append("../")

import pandas as pd
import numpy as np

from azureml.core import Environment, Datastore, Workspace, Experiment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import RunConfiguration


from mlops.common.attach_compute import get_compute
from mlops.common.get_datastores import get_blob_datastore

Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception (cryptography 3.2 (c:\miniconda\envs\py37_default\lib\site-packages), Requirement.parse('cryptography<4.0.0,>=3.3.1; extra == "crypto"'), {'PyJWT'}).


## Configure Workspace and Set Compute Target

In [2]:
# Restore AML workspace from config.json file (can be downloaded through the portal)

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')



Workspace name: sandbox
Azure region: westus
Subscription id: 5eb4ea0a-0c5d-4ed1-8553-b0ee0f5215b7
Resource group: iota-sharing-rg


In [3]:
# Set compute target

compute_target = get_compute(
    workspace=ws,
    compute_name=os.getenv("AML_CLUSTER_NAME"),
    vm_size=os.getenv("AML_CLUSTER_CPU_SKU"),
    vm_priority=os.environ.get("AML_CLUSTER_PRIORITY", 'lowpriority'), 
    min_nodes=int(os.environ.get("AML_CLUSTER_MIN_NODES", 0)),
    max_nodes=int(os.environ.get("AML_CLUSTER_MAX_NODES", 4)),
    scale_down=int(os.environ.get("AML_CLUSTER_SCALE_DOWN", 600)),
)

## Configure Datastores 

In [4]:
#create root datastore
#datastore should represent storage container where data will be accessed
root_datastore = get_blob_datastore(ws, os.getenv("BLOB_DATASTORE_NAME"), os.getenv("STORAGE_NAME"),
                                    os.getenv("STORAGE_KEY"), os.getenv("STORAGE_CONTAINER"))

# Create input and output data references
# WARNING! DataReference works up to 12x times faster than Dataset
root_dir = DataReference(
    datastore=root_datastore, 
    data_reference_name="form_data_ref", 
    mode="mount"
)

## Set Keyvault Secrets

In [5]:
#Set pipeline secrets using defauly keyvault every AML workspace comes with
print("Setting Pipeline Secrets in Azure Key Vault")

key_vault = ws.get_default_keyvault()
key_vault.set_secret(name="formkey", value=os.getenv('FORM_RECOGNIZER_KEY'))
key_vault.set_secret(name="formendpoint", value=os.getenv('FORM_RECOGNIZER_ENDPOINT'))
key_vault.set_secret(name="formmodelid", value=os.getenv('FORM_RECOGNIZER_CUSTOM_MODEL_ID'))
key_vault.set_secret(name="ocrkey", value=os.getenv('OCR_KEY'))
key_vault.set_secret(name="ocrendpoint", value=os.getenv('OCR_ENDPOINT'))

Setting Pipeline Secrets in Azure Key Vault


## Define Pipeline Parameters

In [6]:
# Just an example how we can use parameters to provide different input folders and values

input_dir = PipelineParameter(name="input_dir", default_value="val/clapperboard")
ocr_output_dir = PipelineParameter(name="ocr_output_dir", default_value="selected_clapperboards")
form_output_dir = PipelineParameter(name="form_output_dir", default_value="form_scoring_test_run")
form_labels = PipelineParameter(name="form_labels",
                                default_value="filename, roll, scene, take, title, director, camera, description")

## Build and set up dependencies for task-specific environment

In [7]:
# Build task-specific environment

from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create Pipeline run configuration 

run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.python.conda_dependencies = CondaDependencies.create(
    pip_packages=[
        'argparse==1.4.0',
        'azureml-sdk==1.18.0',
        'azure-storage-blob==12.5.0',
        'azure-identity==1.4.1',
        'azure-mgmt-resource==10.2.0',
        'azure-mgmt-network==16.0.0',
        'azure-mgmt-compute==17.0.0',
        'pyjwt==1.7.1',
        'numpy==1.18.5',
        'pandas==1.1.3',
        'pillow==7.2.0',
        'pyarrow==1.0.1',
        'scikit-image==0.17.2',
        'scikit-learn==0.23.2',
        'scipy==1.5.2',
        'tqdm==4.48.2',
        'opencv-python-headless',
        'tensorflow==2.3.0',
        'azure-cognitiveservices-vision-customvision==3.0.0',
        'PyYAML==5.3.1',
        'ipywidgets==7.5.1',
        'click==7.1.2',
        'python-dotenv==0.10.3'
    ]
)

#specify docker image that will be used

dockerfile = r"""
FROM mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20200821.v1
"""

run_config.environment.docker.base_image = None
run_config.environment.docker.base_dockerfile = dockerfile

## Configure and instantiate pipeline steps

In [8]:
# Create and configure Form Recognizer Training pipeline steps

source_directory = "../"

clapperboard_selection_step = PythonScriptStep(
    name="clapperboard selection",
    script_name="mlops/form_scoring_pipeline/steps/select_clapperboards.py",
    arguments=[
        "--root_dir",
        root_dir,
        "--input_dir",
        input_dir,
        "--output_dir",
        ocr_output_dir,
        "--force",
        True,
    ],
    inputs=[root_dir],
    outputs=[],
    compute_target=compute_target,
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse=False,
)

form_extraction_step = PythonScriptStep(
    name="form_recognizer",
    script_name="mlops/form_scoring_pipeline/steps/extract_forms.py",
    arguments=[
        "--root_dir",
        root_dir,
        "--input_dir",
        input_dir,
        "--clapperboard_dir",
        ocr_output_dir,
        "--output_dir",
        form_output_dir,
        "--labels",
        form_labels,
        "--force",
        True,
    ],
    inputs=[root_dir],
    outputs=[],
    compute_target=compute_target,
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse=False,
)

form_extraction_step.run_after(clapperboard_selection_step)

form_postprocessing_step = PythonScriptStep(
    name="postprocessing_form_recognizer",
    script_name="mlops/form_scoring_pipeline/steps/postprocess.py",
    arguments=[
        "--root_dir",
        root_dir,
        "--input_dir",
        form_output_dir,
        "--output_dir",
        form_output_dir,
        "--force",
        True,
    ],
    inputs=[root_dir],
    compute_target=compute_target,
    source_directory=source_directory,
    outputs=[],
    runconfig=run_config,
    allow_reuse=False,
)

form_postprocessing_step.run_after(form_extraction_step)

print("Pipeline Steps Created")

Pipeline Steps Created


## Configure and publish pipeline to AML

In [9]:
# Create pipeline using existing steps
scoring_pipeline = Pipeline(workspace=ws, steps=[clapperboard_selection_step,
                                                 form_extraction_step,
                                                 form_postprocessing_step])

# Check if the pipeline is consistent 
scoring_pipeline.validate()

# Publish pipeline
published_pipeline = scoring_pipeline.publish(
    name = "form_scoring_pipeline",
    description = "Pipeline to score a Custom Form Recognizer model"
)

Step clapperboard selection is ready to be created [0caa62b2]
Step form_recognizer is ready to be created [10f4d039]
Step postprocessing_form_recognizer is ready to be created [513be94d]
Created step clapperboard selection [0caa62b2][3f67f608-12e2-43aa-bd79-e6a7defa27ef], (This step will run and generate new outputs)
Created step form_recognizer [10f4d039][9306495c-2e45-4c1e-9b5b-9c37e329c998], (This step will run and generate new outputs)
Created step postprocessing_form_recognizer [513be94d][6b84e4c1-f813-41b4-abc1-a2f5db814dab], (This step will run and generate new outputs)
Using data reference form_data_ref for StepId [8ac11295][eb3974e4-60a3-418c-a118-01c1fa392c27], (Consumers of this data are eligible to reuse prior runs.)
Using data reference form_data_ref for StepId [b3a31671][eb3974e4-60a3-418c-a118-01c1fa392c27], (Consumers of this data are eligible to reuse prior runs.)Using data reference form_data_ref for StepId [2b010cff][eb3974e4-60a3-418c-a118-01c1fa392c27], (Consumers 

## Submit and run pipeline in AML

In [10]:
# Submit the pipeline
pipeline_run = Experiment(ws, 'form-score-pipeline').submit(scoring_pipeline)
pipeline_run.wait_for_completion()

Submitted PipelineRun 7fba0a93-f04c-4f33-b637-40657d8072ab
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/form-score-pipeline/runs/7fba0a93-f04c-4f33-b637-40657d8072ab?wsid=/subscriptions/5eb4ea0a-0c5d-4ed1-8553-b0ee0f5215b7/resourcegroups/iota-sharing-rg/workspaces/sandbox
PipelineRunId: 7fba0a93-f04c-4f33-b637-40657d8072ab
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/form-score-pipeline/runs/7fba0a93-f04c-4f33-b637-40657d8072ab?wsid=/subscriptions/5eb4ea0a-0c5d-4ed1-8553-b0ee0f5215b7/resourcegroups/iota-sharing-rg/workspaces/sandbox
PipelineRun Status: Running


StepRunId: 9a24aefd-d99e-42f7-932c-1bb88438c4bf
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/form-score-pipeline/runs/9a24aefd-d99e-42f7-932c-1bb88438c4bf?wsid=/subscriptions/5eb4ea0a-0c5d-4ed1-8553-b0ee0f5215b7/resourcegroups/iota-sharing-rg/workspaces/sandbox
StepRun( clapperboard selection ) Status: Running

Streaming azureml-logs/55_azureml-exe


Streaming azureml-logs/65_job_prep-tvmps_29fbfdbb3f13dbdbc53e133906f29358baa2666ed5b0fea4c9cd14e6b0558a86_p.txt
[2021-04-20T04:19:27.719868] Entering job preparation.
[2021-04-20T04:19:29.065713] Starting job preparation.
[2021-04-20T04:19:29.065748] Extracting the control code.
[2021-04-20T04:19:29.093367] fetching and extracting the control code on master node.
[2021-04-20T04:19:29.093396] Starting extract_project.
[2021-04-20T04:19:29.093437] Starting to extract zip file.
[2021-04-20T04:19:29.817630] Finished extracting zip file.
[2021-04-20T04:19:30.061164] Using urllib.request Python 3.0 or later
[2021-04-20T04:19:30.061203] Start fetching snapshots.
[2021-04-20T04:19:30.061247] Start fetching snapshot.
[2021-04-20T04:19:30.061261] Retrieving project from snapshot: 984783ca-e006-48c1-9f42-eaf3d55fd30e
Starting the daemon thread to refresh tokens in background for process with pid = 49
[2021-04-20T04:19:38.972324] Finished fetching snapshot.
[2021-04-20T04:19:38.972357] Finished f




StepRunId: 7cc1b284-1c23-4c91-8e54-56b5a5cd5828
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/form-score-pipeline/runs/7cc1b284-1c23-4c91-8e54-56b5a5cd5828?wsid=/subscriptions/5eb4ea0a-0c5d-4ed1-8553-b0ee0f5215b7/resourcegroups/iota-sharing-rg/workspaces/sandbox
StepRun( form_recognizer ) Status: NotStarted
StepRun( form_recognizer ) Status: Queued
StepRun( form_recognizer ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_29fbfdbb3f13dbdbc53e133906f29358baa2666ed5b0fea4c9cd14e6b0558a86_p.txt
2021-04-20T04:20:20Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/sandbox/azureml/7cc1b284-1c23-4c91-8e54-56b5a5cd5828/mounts/workspaceblobstore
2021-04-20T04:20:21Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/sandbox/azureml/7cc1b284-1c23-4c91-8e54-56b5a5cd5828/mounts/form_pipe_ds
2021-04-20T04:20:21Z Starting output-watcher...
2021-04-20T04:20:21Z IsDedicatedCompute


Streaming azureml-logs/70_driver_log.txt
2021/04/20 04:20:37 Attempt 1 of http call to http://10.0.0.4:16384/sendlogstoartifacts/info
2021/04/20 04:20:37 Attempt 1 of http call to http://10.0.0.4:16384/sendlogstoartifacts/status
[2021-04-20T04:20:38.511945] Entering context manager injector.
[context_manager_injector.py] Command line Options: Namespace(inject=['ProjectPythonPath:context_managers.ProjectPythonPath', 'RunHistory:context_managers.RunHistory', 'TrackUserError:context_managers.TrackUserError', 'UserExceptions:context_managers.UserExceptions'], invocation=['mlops/form_scoring_pipeline/steps/extract_forms.py', '--root_dir', '/mnt/batch/tasks/shared/LS_root/jobs/sandbox/azureml/7cc1b284-1c23-4c91-8e54-56b5a5cd5828/mounts/form_pipe_ds', '--input_dir', 'val/clapperboard', '--clapperboard_dir', 'selected_clapperboards', '--output_dir', 'form_scoring_test_run', '--labels', 'filename, roll, scene, take, title, director, camera, description', '--force', 'True'])
Script type = None



StepRun(form_recognizer) Execution Summary
StepRun( form_recognizer ) Status: Finished
{'runId': '7cc1b284-1c23-4c91-8e54-56b5a5cd5828', 'target': 'train-test-clust', 'status': 'Completed', 'startTimeUtc': '2021-04-20T04:20:23.11468Z', 'endTimeUtc': '2021-04-20T04:21:06.991029Z', 'properties': {'ContentSnapshotId': '984783ca-e006-48c1-9f42-eaf3d55fd30e', 'StepType': 'PythonScriptStep', 'ComputeTargetType': 'AmlCompute', 'azureml.moduleid': '9306495c-2e45-4c1e-9b5b-9c37e329c998', 'azureml.runsource': 'azureml.StepRun', 'azureml.nodeid': '10f4d039', 'azureml.pipelinerunid': '7fba0a93-f04c-4f33-b637-40657d8072ab', '_azureml.ComputeTargetType': 'amlcompute', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json'}, 'inputDatasets': [], 'outputDatasets': [], 'runDefinition': {'script': 'mlops/form_scoring_pipeline/steps/extract_forms.py', 'command': '', 'useAbsolutePath': False, 'arguments': ['--root_dir', '$AZUREML_DATAREFERENCE_form_da




StepRunId: bdb0de07-81b3-4a13-ab89-4ead1dfbcd6c
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/form-score-pipeline/runs/bdb0de07-81b3-4a13-ab89-4ead1dfbcd6c?wsid=/subscriptions/5eb4ea0a-0c5d-4ed1-8553-b0ee0f5215b7/resourcegroups/iota-sharing-rg/workspaces/sandbox
StepRun( postprocessing_form_recognizer ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_29fbfdbb3f13dbdbc53e133906f29358baa2666ed5b0fea4c9cd14e6b0558a86_p.txt
2021-04-20T04:21:23Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/sandbox/azureml/bdb0de07-81b3-4a13-ab89-4ead1dfbcd6c/mounts/workspaceblobstore
2021-04-20T04:21:23Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/sandbox/azureml/bdb0de07-81b3-4a13-ab89-4ead1dfbcd6c/mounts/form_pipe_ds
2021-04-20T04:21:24Z Starting output-watcher...
2021-04-20T04:21:24Z IsDedicatedCompute == False, starting polling for Low-Pri Preemption
2021-04-20T04:21:24Z E


Streaming azureml-logs/75_job_post-tvmps_29fbfdbb3f13dbdbc53e133906f29358baa2666ed5b0fea4c9cd14e6b0558a86_p.txt
[2021-04-20T04:21:49.950987] Entering job release
Failure while loading azureml_run_type_providers. Failed to load entrypoint hyperdrive = azureml.train.hyperdrive:HyperDriveRun._from_run_dto with exception (cryptography 3.4.7 (/azureml-envs/azureml_87c43497c4d4f72ed3fa76b80566d1b6/lib/python3.6/site-packages), Requirement.parse('cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*,<=3.2'), {'azureml-core'}).
Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception (cryptography 3.4.7 (/azureml-envs/azureml_87c43497c4d4f72ed3fa76b80566d1b6/lib/python3.6/site-packages), Requirement.parse('cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*,<=3.2'), {'azureml-core'}).
Failure while loading azureml_run_type_providers. Failed to load entrypoint azureml.PipelineRun = azureml.pipeline.core.run:PipelineRun._from



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '7fba0a93-f04c-4f33-b637-40657d8072ab', 'status': 'Completed', 'startTimeUtc': '2021-04-20T04:19:08.674563Z', 'endTimeUtc': '2021-04-20T04:22:04.148091Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{"input_dir":"val/clapperboard","ocr_output_dir":"selected_clapperboards","form_output_dir":"form_scoring_test_run","form_labels":"filename, roll, scene, take, title, director, camera, description"}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://sandbox4790804641.blob.core.windows.net/azureml/ExperimentRun/dcid.7fba0a93-f04c-4f33-b637-40657d8072ab/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=2bmKYSRlsIoXzcaPIAOYzwlmawyqg%2Fs8kNV%2BGMsge0A%3D&st=2021-04-20T04%3A12%3A06Z&se=2021-04-20T12%3A22%3A06Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https://sandbox4790804641.blob.core.windows.net/a

'Finished'