# Generating Azure OCR annotations for the FUNSD+ dataset

## Import modules and load environment variables

In [1]:
import os, sys
sys.path.append('../src')
from utils import *
from azure_utils import *
from dotenv import load_dotenv

# Load the environment variables
load_dotenv()
endpoint = os.getenv('AZURE_OCR_ENDPOINT')
key = os.getenv('AZURE_OCR_KEY')

## Parameters

In [2]:
# Define the dataset URL and root directory
DATASET_URL = 'https://www.crc.nd.edu/~akuehlka/funsd_plus/images'
DATASET_ROOT = '/Users/akuehlka/work/crc/FUNSD/datasets/funsd_plus/'

# Training data directories
TRAIN_DIR = os.path.join(DATASET_ROOT, 'training_data')
TRAIN_ANNOT_DIR = os.path.join(TRAIN_DIR, 'annotations')
TRAIN_IMG_DIR = os.path.join(TRAIN_DIR, 'images')
TRAIN_IMG_URL = f"{DATASET_URL}/training_data/images/"

# Testing data directories
TEST_DIR = os.path.join(DATASET_ROOT, 'testing_data')
TEST_ANNOT_DIR = os.path.join(TEST_DIR, 'annotations')
TEST_IMG_DIR = os.path.join(TEST_DIR, 'images')
TEST_IMG_URL = f"{DATASET_URL}/testing_data/images/"

## Generate OCR annotations for the FUNSD Training dataset

In [3]:
# Main execution

# Define the dataset directories
dataset_dir = TRAIN_DIR
annot_dir = TRAIN_ANNOT_DIR
img_dir = TRAIN_IMG_DIR
img_url = TRAIN_IMG_URL

# Define Prebuilt model ID
model_id = "prebuilt-read"

# Create the directory to save the annotations
azure_annot_dir = os.path.join(dataset_dir, f"annotations_azure_model__{model_id.replace('-','_')}")
os.makedirs(azure_annot_dir, exist_ok=True)

# Initialize the client
client = initialize_client(endpoint, key)

# Generate the image URLs to analyze
form_urls = [os.path.join(DATASET_URL, os.path.basename(f)) for f in glob.glob(os.path.join(TRAIN_IMG_DIR, 'train*.png'))]
#generate_image_urls(img_dir, img_url)

# Analyze the documents and save annotations
for form_url in form_urls:
    
    annotations_fpath = os.path.join(azure_annot_dir, f"{os.path.basename(form_url).split('.')[0]}.json")
    if not os.path.exists(annotations_fpath):
        # Analyze the document
        result, status = analyze_document(form_url, client, model_id)

        # Save the result to a JSON file
        save_result_to_json(result, annotations_fpath)

output_csv, _ = process_and_save_annotations(azure_annot_dir)
print(f"Annotations saved to: {output_csv}")


Annotations saved to: /Users/akuehlka/work/crc/FUNSD/datasets/funsd_plus/training_data/annotations_azure_model__prebuilt_read/hw_annotations.csv


## Generate OCR annotations for the FUNSD Testing dataset

In [5]:
# Main execution

# Define the dataset directories
dataset_dir = TEST_DIR
annot_dir = TEST_ANNOT_DIR
img_dir = TEST_IMG_DIR
img_url = TEST_IMG_URL

# Define Prebuilt model ID
model_id = "prebuilt-read"

# Create the directory to save the annotations
azure_annot_dir = os.path.join(dataset_dir, f"annotations_azure_model__{model_id.replace('-','_')}")
os.makedirs(azure_annot_dir, exist_ok=True)

# Initialize the client
client = initialize_client(endpoint, key)

# Generate the image URLs to analyze
form_urls = [os.path.join(DATASET_URL, os.path.basename(f)) for f in glob.glob(os.path.join(TEST_IMG_DIR, '*.png'))]

# Analyze the documents and save annotations
for form_url in form_urls:
    
    annotations_fpath = os.path.join(azure_annot_dir, f"{os.path.basename(form_url).split('.')[0]}.json")
    if not os.path.exists(annotations_fpath):
        # Analyze the document
        result, status = analyze_document(form_url, client, model_id)

        # Save the result to a JSON file
        save_result_to_json(result, annotations_fpath)

output_csv, df = process_and_save_annotations(azure_annot_dir)
print(f"Annotations saved to: {output_csv}")


Annotations saved to: /Users/akuehlka/work/crc/FUNSD/datasets/funsd_plus/testing_data/annotations_azure_model__prebuilt_read/hw_annotations.csv
