# Extracting text with Azure DocAI

In [99]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentAnalysisFeature, AnalyzeResult, AnalyzeDocumentRequest
from PIL import Image
import os
import mimetypes
import fitz #install pymupdf

In [2]:
# Initializing Document Intelligence endpoint
endpoint = "https://azr-hjm-intell-dev.cognitiveservices.azure.com/"

with open('azure_docai_key.txt', 'r') as file:
    API_KEY = file.read()
    
credential = AzureKeyCredential(API_KEY)
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential)

## Getting document information

https://github.com/azure-samples/document-intelligence-code-samples/blob/main/Python(v4.0)/Layout_model/sample_analyze_layout.py

This sample demonstrates how to extract text, tables, figures, selection marks and document structure (e.g., sections) information from a document given through a file.

Selection marks returned from begin_analyze_document(model_id="prebuilt-layout") do not return the text associated with the checkbox. We need to manually process that data

In [3]:
def _in_span(word, spans):
    # Make sure the word is on the specified span (bounding box)
    for span in spans:
        if word.span.offset >= span.offset and (word.span.offset + word.span.length) <= (span.offset + span.length):
            return True
    
    return False

In [4]:
def get_words(page, line):
    result = []

    for word in page.words:
        if _in_span(word, line.spans):
            result.append(word)
    
    return result

In [100]:
# Analyze the document
base_path = 'C:/Users/GReyes15/OneDrive - JNJ/Documents/Documents/DCMT/data/'
filename = '1817854.pdf' #sample_pdf.pdf'
file_path = ''.join([base_path, filename])

with open(file_path, "rb") as file:
    poller = document_intelligence_client.begin_analyze_document("prebuilt-layout",
                                                                 analyze_request = file,
                                                                 features = [DocumentAnalysisFeature.KEY_VALUE_PAIRS],
                                                                 content_type = "application/octet-stream")

result = poller.result()
result

{'apiVersion': '2024-02-29-preview', 'modelId': 'prebuilt-layout', 'stringIndexType': 'textElements', 'content': 'Lithor del Istmo, S.A.\nJanssen\nPHARMACEUTICAL COMPANIES OF Johnson & Johnson\nAGREEMENT FOR THE NON-EXCLUSIVE DISTRIBUTION AND NON-EXCLUSIVE MARKETING OF PRODUCTS FOR THE HEALTH CARE AND GOOD BUSINESS PRACTICES, ENTERED BETWEEN ETHNOR DEL ISTMO, S.A. ON THE ONE HAND (HEREINAFTER REFERRED TO AS "ETHNOR") REPRESENTED IN THIS ACT BY GABRIELA LEÓN MARCANO, BEARER OF THE VENEZUELAN PASSPORT NUMBER 101340069, AND BY THE OTHER HAND, BERMUDA GENERAL AGENCY LTD., REPRESENTED IN THIS ACT BY ALLAN FITZSIMMONS, MALE, ADULT, HOLDER OF THE PERSONAL IDENTITY CARD NUMBER 216866,WHO ACT ON HIS CAPACITY OF LEGAL REPRESNETATIVE OF THE CORPORATION, DULY AUTHORIZED TO SIGN THIS AGREEMENT (HEREINAFTER REFERRED TO AS THE "DISTRIBUTOR"), JOINTLY REFERRED TO AS THE PARTIES, IN ACCORDANCE WITH THE FOLLOWING RECITALS, TERMS AND CONDITIONS.\nRECITALS\nI. ETHNOR through its representative hereby stat

In [101]:
# Check if the document has handwritten content
if result.styles and any([style.is_handwritten for style in result.styles]):
    print("The document has handwritten content.")
else:
    print("The document does not have handwritten content.")

The document has handwritten content.


Code to get the text data from a document
https://github.com/azure-samples/document-intelligence-code-samples/blob/main/Python(v4.0)/Layout_model/sample_analyze_layout.py

`# Analyze pages.`
`    # To learn the detailed concept of "bounding polygon" in the following content, visit: https://aka.ms/bounding-region `
`    for page in result.pages:`
`        print(f"----Analyzing layout from page #{page.page_number}----")`
`        print(f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}")`

`        # Analyze lines.`
`        if page.lines:`
`            for line_idx, line in enumerate(page.lines):`
`                words = get_words(page, line)`
`                print(`
`                    f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "`
`                    f"within bounding polygon '{line.polygon}'"`
`                )`

`                # Analyze words.`
`                for word in words:`
`                    print(f"......Word '{word.content}' has a confidence of {word.confidence}")`

`       # Analyze selection marks.`
`        if page.selection_marks:`
`            for selection_mark in page.selection_marks:`
`                print(`
`                    f"Selection mark is '{selection_mark.state}' within bounding polygon "`
`                    f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"`
`                )`
`        # Note that selection marks returned from begin_analyze_document(model_id="prebuilt-layout") do not return the text associated with the checkbox. `
`        # For the API to return this information, build a custom model to analyze the checkbox and its text. For detailed steps, visit: https://aka.ms/train-your-custom-model`

In [102]:
# Get the document text by section
def get_page_text(result):
    doc_text = {}
    content = []
    current_role = str(result.paragraphs[0].role)

    for paragraph in result.paragraphs:
        if paragraph.role in [None, 'pageHeader']:
            # Add the paragraph into the current section
            content.append(paragraph.content)
        elif (paragraph.role != 'pageNumber'):
            # Store the previous section text
            doc_text[f"Section_{len(doc_text)}"] = {'role': current_role,
                                                    'content': '\n'.join(content)}
            current_role = paragraph.role
            content = [paragraph.content]

    doc_text[f"Section_{len(doc_text)}"] = {'role': current_role,
                                            'content': '\n'.join(content)}
    
    return doc_text

In [103]:
get_page_text(result)

{'Section_0': {'role': 'None',
  'content': 'Lithor del Istmo, S.A.\nJanssen\nPHARMACEUTICAL COMPANIES OF Johnson & Johnson\nAGREEMENT FOR THE NON-EXCLUSIVE DISTRIBUTION AND NON-EXCLUSIVE MARKETING OF PRODUCTS FOR THE HEALTH CARE AND GOOD BUSINESS PRACTICES, ENTERED BETWEEN ETHNOR DEL ISTMO, S.A. ON THE ONE HAND (HEREINAFTER REFERRED TO AS "ETHNOR") REPRESENTED IN THIS ACT BY GABRIELA LEÓN MARCANO, BEARER OF THE VENEZUELAN PASSPORT NUMBER 101340069, AND BY THE OTHER HAND, BERMUDA GENERAL AGENCY LTD., REPRESENTED IN THIS ACT BY ALLAN FITZSIMMONS, MALE, ADULT, HOLDER OF THE PERSONAL IDENTITY CARD NUMBER 216866,WHO ACT ON HIS CAPACITY OF LEGAL REPRESNETATIVE OF THE CORPORATION, DULY AUTHORIZED TO SIGN THIS AGREEMENT (HEREINAFTER REFERRED TO AS THE "DISTRIBUTOR"), JOINTLY REFERRED TO AS THE PARTIES, IN ACCORDANCE WITH THE FOLLOWING RECITALS, TERMS AND CONDITIONS.'},
 'Section_1': {'role': 'sectionHeading',
  'content': 'RECITALS\nI. ETHNOR through its representative hereby states:\na) to b

In [104]:
# Analyze tables
def analyze_table(result):
    tables = {f'table_{i}': '' for i in range(len(result.tables))}

    if result.tables:

        for table_idx, table in enumerate(result.tables):
            rows = {f'row_{i}': '' for i in range(table.row_count)}
            print(f"Table #{table_idx} has {table.row_count} rows and {table.column_count} columns")

            if table.bounding_regions:
                for region in table.bounding_regions:
                    print(f"\t>>> Table #{table_idx} location on page: {region.page_number} is {region.polygon}")

            # Evaluate the cells
            for cell in table.cells:
                rows[f'row_{cell.row_index}'] = {f'col_{cell.column_index}': cell.content}
                print(f"\t>>> Cell [{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
                if cell.bounding_regions:
                    print(f"\t>>> Content on page {region.page_number} is within bounding polygon {region.polygon}")

            tables[f'table_{table_idx}'] = rows
    
    return tables

In [105]:
analyze_table(result)

Table #0 has 3 rows and 2 columns
	>>> Table #0 location on page: 17 is [1.9788, 6.3239, 7.4243, 6.329, 7.4186, 9.5397, 1.9741, 9.5343]
	>>> Cell [0][0] has text 'INCOTERM'
	>>> Content on page 17 is within bounding polygon [1.9788, 6.3239, 7.4243, 6.329, 7.4186, 9.5397, 1.9741, 9.5343]
	>>> Cell [0][1] has text 'Acceptance of merchandise and transfer of ownership'
	>>> Content on page 17 is within bounding polygon [1.9788, 6.3239, 7.4243, 6.329, 7.4186, 9.5397, 1.9741, 9.5343]
	>>> Cell [1][0] has text 'EXW - [Ex Works] Local Sales (warehouse APP)'
	>>> Content on page 17 is within bounding polygon [1.9788, 6.3239, 7.4243, 6.329, 7.4186, 9.5397, 1.9741, 9.5343]
	>>> Cell [1][1] has text 'For local sales in which the DISTRIBUTOR withdraws the merchandise in the warehouse, THE PARTIES agree that the transfer and acceptance of ownership of the products will take effect when said products leave ETHNOR's warehouse. ETHNOR will send the DISTRIBUTOR a copy of the dispatch documentation by em

{'table_0': {'row_0': {'col_1': 'Acceptance of merchandise and transfer of ownership'},
  'row_1': {'col_1': "For local sales in which the DISTRIBUTOR withdraws the merchandise in the warehouse, THE PARTIES agree that the transfer and acceptance of ownership of the products will take effect when said products leave ETHNOR's warehouse. ETHNOR will send the DISTRIBUTOR a copy of the dispatch documentation by email."},
  'row_2': {'col_1': 'For local sales in which the merchandise is delivered to the DISTRIBUTOR within the relevant territory, THE PARTIES agree that the transfer and acceptance of ownership of the products will take effect once the export procedures have been completed and when the products have been delivered to the main carrier. ETHNOR will send the DISTRIBUTOR a copy of the dispatch documentation by email.'}},
 'table_1': {'row_0': {'col_1': 'Acceptance of merchandise and transfer of ownership'},
  'row_1': {'col_1': 'For export sales for which shipments are made by air,

In [106]:
def crop_image_from_image(image_path, page_number, bounding_box):
    with Image.open(image_path) as img:
        if img.format == "TIFF":
            img.seek(page_number)
            img = img.copy()

        return img.crop(bounding_box)

In [107]:
def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)

    bbx = [x * 72 for x in bounding_box]
    rect = fitz.Rect(bbx)
    pix = page.get_pixmap(matrix = fitz.Matrix(300/72, 300/72),
                          clip = rect)
    img = Image.frombytes("RGB",
                          [pix.width, pix.height],
                          pix.samples)
    doc.close()

    return img

In [108]:
def crop_image_from_file(file_path, page_number, bounding_box):
    mime_type = mimetypes.guess_type(file_path)[0]

    if mime_type == "application/pdf":
        return crop_image_from_pdf_page(file_path, page_number, bounding_box)
    else:
        return crop_image_from_image(file_path, page_number, bounding_box)

In [109]:
# Analyze the text from figures
# https://github.com/Azure-Samples/document-intelligence-code-samples/blob/main/Python(v4.0)/Retrieval_Augmented_Generation_(RAG)_samples/sample_figure_understanding.ipynb
def get_figures(result, file_path):
    images = {f"img_{i}": '' for i in range(len(result.figures))}
    content = result.content

    if result.figures:
        print(f"Found {len(result.figures)} figures.")

        for figures_idx, figures in enumerate(result.figures):
            fig_content = ''
            img_desc = ''
            print(f"\t>>> Figure #{figures_idx} has the following spans: {figures.spans}")

            for i, span in enumerate(figures.spans):
                print(f"\t>>> Span #{i}: {span}")
                fig_content += content[span.offset:span.offset + span.length]
            print(f"\t>>> Original figure content in markdown: {fig_content}")

            # The figure bounding boxes includes the coordinates of both the image and the caption
            if figures.caption:
                caption_region = figures.caption.bounding_regions
                print(f"\t>>> Caption: {figures.caption.content}")
                print(f"\t>>> Caption bounding region: {caption_region}")

                for region in figures.bounding_regions:
                    if region not in caption_region:
                        print(f"\t>>> Figure body bounding regions: {region}")
                        bounding_box = (region.polygon[0], #x0 (left)
                                        region.polygon[1], #y0 (top)
                                        region.polygon[4], #x1 (right)
                                        region.polygon[5]) #y1 (bottom)
                        print(f"\t>>> Figure body bounding box in: {bounding_box}")
                        cropped_image = crop_image_from_file(file_path, region.page_number-1, bounding_box)

                        images[f"img_{figures_idx}"] = cropped_image
            else:
                print("\t>>> No caption found for this figure")

                for region in figures.bounding_regions:
                    print(f"\t>>> Figure body bounding regions: {region}")
                    bounding_box = (region.polygon[0], #x0 (left)
                                    region.polygon[1], #y0 (top)
                                    region.polygon[4], #x1 (right)
                                    region.polygon[5]) #y1 (bottom)
                    print(f"\t>>> Figure body bounding box in: {bounding_box}")
                    cropped_image = crop_image_from_file(file_path, region.page_number-1, bounding_box)

                    images[f"img_{figures_idx}"] = cropped_image
                    
    print(images)
    return images

In [110]:
def save_images(images, filename, save_path):
    for key, value in images.items():
        base_name = key
        output_file = base_name + '.png'
        img_path = os.path.join(save_path, '_'.join([filename, output_file]))

        value.save(img_path)
        print(f"{img_path} image saved!")

In [111]:
images = get_figures(result, file_path)
save_images(images, filename.rsplit('.pdf', 1)[0], base_path)

Found 60 figures.
	>>> Figure #0 has the following spans: [{'offset': 0, 'length': 22}]
	>>> Span #0: {'offset': 0, 'length': 22}
	>>> Original figure content in markdown: Lithor del Istmo, S.A.
	>>> No caption found for this figure
	>>> Figure body bounding regions: {'pageNumber': 1, 'polygon': [1.1112, 1.117, 3.7053, 1.1166, 3.7058, 1.4212, 1.1117, 1.4216]}
	>>> Figure body bounding box in: (1.1112, 1.117, 3.7058, 1.4212)
	>>> Figure #1 has the following spans: [{'offset': 23, 'length': 53}]
	>>> Span #0: {'offset': 23, 'length': 53}
	>>> Original figure content in markdown: Janssen
PHARMACEUTICAL COMPANIES OF Johnson & Johnson
	>>> No caption found for this figure
	>>> Figure body bounding regions: {'pageNumber': 1, 'polygon': [4.5649, 0.7136, 7.3022, 0.7136, 7.3019, 1.2798, 4.5649, 1.2796]}
	>>> Figure body bounding box in: (4.5649, 0.7136, 7.3019, 1.2798)
	>>> Figure #2 has the following spans: [{'offset': 3301, 'length': 53}]
	>>> Span #0: {'offset': 3301, 'length': 53}
	>>> Orig