Skip to content

Commit

Permalink
fix: Escape html special characters in hocr_document_template.xml.j2 (
Browse files Browse the repository at this point in the history
#279)

* fix: Escape html special characters in hocr_document_template.xml.j2
* test: Add Unit test for hOCR XML validity.
  • Loading branch information
holtskinner authored Mar 11, 2024
1 parent 71191ab commit 2d9f05b
Show file tree
Hide file tree
Showing 4 changed files with 330 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
{% set paridx = loop.index0 -%}
<p class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
{% set lidx = loop.index0 -%}
<span class='ocr_line' id='line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title='{{ line.hocr_bounding_box }}'>{{ line.text }}{% for token in line.tokens -%}
<span class='ocr_line' id='line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title='{{ line.hocr_bounding_box }}'>{{ line.text|escape }}{% for token in line.tokens -%}
{% set tidx = loop.index0 -%}
<span class='ocrx_word' id='word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title='{{ token.hocr_bounding_box }}'>{{ token.text }}</span>{% endfor -%}
<span class='ocrx_word' id='word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title='{{ token.hocr_bounding_box }}'>{{ token.text|escape }}</span>{% endfor -%}
</span>{% endfor -%}
</p>{% endfor -%}
</span>{% endfor -%}
Expand Down
284 changes: 284 additions & 0 deletions tests/unit/resources/toolbox_invoice_test-0-hocr-escape.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
{
"text": "<Invoice>",
"pages": [
{
"pageNumber": 1,
"dimension": {
"width": 1758.0,
"height": 2275.0,
"unit": "pixels"
},
"layout": {
"textAnchor": {
"textSegments": [
{
"endIndex": "435"
}
]
},
"boundingPoly": {
"vertices": [
{},
{
"x": 1758
},
{
"x": 1758,
"y": 2275
},
{
"y": 2275
}
],
"normalizedVertices": [
{},
{
"x": 1.0
},
{
"x": 1.0,
"y": 1.0
},
{
"y": 1.0
}
]
},
"orientation": 1
},
"detectedLanguages": [
{
"languageCode": "en"
},
{
"languageCode": "und"
}
],
"blocks": [
{
"layout": {
"textAnchor": {
"textSegments": [
{
"endIndex": "8"
}
]
},
"confidence": 0.99258333,
"boundingPoly": {
"vertices": [
{
"x": 1310,
"y": 220
},
{
"x": 1534,
"y": 220
},
{
"x": 1534,
"y": 282
},
{
"x": 1310,
"y": 282
}
],
"normalizedVertices": [
{
"x": 0.74516493,
"y": 0.0967033
},
{
"x": 0.8725825,
"y": 0.0967033
},
{
"x": 0.8725825,
"y": 0.12395605
},
{
"x": 0.74516493,
"y": 0.12395605
}
]
},
"orientation": 1
}
}
],
"paragraphs": [
{
"layout": {
"textAnchor": {
"textSegments": [
{
"endIndex": "8"
}
]
},
"confidence": 0.99258333,
"boundingPoly": {
"vertices": [
{
"x": 1310,
"y": 220
},
{
"x": 1534,
"y": 220
},
{
"x": 1534,
"y": 282
},
{
"x": 1310,
"y": 282
}
],
"normalizedVertices": [
{
"x": 0.74516493,
"y": 0.0967033
},
{
"x": 0.8725825,
"y": 0.0967033
},
{
"x": 0.8725825,
"y": 0.12395605
},
{
"x": 0.74516493,
"y": 0.12395605
}
]
},
"orientation": 1
}
}
],
"lines": [
{
"layout": {
"textAnchor": {
"textSegments": [
{
"endIndex": "8"
}
]
},
"confidence": 0.99258333,
"boundingPoly": {
"vertices": [
{
"x": 1310,
"y": 220
},
{
"x": 1534,
"y": 220
},
{
"x": 1534,
"y": 282
},
{
"x": 1310,
"y": 282
}
],
"normalizedVertices": [
{
"x": 0.74516493,
"y": 0.0967033
},
{
"x": 0.8725825,
"y": 0.0967033
},
{
"x": 0.8725825,
"y": 0.12395605
},
{
"x": 0.74516493,
"y": 0.12395605
}
]
},
"orientation": 1
},
"detectedLanguages": [
{
"languageCode": "en"
}
]
}
],
"tokens": [
{
"layout": {
"textAnchor": {
"textSegments": [
{
"endIndex": "8"
}
]
},
"confidence": 0.99258333,
"boundingPoly": {
"vertices": [
{
"x": 1310,
"y": 220
},
{
"x": 1534,
"y": 220
},
{
"x": 1534,
"y": 282
},
{
"x": 1310,
"y": 282
}
],
"normalizedVertices": [
{
"x": 0.74516493,
"y": 0.0967033
},
{
"x": 0.8725825,
"y": 0.0967033
},
{
"x": 0.8725825,
"y": 0.12395605
},
{
"x": 0.74516493,
"y": 0.12395605
}
]
},
"orientation": 1
},
"detectedLanguages": [
{
"languageCode": "en"
}
]
}
]
}
],
"shardInfo": {
"shardCount": "1"
}
}
16 changes: 16 additions & 0 deletions tests/unit/resources/toolbox_invoice_test-0-hocr-escape.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="unknown" lang="unknown">
<head>
<title>hocr-escape</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="ocr-system" content="Document AI OCR" />
<meta name="ocr-langs" content="unknown" />
<meta name="ocr-scripts" content="unknown" />
<meta name="ocr-number-of-pages" content="1" />
<meta name="ocr-capabilities" content="ocrp_lang ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
</head>
<body>
<div class='ocr_page' lang='unknown' title='bbox 0 0 1758 2275'><span class='ocr_carea' id='block_1_0' title='bbox 1310 220 1534 282'><p class='ocr_par' id='par_1_0_0' title='bbox 1310 220 1534 282'><span class='ocr_line' id='line_1_0_0_0' title='bbox 1310 220 1534 282'>&lt;Invoice<span class='ocrx_word' id='word_1_0_0_0_0' title='bbox 1310 220 1534 282'>&lt;Invoice</span></span></p></span></div>
</body>
</html>
28 changes: 28 additions & 0 deletions tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import json
import os
import shutil
from xml.etree import ElementTree

# try/except added for compatibility with python < 3.8
try:
Expand Down Expand Up @@ -791,6 +792,9 @@ def test_export_hocr_str():
actual_hocr = wrapped_document.export_hocr_str(title="toolbox_invoice_test-0")
assert actual_hocr

element = ElementTree.fromstring(actual_hocr)
assert element is not None

with open(
"tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r", encoding="utf-8"
) as f:
Expand All @@ -808,6 +812,30 @@ def test_export_hocr_str_with_blank_document():

assert actual_hocr

element = ElementTree.fromstring(actual_hocr)
assert element is not None


def test_export_hocr_str_with_escape_characters():
wrapped_document = document.Document.from_document_path(
document_path="tests/unit/resources/toolbox_invoice_test-0-hocr-escape.json"
)

actual_hocr = wrapped_document.export_hocr_str(title="hocr-escape")
assert actual_hocr

element = ElementTree.fromstring(actual_hocr)
assert element is not None

with open(
"tests/unit/resources/toolbox_invoice_test-0-hocr-escape.xml",
"r",
encoding="utf-8",
) as f:
expected = f.read()

assert actual_hocr == expected


def test_document_to_merged_documentai_document(get_bytes_multiple_files_mock):
wrapped_document = document.Document.from_gcs(
Expand Down

0 comments on commit 2d9f05b

Please sign in to comment.