-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Escape html special characters in
hocr_document_template.xml.j2
(
#279) * fix: Escape html special characters in hocr_document_template.xml.j2 * test: Add Unit test for hOCR XML validity.
- Loading branch information
1 parent
71191ab
commit 2d9f05b
Showing
4 changed files
with
330 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
284 changes: 284 additions & 0 deletions
284
tests/unit/resources/toolbox_invoice_test-0-hocr-escape.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,284 @@ | ||
{ | ||
"text": "<Invoice>", | ||
"pages": [ | ||
{ | ||
"pageNumber": 1, | ||
"dimension": { | ||
"width": 1758.0, | ||
"height": 2275.0, | ||
"unit": "pixels" | ||
}, | ||
"layout": { | ||
"textAnchor": { | ||
"textSegments": [ | ||
{ | ||
"endIndex": "435" | ||
} | ||
] | ||
}, | ||
"boundingPoly": { | ||
"vertices": [ | ||
{}, | ||
{ | ||
"x": 1758 | ||
}, | ||
{ | ||
"x": 1758, | ||
"y": 2275 | ||
}, | ||
{ | ||
"y": 2275 | ||
} | ||
], | ||
"normalizedVertices": [ | ||
{}, | ||
{ | ||
"x": 1.0 | ||
}, | ||
{ | ||
"x": 1.0, | ||
"y": 1.0 | ||
}, | ||
{ | ||
"y": 1.0 | ||
} | ||
] | ||
}, | ||
"orientation": 1 | ||
}, | ||
"detectedLanguages": [ | ||
{ | ||
"languageCode": "en" | ||
}, | ||
{ | ||
"languageCode": "und" | ||
} | ||
], | ||
"blocks": [ | ||
{ | ||
"layout": { | ||
"textAnchor": { | ||
"textSegments": [ | ||
{ | ||
"endIndex": "8" | ||
} | ||
] | ||
}, | ||
"confidence": 0.99258333, | ||
"boundingPoly": { | ||
"vertices": [ | ||
{ | ||
"x": 1310, | ||
"y": 220 | ||
}, | ||
{ | ||
"x": 1534, | ||
"y": 220 | ||
}, | ||
{ | ||
"x": 1534, | ||
"y": 282 | ||
}, | ||
{ | ||
"x": 1310, | ||
"y": 282 | ||
} | ||
], | ||
"normalizedVertices": [ | ||
{ | ||
"x": 0.74516493, | ||
"y": 0.0967033 | ||
}, | ||
{ | ||
"x": 0.8725825, | ||
"y": 0.0967033 | ||
}, | ||
{ | ||
"x": 0.8725825, | ||
"y": 0.12395605 | ||
}, | ||
{ | ||
"x": 0.74516493, | ||
"y": 0.12395605 | ||
} | ||
] | ||
}, | ||
"orientation": 1 | ||
} | ||
} | ||
], | ||
"paragraphs": [ | ||
{ | ||
"layout": { | ||
"textAnchor": { | ||
"textSegments": [ | ||
{ | ||
"endIndex": "8" | ||
} | ||
] | ||
}, | ||
"confidence": 0.99258333, | ||
"boundingPoly": { | ||
"vertices": [ | ||
{ | ||
"x": 1310, | ||
"y": 220 | ||
}, | ||
{ | ||
"x": 1534, | ||
"y": 220 | ||
}, | ||
{ | ||
"x": 1534, | ||
"y": 282 | ||
}, | ||
{ | ||
"x": 1310, | ||
"y": 282 | ||
} | ||
], | ||
"normalizedVertices": [ | ||
{ | ||
"x": 0.74516493, | ||
"y": 0.0967033 | ||
}, | ||
{ | ||
"x": 0.8725825, | ||
"y": 0.0967033 | ||
}, | ||
{ | ||
"x": 0.8725825, | ||
"y": 0.12395605 | ||
}, | ||
{ | ||
"x": 0.74516493, | ||
"y": 0.12395605 | ||
} | ||
] | ||
}, | ||
"orientation": 1 | ||
} | ||
} | ||
], | ||
"lines": [ | ||
{ | ||
"layout": { | ||
"textAnchor": { | ||
"textSegments": [ | ||
{ | ||
"endIndex": "8" | ||
} | ||
] | ||
}, | ||
"confidence": 0.99258333, | ||
"boundingPoly": { | ||
"vertices": [ | ||
{ | ||
"x": 1310, | ||
"y": 220 | ||
}, | ||
{ | ||
"x": 1534, | ||
"y": 220 | ||
}, | ||
{ | ||
"x": 1534, | ||
"y": 282 | ||
}, | ||
{ | ||
"x": 1310, | ||
"y": 282 | ||
} | ||
], | ||
"normalizedVertices": [ | ||
{ | ||
"x": 0.74516493, | ||
"y": 0.0967033 | ||
}, | ||
{ | ||
"x": 0.8725825, | ||
"y": 0.0967033 | ||
}, | ||
{ | ||
"x": 0.8725825, | ||
"y": 0.12395605 | ||
}, | ||
{ | ||
"x": 0.74516493, | ||
"y": 0.12395605 | ||
} | ||
] | ||
}, | ||
"orientation": 1 | ||
}, | ||
"detectedLanguages": [ | ||
{ | ||
"languageCode": "en" | ||
} | ||
] | ||
} | ||
], | ||
"tokens": [ | ||
{ | ||
"layout": { | ||
"textAnchor": { | ||
"textSegments": [ | ||
{ | ||
"endIndex": "8" | ||
} | ||
] | ||
}, | ||
"confidence": 0.99258333, | ||
"boundingPoly": { | ||
"vertices": [ | ||
{ | ||
"x": 1310, | ||
"y": 220 | ||
}, | ||
{ | ||
"x": 1534, | ||
"y": 220 | ||
}, | ||
{ | ||
"x": 1534, | ||
"y": 282 | ||
}, | ||
{ | ||
"x": 1310, | ||
"y": 282 | ||
} | ||
], | ||
"normalizedVertices": [ | ||
{ | ||
"x": 0.74516493, | ||
"y": 0.0967033 | ||
}, | ||
{ | ||
"x": 0.8725825, | ||
"y": 0.0967033 | ||
}, | ||
{ | ||
"x": 0.8725825, | ||
"y": 0.12395605 | ||
}, | ||
{ | ||
"x": 0.74516493, | ||
"y": 0.12395605 | ||
} | ||
] | ||
}, | ||
"orientation": 1 | ||
}, | ||
"detectedLanguages": [ | ||
{ | ||
"languageCode": "en" | ||
} | ||
] | ||
} | ||
] | ||
} | ||
], | ||
"shardInfo": { | ||
"shardCount": "1" | ||
} | ||
} |
16 changes: 16 additions & 0 deletions
16
tests/unit/resources/toolbox_invoice_test-0-hocr-escape.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | ||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="unknown" lang="unknown"> | ||
<head> | ||
<title>hocr-escape</title> | ||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> | ||
<meta name="ocr-system" content="Document AI OCR" /> | ||
<meta name="ocr-langs" content="unknown" /> | ||
<meta name="ocr-scripts" content="unknown" /> | ||
<meta name="ocr-number-of-pages" content="1" /> | ||
<meta name="ocr-capabilities" content="ocrp_lang ocr_page ocr_carea ocr_par ocr_line ocrx_word" /> | ||
</head> | ||
<body> | ||
<div class='ocr_page' lang='unknown' title='bbox 0 0 1758 2275'><span class='ocr_carea' id='block_1_0' title='bbox 1310 220 1534 282'><p class='ocr_par' id='par_1_0_0' title='bbox 1310 220 1534 282'><span class='ocr_line' id='line_1_0_0_0' title='bbox 1310 220 1534 282'><Invoice<span class='ocrx_word' id='word_1_0_0_0_0' title='bbox 1310 220 1534 282'><Invoice</span></span></p></span></div> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters