From 408a15ccfb7c9aaabfe2ba5cbbacc3a2b6a08c07 Mon Sep 17 00:00:00 2001 From: Johannes Baiter Date: Mon, 11 May 2020 11:52:15 +0200 Subject: [PATCH] Filter out empty/whitespace-only regions (fixes #105) --- docs/changes.md | 1 + .../digitalcollections/solrocr/formats/OcrPassageFormatter.java | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/changes.md b/docs/changes.md index 01cc098f..bf3fd01a 100644 --- a/docs/changes.md +++ b/docs/changes.md @@ -33,6 +33,7 @@ This is a major release with a focus on compatibility and performance. - Log warnings during source pointer parsing - Filter out empty files during indexing - Add new documentation section on performance tuning +- Empty regions or regions with only whitespace are no longer included in the output ## 0.3.1 (2019-07-26) diff --git a/src/main/java/de/digitalcollections/solrocr/formats/OcrPassageFormatter.java b/src/main/java/de/digitalcollections/solrocr/formats/OcrPassageFormatter.java index 7e1045fc..6985b138 100644 --- a/src/main/java/de/digitalcollections/solrocr/formats/OcrPassageFormatter.java +++ b/src/main/java/de/digitalcollections/solrocr/formats/OcrPassageFormatter.java @@ -198,6 +198,7 @@ protected OcrSnippet parseFragment(String ocrFragment, OcrPage page) { String highlightedText = getTextFromXml(ocrFragment); List snippetRegions = byColumns.stream() .map(this::determineSnippetRegion) + .filter(r -> !r.getText().isEmpty() && !r.getText().trim().isEmpty()) .collect(Collectors.toList()); Set snippetPageIds = snippetRegions.stream() .map(OcrBox::getPageId).collect(Collectors.toSet());