Skip to content
Permalink
Browse files

Add support for hl.absoluteHighlights option (implements #6)

  • Loading branch information...
jbaiter authored and bitzl committed Apr 17, 2019
1 parent 8d7c22b commit 9f24bacda93296da01a91747c4989e12efaf920a
@@ -259,6 +259,8 @@ These parameters can be changed at query time:
`block` or `page` and defaults to `page`.
- `hl.ocr.pageId`: Only show passages from the page with this identifier. Useful if you want to implement a
"Search on this page" feature (e.g. for the [IIIF Content Search API](https://iiif.io/api/search/1.0/)).
- `hl.ocr.absoluteHighlights`: Return the coordinates of highlighted regions as absolute coordinates (i.e. relative to
the page, not the snippet region)


## The MiniOCR format
@@ -30,5 +30,6 @@
* @param prehHighlightTag the tag to put in the snippet text before a highlighted region, e.g. <em>
* @param postHighlightTag the tag to put in the snippet text after a highlighted region, e.g. </em>
*/
OcrPassageFormatter getPassageFormatter(OcrBlock limitBlock, String prehHighlightTag, String postHighlightTag);
OcrPassageFormatter getPassageFormatter(OcrBlock limitBlock, String prehHighlightTag, String postHighlightTag,
boolean absoluteHighlights);
}
@@ -26,10 +26,12 @@
public abstract class OcrPassageFormatter extends PassageFormatter {
protected final String startHlTag;
protected final String endHlTag;
protected final boolean absoluteHighlights;

protected OcrPassageFormatter(String startHlTag, String endHlTag) {
protected OcrPassageFormatter(String startHlTag, String endHlTag, boolean absoluteHighlights) {
this.startHlTag = startHlTag;
this.endHlTag = endHlTag;
this.absoluteHighlights = absoluteHighlights;
}

/** Merge overlapping matches. **/
@@ -133,6 +135,18 @@ private String truncateFragment(String ocrFragment, BreakIterator breakIter) {
/** Parse an {@link OcrSnippet} from an OCR fragment. */
protected abstract OcrSnippet parseFragment(String ocrFragment, String pageId);

protected void addHighlightsToSnippet(List<List<OcrBox>> hlBoxes, OcrSnippet snippet) {
final float xOffset = this.absoluteHighlights ? 0 : snippet.getSnippetRegion().ulx;
final float yOffset = this.absoluteHighlights ? 0 : snippet.getSnippetRegion().uly;
hlBoxes.stream()
.map(bs -> bs.stream()
.map(b -> new OcrBox(b.text, b.ulx - xOffset, b.uly - yOffset,
b.lrx - xOffset, b.lry - yOffset))
.collect(Collectors.toList()))
.forEach(bs -> snippet.addHighlightRegion(this.mergeBoxes(bs)));
}


/** Merge adjacent OCR boxes into a single one, taking line breaks into account **/
protected List<OcrBox> mergeBoxes(List<OcrBox> boxes) {
List<OcrBox> out = new ArrayList<>();
@@ -33,7 +33,8 @@ public BreakIterator getBreakIterator() {

@Override
public OcrPassageFormatter getPassageFormatter(OcrBlock limitBlock, String prehHighlightTag,
String postHighlightTag) {
return new AltoPassageFormatter(breakTag, blockTagMapping.get(limitBlock), prehHighlightTag, postHighlightTag);
String postHighlightTag, boolean absoluteHighlights) {
return new AltoPassageFormatter(breakTag, blockTagMapping.get(limitBlock), prehHighlightTag, postHighlightTag,
absoluteHighlights);
}
}
@@ -7,7 +7,6 @@
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.text.StringEscapeUtils;
import org.mdz.search.solrocr.formats.OcrPassageFormatter;
import org.mdz.search.solrocr.formats.OcrSnippet;
@@ -23,8 +22,9 @@
private final TagBreakIterator pageIter = new TagBreakIterator("Page");
private final TagBreakIterator limitIter;

protected AltoPassageFormatter(String contextTag, String limitTag, String startHlTag, String endHlTag) {
super(startHlTag, endHlTag);
protected AltoPassageFormatter(String contextTag, String limitTag, String startHlTag, String endHlTag,
boolean absoluteHighlights) {
super(startHlTag, endHlTag, absoluteHighlights);
this.limitIter = new TagBreakIterator(limitTag);
}

@@ -118,12 +118,7 @@ protected OcrSnippet parseFragment(String ocrFragment, String pageId) {
extractText(ocrFragment).replaceAll("@@STARTHLTAG@@", startHlTag)
.replaceAll("@@ENDHLTAG@@", endHlTag)).trim();
OcrSnippet snip = new OcrSnippet(text, pageId, snippetRegion);
hlBoxes.stream()
.map(bs -> bs.stream()
.map(b -> new OcrBox(b.text, b.ulx - snipX, b.uly - snipY,
b.lrx - snipX, b.lry - snipY))
.collect(Collectors.toList()))
.forEach(bs -> snip.addHighlightRegion(this.mergeBoxes(bs)));
this.addHighlightsToSnippet(hlBoxes, snip);
return snip;
}

@@ -32,7 +32,8 @@ public BreakIterator getBreakIterator() {

@Override
public OcrPassageFormatter getPassageFormatter(OcrBlock limitBlock, String prehHighlightTag,
String postHighlightTag) {
return new HocrPassageFormatter(breakClass, blockClassMapping.get(limitBlock), prehHighlightTag, postHighlightTag);
String postHighlightTag, boolean absoluteHighlights) {
return new HocrPassageFormatter(breakClass, blockClassMapping.get(limitBlock), prehHighlightTag, postHighlightTag,
absoluteHighlights);
}
}
@@ -5,7 +5,6 @@
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.mdz.search.solrocr.formats.OcrPassageFormatter;
import org.mdz.search.solrocr.formats.OcrSnippet;
import org.mdz.search.solrocr.util.IterableCharSequence;
@@ -23,8 +22,9 @@
private final String startHlTag;
private final String endHlTag;

public HocrPassageFormatter(String contextClass, String limitClass, String startHlTag, String endHlTag) {
super(startHlTag, endHlTag);
public HocrPassageFormatter(String contextClass, String limitClass, String startHlTag, String endHlTag,
boolean absoluteHighlights) {
super(startHlTag, endHlTag, absoluteHighlights);
this.pageIter = new HocrClassBreakIterator("ocr_page");
this.limitIter = new HocrClassBreakIterator(limitClass);
this.startHlTag = startHlTag;
@@ -90,12 +90,7 @@ protected OcrSnippet parseFragment(String ocrFragment, String pageId) {
int snipY = uly;
OcrBox snippetRegion = new OcrBox(null, ulx, uly, lrx, lry);
OcrSnippet snip = new OcrSnippet(getTextFromXml(ocrFragment), pageId, snippetRegion);
hlBoxes.stream()
.map(cs -> cs.stream()
.map(b -> new OcrBox(b.text, b.ulx - snipX, b.uly - snipY,
b.lrx - snipX, b.lry - snipY))
.collect(Collectors.toList()))
.forEach(bs -> snip.addHighlightRegion(this.mergeBoxes(bs)));
this.addHighlightsToSnippet(hlBoxes, snip);
return snip;
}

@@ -32,7 +32,9 @@ public BreakIterator getBreakIterator() {
}

@Override
public OcrPassageFormatter getPassageFormatter(OcrBlock limitBlock, String prehHighlightTag, String postHighlightTag) {
return new MiniOcrPassageFormatter(breakTag, blockTagMapping.get(limitBlock), prehHighlightTag, postHighlightTag);
public OcrPassageFormatter getPassageFormatter(OcrBlock limitBlock, String prehHighlightTag, String postHighlightTag,
boolean absoluteHighlights) {
return new MiniOcrPassageFormatter(breakTag, blockTagMapping.get(limitBlock), prehHighlightTag, postHighlightTag,
absoluteHighlights);
}
}
@@ -21,8 +21,9 @@
private final TagBreakIterator pageIter = new TagBreakIterator("p");
private final TagBreakIterator limitIter;

public MiniOcrPassageFormatter(String contextTag, String limitTag, String startHlTag, String endHlTag) {
super(startHlTag, endHlTag);
public MiniOcrPassageFormatter(String contextTag, String limitTag, String startHlTag, String endHlTag,
boolean absoluteHighlights) {
super(startHlTag, endHlTag, absoluteHighlights);
this.limitIter = new TagBreakIterator(limitTag);
}

@@ -79,8 +80,8 @@ protected OcrSnippet parseFragment(String xmlFragment, String pageId) {
}
}
OcrBox snippetRegion;
final float snipX = ulx;
final float snipY = uly;
final float xOffset = this.absoluteHighlights ? 0 : ulx;
final float yOffset = this.absoluteHighlights ? 0 : uly;
final float snipWidth = lrx - ulx;
final float snipHeight = lry - uly;
if (lrx < 1) {
@@ -89,10 +90,10 @@ protected OcrSnippet parseFragment(String xmlFragment, String pageId) {
hlBoxes = hlBoxes.stream()
.map(cs -> cs.stream()
.map(b -> new OcrBox(b.text,
truncateFloat((b.ulx - snipX) / snipWidth),
truncateFloat((float) ((b.uly - snipY) / snipHeight)),
truncateFloat((b.lrx - snipX) / snipWidth),
truncateFloat((b.lry - snipY) / snipHeight)))
truncateFloat((b.ulx - xOffset) / snipWidth),
truncateFloat((float) ((b.uly - yOffset) / snipHeight)),
truncateFloat((b.lrx - xOffset) / snipWidth),
truncateFloat((b.lry - yOffset) / snipHeight)))
.collect(Collectors.toList()))
.map(this::mergeBoxes)
.collect(Collectors.toList());
@@ -101,8 +102,8 @@ protected OcrSnippet parseFragment(String xmlFragment, String pageId) {
hlBoxes = hlBoxes.stream()
.map(cs -> cs.stream()
.map(b -> new OcrBox(b.text,
(b.ulx - snipX), (b.uly - snipY),
(b.lrx - snipX), (b.lry - snipY)))
(b.ulx - xOffset), (b.uly - yOffset),
(b.lrx - xOffset), (b.lry - yOffset)))
.collect(Collectors.toList()))
.map(this::mergeBoxes)
.collect(Collectors.toList());
@@ -8,4 +8,5 @@
String LIMIT_BLOCK = "hl.ocr.limitBlock";
String PAGE_ID = "hl.ocr.pageId";
String SCORE_BOOST_EARLY = "hl.score.boostEarly";
String ABSOLUTE_HIGHLIGHTS = "hl.ocr.absoluteHighlights";
}
@@ -83,7 +83,8 @@ public SolrOcrHighlighter(ExternalFieldLoader fieldLoader, OcrFormat ocrFormat,
OcrPassageFormatter ocrFormatter = ocrFormat.getPassageFormatter(
OcrBlock.valueOf(params.get(OcrHighlightParams.LIMIT_BLOCK, "block").toUpperCase()),
params.get(HighlightParams.TAG_PRE, "<em>"),
params.get(HighlightParams.TAG_POST, "</em>"));
params.get(HighlightParams.TAG_POST, "</em>"),
params.getBool(OcrHighlightParams.ABSOLUTE_HIGHLIGHTS, false));
ocrSnippets = ocrHighlighter.highlightOcrFields(
ocrFieldNames, query, docIDs, maxPassagesOcr, ocrBreakIterator, ocrFormatter,
params.get(OcrHighlightParams.PAGE_ID, null));
@@ -116,4 +116,12 @@ public void testOverlappingMatches() throws Exception {
"//lst[@name='ocrHighlighting']//arr[@name='highlights']//str[@name='text']/text()='pirates hove their vessel that the other pirates'");
}

@Test
public void testAbsoluteHighlightRegions() throws Exception {
SolrQueryRequest req = xmlQ("q", "Verführung", "hl.ocr.absoluteHighlights", "true");
assertQ(req,
"//lst[@name='region'][1]/int[@name='ulx']/text()=146",
"//arr[@name='highlights']/arr/lst[1]/int[@name='ulx']/text()=229");
}

}

0 comments on commit 9f24bac

Please sign in to comment.
You can’t perform that action at this time.