Skip to content

Commit

Permalink
#1381 - Annotations starting/ending in inter-token space cause exception
Browse files Browse the repository at this point in the history
- Ported fix and unit tests
  • Loading branch information
reckart committed Jun 6, 2019
1 parent 1aae3ea commit 575f18c
Show file tree
Hide file tree
Showing 10 changed files with 173 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,16 @@ public static TsvDocument of(TsvSchema aSchema, JCas aJCas)
}

Entry<Integer, TsvToken> beginTokenEntry = tokenBeginIndex.floorEntry(begin);
// If the current annotation has leading whitespace, we have wrongly fetched the
// token before the start token using floorEntry(end) - so let's try to correct this
if (
// found begin token but found the wrong one
(beginTokenEntry != null && beginTokenEntry.getValue().getEnd() < begin) ||
// didn't find end begin because annotation starts before the first token
beginTokenEntry == null
) {
beginTokenEntry = tokenEndIndex.higherEntry(begin);
}
if (beginTokenEntry == null) {
throw new IllegalStateException(
"Unable to find begin token starting at or before " + begin
Expand All @@ -144,6 +154,16 @@ public static TsvDocument of(TsvSchema aSchema, JCas aJCas)
}

Entry<Integer, TsvToken> endTokenEntry = tokenEndIndex.ceilingEntry(end);
// If the current annotation has trailing whitespace, we have wrongly fetched the
// token after the end token using ceilingEntry(end) - so let's try to correct this
if (
// found end token but found the wrong one
(endTokenEntry != null && endTokenEntry.getValue().getBegin() > end) ||
// didn't find end token because annotation ends beyond the last token
endTokenEntry == null
) {
endTokenEntry = tokenEndIndex.lowerEntry(end);
}
if (endTokenEntry == null) {
throw new IllegalStateException("Unable to find end token ending at or after "
+ end + " (last token ends at " + tokenEndIndex.pollLastEntry().getKey()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem;
Expand Down Expand Up @@ -1666,7 +1667,91 @@ public void testTwoSentencesWithNoSpaceInBetween() throws Exception

writeAndAssertEquals(jcas);
}

/*
* This is something that cannot be done through the editor UI but can happen when working with
* externally created data.
*/
@Test
public void testAnnotationWithTrailingWhitespace() throws Exception
{
JCas jcas = JCasFactory.createJCas();

DocumentMetaData.create(jcas).setDocumentId("doc");
jcas.setDocumentText("one two");
new Token(jcas, 0, 3).addToIndexes();
new Token(jcas, 5, 8).addToIndexes();
new Sentence(jcas, 0, 8).addToIndexes();

// NE has trailing whitespace - on export this should be silently dropped
new NamedEntity(jcas, 0, 4).addToIndexes();

writeAndAssertEquals(jcas);
}

/*
* This is something that cannot be done through the editor UI but can happen when working with
* externally created data.
*/
@Test
public void testAnnotationWithTrailingWhitespaceAtEnd() throws Exception
{
JCas jcas = JCasFactory.createJCas();

DocumentMetaData.create(jcas).setDocumentId("doc");
jcas.setDocumentText("one two ");
new Token(jcas, 0, 3).addToIndexes();
new Token(jcas, 4, 7).addToIndexes();
new Sentence(jcas, 0, 7).addToIndexes();

// NE has trailing whitespace - on export this should be silently dropped
new NamedEntity(jcas, 4, 8).addToIndexes();

writeAndAssertEquals(jcas);
}

/*
* This is something that cannot be done through the editor UI but can happen when working with
* externally created data.
*/
@Test
public void testAnnotationWithLeadingWhitespaceAtStart() throws Exception
{
JCas jcas = JCasFactory.createJCas();

DocumentMetaData.create(jcas).setDocumentId("doc");
jcas.setDocumentText(" one two");
new Token(jcas, 1, 4).addToIndexes();
new Token(jcas, 5, 8).addToIndexes();
new Sentence(jcas, 1, 8).addToIndexes();

// NE has leading whitespace - on export this should be silently dropped
new NamedEntity(jcas, 0, 4).addToIndexes();

writeAndAssertEquals(jcas);
}

/*
* This is something that cannot be done through the editor UI but can happen when working with
* externally created data.
*/
@Test
public void testAnnotationWithLeadingWhitespace() throws Exception
{
JCas jcas = JCasFactory.createJCas();

DocumentMetaData.create(jcas).setDocumentId("doc");
jcas.setDocumentText("one two");
new Token(jcas, 0, 3).addToIndexes();
new Token(jcas, 5, 8).addToIndexes();
new Sentence(jcas, 0, 8).addToIndexes();

// NE has leading whitespace - on export this should be silently dropped
new NamedEntity(jcas, 4, 8).addToIndexes();

writeAndAssertEquals(jcas);
}

private void writeAndAssertEquals(JCas aJCas, Object... aParams)
throws IOException, ResourceInitializationException, AnalysisEngineProcessException
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#FORMAT=WebAnno TSV 3.2
#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value


#Text=one two
1-1 0-3 one _ _
1-2 5-8 two * *
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?><xmi:XMI xmlns:pos="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore" xmlns:tcas="http:///uima/tcas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:cas="http:///uima/cas.ecore" xmlns:tweet="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore" xmlns:morph="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore" xmlns:dependency="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore" xmlns:type6="http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore" xmlns:type="http:///de/tudarmstadt/ukp/dkpro/core/api/anomaly/type.ecore" xmlns:type7="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore" xmlns:type3="http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore" xmlns:type4="http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore" xmlns:type5="http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore" xmlns:type2="http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore" xmlns:constituent="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore" xmlns:chunk="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore" xmi:version="2.0">
<cas:NULL xmi:id="0"/>
<type3:DocumentMetaData xmi:id="1" sofa="12" begin="0" end="8" documentId="doc" isLastSegment="false"/>
<type5:Token xmi:id="19" sofa="12" begin="0" end="3" order="0"/>
<type5:Token xmi:id="32" sofa="12" begin="5" end="8" order="0"/>
<type5:Sentence xmi:id="45" sofa="12" begin="0" end="8"/>
<type4:NamedEntity xmi:id="50" sofa="12" begin="4" end="8"/>
<cas:Sofa xmi:id="12" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="one two"/>
<cas:View sofa="12" members="1 19 32 45 50"/>
</xmi:XMI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#FORMAT=WebAnno TSV 3.2
#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value


#Text=one two
1-1 1-4 one * *
1-2 5-8 two _ _
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?><xmi:XMI xmlns:pos="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore" xmlns:tcas="http:///uima/tcas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:cas="http:///uima/cas.ecore" xmlns:tweet="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore" xmlns:morph="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore" xmlns:dependency="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore" xmlns:type6="http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore" xmlns:type="http:///de/tudarmstadt/ukp/dkpro/core/api/anomaly/type.ecore" xmlns:type7="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore" xmlns:type3="http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore" xmlns:type4="http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore" xmlns:type5="http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore" xmlns:type2="http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore" xmlns:constituent="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore" xmlns:chunk="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore" xmi:version="2.0">
<cas:NULL xmi:id="0"/>
<type3:DocumentMetaData xmi:id="1" sofa="12" begin="0" end="8" documentId="doc" isLastSegment="false"/>
<type5:Token xmi:id="19" sofa="12" begin="1" end="4" order="0"/>
<type5:Token xmi:id="32" sofa="12" begin="5" end="8" order="0"/>
<type5:Sentence xmi:id="45" sofa="12" begin="1" end="8"/>
<type4:NamedEntity xmi:id="50" sofa="12" begin="0" end="4"/>
<cas:Sofa xmi:id="12" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString=" one two"/>
<cas:View sofa="12" members="1 19 32 45 50"/>
</xmi:XMI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#FORMAT=WebAnno TSV 3.2
#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value


#Text=one two
1-1 0-3 one * *
1-2 5-8 two _ _
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?><xmi:XMI xmlns:pos="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore" xmlns:tcas="http:///uima/tcas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:cas="http:///uima/cas.ecore" xmlns:tweet="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore" xmlns:morph="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore" xmlns:dependency="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore" xmlns:type6="http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore" xmlns:type="http:///de/tudarmstadt/ukp/dkpro/core/api/anomaly/type.ecore" xmlns:type7="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore" xmlns:type3="http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore" xmlns:type4="http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore" xmlns:type5="http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore" xmlns:type2="http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore" xmlns:constituent="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore" xmlns:chunk="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore" xmi:version="2.0">
<cas:NULL xmi:id="0"/>
<type3:DocumentMetaData xmi:id="1" sofa="12" begin="0" end="8" documentId="doc" isLastSegment="false"/>
<type5:Token xmi:id="19" sofa="12" begin="0" end="3" order="0"/>
<type5:Token xmi:id="32" sofa="12" begin="5" end="8" order="0"/>
<type5:Sentence xmi:id="45" sofa="12" begin="0" end="8"/>
<type4:NamedEntity xmi:id="50" sofa="12" begin="0" end="4"/>
<cas:Sofa xmi:id="12" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="one two"/>
<cas:View sofa="12" members="1 19 32 45 50"/>
</xmi:XMI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#FORMAT=WebAnno TSV 3.2
#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value


#Text=one two
1-1 0-3 one _ _
1-2 4-7 two * *
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?><xmi:XMI xmlns:pos="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore" xmlns:tcas="http:///uima/tcas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:cas="http:///uima/cas.ecore" xmlns:tweet="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore" xmlns:morph="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore" xmlns:dependency="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore" xmlns:type6="http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore" xmlns:type="http:///de/tudarmstadt/ukp/dkpro/core/api/anomaly/type.ecore" xmlns:type7="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore" xmlns:type3="http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore" xmlns:type4="http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore" xmlns:type5="http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore" xmlns:type2="http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore" xmlns:constituent="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore" xmlns:chunk="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore" xmi:version="2.0">
<cas:NULL xmi:id="0"/>
<type3:DocumentMetaData xmi:id="1" sofa="12" begin="0" end="8" documentId="doc" isLastSegment="false"/>
<type5:Token xmi:id="19" sofa="12" begin="0" end="3" order="0"/>
<type5:Token xmi:id="32" sofa="12" begin="4" end="7" order="0"/>
<type5:Sentence xmi:id="45" sofa="12" begin="0" end="7"/>
<type4:NamedEntity xmi:id="50" sofa="12" begin="4" end="8"/>
<cas:Sofa xmi:id="12" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="one two "/>
<cas:View sofa="12" members="1 19 32 45 50"/>
</xmi:XMI>

0 comments on commit 575f18c

Please sign in to comment.