Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#1381 - Annotations starting/ending in inter-token space cause exception #1383

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,16 @@ public static TsvDocument of(TsvSchema aSchema, JCas aJCas)
}

Entry<Integer, TsvToken> beginTokenEntry = tokenBeginIndex.floorEntry(begin);
// If the current annotation has leading whitespace, we have wrongly fetched the
// token before the start token using floorEntry(end) - so let's try to correct this
if (
// found begin token but found the wrong one
(beginTokenEntry != null && beginTokenEntry.getValue().getEnd() < begin) ||
// didn't find end begin because annotation starts before the first token
beginTokenEntry == null
) {
beginTokenEntry = tokenEndIndex.higherEntry(begin);
}
if (beginTokenEntry == null) {
throw new IllegalStateException(
"Unable to find begin token starting at or before " + begin
Expand All @@ -144,6 +154,16 @@ public static TsvDocument of(TsvSchema aSchema, JCas aJCas)
}

Entry<Integer, TsvToken> endTokenEntry = tokenEndIndex.ceilingEntry(end);
// If the current annotation has trailing whitespace, we have wrongly fetched the
// token after the end token using ceilingEntry(end) - so let's try to correct this
if (
// found end token but found the wrong one
(endTokenEntry != null && endTokenEntry.getValue().getBegin() > end) ||
// didn't find end token because annotation ends beyond the last token
endTokenEntry == null
) {
endTokenEntry = tokenEndIndex.lowerEntry(end);
}
if (endTokenEntry == null) {
throw new IllegalStateException("Unable to find end token ending at or after "
+ end + " (last token ends at " + tokenEndIndex.pollLastEntry().getKey()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem;
Expand Down Expand Up @@ -1666,7 +1667,91 @@ public void testTwoSentencesWithNoSpaceInBetween() throws Exception

writeAndAssertEquals(jcas);
}

/*
* This is something that cannot be done through the editor UI but can happen when working with
* externally created data.
*/
@Test
public void testAnnotationWithTrailingWhitespace() throws Exception
{
JCas jcas = JCasFactory.createJCas();

DocumentMetaData.create(jcas).setDocumentId("doc");
jcas.setDocumentText("one two");
new Token(jcas, 0, 3).addToIndexes();
new Token(jcas, 5, 8).addToIndexes();
new Sentence(jcas, 0, 8).addToIndexes();

// NE has trailing whitespace - on export this should be silently dropped
new NamedEntity(jcas, 0, 4).addToIndexes();

writeAndAssertEquals(jcas);
}

/*
* This is something that cannot be done through the editor UI but can happen when working with
* externally created data.
*/
@Test
public void testAnnotationWithTrailingWhitespaceAtEnd() throws Exception
{
JCas jcas = JCasFactory.createJCas();

DocumentMetaData.create(jcas).setDocumentId("doc");
jcas.setDocumentText("one two ");
new Token(jcas, 0, 3).addToIndexes();
new Token(jcas, 4, 7).addToIndexes();
new Sentence(jcas, 0, 7).addToIndexes();

// NE has trailing whitespace - on export this should be silently dropped
new NamedEntity(jcas, 4, 8).addToIndexes();

writeAndAssertEquals(jcas);
}

/*
* This is something that cannot be done through the editor UI but can happen when working with
* externally created data.
*/
@Test
public void testAnnotationWithLeadingWhitespaceAtStart() throws Exception
{
JCas jcas = JCasFactory.createJCas();

DocumentMetaData.create(jcas).setDocumentId("doc");
jcas.setDocumentText(" one two");
new Token(jcas, 1, 4).addToIndexes();
new Token(jcas, 5, 8).addToIndexes();
new Sentence(jcas, 1, 8).addToIndexes();

// NE has leading whitespace - on export this should be silently dropped
new NamedEntity(jcas, 0, 4).addToIndexes();

writeAndAssertEquals(jcas);
}

/*
* This is something that cannot be done through the editor UI but can happen when working with
* externally created data.
*/
@Test
public void testAnnotationWithLeadingWhitespace() throws Exception
{
JCas jcas = JCasFactory.createJCas();

DocumentMetaData.create(jcas).setDocumentId("doc");
jcas.setDocumentText("one two");
new Token(jcas, 0, 3).addToIndexes();
new Token(jcas, 5, 8).addToIndexes();
new Sentence(jcas, 0, 8).addToIndexes();

// NE has leading whitespace - on export this should be silently dropped
new NamedEntity(jcas, 4, 8).addToIndexes();

writeAndAssertEquals(jcas);
}

private void writeAndAssertEquals(JCas aJCas, Object... aParams)
throws IOException, ResourceInitializationException, AnalysisEngineProcessException
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#FORMAT=WebAnno TSV 3.2
#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value


#Text=one two
1-1 0-3 one _ _
1-2 5-8 two * *
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?><xmi:XMI xmlns:pos="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore" xmlns:tcas="http:///uima/tcas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:cas="http:///uima/cas.ecore" xmlns:tweet="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore" xmlns:morph="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore" xmlns:dependency="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore" xmlns:type6="http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore" xmlns:type="http:///de/tudarmstadt/ukp/dkpro/core/api/anomaly/type.ecore" xmlns:type7="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore" xmlns:type3="http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore" xmlns:type4="http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore" xmlns:type5="http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore" xmlns:type2="http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore" xmlns:constituent="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore" xmlns:chunk="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore" xmi:version="2.0">
<cas:NULL xmi:id="0"/>
<type3:DocumentMetaData xmi:id="1" sofa="12" begin="0" end="8" documentId="doc" isLastSegment="false"/>
<type5:Token xmi:id="19" sofa="12" begin="0" end="3" order="0"/>
<type5:Token xmi:id="32" sofa="12" begin="5" end="8" order="0"/>
<type5:Sentence xmi:id="45" sofa="12" begin="0" end="8"/>
<type4:NamedEntity xmi:id="50" sofa="12" begin="4" end="8"/>
<cas:Sofa xmi:id="12" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="one two"/>
<cas:View sofa="12" members="1 19 32 45 50"/>
</xmi:XMI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#FORMAT=WebAnno TSV 3.2
#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value


#Text=one two
1-1 1-4 one * *
1-2 5-8 two _ _
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?><xmi:XMI xmlns:pos="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore" xmlns:tcas="http:///uima/tcas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:cas="http:///uima/cas.ecore" xmlns:tweet="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore" xmlns:morph="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore" xmlns:dependency="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore" xmlns:type6="http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore" xmlns:type="http:///de/tudarmstadt/ukp/dkpro/core/api/anomaly/type.ecore" xmlns:type7="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore" xmlns:type3="http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore" xmlns:type4="http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore" xmlns:type5="http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore" xmlns:type2="http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore" xmlns:constituent="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore" xmlns:chunk="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore" xmi:version="2.0">
<cas:NULL xmi:id="0"/>
<type3:DocumentMetaData xmi:id="1" sofa="12" begin="0" end="8" documentId="doc" isLastSegment="false"/>
<type5:Token xmi:id="19" sofa="12" begin="1" end="4" order="0"/>
<type5:Token xmi:id="32" sofa="12" begin="5" end="8" order="0"/>
<type5:Sentence xmi:id="45" sofa="12" begin="1" end="8"/>
<type4:NamedEntity xmi:id="50" sofa="12" begin="0" end="4"/>
<cas:Sofa xmi:id="12" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString=" one two"/>
<cas:View sofa="12" members="1 19 32 45 50"/>
</xmi:XMI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#FORMAT=WebAnno TSV 3.2
#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value


#Text=one two
1-1 0-3 one * *
1-2 5-8 two _ _
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?><xmi:XMI xmlns:pos="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore" xmlns:tcas="http:///uima/tcas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:cas="http:///uima/cas.ecore" xmlns:tweet="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore" xmlns:morph="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore" xmlns:dependency="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore" xmlns:type6="http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore" xmlns:type="http:///de/tudarmstadt/ukp/dkpro/core/api/anomaly/type.ecore" xmlns:type7="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore" xmlns:type3="http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore" xmlns:type4="http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore" xmlns:type5="http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore" xmlns:type2="http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore" xmlns:constituent="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore" xmlns:chunk="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore" xmi:version="2.0">
<cas:NULL xmi:id="0"/>
<type3:DocumentMetaData xmi:id="1" sofa="12" begin="0" end="8" documentId="doc" isLastSegment="false"/>
<type5:Token xmi:id="19" sofa="12" begin="0" end="3" order="0"/>
<type5:Token xmi:id="32" sofa="12" begin="5" end="8" order="0"/>
<type5:Sentence xmi:id="45" sofa="12" begin="0" end="8"/>
<type4:NamedEntity xmi:id="50" sofa="12" begin="0" end="4"/>
<cas:Sofa xmi:id="12" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="one two"/>
<cas:View sofa="12" members="1 19 32 45 50"/>
</xmi:XMI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#FORMAT=WebAnno TSV 3.2
#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value


#Text=one two
1-1 0-3 one _ _
1-2 4-7 two * *
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?><xmi:XMI xmlns:pos="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos.ecore" xmlns:tcas="http:///uima/tcas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:cas="http:///uima/cas.ecore" xmlns:tweet="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/pos/tweet.ecore" xmlns:morph="http:///de/tudarmstadt/ukp/dkpro/core/api/lexmorph/type/morph.ecore" xmlns:dependency="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/dependency.ecore" xmlns:type6="http:///de/tudarmstadt/ukp/dkpro/core/api/semantics/type.ecore" xmlns:type="http:///de/tudarmstadt/ukp/dkpro/core/api/anomaly/type.ecore" xmlns:type7="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type.ecore" xmlns:type3="http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore" xmlns:type4="http:///de/tudarmstadt/ukp/dkpro/core/api/ner/type.ecore" xmlns:type5="http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore" xmlns:type2="http:///de/tudarmstadt/ukp/dkpro/core/api/coref/type.ecore" xmlns:constituent="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/constituent.ecore" xmlns:chunk="http:///de/tudarmstadt/ukp/dkpro/core/api/syntax/type/chunk.ecore" xmi:version="2.0">
<cas:NULL xmi:id="0"/>
<type3:DocumentMetaData xmi:id="1" sofa="12" begin="0" end="8" documentId="doc" isLastSegment="false"/>
<type5:Token xmi:id="19" sofa="12" begin="0" end="3" order="0"/>
<type5:Token xmi:id="32" sofa="12" begin="4" end="7" order="0"/>
<type5:Sentence xmi:id="45" sofa="12" begin="0" end="7"/>
<type4:NamedEntity xmi:id="50" sofa="12" begin="4" end="8"/>
<cas:Sofa xmi:id="12" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="one two "/>
<cas:View sofa="12" members="1 19 32 45 50"/>
</xmi:XMI>