From 27ffd3bd1fd62e003926e92ce6ea5b0d45d1c348 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 11 Apr 2018 18:09:41 +0200 Subject: [PATCH] updating grobidAnalisers to consider break line in tokenizeToLayoutToken, when a \n is encountered #180 --- .../grobid/core/analyzers/GrobidAnalyzer.java | 11 +- .../core/analyzers/GrobidDefaultAnalyzer.java | 106 ++++++++++-------- .../core/analyzers/GrobidAnalyzerTest.java | 48 ++++++++ .../analyzers/GrobidDefaultAnalyzerTest.java | 48 ++++++++ 4 files changed, 164 insertions(+), 49 deletions(-) create mode 100644 grobid-core/src/test/java/org/grobid/core/analyzers/GrobidAnalyzerTest.java create mode 100644 grobid-core/src/test/java/org/grobid/core/analyzers/GrobidDefaultAnalyzerTest.java diff --git a/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java b/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java index e963ce4398..a721009982 100644 --- a/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java +++ b/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java @@ -3,6 +3,7 @@ import org.grobid.core.lang.Language; import org.grobid.core.layout.LayoutToken; +import org.grobid.core.utilities.UnicodeUtil; import org.wipo.nlp.textboundaries.ReTokenizer; import org.wipo.nlp.textboundaries.ReTokenizerFactory; @@ -158,15 +159,21 @@ public List tokenizeWithLayoutToken(String text) { public List tokenizeWithLayoutToken(String text, Language lang) { List result = new ArrayList<>(); + text = UnicodeUtil.normaliseText(text); List tokens = tokenize(text, lang); int pos = 0; - for(String tok : tokens) { - LayoutToken layoutToken = new LayoutToken(); + for (int i = 0; i < tokens.size(); i++) { + String tok = tokens.get(i); + LayoutToken layoutToken = new LayoutToken(); layoutToken.setText(tok); layoutToken.setOffset(pos); result.add(layoutToken); pos += tok.length(); + if (i < tokens.size() - 1 && tokens.get(i + 1).equals("\n")) { + layoutToken.setNewLineAfter(true); + } } + return result; } } \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidDefaultAnalyzer.java b/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidDefaultAnalyzer.java index 407874c3ef..4a3bf6b63c 100644 --- a/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidDefaultAnalyzer.java +++ b/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidDefaultAnalyzer.java @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,7 +25,7 @@ import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; - + /** * Default tokenizer adequate for all Indo-European languages. * @@ -33,14 +33,14 @@ */ public class GrobidDefaultAnalyzer implements Analyzer { - private static volatile GrobidDefaultAnalyzer instance; + private static volatile GrobidDefaultAnalyzer instance; - public static GrobidDefaultAnalyzer getInstance() { + public static GrobidDefaultAnalyzer getInstance() { if (instance == null) { //double check idiom // synchronized (instanceController) { - if (instance == null) - getNewInstance(); + if (instance == null) + getNewInstance(); // } } return instance; @@ -49,63 +49,75 @@ public static GrobidDefaultAnalyzer getInstance() { /** * Creates a new instance. */ - private static synchronized void getNewInstance() { - instance = new GrobidDefaultAnalyzer(); - } + private static synchronized void getNewInstance() { + instance = new GrobidDefaultAnalyzer(); + } /** * Hidden constructor */ private GrobidDefaultAnalyzer() { - } + } public static final String delimiters = TextUtilities.delimiters; //" \n\r\t([,:;?.!/)-–−\"“”‘’'`$]*\u2666\u2665\u2663\u2660\u00A0"; public String getName() { - return "DefaultGrobidAnalyzer"; - } - - public List tokenize(String text) { - // as a default analyzer, language is not considered - return tokenize(text, null); - } - - public List tokenize(String text, Language lang) { - List result = new ArrayList(); - text = UnicodeUtil.normaliseText(text); - StringTokenizer st = new StringTokenizer(text, delimiters, true); - while(st.hasMoreTokens()) { - result.add(st.nextToken()); - } - return result; - } - - public List retokenize(List chunks) { - StringTokenizer st = null; - List result = new ArrayList(); - for(String chunk : chunks) { - chunk = UnicodeUtil.normaliseText(chunk); - st = new StringTokenizer(chunk, delimiters, true); - while(st.hasMoreTokens()) { - result.add(st.nextToken()); - } - } - return result; - } - - public List tokenizeWithLayoutToken(String text) { + return "DefaultGrobidAnalyzer"; + } + + public List tokenize(String text) { + // as a default analyzer, language is not considered + return tokenize(text, null); + } + + public List tokenize(String text, Language lang) { + List result = new ArrayList<>(); + text = UnicodeUtil.normaliseText(text); + StringTokenizer st = new StringTokenizer(text, delimiters, true); + while (st.hasMoreTokens()) { + result.add(st.nextToken()); + } + return result; + } + + public List retokenize(List chunks) { + StringTokenizer st = null; + List result = new ArrayList<>(); + for (String chunk : chunks) { + chunk = UnicodeUtil.normaliseText(chunk); + st = new StringTokenizer(chunk, delimiters, true); + while (st.hasMoreTokens()) { + result.add(st.nextToken()); + } + } + return result; + } + + public List tokenizeWithLayoutToken(String text) { + return tokenizeWithLayoutToken(text, null); + } + + /** + * Tokenize text returning list of LayoutTokens. + */ + public List tokenizeWithLayoutToken(String text, Language language) { List result = new ArrayList<>(); text = UnicodeUtil.normaliseText(text); - List tokens = tokenize(text); + List tokens = tokenize(text, language); int pos = 0; - for(String tok : tokens) { - LayoutToken layoutToken = new LayoutToken(); + for (int i = 0; i < tokens.size(); i++) { + String tok = tokens.get(i); + LayoutToken layoutToken = new LayoutToken(); layoutToken.setText(tok); layoutToken.setOffset(pos); result.add(layoutToken); pos += tok.length(); + if (i < tokens.size() - 1 && tokens.get(i + 1).equals("\n")) { + layoutToken.setNewLineAfter(true); + } } + return result; } } \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidAnalyzerTest.java b/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidAnalyzerTest.java new file mode 100644 index 0000000000..7b854716b5 --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidAnalyzerTest.java @@ -0,0 +1,48 @@ +package org.grobid.core.analyzers; + +import org.grobid.core.layout.LayoutToken; +import org.junit.Before; +import org.junit.Test; + +import java.util.List; + +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.*; + +public class GrobidAnalyzerTest { + GrobidAnalyzer target; + + @Before + public void setUp() throws Exception { + target = GrobidAnalyzer.getInstance(); + } + + @Test + public void testTokenizeWithLayoutToken() { + final List layoutTokens = target.tokenizeWithLayoutToken("This is a normal \ntext,\n\n\n on several lines.\n"); + + assertThat(layoutTokens, hasSize(22)); + assertThat(layoutTokens.get(0).getText(), is("This")); + assertThat(layoutTokens.get(1).getText(), is(" ")); + assertThat(layoutTokens.get(6).getText(), is("normal")); + assertThat(layoutTokens.get(7).getText(), is(" ")); + assertThat(layoutTokens.get(7).isNewLineAfter(), is(true)); + assertThat(layoutTokens.get(8).getText(), is("\n")); + assertThat(layoutTokens.get(8).isNewLineAfter(), is(false)); + assertThat(layoutTokens.get(10).getText(), is(",")); + assertThat(layoutTokens.get(10).isNewLineAfter(), is(true)); + assertThat(layoutTokens.get(11).getText(), is("\n")); + assertThat(layoutTokens.get(11).isNewLineAfter(), is(true)); + assertThat(layoutTokens.get(12).getText(), is("\n")); + assertThat(layoutTokens.get(12).isNewLineAfter(), is(true)); + assertThat(layoutTokens.get(13).getText(), is("\n")); + assertThat(layoutTokens.get(13).isNewLineAfter(), is(false)); + } + + @Test + public void testTokenizeWithLayoutToken_emptyText() { + assertThat(target.tokenizeWithLayoutToken(""), hasSize(0)); + } + +} \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidDefaultAnalyzerTest.java b/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidDefaultAnalyzerTest.java new file mode 100644 index 0000000000..5bdc7be743 --- /dev/null +++ b/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidDefaultAnalyzerTest.java @@ -0,0 +1,48 @@ +package org.grobid.core.analyzers; + +import org.grobid.core.layout.LayoutToken; +import org.junit.Before; +import org.junit.Test; + +import java.util.List; + +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.*; + +public class GrobidDefaultAnalyzerTest { + + GrobidDefaultAnalyzer target; + + @Before + public void setUp() throws Exception { + target = GrobidDefaultAnalyzer.getInstance(); + } + + @Test + public void testTokenizeWithLayoutToken() { + final List layoutTokens = target.tokenizeWithLayoutToken("This is a normal \ntext,\n\n\n on several lines.\n"); + + assertThat(layoutTokens, hasSize(22)); + assertThat(layoutTokens.get(0).getText(), is("This")); + assertThat(layoutTokens.get(1).getText(), is(" ")); + assertThat(layoutTokens.get(6).getText(), is("normal")); + assertThat(layoutTokens.get(7).getText(), is(" ")); + assertThat(layoutTokens.get(7).isNewLineAfter(), is(true)); + assertThat(layoutTokens.get(8).getText(), is("\n")); + assertThat(layoutTokens.get(8).isNewLineAfter(), is(false)); + assertThat(layoutTokens.get(10).getText(), is(",")); + assertThat(layoutTokens.get(10).isNewLineAfter(), is(true)); + assertThat(layoutTokens.get(11).getText(), is("\n")); + assertThat(layoutTokens.get(11).isNewLineAfter(), is(true)); + assertThat(layoutTokens.get(12).getText(), is("\n")); + assertThat(layoutTokens.get(12).isNewLineAfter(), is(true)); + assertThat(layoutTokens.get(13).getText(), is("\n")); + assertThat(layoutTokens.get(13).isNewLineAfter(), is(false)); + } + + @Test + public void testTokenizeWithLayoutToken_emptyText() { + assertThat(target.tokenizeWithLayoutToken(""), hasSize(0)); + } +} \ No newline at end of file