Skip to content
This repository has been archived by the owner on Apr 20, 2022. It is now read-only.

Commit

Permalink
updating grobidAnalisers to consider break line in tokenizeToLayoutTo…
Browse files Browse the repository at this point in the history
…ken, when a \n is encountered kermitt2#180
  • Loading branch information
lfoppiano committed Apr 14, 2018
1 parent 34fba0a commit 27ffd3b
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.grobid.core.lang.Language;
import org.grobid.core.layout.LayoutToken;

import org.grobid.core.utilities.UnicodeUtil;
import org.wipo.nlp.textboundaries.ReTokenizer;
import org.wipo.nlp.textboundaries.ReTokenizerFactory;

Expand Down Expand Up @@ -158,15 +159,21 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text) {

public List<LayoutToken> tokenizeWithLayoutToken(String text, Language lang) {
List<LayoutToken> result = new ArrayList<>();
text = UnicodeUtil.normaliseText(text);
List<String> tokens = tokenize(text, lang);
int pos = 0;
for(String tok : tokens) {
LayoutToken layoutToken = new LayoutToken();
for (int i = 0; i < tokens.size(); i++) {
String tok = tokens.get(i);
LayoutToken layoutToken = new LayoutToken();
layoutToken.setText(tok);
layoutToken.setOffset(pos);
result.add(layoutToken);
pos += tok.length();
if (i < tokens.size() - 1 && tokens.get(i + 1).equals("\n")) {
layoutToken.setNewLineAfter(true);
}
}

return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand All @@ -25,22 +25,22 @@
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

/**
* Default tokenizer adequate for all Indo-European languages.
*
* @author Patrice Lopez
*/
public class GrobidDefaultAnalyzer implements Analyzer {

private static volatile GrobidDefaultAnalyzer instance;
private static volatile GrobidDefaultAnalyzer instance;

public static GrobidDefaultAnalyzer getInstance() {
public static GrobidDefaultAnalyzer getInstance() {
if (instance == null) {
//double check idiom
// synchronized (instanceController) {
if (instance == null)
getNewInstance();
if (instance == null)
getNewInstance();
// }
}
return instance;
Expand All @@ -49,63 +49,75 @@ public static GrobidDefaultAnalyzer getInstance() {
/**
* Creates a new instance.
*/
private static synchronized void getNewInstance() {
instance = new GrobidDefaultAnalyzer();
}
private static synchronized void getNewInstance() {
instance = new GrobidDefaultAnalyzer();
}

/**
* Hidden constructor
*/
private GrobidDefaultAnalyzer() {
}
}

public static final String delimiters = TextUtilities.delimiters;
//" \n\r\t([,:;?.!/)-–−\"“”‘’'`$]*\u2666\u2665\u2663\u2660\u00A0";

public String getName() {
return "DefaultGrobidAnalyzer";
}

public List<String> tokenize(String text) {
// as a default analyzer, language is not considered
return tokenize(text, null);
}

public List<String> tokenize(String text, Language lang) {
List<String> result = new ArrayList<String>();
text = UnicodeUtil.normaliseText(text);
StringTokenizer st = new StringTokenizer(text, delimiters, true);
while(st.hasMoreTokens()) {
result.add(st.nextToken());
}
return result;
}

public List<String> retokenize(List<String> chunks) {
StringTokenizer st = null;
List<String> result = new ArrayList<String>();
for(String chunk : chunks) {
chunk = UnicodeUtil.normaliseText(chunk);
st = new StringTokenizer(chunk, delimiters, true);
while(st.hasMoreTokens()) {
result.add(st.nextToken());
}
}
return result;
}

public List<LayoutToken> tokenizeWithLayoutToken(String text) {
return "DefaultGrobidAnalyzer";
}

public List<String> tokenize(String text) {
// as a default analyzer, language is not considered
return tokenize(text, null);
}

public List<String> tokenize(String text, Language lang) {
List<String> result = new ArrayList<>();
text = UnicodeUtil.normaliseText(text);
StringTokenizer st = new StringTokenizer(text, delimiters, true);
while (st.hasMoreTokens()) {
result.add(st.nextToken());
}
return result;
}

public List<String> retokenize(List<String> chunks) {
StringTokenizer st = null;
List<String> result = new ArrayList<>();
for (String chunk : chunks) {
chunk = UnicodeUtil.normaliseText(chunk);
st = new StringTokenizer(chunk, delimiters, true);
while (st.hasMoreTokens()) {
result.add(st.nextToken());
}
}
return result;
}

public List<LayoutToken> tokenizeWithLayoutToken(String text) {
return tokenizeWithLayoutToken(text, null);
}

/**
* Tokenize text returning list of LayoutTokens.
*/
public List<LayoutToken> tokenizeWithLayoutToken(String text, Language language) {
List<LayoutToken> result = new ArrayList<>();
text = UnicodeUtil.normaliseText(text);
List<String> tokens = tokenize(text);
List<String> tokens = tokenize(text, language);
int pos = 0;
for(String tok : tokens) {
LayoutToken layoutToken = new LayoutToken();
for (int i = 0; i < tokens.size(); i++) {
String tok = tokens.get(i);
LayoutToken layoutToken = new LayoutToken();
layoutToken.setText(tok);
layoutToken.setOffset(pos);
result.add(layoutToken);
pos += tok.length();
if (i < tokens.size() - 1 && tokens.get(i + 1).equals("\n")) {
layoutToken.setNewLineAfter(true);
}
}

return result;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package org.grobid.core.analyzers;

import org.grobid.core.layout.LayoutToken;
import org.junit.Before;
import org.junit.Test;

import java.util.List;

import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.junit.Assert.*;

public class GrobidAnalyzerTest {
GrobidAnalyzer target;

@Before
public void setUp() throws Exception {
target = GrobidAnalyzer.getInstance();
}

@Test
public void testTokenizeWithLayoutToken() {
final List<LayoutToken> layoutTokens = target.tokenizeWithLayoutToken("This is a normal \ntext,\n\n\n on several lines.\n");

assertThat(layoutTokens, hasSize(22));
assertThat(layoutTokens.get(0).getText(), is("This"));
assertThat(layoutTokens.get(1).getText(), is(" "));
assertThat(layoutTokens.get(6).getText(), is("normal"));
assertThat(layoutTokens.get(7).getText(), is(" "));
assertThat(layoutTokens.get(7).isNewLineAfter(), is(true));
assertThat(layoutTokens.get(8).getText(), is("\n"));
assertThat(layoutTokens.get(8).isNewLineAfter(), is(false));
assertThat(layoutTokens.get(10).getText(), is(","));
assertThat(layoutTokens.get(10).isNewLineAfter(), is(true));
assertThat(layoutTokens.get(11).getText(), is("\n"));
assertThat(layoutTokens.get(11).isNewLineAfter(), is(true));
assertThat(layoutTokens.get(12).getText(), is("\n"));
assertThat(layoutTokens.get(12).isNewLineAfter(), is(true));
assertThat(layoutTokens.get(13).getText(), is("\n"));
assertThat(layoutTokens.get(13).isNewLineAfter(), is(false));
}

@Test
public void testTokenizeWithLayoutToken_emptyText() {
assertThat(target.tokenizeWithLayoutToken(""), hasSize(0));
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package org.grobid.core.analyzers;

import org.grobid.core.layout.LayoutToken;
import org.junit.Before;
import org.junit.Test;

import java.util.List;

import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.junit.Assert.*;

public class GrobidDefaultAnalyzerTest {

GrobidDefaultAnalyzer target;

@Before
public void setUp() throws Exception {
target = GrobidDefaultAnalyzer.getInstance();
}

@Test
public void testTokenizeWithLayoutToken() {
final List<LayoutToken> layoutTokens = target.tokenizeWithLayoutToken("This is a normal \ntext,\n\n\n on several lines.\n");

assertThat(layoutTokens, hasSize(22));
assertThat(layoutTokens.get(0).getText(), is("This"));
assertThat(layoutTokens.get(1).getText(), is(" "));
assertThat(layoutTokens.get(6).getText(), is("normal"));
assertThat(layoutTokens.get(7).getText(), is(" "));
assertThat(layoutTokens.get(7).isNewLineAfter(), is(true));
assertThat(layoutTokens.get(8).getText(), is("\n"));
assertThat(layoutTokens.get(8).isNewLineAfter(), is(false));
assertThat(layoutTokens.get(10).getText(), is(","));
assertThat(layoutTokens.get(10).isNewLineAfter(), is(true));
assertThat(layoutTokens.get(11).getText(), is("\n"));
assertThat(layoutTokens.get(11).isNewLineAfter(), is(true));
assertThat(layoutTokens.get(12).getText(), is("\n"));
assertThat(layoutTokens.get(12).isNewLineAfter(), is(true));
assertThat(layoutTokens.get(13).getText(), is("\n"));
assertThat(layoutTokens.get(13).isNewLineAfter(), is(false));
}

@Test
public void testTokenizeWithLayoutToken_emptyText() {
assertThat(target.tokenizeWithLayoutToken(""), hasSize(0));
}
}

0 comments on commit 27ffd3b

Please sign in to comment.