Skip to content

Commit

Permalink
New annotator to extract paragraphs using a naive regex approach
Browse files Browse the repository at this point in the history
  • Loading branch information
jbaker-dstl committed Apr 25, 2019
1 parent 7fd1f4a commit 68932b0
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package uk.gov.dstl.baleen.annotators.regex;

import com.google.common.collect.ImmutableSet;
import org.apache.uima.jcas.JCas;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.language.Paragraph;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;

import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Annotate paragraphs in a document by looking for multiple new lines
*
* @baleen.javadoc
*/
public class NaiveParagraph extends BaleenAnnotator {
private static final Pattern PARAGRAPH_REGEX = Pattern.compile("[^\\r\\n]+((\\r|\\n|\\r\\n)[^\\r\\n]+)*");

protected void doProcess(JCas jCas) {
Matcher m = PARAGRAPH_REGEX.matcher(jCas.getDocumentText());

while(m.find()) {
Paragraph p = new Paragraph(jCas);
p.setBegin(m.start());
p.setEnd(m.end());
addToJCasIndex(p);
}

}

public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Paragraph.class));
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package uk.gov.dstl.baleen.annotators.regex;

import org.apache.uima.fit.util.JCasUtil;
import org.junit.Test;
import uk.gov.dstl.baleen.annotators.testing.AbstractAnnotatorTest;
import uk.gov.dstl.baleen.types.language.Paragraph;

import static org.junit.Assert.assertEquals;

public class NaiveParagraphTest extends AbstractAnnotatorTest {

public NaiveParagraphTest() {
super(NaiveParagraph.class);
}

@Test
public void test() throws Exception{
jCas.setDocumentText("Hello\nWorld\n\nThis is the second paragraph!");
processJCas();

assertEquals(2, JCasUtil.select(jCas, Paragraph.class).size());

assertEquals("Hello\nWorld", JCasUtil.selectByIndex(jCas, Paragraph.class, 0).getCoveredText());
assertEquals("This is the second paragraph!", JCasUtil.selectByIndex(jCas, Paragraph.class, 1).getCoveredText());
}
}

0 comments on commit 68932b0

Please sign in to comment.