Skip to content

Commit

Permalink
Baleen Triage (#72)
Browse files Browse the repository at this point in the history
* Baleen Triage

We collect some current annotators associated with document triage
into the `triage` namespace and contribute annotators to assign a
document date form the title, create document summaries and compute the
Shannon Entropy of the document to inform prioritisation.

We also integrate the Mallet library for document classification.
This supports a number of different algorithms for learning from labelled
documents, learning by suggestion and a Latent Dirichlet approach for when
no labels data exists.

* Change WordDistributionDocumentSummaryTest text for baleen README

To remove any possible copyright issues.

* Simplify topic model tests.

Extract single generated dataset.
Generate test models from this data.

Test verify valid models are created and can be read by the annotator.
This is sufficient to show correct integration with Mallet
and we do not need to use the same data as in Mallet tests.
  • Loading branch information
stuarthendren authored and JohnDaws committed May 10, 2018
1 parent 3a4d380 commit 8781ec9
Show file tree
Hide file tree
Showing 89 changed files with 10,346 additions and 1,067 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (c) Committed Software 2018, opensource@committed.io
package uk.gov.dstl.baleen.annotators.helpers;

/** Utilities for mathematical operations */
public class MathUtils {

/**
* @param base The base of the logarithm to be calculated
* @param argument The argument of the logarithm to be calculated
* @return The result of the calculation
*/
public static double logarithm(double base, double argument) {
if (base <= 0 || argument <= 0) {
throw new ArithmeticException("Base and argument of logarithms must be greater than 0");
}
if (base == 1) {
throw new ArithmeticException("Base of logarithm must be positive and not equal to 1");
}

return Math.log(argument) / Math.log(base);
}
}
Original file line number Diff line number Diff line change
@@ -1,204 +1,10 @@
// Dstl (c) Crown Copyright 2017
// Copyright (c) Committed Software 2018, opensource@committed.io
package uk.gov.dstl.baleen.annotators.misc;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.StringJoiner;
import java.util.TreeSet;
import java.util.stream.Collectors;

import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;

import com.google.common.base.Strings;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;

import uk.gov.dstl.baleen.annotators.misc.helpers.AbstractKeywordsAnnotator;
import uk.gov.dstl.baleen.annotators.misc.helpers.NoOpStemmer;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.resources.utils.StopwordUtils;
import uk.gov.dstl.baleen.types.common.Buzzword;
import uk.gov.dstl.baleen.types.metadata.Metadata;

/**
* This annotator attempts to identify keywords using the following process: 1) Split document by
* stop words 2) For each remaining word and/or phrase produce n-grams up to a maximum length 3)
* Stem each n-gram 4) Count the occurrences of each stemmed n-gram, weighting the count based on
* n-gram length 5) Select the most commonly occurring n-grams 6) Convert back to the original words
*
* @baleen.javadoc
* @deprecated Annotator moved to triage {@link
* uk.gov.dstl.baleen.annotators.triage.CommonKeywords}.
*/
public class CommonKeywords extends AbstractKeywordsAnnotator {
/**
* The maximum n-gram length
*
* @baleen.config 3
*/
public static final String PARAM_NGRAM_LENGTH = "ngram";

@ConfigurationParameter(name = PARAM_NGRAM_LENGTH, defaultValue = "3")
protected Integer maxLength;

/**
* The stemming algorithm to use, as defined in OpenNLP's SnowballStemmer.ALGORITHM enum, e.g.
* ENGLISH. If not set, or set to an undefined value, then no stemming will be used
*
* @baleen.config ENGLISH
*/
public static final String PARAM_STEMMING = "stemming";

@ConfigurationParameter(name = PARAM_STEMMING, defaultValue = "ENGLISH")
protected String stemming;

private Stemmer stemmer;
private String stopwordPattern;

@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);

if (!Strings.isNullOrEmpty(stemming)) {
try {
ALGORITHM algo = ALGORITHM.valueOf(stemming);
stemmer = new SnowballStemmer(algo);
} catch (IllegalArgumentException iae) {
getMonitor()
.warn(
"Value of {} does not match pre-defined list, no stemming will be used.",
PARAM_STEMMING,
iae);
stemmer = new NoOpStemmer();
}
} else {
stemmer = new NoOpStemmer();
}

stopwordPattern = StopwordUtils.buildStopwordPattern(stopwords, true, "[-.!?0-9]").pattern();
}

@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
List<String> phrases =
Arrays.asList(getTextInTextBlocks(jCas).toLowerCase().split(stopwordPattern));

phrases = phrases.stream().filter(s -> s.length() > 0).collect(Collectors.toList());

Map<String, Double> stemCount = new HashMap<>();
Map<String, Integer> wordCount = new HashMap<>();
Multimap<String, String> stemToWord = HashMultimap.create();

for (String phrase : phrases) {
String[] terms = phrase.split("\\s+");

for (int i = 0; i < terms.length; i++) {
StringJoiner sjStem = new StringJoiner(" ");
StringJoiner sjOrig = new StringJoiner(" ");
for (int j = 0; j < maxLength && i + j < terms.length; j++) {
String origTerm =
terms[i + j].replaceAll("^[-,\"\\(\\)':;]+", "").replaceAll("[-,\"\\(\\)':;]+$", "");
String term = stemmer.stem(origTerm.trim().replaceAll("[^a-z]", "")).toString();

if (term.length() == 0) break;

sjStem.add(term);
sjOrig.add(origTerm);

Double weight =
1.0 + j / Math.max(1.0, maxLength - 1.0); // Boost the score of longer words

String key = sjStem.toString();
Double dVal = stemCount.getOrDefault(key, 0.0);
stemCount.put(key, dVal + weight);

String origKey = sjOrig.toString();
Integer iVal = wordCount.getOrDefault(origKey, 0);
wordCount.put(origKey, iVal + 1);

stemToWord.put(key, origKey);
}
}
}

stemCount.remove("");

Multimap<Double, String> countToStem = HashMultimap.create();
Set<Double> countValues = new TreeSet<>(Collections.reverseOrder());

for (Entry<String, Double> e : stemCount.entrySet()) {
countToStem.put(e.getValue(), e.getKey()); // (Count, Key)
countValues.add(e.getValue());
}

List<String> stemmedKeywords = new ArrayList<>();
for (Double d : countValues) {
stemmedKeywords.addAll(countToStem.get(d));

if (stemmedKeywords.size() >= maxKeywords) break;
}

unstemAndAddKeywords(jCas, stemmedKeywords, stemToWord, wordCount);
}

@Override
public AnalysisEngineAction getAction() {
Set<Class<? extends Annotation>> outputs = new HashSet<>();
outputs.add(Metadata.class);
if (addBuzzwords) outputs.add(Buzzword.class);

return new AnalysisEngineAction(Collections.emptySet(), outputs);
}

private void unstemAndAddKeywords(
JCas jCas,
List<String> stemmedKeywords,
Multimap<String, String> stemToWord,
Map<String, Integer> wordCount) {
List<String> selectedKeywords = new ArrayList<>();
List<String> additionalKeywords = new ArrayList<>();

for (String stemmed : stemmedKeywords) {
Collection<String> keywords = stemToWord.get(stemmed);
String bestKeyword = selectBestUnstemmedWord(keywords, wordCount);

additionalKeywords.addAll(keywords);

selectedKeywords.add(bestKeyword);
additionalKeywords.remove(bestKeyword);
}

addKeywordsToJCas(jCas, selectedKeywords, additionalKeywords);
}

private String selectBestUnstemmedWord(
Collection<String> keywords, Map<String, Integer> wordCount) {
String bestKeyword = "";
Integer bestCount = 0;

for (String keyword : keywords) {
Integer count = wordCount.get(keyword);
if (count > bestCount) {
bestCount = count;
bestKeyword = keyword;
}
}

return bestKeyword;
}
}
@Deprecated
@SuppressWarnings({"squid:S2176", "squid:S1133"}) // Suppress duplicate class name warnings
public class CommonKeywords extends uk.gov.dstl.baleen.annotators.triage.CommonKeywords {}
Loading

0 comments on commit 8781ec9

Please sign in to comment.