Baleen Triage (#72)

* Baleen Triage We collect some current annotators associated with document triage into the `triage` namespace and contribute annotators to assign a document date form the title, create document summaries and compute the Shannon Entropy of the document to inform prioritisation. We also integrate the Mallet library for document classification. This supports a number of different algorithms for learning from labelled documents, learning by suggestion and a Latent Dirichlet approach for when no labels data exists. * Change WordDistributionDocumentSummaryTest text for baleen README To remove any possible copyright issues. * Simplify topic model tests. Extract single generated dataset. Generate test models from this data. Test verify valid models are created and can be read by the annotator. This is sufficient to show correct integration with Mallet and we do not need to use the same data as in Mallet tests.
dstl · May 10, 2018 · 8781ec9 · 8781ec9
1 parent 3a4d380
commit 8781ec9
Show file tree

Hide file tree

Showing 89 changed files with 10,346 additions and 1,067 deletions.
diff --git a/baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/helpers/MathUtils.java b/baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/helpers/MathUtils.java
@@ -0,0 +1,22 @@
+// Copyright (c) Committed Software 2018, opensource@committed.io
+package uk.gov.dstl.baleen.annotators.helpers;
+
+/** Utilities for mathematical operations */
+public class MathUtils {
+
+  /**
+   * @param base The base of the logarithm to be calculated
+   * @param argument The argument of the logarithm to be calculated
+   * @return The result of the calculation
+   */
+  public static double logarithm(double base, double argument) {
+    if (base <= 0 || argument <= 0) {
+      throw new ArithmeticException("Base and argument of logarithms must be greater than 0");
+    }
+    if (base == 1) {
+      throw new ArithmeticException("Base of logarithm must be positive and not equal to 1");
+    }
+
+    return Math.log(argument) / Math.log(base);
+  }
+}
diff --git a/baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/misc/CommonKeywords.java b/baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/misc/CommonKeywords.java
@@ -1,204 +1,10 @@
-// Dstl (c) Crown Copyright 2017
+// Copyright (c) Committed Software 2018, opensource@committed.io
 package uk.gov.dstl.baleen.annotators.misc;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-import java.util.StringJoiner;
-import java.util.TreeSet;
-import java.util.stream.Collectors;
-
-import opennlp.tools.stemmer.Stemmer;
-import opennlp.tools.stemmer.snowball.SnowballStemmer;
-import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM;
-
-import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.fit.descriptor.ConfigurationParameter;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.apache.uima.resource.ResourceInitializationException;
-
-import com.google.common.base.Strings;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
-
-import uk.gov.dstl.baleen.annotators.misc.helpers.AbstractKeywordsAnnotator;
-import uk.gov.dstl.baleen.annotators.misc.helpers.NoOpStemmer;
-import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
-import uk.gov.dstl.baleen.resources.utils.StopwordUtils;
-import uk.gov.dstl.baleen.types.common.Buzzword;
-import uk.gov.dstl.baleen.types.metadata.Metadata;
-
 /**
- * This annotator attempts to identify keywords using the following process: 1) Split document by
- * stop words 2) For each remaining word and/or phrase produce n-grams up to a maximum length 3)
- * Stem each n-gram 4) Count the occurrences of each stemmed n-gram, weighting the count based on
- * n-gram length 5) Select the most commonly occurring n-grams 6) Convert back to the original words
- *
- * @baleen.javadoc
+ * @deprecated Annotator moved to triage {@link
+ *     uk.gov.dstl.baleen.annotators.triage.CommonKeywords}.
  */
-public class CommonKeywords extends AbstractKeywordsAnnotator {
-  /**
-   * The maximum n-gram length
-   *
-   * @baleen.config 3
-   */
-  public static final String PARAM_NGRAM_LENGTH = "ngram";
-
-  @ConfigurationParameter(name = PARAM_NGRAM_LENGTH, defaultValue = "3")
-  protected Integer maxLength;
-
-  /**
-   * The stemming algorithm to use, as defined in OpenNLP's SnowballStemmer.ALGORITHM enum, e.g.
-   * ENGLISH. If not set, or set to an undefined value, then no stemming will be used
-   *
-   * @baleen.config ENGLISH
-   */
-  public static final String PARAM_STEMMING = "stemming";
-
-  @ConfigurationParameter(name = PARAM_STEMMING, defaultValue = "ENGLISH")
-  protected String stemming;
-
-  private Stemmer stemmer;
-  private String stopwordPattern;
-
-  @Override
-  public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
-    super.doInitialize(aContext);
-
-    if (!Strings.isNullOrEmpty(stemming)) {
-      try {
-        ALGORITHM algo = ALGORITHM.valueOf(stemming);
-        stemmer = new SnowballStemmer(algo);
-      } catch (IllegalArgumentException iae) {
-        getMonitor()
-            .warn(
-                "Value of {} does not match pre-defined list, no stemming will be used.",
-                PARAM_STEMMING,
-                iae);
-        stemmer = new NoOpStemmer();
-      }
-    } else {
-      stemmer = new NoOpStemmer();
-    }
-
-    stopwordPattern = StopwordUtils.buildStopwordPattern(stopwords, true, "[-.!?0-9]").pattern();
-  }
-
-  @Override
-  protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
-    List<String> phrases =
-        Arrays.asList(getTextInTextBlocks(jCas).toLowerCase().split(stopwordPattern));
-
-    phrases = phrases.stream().filter(s -> s.length() > 0).collect(Collectors.toList());
-
-    Map<String, Double> stemCount = new HashMap<>();
-    Map<String, Integer> wordCount = new HashMap<>();
-    Multimap<String, String> stemToWord = HashMultimap.create();
-
-    for (String phrase : phrases) {
-      String[] terms = phrase.split("\\s+");
-
-      for (int i = 0; i < terms.length; i++) {
-        StringJoiner sjStem = new StringJoiner(" ");
-        StringJoiner sjOrig = new StringJoiner(" ");
-        for (int j = 0; j < maxLength && i + j < terms.length; j++) {
-          String origTerm =
-              terms[i + j].replaceAll("^[-,\"\\(\\)':;]+", "").replaceAll("[-,\"\\(\\)':;]+$", "");
-          String term = stemmer.stem(origTerm.trim().replaceAll("[^a-z]", "")).toString();
-
-          if (term.length() == 0) break;
-
-          sjStem.add(term);
-          sjOrig.add(origTerm);
-
-          Double weight =
-              1.0 + j / Math.max(1.0, maxLength - 1.0); // Boost the score of longer words
-
-          String key = sjStem.toString();
-          Double dVal = stemCount.getOrDefault(key, 0.0);
-          stemCount.put(key, dVal + weight);
-
-          String origKey = sjOrig.toString();
-          Integer iVal = wordCount.getOrDefault(origKey, 0);
-          wordCount.put(origKey, iVal + 1);
-
-          stemToWord.put(key, origKey);
-        }
-      }
-    }
-
-    stemCount.remove("");
-
-    Multimap<Double, String> countToStem = HashMultimap.create();
-    Set<Double> countValues = new TreeSet<>(Collections.reverseOrder());
-
-    for (Entry<String, Double> e : stemCount.entrySet()) {
-      countToStem.put(e.getValue(), e.getKey()); // (Count, Key)
-      countValues.add(e.getValue());
-    }
-
-    List<String> stemmedKeywords = new ArrayList<>();
-    for (Double d : countValues) {
-      stemmedKeywords.addAll(countToStem.get(d));
-
-      if (stemmedKeywords.size() >= maxKeywords) break;
-    }
-
-    unstemAndAddKeywords(jCas, stemmedKeywords, stemToWord, wordCount);
-  }
-
-  @Override
-  public AnalysisEngineAction getAction() {
-    Set<Class<? extends Annotation>> outputs = new HashSet<>();
-    outputs.add(Metadata.class);
-    if (addBuzzwords) outputs.add(Buzzword.class);
-
-    return new AnalysisEngineAction(Collections.emptySet(), outputs);
-  }
-
-  private void unstemAndAddKeywords(
-      JCas jCas,
-      List<String> stemmedKeywords,
-      Multimap<String, String> stemToWord,
-      Map<String, Integer> wordCount) {
-    List<String> selectedKeywords = new ArrayList<>();
-    List<String> additionalKeywords = new ArrayList<>();
-
-    for (String stemmed : stemmedKeywords) {
-      Collection<String> keywords = stemToWord.get(stemmed);
-      String bestKeyword = selectBestUnstemmedWord(keywords, wordCount);
-
-      additionalKeywords.addAll(keywords);
-
-      selectedKeywords.add(bestKeyword);
-      additionalKeywords.remove(bestKeyword);
-    }
-
-    addKeywordsToJCas(jCas, selectedKeywords, additionalKeywords);
-  }
-
-  private String selectBestUnstemmedWord(
-      Collection<String> keywords, Map<String, Integer> wordCount) {
-    String bestKeyword = "";
-    Integer bestCount = 0;
-
-    for (String keyword : keywords) {
-      Integer count = wordCount.get(keyword);
-      if (count > bestCount) {
-        bestCount = count;
-        bestKeyword = keyword;
-      }
-    }
-
-    return bestKeyword;
-  }
-}
+@Deprecated
+@SuppressWarnings({"squid:S2176", "squid:S1133"}) // Suppress duplicate class name warnings
+public class CommonKeywords extends uk.gov.dstl.baleen.annotators.triage.CommonKeywords {}