#504 - Refactor Ngram Features for quicker processing

dkpro · Sep 23, 2018 · 1ba97ee · 1ba97ee
1 parent 8fc10ae
commit 1ba97ee
Show file tree

Hide file tree

Showing 21 changed files with 141 additions and 173 deletions.
diff --git a/...-ngram/src/main/java/org/dkpro/tc/features/maxnormalization/SentenceRatioPerDocument.java b/...-ngram/src/main/java/org/dkpro/tc/features/maxnormalization/SentenceRatioPerDocument.java
@@ -31,7 +31,7 @@
 import org.dkpro.tc.api.features.FeatureType;
 import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
 import org.dkpro.tc.api.type.TextClassificationTarget;
-import org.dkpro.tc.features.ngram.base.MaximumNormalizationExtractorBase;
+import org.dkpro.tc.features.ngram.meta.base.MaximumNormalizationExtractorBase;
 import org.dkpro.tc.features.ngram.meta.maxnormalization.MaxNrOfSentencesOverAllDocumentsMC;
 
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;

diff --git a/...features-ngram/src/main/java/org/dkpro/tc/features/maxnormalization/TokenLengthRatio.java b/...features-ngram/src/main/java/org/dkpro/tc/features/maxnormalization/TokenLengthRatio.java
@@ -30,7 +30,7 @@
 import org.dkpro.tc.api.features.FeatureType;
 import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
 import org.dkpro.tc.api.type.TextClassificationTarget;
-import org.dkpro.tc.features.ngram.base.MaximumNormalizationExtractorBase;
+import org.dkpro.tc.features.ngram.meta.base.MaximumNormalizationExtractorBase;
 import org.dkpro.tc.features.ngram.meta.maxnormalization.MaxTokenLenMC;
 
 /**

diff --git a/...res-ngram/src/main/java/org/dkpro/tc/features/maxnormalization/TokenRatioPerDocument.java b/...res-ngram/src/main/java/org/dkpro/tc/features/maxnormalization/TokenRatioPerDocument.java
@@ -32,7 +32,7 @@
 import org.dkpro.tc.api.features.FeatureType;
 import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
 import org.dkpro.tc.api.type.TextClassificationTarget;
-import org.dkpro.tc.features.ngram.base.MaximumNormalizationExtractorBase;
+import org.dkpro.tc.features.ngram.meta.base.MaximumNormalizationExtractorBase;
 import org.dkpro.tc.features.ngram.meta.maxnormalization.MaxNrOfTokensOverAllDocumentsMC;
 
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

diff --git a/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/AbstractNgram.java b/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/AbstractNgram.java
@@ -0,0 +1,74 @@
+/*******************************************************************************
+ * Copyright 2018
+ * Ubiquitous Knowledge Processing (UKP) Lab
+ * Technische Universität Darmstadt
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.dkpro.tc.features.ngram;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.dkpro.tc.api.exception.TextClassificationException;
+import org.dkpro.tc.api.features.Feature;
+import org.dkpro.tc.api.features.FeatureExtractor;
+import org.dkpro.tc.api.features.FeatureType;
+import org.dkpro.tc.features.ngram.meta.base.LuceneFeatureExtractorBase;
+
+import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
+
+public abstract class AbstractNgram extends LuceneFeatureExtractorBase
+implements FeatureExtractor
+{
+    protected Set<Feature> prepFeatSet;
+
+    protected Set<Feature> getFeatureSet(FrequencyDistribution<String> fd) throws TextClassificationException {
+        /*
+         * Instead of iterating all top-k ngrams comparing them to all document ngrams for each
+         * iteration (expensive for large top-Ks),we build all features that might be created only once.
+         * We copy this feature map then for each call, which is cheaper and update only the values of those ngrams that are found.
+         * (TH 2018-09-23) 
+         */
+        Set<Feature> features = new HashSet<>(prepFeatSet);
+
+        for (String ng : fd.getKeys()) {
+            if (topKSet.contains(ng)) {
+                // remove default value from set, i.e. feature name and value are part of the
+                // features identity. Thus, remove feature with value 0 and add new one with value
+                // 1. Just adding the same feature with a new value will NOT override the existing
+                // entry.
+                Feature feature = new Feature(getFeaturePrefix() + "_" + ng, 0, true, FeatureType.BOOLEAN);
+                features.remove(feature);
+
+                //Set value to 1, i.e. feature found and mark the feature value as non-default value
+                feature.setValue(1);
+                feature.setDefault(false);
+
+                //add to set
+                features.add(feature);
+            }
+        }
+        return features;
+    }
+
+    protected void prepare() throws TextClassificationException
+    {
+        prepFeatSet = new HashSet<>(1024);
+        //Iterate once all topK and init features  
+        for(String topNgram : topKSet.getKeys()) {
+            Feature feature = new Feature(getFeaturePrefix() + "_"  + topNgram, 0, true, FeatureType.BOOLEAN);
+            prepFeatSet.add(feature);
+        }
+    }
+}
diff --git a/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/CharacterNGram.java b/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/CharacterNGram.java
@@ -18,7 +18,6 @@
 package org.dkpro.tc.features.ngram;
 
 import java.util.Arrays;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -29,11 +28,8 @@
 import org.apache.uima.util.Level;
 import org.dkpro.tc.api.exception.TextClassificationException;
 import org.dkpro.tc.api.features.Feature;
-import org.dkpro.tc.api.features.FeatureExtractor;
-import org.dkpro.tc.api.features.FeatureType;
 import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
 import org.dkpro.tc.api.type.TextClassificationTarget;
-import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
 import org.dkpro.tc.features.ngram.meta.CharacterNGramMC;
 
 import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
@@ -43,8 +39,7 @@
  */
 @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
 public class CharacterNGram
-    extends LuceneFeatureExtractorBase
-    implements FeatureExtractor
+    extends AbstractNgram
 {
 
     private Set<Feature> prepFeatSet;
@@ -65,43 +60,8 @@ public Set<Feature> extract(JCas aJCas, TextClassificationTarget aTarget)
                                               CharacterNGramMC.CHAR_WORD_BEGIN,
                                               CharacterNGramMC.CHAR_WORD_END);
 
-        /*
-         * Instead of iterating all top-k ngrams comparing them to all document ngrams for each
-         * iteration (expensive for large top-Ks),we build all features that might be created only once.
-         * We copy this feature map then for each call, which is cheaper and update only the values of those ngrams that are found.
-         * (TH 2018-09-23) 
-         */
-        Set<Feature> features = new HashSet<>(prepFeatSet);
 
-        for (String docNgram : documentCharNgrams.getKeys()) {
-            if (topKSet.contains(docNgram)) {
-                // remove default value from set, i.e. feature name and value are part of the
-                // features identity. Thus, remove feature with value 0 and add new one with value
-                // 1. Just adding the same feature with a new value will NOT override the existing
-                // entry.
-                Feature feature = new Feature(getFeaturePrefix() + "_" + docNgram, 0, true, FeatureType.BOOLEAN);
-                features.remove(feature);
-
-                //Set value to 1, i.e. feature found and mark the feature value as non-default value
-                feature.setValue(1);
-                feature.setDefault(false);
-
-                //add to set
-                features.add(feature);
-            }
-        }
-
-        return features;
-    }
-
-    private void prepare() throws TextClassificationException
-    {
-        prepFeatSet = new HashSet<>(1024);
-        //Iterate once all topK and init features  
-        for(String topNgram : topKSet.getKeys()) {
-            Feature feature = new Feature(getFeaturePrefix() + "_"  + topNgram, 0, true, FeatureType.BOOLEAN);
-            prepFeatSet.add(feature);
-        }
+        return getFeatureSet(documentCharNgrams);
     }
 
     @Override

diff --git a/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/KeywordNGram.java b/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/KeywordNGram.java
@@ -19,7 +19,6 @@
 
 import java.io.IOException;
 import java.util.Arrays;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -31,12 +30,9 @@
 import org.apache.uima.resource.ResourceSpecifier;
 import org.dkpro.tc.api.exception.TextClassificationException;
 import org.dkpro.tc.api.features.Feature;
-import org.dkpro.tc.api.features.FeatureExtractor;
-import org.dkpro.tc.api.features.FeatureType;
 import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
 import org.dkpro.tc.api.features.util.FeatureUtil;
 import org.dkpro.tc.api.type.TextClassificationTarget;
-import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
 import org.dkpro.tc.features.ngram.meta.KeywordNGramMC;
 import org.dkpro.tc.features.ngram.util.KeywordNGramUtils;
 
@@ -45,8 +41,7 @@
 @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
         "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
 public class KeywordNGram
-    extends LuceneFeatureExtractorBase
-    implements FeatureExtractor
+    extends AbstractNgram
 {
     public static final String PARAM_NGRAM_KEYWORDS_FILE = "keywordsFile";
     @ConfigurationParameter(name = PARAM_NGRAM_KEYWORDS_FILE, mandatory = true)
@@ -70,23 +65,17 @@ public class KeywordNGram
     public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget)
         throws TextClassificationException
     {
-        Set<Feature> features = new HashSet<Feature>();
-
+
+        if (prepFeatSet == null) {
+            prepare();
+        }
+
         FrequencyDistribution<String> documentNgrams = KeywordNGramUtils.getDocumentKeywordNgrams(
                 jcas, aTarget, ngramMaxN, ngramMaxN, markSentenceBoundary, markSentenceLocation,
                 includeCommas, keywords);
 
-        for (String topNgram : topKSet.getKeys()) {
-            if (documentNgrams.getKeys().contains(topNgram)) {
-                features.add(
-                        new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
-            }
-            else {
-                features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true,
-                        FeatureType.BOOLEAN));
-            }
-        }
-        return features;
+
+        return getFeatureSet(documentNgrams);
     }
 
     @Override

diff --git a/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/PhoneticNGram.java b/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/PhoneticNGram.java
@@ -18,7 +18,6 @@
 package org.dkpro.tc.features.ngram;
 
 import java.util.Arrays;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -28,11 +27,8 @@
 import org.apache.uima.resource.ResourceInitializationException;
 import org.dkpro.tc.api.exception.TextClassificationException;
 import org.dkpro.tc.api.features.Feature;
-import org.dkpro.tc.api.features.FeatureExtractor;
-import org.dkpro.tc.api.features.FeatureType;
 import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
 import org.dkpro.tc.api.type.TextClassificationTarget;
-import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
 import org.dkpro.tc.features.ngram.meta.PhoneticNGramMC;
 
 import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
@@ -44,30 +40,22 @@
 @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
         "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
 public class PhoneticNGram
-    extends LuceneFeatureExtractorBase
-    implements FeatureExtractor
+    extends AbstractNgram
 {
 
     @Override
     public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget)
         throws TextClassificationException
     {
+
+        if (prepFeatSet == null) {
+            prepare();
+        }
 
-        Set<Feature> features = new HashSet<Feature>();
         FrequencyDistribution<String> documentNgrams = PhoneticNGramMC
                 .getDocumentPhoneticNgrams(jcas, aTarget, ngramMinN, ngramMaxN);
 
-        for (String topNgram : topKSet.getKeys()) {
-            if (documentNgrams.getKeys().contains(topNgram)) {
-                features.add(
-                        new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
-            }
-            else {
-                features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true,
-                        FeatureType.BOOLEAN));
-            }
-        }
-        return features;
+        return getFeatureSet(documentNgrams);
     }
 
     @Override

diff --git a/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/PosNGram.java b/dkpro-tc-features-ngram/src/main/java/org/dkpro/tc/features/ngram/PosNGram.java
@@ -18,7 +18,6 @@
 package org.dkpro.tc.features.ngram;
 
 import java.util.Arrays;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -29,11 +28,8 @@
 import org.apache.uima.resource.ResourceInitializationException;
 import org.dkpro.tc.api.exception.TextClassificationException;
 import org.dkpro.tc.api.features.Feature;
-import org.dkpro.tc.api.features.FeatureExtractor;
-import org.dkpro.tc.api.features.FeatureType;
 import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
 import org.dkpro.tc.api.type.TextClassificationTarget;
-import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
 import org.dkpro.tc.features.ngram.meta.PosNGramMC;
 
 import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
@@ -45,8 +41,7 @@
         "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
         "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" })
 public class PosNGram
-    extends LuceneFeatureExtractorBase
-    implements FeatureExtractor
+    extends AbstractNgram
 {
 
     public static final String PARAM_USE_CANONICAL_POS = "useCanonicalPos";
@@ -57,23 +52,17 @@ public class PosNGram
     public Set<Feature> extract(JCas view, TextClassificationTarget classificationUnit)
         throws TextClassificationException
     {
+
+        if (prepFeatSet == null) {
+            prepare();
+        }
 
-        Set<Feature> features = new HashSet<Feature>();
         FrequencyDistribution<String> documentPOSNgrams = null;
         documentPOSNgrams = PosNGramMC.getDocumentPosNgrams(view, classificationUnit, ngramMinN,
                 ngramMaxN, useCanonicalTags);
 
-        for (String topNgram : topKSet.getKeys()) {
-            if (documentPOSNgrams.getKeys().contains(topNgram)) {
-                features.add(
-                        new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
-            }
-            else {
-                features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true,
-                        FeatureType.BOOLEAN));
-            }
-        }
-        return features;
+
+        return getFeatureSet(documentPOSNgrams);
     }
 
     @Override