Skip to content

Commit

Permalink
#504 - Refactor Ngram Features for quicker processing
Browse files Browse the repository at this point in the history
  • Loading branch information
Horsmann committed Sep 23, 2018
1 parent 8fc10ae commit 1ba97ee
Show file tree
Hide file tree
Showing 21 changed files with 141 additions and 173 deletions.
Expand Up @@ -31,7 +31,7 @@
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.MaximumNormalizationExtractorBase;
import org.dkpro.tc.features.ngram.meta.base.MaximumNormalizationExtractorBase;
import org.dkpro.tc.features.ngram.meta.maxnormalization.MaxNrOfSentencesOverAllDocumentsMC;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
Expand Down
Expand Up @@ -30,7 +30,7 @@
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.MaximumNormalizationExtractorBase;
import org.dkpro.tc.features.ngram.meta.base.MaximumNormalizationExtractorBase;
import org.dkpro.tc.features.ngram.meta.maxnormalization.MaxTokenLenMC;

/**
Expand Down
Expand Up @@ -32,7 +32,7 @@
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.MaximumNormalizationExtractorBase;
import org.dkpro.tc.features.ngram.meta.base.MaximumNormalizationExtractorBase;
import org.dkpro.tc.features.ngram.meta.maxnormalization.MaxNrOfTokensOverAllDocumentsMC;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
Expand Down
@@ -0,0 +1,74 @@
/*******************************************************************************
* Copyright 2018
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.tc.features.ngram;

import java.util.HashSet;
import java.util.Set;

import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.features.ngram.meta.base.LuceneFeatureExtractorBase;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;

public abstract class AbstractNgram extends LuceneFeatureExtractorBase
implements FeatureExtractor
{
protected Set<Feature> prepFeatSet;

protected Set<Feature> getFeatureSet(FrequencyDistribution<String> fd) throws TextClassificationException {
/*
* Instead of iterating all top-k ngrams comparing them to all document ngrams for each
* iteration (expensive for large top-Ks),we build all features that might be created only once.
* We copy this feature map then for each call, which is cheaper and update only the values of those ngrams that are found.
* (TH 2018-09-23)
*/
Set<Feature> features = new HashSet<>(prepFeatSet);

for (String ng : fd.getKeys()) {
if (topKSet.contains(ng)) {
// remove default value from set, i.e. feature name and value are part of the
// features identity. Thus, remove feature with value 0 and add new one with value
// 1. Just adding the same feature with a new value will NOT override the existing
// entry.
Feature feature = new Feature(getFeaturePrefix() + "_" + ng, 0, true, FeatureType.BOOLEAN);
features.remove(feature);

//Set value to 1, i.e. feature found and mark the feature value as non-default value
feature.setValue(1);
feature.setDefault(false);

//add to set
features.add(feature);
}
}
return features;
}

protected void prepare() throws TextClassificationException
{
prepFeatSet = new HashSet<>(1024);
//Iterate once all topK and init features
for(String topNgram : topKSet.getKeys()) {
Feature feature = new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN);
prepFeatSet.add(feature);
}
}
}
Expand Up @@ -18,7 +18,6 @@
package org.dkpro.tc.features.ngram;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand All @@ -29,11 +28,8 @@
import org.apache.uima.util.Level;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
import org.dkpro.tc.features.ngram.meta.CharacterNGramMC;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
Expand All @@ -43,8 +39,7 @@
*/
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
public class CharacterNGram
extends LuceneFeatureExtractorBase
implements FeatureExtractor
extends AbstractNgram
{

private Set<Feature> prepFeatSet;
Expand All @@ -65,43 +60,8 @@ public Set<Feature> extract(JCas aJCas, TextClassificationTarget aTarget)
CharacterNGramMC.CHAR_WORD_BEGIN,
CharacterNGramMC.CHAR_WORD_END);

/*
* Instead of iterating all top-k ngrams comparing them to all document ngrams for each
* iteration (expensive for large top-Ks),we build all features that might be created only once.
* We copy this feature map then for each call, which is cheaper and update only the values of those ngrams that are found.
* (TH 2018-09-23)
*/
Set<Feature> features = new HashSet<>(prepFeatSet);

for (String docNgram : documentCharNgrams.getKeys()) {
if (topKSet.contains(docNgram)) {
// remove default value from set, i.e. feature name and value are part of the
// features identity. Thus, remove feature with value 0 and add new one with value
// 1. Just adding the same feature with a new value will NOT override the existing
// entry.
Feature feature = new Feature(getFeaturePrefix() + "_" + docNgram, 0, true, FeatureType.BOOLEAN);
features.remove(feature);

//Set value to 1, i.e. feature found and mark the feature value as non-default value
feature.setValue(1);
feature.setDefault(false);

//add to set
features.add(feature);
}
}

return features;
}

private void prepare() throws TextClassificationException
{
prepFeatSet = new HashSet<>(1024);
//Iterate once all topK and init features
for(String topNgram : topKSet.getKeys()) {
Feature feature = new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN);
prepFeatSet.add(feature);
}
return getFeatureSet(documentCharNgrams);
}

@Override
Expand Down
Expand Up @@ -19,7 +19,6 @@

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand All @@ -31,12 +30,9 @@
import org.apache.uima.resource.ResourceSpecifier;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.features.util.FeatureUtil;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
import org.dkpro.tc.features.ngram.meta.KeywordNGramMC;
import org.dkpro.tc.features.ngram.util.KeywordNGramUtils;

Expand All @@ -45,8 +41,7 @@
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
public class KeywordNGram
extends LuceneFeatureExtractorBase
implements FeatureExtractor
extends AbstractNgram
{
public static final String PARAM_NGRAM_KEYWORDS_FILE = "keywordsFile";
@ConfigurationParameter(name = PARAM_NGRAM_KEYWORDS_FILE, mandatory = true)
Expand All @@ -70,23 +65,17 @@ public class KeywordNGram
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget)
throws TextClassificationException
{
Set<Feature> features = new HashSet<Feature>();


if (prepFeatSet == null) {
prepare();
}

FrequencyDistribution<String> documentNgrams = KeywordNGramUtils.getDocumentKeywordNgrams(
jcas, aTarget, ngramMaxN, ngramMaxN, markSentenceBoundary, markSentenceLocation,
includeCommas, keywords);

for (String topNgram : topKSet.getKeys()) {
if (documentNgrams.getKeys().contains(topNgram)) {
features.add(
new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
}
else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true,
FeatureType.BOOLEAN));
}
}
return features;

return getFeatureSet(documentNgrams);
}

@Override
Expand Down
Expand Up @@ -18,7 +18,6 @@
package org.dkpro.tc.features.ngram;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand All @@ -28,11 +27,8 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
import org.dkpro.tc.features.ngram.meta.PhoneticNGramMC;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
Expand All @@ -44,30 +40,22 @@
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
public class PhoneticNGram
extends LuceneFeatureExtractorBase
implements FeatureExtractor
extends AbstractNgram
{

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget)
throws TextClassificationException
{

if (prepFeatSet == null) {
prepare();
}

Set<Feature> features = new HashSet<Feature>();
FrequencyDistribution<String> documentNgrams = PhoneticNGramMC
.getDocumentPhoneticNgrams(jcas, aTarget, ngramMinN, ngramMaxN);

for (String topNgram : topKSet.getKeys()) {
if (documentNgrams.getKeys().contains(topNgram)) {
features.add(
new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
}
else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true,
FeatureType.BOOLEAN));
}
}
return features;
return getFeatureSet(documentNgrams);
}

@Override
Expand Down
Expand Up @@ -18,7 +18,6 @@
package org.dkpro.tc.features.ngram;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand All @@ -29,11 +28,8 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
import org.dkpro.tc.features.ngram.meta.PosNGramMC;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
Expand All @@ -45,8 +41,7 @@
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" })
public class PosNGram
extends LuceneFeatureExtractorBase
implements FeatureExtractor
extends AbstractNgram
{

public static final String PARAM_USE_CANONICAL_POS = "useCanonicalPos";
Expand All @@ -57,23 +52,17 @@ public class PosNGram
public Set<Feature> extract(JCas view, TextClassificationTarget classificationUnit)
throws TextClassificationException
{

if (prepFeatSet == null) {
prepare();
}

Set<Feature> features = new HashSet<Feature>();
FrequencyDistribution<String> documentPOSNgrams = null;
documentPOSNgrams = PosNGramMC.getDocumentPosNgrams(view, classificationUnit, ngramMinN,
ngramMaxN, useCanonicalTags);

for (String topNgram : topKSet.getKeys()) {
if (documentPOSNgrams.getKeys().contains(topNgram)) {
features.add(
new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
}
else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true,
FeatureType.BOOLEAN));
}
}
return features;

return getFeatureSet(documentPOSNgrams);
}

@Override
Expand Down

0 comments on commit 1ba97ee

Please sign in to comment.