From b0de4042159883f6a6fc46ad2403dd4cece48f62 Mon Sep 17 00:00:00 2001 From: Carsten Schnober Date: Tue, 17 May 2016 12:56:10 +0200 Subject: [PATCH 1/6] Cloess #848 Remove module ditop-asl from deps-not-on-maven-central profile - dkpro-core-asl/pom.xml: move module and managed dependency ditop-asl to default profile - dkpro-core-io-ditop-asl/pom.xml: upgrade Mallet dependency to version 2.0.8 --- dkpro-core-asl/pom.xml | 13 +++++++------ dkpro-core-io-ditop-asl/pom.xml | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/dkpro-core-asl/pom.xml b/dkpro-core-asl/pom.xml index 6f085820e5..cf131273a2 100644 --- a/dkpro-core-asl/pom.xml +++ b/dkpro-core-asl/pom.xml @@ -210,6 +210,11 @@ de.tudarmstadt.ukp.dkpro.core.io.conll-asl 1.9.0-SNAPSHOT + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.io.ditop-asl + 1.9.0-SNAPSHOT + de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.io.imscwb-asl @@ -458,6 +463,7 @@ ../dkpro-core-io-brat-asl ../dkpro-core-io-combination-asl ../dkpro-core-io-conll-asl + ../dkpro-core-io-ditop-asl ../dkpro-core-io-imscwb-asl ../dkpro-core-io-html-asl ../dkpro-core-io-json-asl @@ -566,7 +572,6 @@ - ../dkpro-core-io-ditop-asl ../dkpro-core-io-fangorn-asl ../dkpro-core-io-graf-asl @@ -575,11 +580,7 @@ - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.io.ditop-asl - 1.9.0-SNAPSHOT - + de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.flextag-asl diff --git a/dkpro-core-io-ditop-asl/pom.xml b/dkpro-core-io-ditop-asl/pom.xml index a044cbfb97..7115c15048 100644 --- a/dkpro-core-io-ditop-asl/pom.xml +++ b/dkpro-core-io-ditop-asl/pom.xml @@ -31,7 +31,7 @@ cc.mallet mallet - 2.0.7 + 2.0.8 de.tudarmstadt.ukp.dkpro.core From 50bb4980e2f1a8567ef1d6a9935741ccc7a068a0 Mon Sep 17 00:00:00 2001 From: Carsten Schnober Date: Tue, 17 May 2016 12:58:18 +0200 Subject: [PATCH 2/6] Downgrade Mallet to v2.0.8 release #839 - dkpro-core-asl/pom.xml move managed dependency mallet-asl to default profile --- dkpro-core-asl/pom.xml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dkpro-core-asl/pom.xml b/dkpro-core-asl/pom.xml index cf131273a2..06e985f27c 100644 --- a/dkpro-core-asl/pom.xml +++ b/dkpro-core-asl/pom.xml @@ -335,7 +335,12 @@ de.tudarmstadt.ukp.dkpro.core.ldweb1t-asl 1.9.0-SNAPSHOT - + + de.tudarmstadt.ukp.dkpro.core + de.tudarmstadt.ukp.dkpro.core.mallet-asl + 1.9.0-SNAPSHOT + + de.tudarmstadt.ukp.dkpro.core de.tudarmstadt.ukp.dkpro.core.maltparser-asl 1.9.0-SNAPSHOT @@ -601,11 +606,7 @@ de.tudarmstadt.ukp.dkpro.core.lbj-asl 1.9.0-SNAPSHOT - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.mallet-asl - 1.9.0-SNAPSHOT - + From c02e3c60a05cda823d13741dcd5865492df87bcc Mon Sep 17 00:00:00 2001 From: maxxkia Date: Thu, 19 May 2016 16:56:35 +0200 Subject: [PATCH 3/6] No issue. Removed the redundant/unused attribute Row.id Removed the redundant/unused attribute Row.id and converted the for loop to foreach loop for efficiency. --- .../dkpro/core/io/conll/Conll2000Writer.java | 26 +++++++++---------- .../dkpro/core/io/conll/Conll2002Writer.java | 24 ++++++++--------- 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Writer.java b/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Writer.java index bcbdbdfa02..2557c436bb 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Writer.java +++ b/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Writer.java @@ -45,7 +45,7 @@ /** *

Writes the CoNLL 2000 chunking format. The columns are separated by spaces.

- * + * *

  * He        PRP  B-NP
  * reckons   VBZ  B-VP
@@ -64,15 +64,15 @@
  * September NNP  B-NP
  * .         .    O
  * 
- * + * *
    *
  1. FORM - token
  2. *
  3. POSTAG - part-of-speech tag
  4. *
  5. CHUNK - chunk (BIO encoded)
  6. *
- * + * *

Sentences are separated by a blank new line.

- * + * * @see CoNLL 2000 shared task */ @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", @@ -131,18 +131,17 @@ private void convert(JCas aJCas, PrintWriter aOut) // Tokens List tokens = selectCovered(Token.class, sentence); - + // Chunks IobEncoder encoder = new IobEncoder(aJCas.getCas(), chunkType, chunkValue); - - for (int i = 0; i < tokens.size(); i++) { + + for (Token token:tokens) { Row row = new Row(); - row.id = i+1; - row.token = tokens.get(i); - row.chunk = encoder.encode(tokens.get(i)); + row.token = token; + row.chunk = encoder.encode(token); ctokens.put(row.token, row); } - + // Write sentence in CONLL 2006 format for (Row row : ctokens.values()) { String pos = UNUSED; @@ -150,12 +149,12 @@ private void convert(JCas aJCas, PrintWriter aOut) POS posAnno = row.token.getPos(); pos = posAnno.getPosValue(); } - + String chunk = UNUSED; if (writeChunk && (row.chunk != null)) { chunk = encoder.encode(row.token); } - + aOut.printf("%s %s %s\n", row.token.getCoveredText(), pos, chunk); } @@ -165,7 +164,6 @@ private void convert(JCas aJCas, PrintWriter aOut) private static final class Row { - int id; Token token; String chunk; } diff --git a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Writer.java b/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Writer.java index 023cac13f0..4354913c44 100644 --- a/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Writer.java +++ b/dkpro-core-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Writer.java @@ -45,7 +45,7 @@ /** *

Writes the CoNLL 2002 named entity format. The columns are separated by a single space, unlike * illustrated below.

- * + * *

  * Wolff      B-PER
  * ,          O
@@ -71,14 +71,14 @@
  * Madrid     I-ORG
  * .          O
  * 
- * + * *
    *
  1. FORM - token
  2. *
  3. NER - named entity (BIO encoded)
  4. *
- * + * *

Sentences are separated by a blank new line.

- * + * * @see CoNLL 2002 shared task */ @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", @@ -133,25 +133,24 @@ private void convert(JCas aJCas, PrintWriter aOut) // Tokens List tokens = selectCovered(Token.class, sentence); - + // Chunks IobEncoder encoder = new IobEncoder(aJCas.getCas(), neType, neValue); - - for (int i = 0; i < tokens.size(); i++) { + + for (Token token:tokens) { Row row = new Row(); - row.id = i+1; - row.token = tokens.get(i); - row.ne = encoder.encode(tokens.get(i)); + row.token = token; + row.ne = encoder.encode(token); ctokens.put(row.token, row); } - + // Write sentence in CONLL 2006 format for (Row row : ctokens.values()) { String chunk = UNUSED; if (writeNamedEntity && (row.ne != null)) { chunk = encoder.encode(row.token); } - + aOut.printf("%s %s\n", row.token.getCoveredText(), chunk); } @@ -161,7 +160,6 @@ private void convert(JCas aJCas, PrintWriter aOut) private static final class Row { - int id; Token token; String ne; } From ea76ba3adf9aea364dc5172ecf3657fa0de91b98 Mon Sep 17 00:00:00 2001 From: Carsten Schnober Date: Mon, 23 May 2016 16:17:29 +0200 Subject: [PATCH 4/6] No issue. Javadoc improved. --- .../core/mallet/MalletModelEstimator.java | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/MalletModelEstimator.java b/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/MalletModelEstimator.java index 4ee01a1421..ad213115d1 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/MalletModelEstimator.java +++ b/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/MalletModelEstimator.java @@ -37,7 +37,14 @@ import java.util.OptionalInt; /** - * This class defines parameters and methods that are common for Mallet model estimators. + * This abstract class defines parameters and methods that are common for Mallet model estimators. + *

+ * It creates a Mallet {@link InstanceList} from the input documents so that inheriting estimators + * can create a model, typically implemented by overriding the {@link JCasFileWriter_ImplBase#collectionProcessComplete()} + * method. + * + * @see de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.WordEmbeddingsEstimator + * @see de.tudarmstadt.ukp.dkpro.core.mallet.lda.LdaTopicModelEstimator * @since 1.9.0 */ public abstract class MalletModelEstimator @@ -46,37 +53,40 @@ public abstract class MalletModelEstimator private static final Locale locale = Locale.US; /** - * The annotation type to use for the model. Default: {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}. - * For lemmas, use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} + * The annotation type to use as input tokens for the model estimation. + * Default: {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}. + * For lemmas, for instance, use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} */ public static final String PARAM_TOKEN_FEATURE_PATH = "tokenFeaturePath"; @ConfigurationParameter(name = PARAM_TOKEN_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") private String tokenFeaturePath; /** - * The number of threads to use during model estimation. If not set, the number of threads is determined automatically. + * The number of threads to use during model estimation. + * If not set, the number of threads is automatically set by {@link ComponentParameters#computeNumThreads(int)}. *

- * Warning: do not set this to more than 1 when using very small (test) data sets on {@link de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.WordEmbeddingsEstimator} - * because the process might then run infinitely! + * Warning: do not set this to more than 1 when using very small (test) data sets on {@link de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.WordEmbeddingsEstimator}! + * This might prevent the process from terminating. */ public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS) private int numThreads; /** - * Ignore tokens (or lemmas, respectively) that are shorter than the given value. Default: 3. + * Ignore tokens (or any other annotation type, as specified by {@link #PARAM_TOKEN_FEATURE_PATH}) + * that are shorter than the given value. Default: 3. */ public static final String PARAM_MIN_TOKEN_LENGTH = "minTokenLength"; @ConfigurationParameter(name = PARAM_MIN_TOKEN_LENGTH, mandatory = true, defaultValue = "3") private int minTokenLength; /** - * If specific, the text contained in the given segmentation type annotations are fed as - * separate units to the topic model estimator e.g. + * If specified, the text contained in the given segmentation type annotations are fed as + * separate units ("documents") to the topic model estimator e.g. * {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.sentence}. Text that is not within * such annotations is ignored. *

- * By default, the full document text is used as a document. + * By default, the full text is used as a document. */ public static final String PARAM_COVERING_ANNOTATION_TYPE = "coveringAnnotationType"; @ConfigurationParameter(name = PARAM_COVERING_ANNOTATION_TYPE, mandatory = false) From 855ecc4effe6ffd3436d38cace57a73695d7355d Mon Sep 17 00:00:00 2001 From: Carsten Schnober Date: Mon, 23 May 2016 16:21:28 +0200 Subject: [PATCH 5/6] No issue. Javadoc fixed. --- .../wordembeddings/WordEmbeddingsEstimator.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/WordEmbeddingsEstimator.java b/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/WordEmbeddingsEstimator.java index e618ba2787..e540035760 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/WordEmbeddingsEstimator.java +++ b/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/WordEmbeddingsEstimator.java @@ -29,8 +29,10 @@ /** * Compute word embeddings from the given collection using skip-grams. *

- * Set {@link #PARAM_TOKEN_FEATURE_PATH} to determine what is considered as token (Tokens, Lemmas, etc.) - * and {@link #PARAM_COVERING_ANNOTATION_TYPE} to determine what is considered a document (sentences, paragraphs, etc.). + * Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, etc.). + *

+ * Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, paragraphs, etc.). + * * @since 1.9.0 */ public class WordEmbeddingsEstimator @@ -58,15 +60,14 @@ public class WordEmbeddingsEstimator private int windowSize; /** - * An example word that is output with its nearest neighbours once in a while (FIXME: currently - * not working, see {@link #collectionProcessComplete()}). (default: null, i.e. none). + * An example word that is output with its nearest neighbours once in a while (default: null, i.e. none). */ public static final String PARAM_EXAMPLE_WORD = "exampleWord"; @ConfigurationParameter(name = PARAM_EXAMPLE_WORD, mandatory = false) private String exampleWord; /** - * All documents with fewer tokens than this (default: 10) are omitted. + * Ignore documents with fewer tokens than this value (default: 10). */ public static final String PARAM_MIN_DOCUMENT_LENGTH = "minDocumentLength"; @ConfigurationParameter(name = PARAM_MIN_DOCUMENT_LENGTH, mandatory = true, defaultValue = "10") From f72baa65a847b9e5bbfe1391b9feb615bcf17aba Mon Sep 17 00:00:00 2001 From: Carsten Schnober Date: Mon, 23 May 2016 16:24:59 +0200 Subject: [PATCH 6/6] No issue. Javadoc fixed and TypeCapability removed (depends on parameterization). --- .../mallet/lda/LdaTopicModelEstimator.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/LdaTopicModelEstimator.java b/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/LdaTopicModelEstimator.java index 0b9b8bee6d..c07efabfeb 100644 --- a/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/LdaTopicModelEstimator.java +++ b/dkpro-core-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/LdaTopicModelEstimator.java @@ -22,7 +22,6 @@ import de.tudarmstadt.ukp.dkpro.core.mallet.MalletModelEstimator; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.TypeCapability; import java.io.File; import java.io.IOException; @@ -30,15 +29,17 @@ /** * Estimate an LDA topic model using Mallet and write it to a file. It stores all incoming CAS' to * Mallet {@link Instance}s before estimating the model, using a {@link ParallelTopicModel}. + *

+ * Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, etc.). + *

+ * Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, paragraphs, etc.). */ -@TypeCapability( - inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" } -) + public class LdaTopicModelEstimator extends MalletModelEstimator { /** - * The number of topics to estimate for the topic model. + * The number of topics to estimate. */ public static final String PARAM_N_TOPICS = "nTopics"; @ConfigurationParameter(name = PARAM_N_TOPICS, mandatory = true, defaultValue = "10") @@ -52,14 +53,14 @@ public class LdaTopicModelEstimator private int nIterations; /** - * The number of iterations before hyperparameter optimization begins. Default: 100 + * The number of iterations before hyper-parameter optimization begins. Default: 100 */ public static final String PARAM_BURNIN_PERIOD = "burninPeriod"; @ConfigurationParameter(name = PARAM_BURNIN_PERIOD, mandatory = true, defaultValue = "100") private int burninPeriod; /** - * Interval for optimizing Dirichlet hyperparameters. Default: 50 + * Interval for optimizing Dirichlet hyper-parameters. Default: 50 */ public static final String PARAM_OPTIMIZE_INTERVAL = "optimizeInterval"; @ConfigurationParameter(name = PARAM_OPTIMIZE_INTERVAL, mandatory = true, defaultValue = "50") @@ -73,7 +74,7 @@ public class LdaTopicModelEstimator private int randomSeed; /** - * Define how often to save a serialized model during estimation. Default: 0 (only save when + * Define how frequently a serialized model is saved to disk during estimation. Default: 0 (only save when * estimation is done). */ public static final String PARAM_SAVE_INTERVAL = "saveInterval"; @@ -81,7 +82,7 @@ public class LdaTopicModelEstimator private int saveInterval; /** - * Use a symmatric alpha value during model estimation? Default: false. + * Use a symmetric alpha value during model estimation? Default: false. */ public static final String PARAM_USE_SYMMETRIC_ALPHA = "useSymmetricAlpha"; @ConfigurationParameter(name = PARAM_USE_SYMMETRIC_ALPHA, mandatory = true, defaultValue = "false")