Merge branch 'master' of github.com:dkpro/dkpro-core

* 'master' of github.com:dkpro/dkpro-core: No issue. Javadoc fixed and TypeCapability removed (depends on parameterization). No issue. Javadoc fixed. No issue. Javadoc improved. No issue. Removed the redundant/unused attribute Row.id Downgrade Mallet to v2.0.8 release #839 Cloess #848 Remove module ditop-asl from deps-not-on-maven-central profile
dkpro · May 26, 2016 · 2dd9fe1 · 2dd9fe1
2 parents 0a7f36d + f72baa6
commit 2dd9fe1
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 64 deletions.
diff --git a/dkpro-core-asl/pom.xml b/dkpro-core-asl/pom.xml
@@ -210,6 +210,11 @@
 				<artifactId>de.tudarmstadt.ukp.dkpro.core.io.conll-asl</artifactId>
 				<version>1.9.0-SNAPSHOT</version>
 			</dependency>
+			<dependency>
+				<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
+				<artifactId>de.tudarmstadt.ukp.dkpro.core.io.ditop-asl</artifactId>
+				<version>1.9.0-SNAPSHOT</version>
+			</dependency>
 			<dependency>
 				<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
 				<artifactId>de.tudarmstadt.ukp.dkpro.core.io.imscwb-asl</artifactId>
@@ -335,7 +340,12 @@
                 <artifactId>de.tudarmstadt.ukp.dkpro.core.ldweb1t-asl</artifactId>
                 <version>1.9.0-SNAPSHOT</version>
             </dependency>
-   			<dependency>
+			<dependency>
+				<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
+				<artifactId>de.tudarmstadt.ukp.dkpro.core.mallet-asl</artifactId>
+				<version>1.9.0-SNAPSHOT</version>
+			</dependency>
+			<dependency>
 				<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
 				<artifactId>de.tudarmstadt.ukp.dkpro.core.maltparser-asl</artifactId>
 				<version>1.9.0-SNAPSHOT</version>
@@ -463,6 +473,7 @@
         <module>../dkpro-core-io-brat-asl</module>
         <module>../dkpro-core-io-combination-asl</module>
 		<module>../dkpro-core-io-conll-asl</module>
+		<module>../dkpro-core-io-ditop-asl</module>
 		<module>../dkpro-core-io-imscwb-asl</module>
 		<module>../dkpro-core-io-html-asl</module>
         <module>../dkpro-core-io-json-asl</module>
@@ -572,7 +583,6 @@
 			</repositories>
 			<modules>
                 <!-- IO modules -->
-                <module>../dkpro-core-io-ditop-asl</module>
                 <module>../dkpro-core-io-fangorn-asl</module>
                 <module>../dkpro-core-io-graf-asl</module>
                 <!-- Processing modules -->
@@ -581,11 +591,7 @@
 			</modules>
 			<dependencyManagement>
 				<dependencies>
-                    <dependency>
-                        <groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
-                        <artifactId>de.tudarmstadt.ukp.dkpro.core.io.ditop-asl</artifactId>
-                        <version>1.9.0-SNAPSHOT</version>
-                    </dependency>
+
 					<dependency>
 						<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
 						<artifactId>de.tudarmstadt.ukp.dkpro.core.flextag-asl</artifactId>
@@ -606,11 +612,7 @@
                         <artifactId>de.tudarmstadt.ukp.dkpro.core.lbj-asl</artifactId>
                         <version>1.9.0-SNAPSHOT</version>
                     </dependency>
-                    <dependency>
-                        <groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
-                        <artifactId>de.tudarmstadt.ukp.dkpro.core.mallet-asl</artifactId>
-                        <version>1.9.0-SNAPSHOT</version>
-                    </dependency>
+
 				</dependencies>
 			</dependencyManagement>
 		</profile>

diff --git a/...re-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Writer.java b/...re-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2000Writer.java
@@ -45,7 +45,7 @@
 
 /**
  * <p>Writes the CoNLL 2000 chunking format. The columns are separated by spaces.</p>
- * 
+ *
  * <pre><code>
  * He        PRP  B-NP
  * reckons   VBZ  B-VP
@@ -64,15 +64,15 @@
  * September NNP  B-NP
  * .         .    O
  * </code></pre>
- * 
+ *
  * <ol>
  * <li>FORM - token</li>
  * <li>POSTAG - part-of-speech tag</li>
  * <li>CHUNK - chunk (BIO encoded)</li>
  * </ol>
- * 
+ *
  * <p>Sentences are separated by a blank new line.</p>
- * 
+ *
  * @see <a href="http://www.cnts.ua.ac.be/conll2000/chunking/">CoNLL 2000 shared task</a>
  */
 @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
@@ -131,31 +131,30 @@ private void convert(JCas aJCas, PrintWriter aOut)
 
             // Tokens
             List<Token> tokens = selectCovered(Token.class, sentence);
-            
+
             // Chunks
             IobEncoder encoder = new IobEncoder(aJCas.getCas(), chunkType, chunkValue);
-            
-            for (int i = 0; i < tokens.size(); i++) {
+
+            for (Token token:tokens) {
                 Row row = new Row();
-                row.id = i+1;
-                row.token = tokens.get(i);
-                row.chunk = encoder.encode(tokens.get(i));
+                row.token = token;
+                row.chunk = encoder.encode(token);
                 ctokens.put(row.token, row);
             }
-            
+
             // Write sentence in CONLL 2006 format
             for (Row row : ctokens.values()) {
                 String pos = UNUSED;
                 if (writePos && (row.token.getPos() != null)) {
                     POS posAnno = row.token.getPos();
                     pos = posAnno.getPosValue();
                 }
-                
+
                 String chunk = UNUSED;
                 if (writeChunk && (row.chunk != null)) {
                     chunk = encoder.encode(row.token);
                 }
-                
+
                 aOut.printf("%s %s %s\n", row.token.getCoveredText(), pos, chunk);
             }
 
@@ -165,7 +164,6 @@ private void convert(JCas aJCas, PrintWriter aOut)
 
     private static final class Row
     {
-        int id;
         Token token;
         String chunk;
     }

diff --git a/...re-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Writer.java b/...re-io-conll-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2002Writer.java
@@ -45,7 +45,7 @@
 /**
  * <p>Writes the CoNLL 2002 named entity format. The columns are separated by a single space, unlike
  * illustrated below.</p>
- * 
+ *
  * <pre><code>
  * Wolff      B-PER
  * ,          O
@@ -71,14 +71,14 @@
  * Madrid     I-ORG
  * .          O
  * </code></pre>
- * 
+ *
  * <ol>
  * <li>FORM - token</li>
  * <li>NER - named entity (BIO encoded)</li>
  * </ol>
- * 
+ *
  * <p>Sentences are separated by a blank new line.</p>
- * 
+ *
  * @see <a href="http://www.clips.ua.ac.be/conll2002/ner/">CoNLL 2002 shared task</a>
  */
 @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
@@ -133,25 +133,24 @@ private void convert(JCas aJCas, PrintWriter aOut)
 
             // Tokens
             List<Token> tokens = selectCovered(Token.class, sentence);
-            
+
             // Chunks
             IobEncoder encoder = new IobEncoder(aJCas.getCas(), neType, neValue);
-            
-            for (int i = 0; i < tokens.size(); i++) {
+
+            for (Token token:tokens) {
                 Row row = new Row();
-                row.id = i+1;
-                row.token = tokens.get(i);
-                row.ne = encoder.encode(tokens.get(i));
+                row.token = token;
+                row.ne = encoder.encode(token);
                 ctokens.put(row.token, row);
             }
-            
+
             // Write sentence in CONLL 2006 format
             for (Row row : ctokens.values()) {
                 String chunk = UNUSED;
                 if (writeNamedEntity && (row.ne != null)) {
                     chunk = encoder.encode(row.token);
                 }
-                
+
                 aOut.printf("%s %s\n", row.token.getCoveredText(), chunk);
             }
 
@@ -161,7 +160,6 @@ private void convert(JCas aJCas, PrintWriter aOut)
 
     private static final class Row
     {
-        int id;
         Token token;
         String ne;
     }

diff --git a/dkpro-core-io-ditop-asl/pom.xml b/dkpro-core-io-ditop-asl/pom.xml
@@ -31,7 +31,7 @@
   	<dependency>
   		<groupId>cc.mallet</groupId>
   		<artifactId>mallet</artifactId>
-  		<version>2.0.7</version>
+  		<version>2.0.8</version>
   	</dependency>
   	<dependency>
   		<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>

diff --git a/...e-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/MalletModelEstimator.java b/...e-mallet-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/MalletModelEstimator.java
@@ -37,7 +37,14 @@
 import java.util.OptionalInt;
 
 /**
- * This class defines parameters and methods that are common for Mallet model estimators.
+ * This abstract class defines parameters and methods that are common for Mallet model estimators.
+ * <p>
+ * It creates a Mallet {@link InstanceList} from the input documents so that inheriting estimators
+ * can create a model, typically implemented by overriding the {@link JCasFileWriter_ImplBase#collectionProcessComplete()}
+ * method.
+ *
+ * @see de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.WordEmbeddingsEstimator
+ * @see de.tudarmstadt.ukp.dkpro.core.mallet.lda.LdaTopicModelEstimator
  * @since 1.9.0
  */
 public abstract class MalletModelEstimator
@@ -46,37 +53,40 @@ public abstract class MalletModelEstimator
     private static final Locale locale = Locale.US;
 
     /**
-     * The annotation type to use for the model. Default: {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}.
-     * For lemmas, use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value}
+     * The annotation type to use as input tokens for the model estimation.
+     * Default: {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}.
+     * For lemmas, for instance, use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value}
      */
     public static final String PARAM_TOKEN_FEATURE_PATH = "tokenFeaturePath";
     @ConfigurationParameter(name = PARAM_TOKEN_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token")
     private String tokenFeaturePath;
 
     /**
-     * The number of threads to use during model estimation. If not set, the number of threads is determined automatically.
+     * The number of threads to use during model estimation.
+     * If not set, the number of threads is automatically set by {@link ComponentParameters#computeNumThreads(int)}.
      * <p>
-     * Warning: do not set this to more than 1 when using very small (test) data sets on {@link de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.WordEmbeddingsEstimator}
-     * because the process might then run infinitely!
+     * Warning: do not set this to more than 1 when using very small (test) data sets on {@link de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.WordEmbeddingsEstimator}!
+     * This might prevent the process from terminating.
      */
     public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS;
     @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS)
     private int numThreads;
 
     /**
-     * Ignore tokens (or lemmas, respectively) that are shorter than the given value. Default: 3.
+     * Ignore tokens (or any other annotation type, as specified by {@link #PARAM_TOKEN_FEATURE_PATH})
+     * that are shorter than the given value. Default: 3.
      */
     public static final String PARAM_MIN_TOKEN_LENGTH = "minTokenLength";
     @ConfigurationParameter(name = PARAM_MIN_TOKEN_LENGTH, mandatory = true, defaultValue = "3")
     private int minTokenLength;
 
     /**
-     * If specific, the text contained in the given segmentation type annotations are fed as
-     * separate units to the topic model estimator e.g.
+     * If specified, the text contained in the given segmentation type annotations are fed as
+     * separate units ("documents") to the topic model estimator e.g.
      * {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.sentence}. Text that is not within
      * such annotations is ignored.
      * <p>
-     * By default, the full document text is used as a document.
+     * By default, the full text is used as a document.
      */
     public static final String PARAM_COVERING_ANNOTATION_TYPE = "coveringAnnotationType";
     @ConfigurationParameter(name = PARAM_COVERING_ANNOTATION_TYPE, mandatory = false)

diff --git a/...et-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/LdaTopicModelEstimator.java b/...et-asl/src/main/java/de/tudarmstadt/ukp/dkpro/core/mallet/lda/LdaTopicModelEstimator.java
@@ -22,23 +22,24 @@
 import de.tudarmstadt.ukp.dkpro.core.mallet.MalletModelEstimator;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.fit.descriptor.ConfigurationParameter;
-import org.apache.uima.fit.descriptor.TypeCapability;
 
 import java.io.File;
 import java.io.IOException;
 
 /**
  * Estimate an LDA topic model using Mallet and write it to a file. It stores all incoming CAS' to
  * Mallet {@link Instance}s before estimating the model, using a {@link ParallelTopicModel}.
+ * <p>
+ * Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, etc.).
+ * <p>
+ * Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, paragraphs, etc.).
  */
-@TypeCapability(
-        inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }
-)
+
 public class LdaTopicModelEstimator
         extends MalletModelEstimator
 {
     /**
-     * The number of topics to estimate for the topic model.
+     * The number of topics to estimate.
      */
     public static final String PARAM_N_TOPICS = "nTopics";
     @ConfigurationParameter(name = PARAM_N_TOPICS, mandatory = true, defaultValue = "10")
@@ -52,14 +53,14 @@ public class LdaTopicModelEstimator
     private int nIterations;
 
     /**
-     * The number of iterations before hyperparameter optimization begins. Default: 100
+     * The number of iterations before hyper-parameter optimization begins. Default: 100
      */
     public static final String PARAM_BURNIN_PERIOD = "burninPeriod";
     @ConfigurationParameter(name = PARAM_BURNIN_PERIOD, mandatory = true, defaultValue = "100")
     private int burninPeriod;
 
     /**
-     * Interval for optimizing Dirichlet hyperparameters. Default: 50
+     * Interval for optimizing Dirichlet hyper-parameters. Default: 50
      */
     public static final String PARAM_OPTIMIZE_INTERVAL = "optimizeInterval";
     @ConfigurationParameter(name = PARAM_OPTIMIZE_INTERVAL, mandatory = true, defaultValue = "50")
@@ -73,15 +74,15 @@ public class LdaTopicModelEstimator
     private int randomSeed;
 
     /**
-     * Define how often to save a serialized model during estimation. Default: 0 (only save when
+     * Define how frequently a serialized model is saved to disk during estimation. Default: 0 (only save when
      * estimation is done).
      */
     public static final String PARAM_SAVE_INTERVAL = "saveInterval";
     @ConfigurationParameter(name = PARAM_SAVE_INTERVAL, mandatory = true, defaultValue = "0")
     private int saveInterval;
 
     /**
-     * Use a symmatric alpha value during model estimation? Default: false.
+     * Use a symmetric alpha value during model estimation? Default: false.
      */
     public static final String PARAM_USE_SYMMETRIC_ALPHA = "useSymmetricAlpha";
     @ConfigurationParameter(name = PARAM_USE_SYMMETRIC_ALPHA, mandatory = true, defaultValue = "false")

diff --git a/...ain/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/WordEmbeddingsEstimator.java b/...ain/java/de/tudarmstadt/ukp/dkpro/core/mallet/wordembeddings/WordEmbeddingsEstimator.java
@@ -29,8 +29,10 @@
 /**
  * Compute word embeddings from the given collection using skip-grams.
  * <p>
- * Set {@link #PARAM_TOKEN_FEATURE_PATH} to determine what is considered as token (Tokens, Lemmas, etc.)
- * and {@link #PARAM_COVERING_ANNOTATION_TYPE} to determine what is considered a document (sentences, paragraphs, etc.).
+ * Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, etc.).
+ * <p>
+ * Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, paragraphs, etc.).
+ *
  * @since 1.9.0
  */
 public class WordEmbeddingsEstimator
@@ -58,15 +60,14 @@ public class WordEmbeddingsEstimator
     private int windowSize;
 
     /**
-     * An example word that is output with its nearest neighbours once in a while (FIXME: currently
-     * not working, see {@link #collectionProcessComplete()}). (default: null, i.e. none).
+     * An example word that is output with its nearest neighbours once in a while (default: null, i.e. none).
      */
     public static final String PARAM_EXAMPLE_WORD = "exampleWord";
     @ConfigurationParameter(name = PARAM_EXAMPLE_WORD, mandatory = false)
     private String exampleWord;
 
     /**
-     * All documents with fewer tokens than this (default: 10) are omitted.
+     * Ignore documents with fewer tokens than this value (default: 10).
      */
     public static final String PARAM_MIN_DOCUMENT_LENGTH = "minDocumentLength";
     @ConfigurationParameter(name = PARAM_MIN_DOCUMENT_LENGTH, mandatory = true, defaultValue = "10")