Skip to content

Commit

Permalink
Merge branch 'master' of github.com:dkpro/dkpro-core
Browse files Browse the repository at this point in the history
* 'master' of github.com:dkpro/dkpro-core:
  No issue. Javadoc fixed and TypeCapability removed (depends on parameterization).
  No issue. Javadoc fixed.
  No issue. Javadoc improved.
  No issue. Removed the redundant/unused attribute Row.id
  Downgrade Mallet to v2.0.8 release #839
  Cloess #848 Remove module ditop-asl from deps-not-on-maven-central profile
  • Loading branch information
reckart committed May 26, 2016
2 parents 0a7f36d + f72baa6 commit 2dd9fe1
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 64 deletions.
26 changes: 14 additions & 12 deletions dkpro-core-asl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,11 @@
<artifactId>de.tudarmstadt.ukp.dkpro.core.io.conll-asl</artifactId>
<version>1.9.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.io.ditop-asl</artifactId>
<version>1.9.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.io.imscwb-asl</artifactId>
Expand Down Expand Up @@ -335,7 +340,12 @@
<artifactId>de.tudarmstadt.ukp.dkpro.core.ldweb1t-asl</artifactId>
<version>1.9.0-SNAPSHOT</version>
</dependency>
<dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.mallet-asl</artifactId>
<version>1.9.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.maltparser-asl</artifactId>
<version>1.9.0-SNAPSHOT</version>
Expand Down Expand Up @@ -463,6 +473,7 @@
<module>../dkpro-core-io-brat-asl</module>
<module>../dkpro-core-io-combination-asl</module>
<module>../dkpro-core-io-conll-asl</module>
<module>../dkpro-core-io-ditop-asl</module>
<module>../dkpro-core-io-imscwb-asl</module>
<module>../dkpro-core-io-html-asl</module>
<module>../dkpro-core-io-json-asl</module>
Expand Down Expand Up @@ -572,7 +583,6 @@
</repositories>
<modules>
<!-- IO modules -->
<module>../dkpro-core-io-ditop-asl</module>
<module>../dkpro-core-io-fangorn-asl</module>
<module>../dkpro-core-io-graf-asl</module>
<!-- Processing modules -->
Expand All @@ -581,11 +591,7 @@
</modules>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.io.ditop-asl</artifactId>
<version>1.9.0-SNAPSHOT</version>
</dependency>

<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.flextag-asl</artifactId>
Expand All @@ -606,11 +612,7 @@
<artifactId>de.tudarmstadt.ukp.dkpro.core.lbj-asl</artifactId>
<version>1.9.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.mallet-asl</artifactId>
<version>1.9.0-SNAPSHOT</version>
</dependency>

</dependencies>
</dependencyManagement>
</profile>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

/**
* <p>Writes the CoNLL 2000 chunking format. The columns are separated by spaces.</p>
*
*
* <pre><code>
* He PRP B-NP
* reckons VBZ B-VP
Expand All @@ -64,15 +64,15 @@
* September NNP B-NP
* . . O
* </code></pre>
*
*
* <ol>
* <li>FORM - token</li>
* <li>POSTAG - part-of-speech tag</li>
* <li>CHUNK - chunk (BIO encoded)</li>
* </ol>
*
*
* <p>Sentences are separated by a blank new line.</p>
*
*
* @see <a href="http://www.cnts.ua.ac.be/conll2000/chunking/">CoNLL 2000 shared task</a>
*/
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
Expand Down Expand Up @@ -131,31 +131,30 @@ private void convert(JCas aJCas, PrintWriter aOut)

// Tokens
List<Token> tokens = selectCovered(Token.class, sentence);

// Chunks
IobEncoder encoder = new IobEncoder(aJCas.getCas(), chunkType, chunkValue);
for (int i = 0; i < tokens.size(); i++) {

for (Token token:tokens) {
Row row = new Row();
row.id = i+1;
row.token = tokens.get(i);
row.chunk = encoder.encode(tokens.get(i));
row.token = token;
row.chunk = encoder.encode(token);
ctokens.put(row.token, row);
}

// Write sentence in CONLL 2006 format
for (Row row : ctokens.values()) {
String pos = UNUSED;
if (writePos && (row.token.getPos() != null)) {
POS posAnno = row.token.getPos();
pos = posAnno.getPosValue();
}

String chunk = UNUSED;
if (writeChunk && (row.chunk != null)) {
chunk = encoder.encode(row.token);
}

aOut.printf("%s %s %s\n", row.token.getCoveredText(), pos, chunk);
}

Expand All @@ -165,7 +164,6 @@ private void convert(JCas aJCas, PrintWriter aOut)

private static final class Row
{
int id;
Token token;
String chunk;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
/**
* <p>Writes the CoNLL 2002 named entity format. The columns are separated by a single space, unlike
* illustrated below.</p>
*
*
* <pre><code>
* Wolff B-PER
* , O
Expand All @@ -71,14 +71,14 @@
* Madrid I-ORG
* . O
* </code></pre>
*
*
* <ol>
* <li>FORM - token</li>
* <li>NER - named entity (BIO encoded)</li>
* </ol>
*
*
* <p>Sentences are separated by a blank new line.</p>
*
*
* @see <a href="http://www.clips.ua.ac.be/conll2002/ner/">CoNLL 2002 shared task</a>
*/
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
Expand Down Expand Up @@ -133,25 +133,24 @@ private void convert(JCas aJCas, PrintWriter aOut)

// Tokens
List<Token> tokens = selectCovered(Token.class, sentence);

// Chunks
IobEncoder encoder = new IobEncoder(aJCas.getCas(), neType, neValue);
for (int i = 0; i < tokens.size(); i++) {

for (Token token:tokens) {
Row row = new Row();
row.id = i+1;
row.token = tokens.get(i);
row.ne = encoder.encode(tokens.get(i));
row.token = token;
row.ne = encoder.encode(token);
ctokens.put(row.token, row);
}

// Write sentence in CONLL 2006 format
for (Row row : ctokens.values()) {
String chunk = UNUSED;
if (writeNamedEntity && (row.ne != null)) {
chunk = encoder.encode(row.token);
}

aOut.printf("%s %s\n", row.token.getCoveredText(), chunk);
}

Expand All @@ -161,7 +160,6 @@ private void convert(JCas aJCas, PrintWriter aOut)

private static final class Row
{
int id;
Token token;
String ne;
}
Expand Down
2 changes: 1 addition & 1 deletion dkpro-core-io-ditop-asl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
<dependency>
<groupId>cc.mallet</groupId>
<artifactId>mallet</artifactId>
<version>2.0.7</version>
<version>2.0.8</version>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,14 @@
import java.util.OptionalInt;

/**
* This class defines parameters and methods that are common for Mallet model estimators.
* This abstract class defines parameters and methods that are common for Mallet model estimators.
* <p>
* It creates a Mallet {@link InstanceList} from the input documents so that inheriting estimators
* can create a model, typically implemented by overriding the {@link JCasFileWriter_ImplBase#collectionProcessComplete()}
* method.
*
* @see de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.WordEmbeddingsEstimator
* @see de.tudarmstadt.ukp.dkpro.core.mallet.lda.LdaTopicModelEstimator
* @since 1.9.0
*/
public abstract class MalletModelEstimator
Expand All @@ -46,37 +53,40 @@ public abstract class MalletModelEstimator
private static final Locale locale = Locale.US;

/**
* The annotation type to use for the model. Default: {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}.
* For lemmas, use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value}
* The annotation type to use as input tokens for the model estimation.
* Default: {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}.
* For lemmas, for instance, use {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value}
*/
public static final String PARAM_TOKEN_FEATURE_PATH = "tokenFeaturePath";
@ConfigurationParameter(name = PARAM_TOKEN_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token")
private String tokenFeaturePath;

/**
* The number of threads to use during model estimation. If not set, the number of threads is determined automatically.
* The number of threads to use during model estimation.
* If not set, the number of threads is automatically set by {@link ComponentParameters#computeNumThreads(int)}.
* <p>
* Warning: do not set this to more than 1 when using very small (test) data sets on {@link de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.WordEmbeddingsEstimator}
* because the process might then run infinitely!
* Warning: do not set this to more than 1 when using very small (test) data sets on {@link de.tudarmstadt.ukp.dkpro.core.mallet.wordembeddings.WordEmbeddingsEstimator}!
* This might prevent the process from terminating.
*/
public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS;
@ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS)
private int numThreads;

/**
* Ignore tokens (or lemmas, respectively) that are shorter than the given value. Default: 3.
* Ignore tokens (or any other annotation type, as specified by {@link #PARAM_TOKEN_FEATURE_PATH})
* that are shorter than the given value. Default: 3.
*/
public static final String PARAM_MIN_TOKEN_LENGTH = "minTokenLength";
@ConfigurationParameter(name = PARAM_MIN_TOKEN_LENGTH, mandatory = true, defaultValue = "3")
private int minTokenLength;

/**
* If specific, the text contained in the given segmentation type annotations are fed as
* separate units to the topic model estimator e.g.
* If specified, the text contained in the given segmentation type annotations are fed as
* separate units ("documents") to the topic model estimator e.g.
* {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.sentence}. Text that is not within
* such annotations is ignored.
* <p>
* By default, the full document text is used as a document.
* By default, the full text is used as a document.
*/
public static final String PARAM_COVERING_ANNOTATION_TYPE = "coveringAnnotationType";
@ConfigurationParameter(name = PARAM_COVERING_ANNOTATION_TYPE, mandatory = false)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,24 @@
import de.tudarmstadt.ukp.dkpro.core.mallet.MalletModelEstimator;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;

import java.io.File;
import java.io.IOException;

/**
* Estimate an LDA topic model using Mallet and write it to a file. It stores all incoming CAS' to
* Mallet {@link Instance}s before estimating the model, using a {@link ParallelTopicModel}.
* <p>
* Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, etc.).
* <p>
* Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, paragraphs, etc.).
*/
@TypeCapability(
inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }
)

public class LdaTopicModelEstimator
extends MalletModelEstimator
{
/**
* The number of topics to estimate for the topic model.
* The number of topics to estimate.
*/
public static final String PARAM_N_TOPICS = "nTopics";
@ConfigurationParameter(name = PARAM_N_TOPICS, mandatory = true, defaultValue = "10")
Expand All @@ -52,14 +53,14 @@ public class LdaTopicModelEstimator
private int nIterations;

/**
* The number of iterations before hyperparameter optimization begins. Default: 100
* The number of iterations before hyper-parameter optimization begins. Default: 100
*/
public static final String PARAM_BURNIN_PERIOD = "burninPeriod";
@ConfigurationParameter(name = PARAM_BURNIN_PERIOD, mandatory = true, defaultValue = "100")
private int burninPeriod;

/**
* Interval for optimizing Dirichlet hyperparameters. Default: 50
* Interval for optimizing Dirichlet hyper-parameters. Default: 50
*/
public static final String PARAM_OPTIMIZE_INTERVAL = "optimizeInterval";
@ConfigurationParameter(name = PARAM_OPTIMIZE_INTERVAL, mandatory = true, defaultValue = "50")
Expand All @@ -73,15 +74,15 @@ public class LdaTopicModelEstimator
private int randomSeed;

/**
* Define how often to save a serialized model during estimation. Default: 0 (only save when
* Define how frequently a serialized model is saved to disk during estimation. Default: 0 (only save when
* estimation is done).
*/
public static final String PARAM_SAVE_INTERVAL = "saveInterval";
@ConfigurationParameter(name = PARAM_SAVE_INTERVAL, mandatory = true, defaultValue = "0")
private int saveInterval;

/**
* Use a symmatric alpha value during model estimation? Default: false.
* Use a symmetric alpha value during model estimation? Default: false.
*/
public static final String PARAM_USE_SYMMETRIC_ALPHA = "useSymmetricAlpha";
@ConfigurationParameter(name = PARAM_USE_SYMMETRIC_ALPHA, mandatory = true, defaultValue = "false")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
/**
* Compute word embeddings from the given collection using skip-grams.
* <p>
* Set {@link #PARAM_TOKEN_FEATURE_PATH} to determine what is considered as token (Tokens, Lemmas, etc.)
* and {@link #PARAM_COVERING_ANNOTATION_TYPE} to determine what is considered a document (sentences, paragraphs, etc.).
* Set {@link #PARAM_TOKEN_FEATURE_PATH} to define what is considered as a token (Tokens, Lemmas, etc.).
* <p>
* Set {@link #PARAM_COVERING_ANNOTATION_TYPE} to define what is considered a document (sentences, paragraphs, etc.).
*
* @since 1.9.0
*/
public class WordEmbeddingsEstimator
Expand Down Expand Up @@ -58,15 +60,14 @@ public class WordEmbeddingsEstimator
private int windowSize;

/**
* An example word that is output with its nearest neighbours once in a while (FIXME: currently
* not working, see {@link #collectionProcessComplete()}). (default: null, i.e. none).
* An example word that is output with its nearest neighbours once in a while (default: null, i.e. none).
*/
public static final String PARAM_EXAMPLE_WORD = "exampleWord";
@ConfigurationParameter(name = PARAM_EXAMPLE_WORD, mandatory = false)
private String exampleWord;

/**
* All documents with fewer tokens than this (default: 10) are omitted.
* Ignore documents with fewer tokens than this value (default: 10).
*/
public static final String PARAM_MIN_DOCUMENT_LENGTH = "minDocumentLength";
@ConfigurationParameter(name = PARAM_MIN_DOCUMENT_LENGTH, mandatory = true, defaultValue = "10")
Expand Down

0 comments on commit 2dd9fe1

Please sign in to comment.