Permalink
Browse files

Updated NER.

  • Loading branch information...
jdchoi77 committed Apr 18, 2015
1 parent 147e0df commit 23b79c51f0e32eb5c5219f8c9de171537217439a
Showing with 1,110 additions and 3,317 deletions.
  1. +1 −1 LICENSE.txt
  2. +4 −5 src/main/java/edu/emory/clir/clearnlp/bin/C2DConvert.java
  3. +5 −5 src/main/java/edu/emory/clir/clearnlp/bin/NLPDecode.java
  4. +18 −164 src/main/java/edu/emory/clir/clearnlp/bin/NLPTrain.java
  5. +1 −1 src/main/java/edu/emory/clir/clearnlp/bin/{classify → helper}/AbstractClassify.java
  6. +1 −1 src/main/java/edu/emory/clir/clearnlp/bin/{classify → helper}/AbstractClassifyOneVsAll.java
  7. +1 −1 src/main/java/edu/emory/clir/clearnlp/bin/{classify → helper}/AbstractClassifyOnline.java
  8. +199 −0 src/main/java/edu/emory/clir/clearnlp/bin/helper/AbstractNLPTrain.java
  9. +1 −1 src/main/java/edu/emory/clir/clearnlp/bin/{classify → helper}/AdaGradClassify.java
  10. +1 −1 src/main/java/edu/emory/clir/clearnlp/bin/{classify → helper}/LiblinearClassify.java
  11. +138 −0 src/main/java/edu/emory/clir/clearnlp/classification/trainer/RRM.java
  12. +16 −3 src/main/java/edu/emory/clir/clearnlp/collection/tree/PrefixTree.java
  13. +26 −18 src/main/java/edu/emory/clir/clearnlp/component/mode/dep/AbstractDEPParser.java
  14. +6 −14 src/main/java/edu/emory/clir/clearnlp/component/mode/dep/DEPConfiguration.java
  15. +3 −2 src/main/java/edu/emory/clir/clearnlp/component/mode/dep/DEPFeatureExtractor.java
  16. +0 −543 src/main/java/edu/emory/clir/clearnlp/component/mode/dep/DEPStateBranch.java
  17. +20 −4 src/main/java/edu/emory/clir/clearnlp/component/mode/dep/DEPTrainer.java
  18. +27 −22 ...ain/java/edu/emory/clir/clearnlp/component/mode/dep/{DEPState.java → state/AbstractDEPState.java}
  19. +229 −0 src/main/java/edu/emory/clir/clearnlp/component/mode/dep/state/DEPStateBranch.java
  20. +58 −0 src/main/java/edu/emory/clir/clearnlp/component/mode/dep/state/DEPStateGreedy.java
  21. +0 −200 src/main/java/edu/emory/clir/clearnlp/component/mode/ner/AbstractNERecognizer.java
  22. +0 −58 src/main/java/edu/emory/clir/clearnlp/component/mode/ner/DefaultNERecognizer.java
  23. +0 −60 src/main/java/edu/emory/clir/clearnlp/component/mode/ner/EnglishNERecognizer.java
  24. +0 −80 src/main/java/edu/emory/clir/clearnlp/component/mode/ner/NERConfiguration.java
  25. +0 −86 src/main/java/edu/emory/clir/clearnlp/component/mode/ner/NEREval.java
  26. +0 −46 src/main/java/edu/emory/clir/clearnlp/component/mode/ner/NERFeatureExtractor.java
  27. +0 −104 src/main/java/edu/emory/clir/clearnlp/component/mode/ner/NERLexicon.java
  28. +0 −175 src/main/java/edu/emory/clir/clearnlp/component/mode/ner/NERState.java
  29. +0 −79 src/main/java/edu/emory/clir/clearnlp/component/mode/ner/NERTrainer.java
  30. +15 −56 src/main/java/edu/emory/clir/clearnlp/component/utils/NLPUtils.java
  31. +98 −44 src/main/java/edu/emory/clir/clearnlp/conversion/EnglishC2DConverter.java
  32. +3 −0 src/main/java/edu/emory/clir/clearnlp/dependency/DEPLib.java
  33. +2 −2 src/main/java/edu/emory/clir/clearnlp/dependency/DEPLibEn.java
  34. +33 −0 src/main/java/edu/emory/clir/clearnlp/dependency/DEPNode.java
  35. +19 −19 src/main/java/edu/emory/clir/clearnlp/dependency/DEPTagEn.java
  36. +13 −7 src/main/java/edu/emory/clir/clearnlp/dependency/DEPTree.java
  37. +59 −0 src/main/java/edu/emory/clir/clearnlp/experiment/NERExtract.java
  38. +14 −2 src/main/java/edu/emory/clir/clearnlp/experiment/NLPMerge.java
  39. +8 −13 src/main/java/edu/emory/clir/clearnlp/experiment/Z.java
  40. +32 −0 src/main/java/edu/emory/clir/clearnlp/feature/AbstractFeatureExtractor.java
  41. +3 −2 src/main/java/edu/emory/clir/clearnlp/feature/type/FieldType.java
  42. +0 −83 src/main/java/edu/emory/clir/clearnlp/lexicon/dbpedia/DBPediaInfo.java
  43. +0 −141 src/main/java/edu/emory/clir/clearnlp/lexicon/dbpedia/DBPediaInfoExtractor.java
  44. +0 −27 src/main/java/edu/emory/clir/clearnlp/lexicon/dbpedia/DBPediaInfoMap.java
  45. +0 −75 src/main/java/edu/emory/clir/clearnlp/lexicon/dbpedia/DBPediaOntologyExtractor.java
  46. +0 −768 src/main/java/edu/emory/clir/clearnlp/lexicon/dbpedia/DBPediaType.java
  47. +0 −119 src/main/java/edu/emory/clir/clearnlp/lexicon/dbpedia/DBPediaTypeMap.java
  48. +0 −30 src/main/java/edu/emory/clir/clearnlp/lexicon/dbpedia/DBPediaXML.java
  49. +0 −149 src/main/java/edu/emory/clir/clearnlp/lexicon/dbpedia/PrefixTreeGenerator.java
  50. +9 −2 src/main/java/edu/emory/clir/clearnlp/ner/NERInfoList.java
  51. +0 −55 src/main/java/edu/emory/clir/clearnlp/ner/NERTag.java
  52. +1 −7 src/main/java/edu/emory/clir/clearnlp/util/Joiner.java
  53. +2 −1 src/main/java/edu/emory/clir/clearnlp/util/XmlUtils.java
  54. +1 −1 src/main/java/edu/emory/clir/clearnlp/util/lang/ENUtils.java
  55. +1 −1 src/main/resources/configure/config_train_ner.xml
  56. +32 −31 src/main/resources/samples/wsj_0001.parse.dep
  57. +4 −6 src/test/java/edu/emory/clir/clearnlp/component/configuration/DEPConfigurationTest.java
  58. +1 −1 src/test/java/edu/emory/clir/clearnlp/dependency/DEPTreeTest.java
  59. +5 −1 src/test/resources/nlp/configuration/configure.xml
View
@@ -1,4 +1,4 @@
Copyright 2014, Emory University
Copyright 2014-2015, Emory University
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -38,7 +38,6 @@
import edu.emory.clir.clearnlp.lexicon.propbank.PBInstance;
import edu.emory.clir.clearnlp.lexicon.propbank.PBReader;
import edu.emory.clir.clearnlp.ner.BILOU;
import edu.emory.clir.clearnlp.ner.NERTag;
import edu.emory.clir.clearnlp.pos.POSLibEn;
import edu.emory.clir.clearnlp.util.BinUtils;
import edu.emory.clir.clearnlp.util.FileUtils;
@@ -246,14 +245,14 @@ private void initNamedEntities(CTTree cTree, List<ObjectIntIntTriple<String>> na
for (ObjectIntIntTriple<String> t : names)
{
if (t.i1 == t.i2)
cTree.getTerminal(t.i1).setNamedEntityTag(NERTag.toBILOUTag(BILOU.U, t.o));
cTree.getTerminal(t.i1).setNamedEntityTag(BILOU.U+"-"+t.o);
else
{
cTree.getTerminal(t.i1).setNamedEntityTag(NERTag.toBILOUTag(BILOU.B, t.o));
cTree.getTerminal(t.i2).setNamedEntityTag(NERTag.toBILOUTag(BILOU.L, t.o));
cTree.getTerminal(t.i1).setNamedEntityTag(BILOU.B+"-"+t.o);
cTree.getTerminal(t.i2).setNamedEntityTag(BILOU.L+"-"+t.o);
for (i=t.i1+1; i<t.i2; i++)
cTree.getTerminal(i).setNamedEntityTag(NERTag.toBILOUTag(BILOU.I, t.o));
cTree.getTerminal(i).setNamedEntityTag(BILOU.I+"-"+t.o);
}
}
}
@@ -154,7 +154,7 @@ public void process(DEPTree tree, PrintStream fout, NLPMode mode, AbstractCompon
switch (mode)
{
case srl :
case ner : list.add(NLPUtils.getNERecognizer(language, config.getModelPath(NLPMode.ner)));
// case ner : list.add(NLPUtils.getNERecognizer(language, config.getModelPath(NLPMode.ner)));
case dep : list.add(NLPUtils.getDEPParser(language, config.getModelPath(NLPMode.dep), new DEPConfiguration(IOUtils.createFileInputStream(s_configurationFile))));
case morph: list.add(NLPUtils.getMPAnalyzer(language));
case pos : list.add(NLPUtils.getPOSTagger(language, config.getModelPath(NLPMode.pos)));
@@ -170,9 +170,9 @@ public void process(DEPTree tree, PrintStream fout, NLPMode mode, AbstractCompon
switch (mode)
{
case srl:
case ner:
if (!reader.hasNamedEntityTags())
list.add(NLPUtils.getNERecognizer(language, config.getModelPath(NLPMode.ner)));
// case ner:
// if (!reader.hasNamedEntityTags())
// list.add(NLPUtils.getNERecognizer(language, config.getModelPath(NLPMode.ner)));
case dep:
if (!reader.hasDependencyHeads())
list.add(NLPUtils.getDEPParser(language, config.getModelPath(NLPMode.dep), new DEPConfiguration(IOUtils.createFileInputStream(s_configurationFile))));
@@ -199,7 +199,7 @@ private String toString(DEPTree tree, NLPMode mode)
switch (mode)
{
case srl : return tree.toString(DEPNode::toStringSRL);
case ner : return tree.toString(DEPNode::toStringNER);
// case ner : return tree.toString(DEPNode::toStringNER);
case dep : return tree.toString(DEPNode::toStringDEP);
case morph: return tree.toString(DEPNode::toStringMorph);
case pos : return tree.toString(DEPNode::toStringPOS);
@@ -1,5 +1,5 @@
/**
* Copyright 2014, Emory University
* Copyright 2015, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,191 +15,45 @@
*/
package edu.emory.clir.clearnlp.bin;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.ObjectOutputStream;
import java.util.List;
import java.util.concurrent.ExecutionException;
import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.spi.StringArrayOptionHandler;
import org.tukaani.xz.LZMA2Options;
import org.tukaani.xz.XZOutputStream;
import edu.emory.clir.clearnlp.collection.pair.ObjectDoublePair;
import edu.emory.clir.clearnlp.component.AbstractStatisticalComponent;
import edu.emory.clir.clearnlp.bin.helper.AbstractNLPTrain;
import edu.emory.clir.clearnlp.component.mode.dep.DEPTrainer;
import edu.emory.clir.clearnlp.component.mode.pos.POSTrainer;
import edu.emory.clir.clearnlp.component.trainer.AbstractNLPTrainer;
import edu.emory.clir.clearnlp.component.utils.NLPMode;
import edu.emory.clir.clearnlp.component.utils.NLPUtils;
import edu.emory.clir.clearnlp.util.BinUtils;
import edu.emory.clir.clearnlp.util.FileUtils;
import edu.emory.clir.clearnlp.util.IOUtils;
/**
* @since 3.0.0
* @since 3.0.3
* @author Jinho D. Choi ({@code jinho.choi@emory.edu})
*/
public class NLPTrain
public class NLPTrain extends AbstractNLPTrain
{
@Option(name="-c", usage="confinguration file (required)", required=true, metaVar="<filename>")
protected String s_configurationFile;
@Option(name="-f", usage="feature template files (required)", required=true, metaVar="<filename>", handler=StringArrayOptionHandler.class)
protected String[] s_featureFiles;
@Option(name="-m", usage="model filename (optional)", required=false, metaVar="<filename>")
protected String s_modelPath = null;
@Option(name="-t", usage="training path (required)", required=true, metaVar="<filepath>")
protected String s_trainPath;
@Option(name="-d", usage="development path (required)", required=true, metaVar="<filepath>")
protected String s_developPath;
@Option(name="-te", usage="training file extension (default: *)", required=false, metaVar="<string>")
protected String s_trainExt = "*";
@Option(name="-de", usage="development file extension (default: *)", required=false, metaVar="<string>")
protected String s_developExt = "*";
@Option(name="-mode", usage="pos|dep|ner|srl", required=true, metaVar="<mode>")
protected String s_mode = ".*";
// @Option(name="-threads", usage="number of threads (default: 1)", required=false, metaVar="<Integer>")
// protected int n_threads = 1;
public NLPTrain() {}
public NLPTrain(String[] args) throws InterruptedException, ExecutionException
{
BinUtils.initArgs(args, this);
List<String> trainFiles = FileUtils.getFileList(s_trainPath , s_trainExt , false);
List<String> developFiles = FileUtils.getFileList(s_developPath, s_developExt, false);
NLPMode mode = NLPMode.valueOf(s_mode);
ObjectDoublePair<AbstractStatisticalComponent<?,?,?,?>> p = train(trainFiles, developFiles, s_featureFiles, s_configurationFile, mode);
BinUtils.LOG.info(String.format("Final score: %4.2f\n", p.d));
if (s_modelPath != null) saveModel(p.o, s_modelPath);
}
// public NLPTrain(String[] args) throws InterruptedException, ExecutionException
// {
// BinUtils.initArgs(args, this);
//
// List<String> trainFiles = FileUtils.getFileList(s_trainPath , s_trainExt , false);
// List<String> developFiles = FileUtils.getFileList(s_developPath, s_developExt, false);
// NLPMode mode = NLPMode.valueOf(s_mode);
//
// List<Callable<ObjectObjectDoubleTriple<AbstractStatisticalComponent<?,?,?,?>,String>>> tasks = new ArrayList<>();
// Callable<ObjectObjectDoubleTriple<AbstractStatisticalComponent<?,?,?,?>,String>> c;
// ExecutorService executor = Executors.newFixedThreadPool(n_threads);
//
// for (String configurationFile : s_configurationFiles)
// {
// for (String featureFile : s_featureFiles)
// {
// System.out.println(featureFile);
//
// c = new Callable<ObjectObjectDoubleTriple<AbstractStatisticalComponent<?,?,?,?>,String>>()
// {
// @Override
// public ObjectObjectDoubleTriple<AbstractStatisticalComponent<?,?,?,?>,String> call() throws Exception
// {
// final ObjectDoublePair<AbstractStatisticalComponent<?,?,?,?>> p = train(trainFiles, developFiles, Splitter.splitColons(featureFile), configurationFile, mode);
// return new ObjectObjectDoubleTriple<AbstractStatisticalComponent<?,?,?,?>,String>(p.o, FileUtils.getBaseName(configurationFile)+", "+FileUtils.getBaseName(featureFile), p.d);
// }
// };
//
// tasks.add(c);
// }
// }
//
// List<Future<ObjectObjectDoubleTriple<AbstractStatisticalComponent<?,?,?,?>,String>>> futures = executor.invokeAll(tasks);
// ObjectObjectDoubleTriple<AbstractStatisticalComponent<?,?,?,?>,String> max = null, t;
// int i, size = futures.size();
//
// for (i=0; i<size; i++)
// {
// t = futures.get(i).get();
// System.out.printf("%s: %5.2f\n", t.o2, t.d);
// if (max == null || max.compareTo(t) < 0) max = t;
// }
//
// executor.shutdown();
// if (size > 1) BinUtils.LOG.info(String.format("Best\n%s: %5.2f\n", max.o2, max.d));
// if (s_modelPath != null) saveModel(max.o1, s_modelPath);
// }
public ObjectDoublePair<AbstractStatisticalComponent<?,?,?,?>> train(List<String> trainFiles, List<String> developFiles, String[] featureFiles, String configurationFile, NLPMode mode)
public NLPTrain(String[] args) throws Exception
{
InputStream configuration = IOUtils.createFileInputStream(configurationFile);
InputStream[] features = IOUtils.createFileInputStreams(featureFiles);
AbstractNLPTrainer trainer = NLPUtils.getTrainer(mode, configuration, features);
return trainer.train(trainFiles, developFiles);
super(args);
}
public void saveModel(AbstractStatisticalComponent<?,?,?,?> component, String modelPath)
static public void main(String[] args)
{
ObjectOutputStream out;
try
try
{
out = new ObjectOutputStream(new XZOutputStream(new BufferedOutputStream(new FileOutputStream(modelPath)), new LZMA2Options()));
component.save(out);
out.close();
new NLPTrain(args);
}
catch (Exception e) {e.printStackTrace();}
}
// void onlineTrain()
// {
// try
// {
// DefaultPOSTagger tagger = new DefaultPOSTagger(new ObjectInputStream(new XZInputStream(new BufferedInputStream(new FileInputStream(s_modelPath)))));
// for (DEPTree tree : getTrees())
// {
// tagger.process(tree);
// System.out.println(tree.toStringPOS()+"\n");
// }
// tagger.onlineTrain(getTrees());
// System.out.println("---------------------------\n");
// for (DEPTree tree : getTrees())
// {
// tagger.process(tree);
// System.out.println(tree.toStringPOS()+"\n");
// }
// }
// catch (Exception e) {e.printStackTrace();}
// }
//
// private List<DEPTree> getTrees()
// {
// List<DEPTree> list = Lists.newArrayList();
// DEPTree tree;
//
// tree = new DEPTree(5);
// tree.add(new DEPNode(1, "mr.", "NNP", new DEPFeat()));
// tree.add(new DEPNode(2, "boom", "NNP", new DEPFeat()));
// tree.add(new DEPNode(3, "toissed", "VBD", new DEPFeat()));
// tree.add(new DEPNode(4, "paat", "JJ", new DEPFeat()));
// tree.add(new DEPNode(5, "balll", "NN", new DEPFeat()));
// list.add(tree);
//
// tree = new DEPTree(4);
// tree.add(new DEPNode(1, "John", "NNP", new DEPFeat()));
// tree.add(new DEPNode(2, "bought", "VBD", new DEPFeat()));
// tree.add(new DEPNode(3, "a", "DT", new DEPFeat()));
// tree.add(new DEPNode(4, "car", "NN", new DEPFeat()));
// list.add(tree);
//
// return list;
// }
static public void main(String[] args)
@Override
protected AbstractNLPTrainer getTrainer(NLPMode mode, InputStream configuration, InputStream[] features)
{
try
switch (mode)
{
new NLPTrain(args);
case pos: return new POSTrainer(configuration, features);
case dep: return new DEPTrainer(configuration, features);
case srl: return null;
default : throw new IllegalArgumentException("Invalid mode: "+mode.toString());
}
catch (InterruptedException | ExecutionException e) {e.printStackTrace();}
}
}
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.clir.clearnlp.bin.classify;
package edu.emory.clir.clearnlp.bin.helper;
import java.io.InputStream;
import java.io.ObjectInputStream;
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.clir.clearnlp.bin.classify;
package edu.emory.clir.clearnlp.bin.helper;
import edu.emory.clir.clearnlp.classification.configuration.AbstractTrainerConfiguration;
import edu.emory.clir.clearnlp.classification.model.AbstractModel;
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.clir.clearnlp.bin.classify;
package edu.emory.clir.clearnlp.bin.helper;
import org.kohsuke.args4j.Option;
Oops, something went wrong.

0 comments on commit 23b79c5

Please sign in to comment.