Skip to content

Commit

Permalink
Updated dependency parser.
Browse files Browse the repository at this point in the history
  • Loading branch information
jdchoi77 committed Jan 19, 2016
1 parent ba747d8 commit c3b8753
Show file tree
Hide file tree
Showing 67 changed files with 1,144 additions and 2,586 deletions.
6 changes: 3 additions & 3 deletions core/pom.xml
Expand Up @@ -142,9 +142,9 @@
<version>0.4</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>18.0</version>
<groupId>net.jpountz.lz4</groupId>
<artifactId>lz4</artifactId>
<version>1.3.0</version>
</dependency>
</dependencies>
</project>
Expand Up @@ -13,12 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.mathcs.nlp.zzz;
package edu.emory.mathcs.nlp.bin;

import java.io.BufferedReader;
import java.io.PrintStream;
import java.io.ObjectOutputStream;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringJoiner;
import java.util.stream.Collectors;

import edu.emory.mathcs.nlp.common.collection.ngram.Bigram;
import edu.emory.mathcs.nlp.common.collection.ngram.Unigram;
Expand All @@ -31,13 +35,12 @@
/**
* @author Jinho D. Choi ({@code jinho.choi@emory.edu})
*/
public class Coverage
public class AmbiguityClassGenerator
{
public void readTrain(Bigram<String,String> bigram, String dirpath) throws Exception
public void readTrain(Bigram<String,String> bigram, String dirpath, boolean uncaptialized) throws Exception
{
BufferedReader reader;
String line;
String f, p;
String line, form;
String[] t;

for (String filename : FileUtils.getFileList(dirpath, "*"))
Expand All @@ -49,51 +52,71 @@ public void readTrain(Bigram<String,String> bigram, String dirpath) throws Excep
{
if (line.trim().isEmpty()) continue;
t = Splitter.splitTabs(line);
f = StringUtils.toSimplifiedForm(t[1]);
p = t[3];
bigram.add(f, p);
form = StringUtils.toSimplifiedForm(t[1], uncaptialized);
if (!skip(form)) bigram.add(form, t[3]);
}

reader.close();
}
}

public void printVocab(Bigram<String,String> bigram, String outputFile, int cutoff)
private boolean skip(String form)
{
PrintStream fout = IOUtils.createBufferedPrintStream(outputFile);
char[] cs = form.toCharArray();

for (int i=0; i<cs.length; i++)
{
if (cs[i] == '_' || cs[i] >= 128)
return true;
}

return false;
}

public void printVocab(Bigram<String,String> bigram, String outputFile, int cutoff, double threshold) throws Exception
{
Map<String,List<String>> map = new HashMap<>();
List<ObjectDoublePair<String>> list;
Unigram<String> unigram;
int count = 0, tags = 0;
StringJoiner build;

for (Entry<String, Unigram<String>> e : bigram.entrySet())
{
unigram = e.getValue();
if (unigram.getTotalCount() < cutoff) continue;
build = new StringJoiner(" ");
build.add(e.getKey());
list = unigram.toList(threshold);
if (list.isEmpty()) continue;
if (list.size() == 1 && (list.get(0).o.equals("NNP") || list.get(0).o.equals("NNPS"))) continue;

for (ObjectDoublePair<String> v : unigram.toList(0d))
{
build.add(v.o+":"+v.d);
tags++;
}

fout.println(build.toString());
Collections.sort(list, Collections.reverseOrder());
map.put(e.getKey(), list.stream().map(p -> p.o).collect(Collectors.toList()));
tags += list.size();
count++;
}

fout.close();
System.out.println("Avg tags: "+((double)tags/count));
System.out.println("Words: "+count);

ObjectOutputStream fout = IOUtils.createObjectXZBufferedOutputStream(outputFile);
fout.writeObject(map);
fout.close();
}

static public void main(String[] args) throws Exception
{
Coverage cov = new Coverage();
Bigram<String,String> bigram = new Bigram<>();
cov.readTrain (bigram, "/mnt/ainos-research/henryyhc/dat/nytimes/tree");
cov.readTrain (bigram, "/mnt/ainos-research/henryyhc/dat/wikipedia2015/tree");
// cov.readTrain (bigram, "/home/jdchoi/dat/general-en/trn-pos");
cov.printVocab(bigram, "/mnt/ainos-research/data/word_classes/nytimes-wiki-ambiguity-classes.txt", 0);
AmbiguityClassGenerator cov = new AmbiguityClassGenerator();

final String outputFile = args[0];
final int cutoff = Integer.parseInt(args[1]);
final double threshold = Double.parseDouble(args[2]);
final boolean uncaptialized = Boolean.parseBoolean(args[3]);

cov.readTrain (bigram, "/mnt/ainos-research/henryyhc/dat/nytimes/tree", uncaptialized);
cov.readTrain (bigram, "/mnt/ainos-research/henryyhc/dat/wikipedia2015/tree", uncaptialized);
cov.readTrain (bigram, "/home/jdchoi/dat/en-general/trn-pos", uncaptialized);
cov.readTrain (bigram, "/home/jdchoi/dat/en-medical/trn-pos", uncaptialized);
cov.readTrain (bigram, "/home/jdchoi/dat/en-bioinformatics/trn-pos", uncaptialized);
cov.printVocab(bigram, outputFile, cutoff, threshold);
}
}
24 changes: 15 additions & 9 deletions core/src/main/java/edu/emory/mathcs/nlp/bin/ModelShrink.java
Expand Up @@ -16,6 +16,7 @@
package edu.emory.mathcs.nlp.bin;

import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.List;

import org.kohsuke.args4j.Option;
Expand All @@ -26,10 +27,10 @@
import edu.emory.mathcs.nlp.component.template.OnlineComponent;
import edu.emory.mathcs.nlp.component.template.eval.Eval;
import edu.emory.mathcs.nlp.component.template.node.NLPNode;
import edu.emory.mathcs.nlp.component.template.reader.TSVReader;
import edu.emory.mathcs.nlp.component.template.state.NLPState;
import edu.emory.mathcs.nlp.component.template.util.GlobalLexica;
import edu.emory.mathcs.nlp.component.template.util.NLPFlag;
import edu.emory.mathcs.nlp.component.template.util.TSVReader;
import edu.emory.mathcs.nlp.learning.optimization.OnlineOptimizer;

/**
Expand Down Expand Up @@ -66,28 +67,29 @@ public class ModelShrink
component.setConfiguration(IOUtils.createFileInputStream(configuration_file));
List<String> inputFiles = FileUtils.getFileList(input_path, input_ext);
OnlineOptimizer optimizer = component.getOptimizer();
byte[] prevComponent;
double currScore;

evaluate(inputFiles, component, 0f);

for (float f=start; ; f+=increment)
{
prevComponent = IOUtils.toByteArray(component);
component.getFeatureTemplate().reduce(optimizer.getWeightVector(), f);
currScore = evaluate(inputFiles, component, f);

if (lower_bound >= currScore)
break;
if (lower_bound >= currScore) break;
}

// ObjectOutputStream fout = IOUtils.createObjectXZBufferedOutputStream(model_file+"."+output_ext);
// model.fromByteArray(prevModel);
// fout.writeObject(component);
// fout.close();
ObjectOutputStream fout = IOUtils.createObjectXZBufferedOutputStream(model_file+"."+output_ext);
component = (OnlineComponent<S>)IOUtils.fromByteArray(prevComponent);
fout.writeObject(component);
fout.close();
}

public <S extends NLPState>double evaluate(List<String> inputFiles, OnlineComponent<S> component, float rate) throws Exception
{
TSVReader reader = component.getConfiguration().getTSVReader();
long st, et, ttime = 0, tnode = 0;
NLPNode[] nodes;

component.setFlag(NLPFlag.EVALUATE);
Expand All @@ -101,13 +103,17 @@ public <S extends NLPState>double evaluate(List<String> inputFiles, OnlineCompon
while ((nodes = reader.next()) != null)
{
GlobalLexica.assignGlobalLexica(nodes);
st = System.currentTimeMillis();
component.process(nodes);
et = System.currentTimeMillis();
ttime += et - st;
tnode += nodes.length - 1;
}

reader.close();
}

System.out.println(String.format("%5.4f: %s -> %d", rate, eval.toString(), component.getFeatureTemplate().getSparseFeatureSize()));
System.out.println(String.format("%5.4f: %s -> %7d, N/S = %d", rate, eval.toString(), component.getFeatureTemplate().getSparseFeatureSize(), (int)Math.round(1000d * tnode / ttime)));
return eval.score();
}

Expand Down
128 changes: 0 additions & 128 deletions core/src/main/java/edu/emory/mathcs/nlp/bin/NLPEval.java

This file was deleted.

18 changes: 3 additions & 15 deletions core/src/main/java/edu/emory/mathcs/nlp/bin/NLPTrain.java
Expand Up @@ -22,11 +22,8 @@

import edu.emory.mathcs.nlp.common.util.BinUtils;
import edu.emory.mathcs.nlp.common.util.FileUtils;
import edu.emory.mathcs.nlp.component.dep.DEPTrainer;
import edu.emory.mathcs.nlp.component.doc.DOCTrainer;
import edu.emory.mathcs.nlp.component.ner.NERTrainer;
import edu.emory.mathcs.nlp.component.pos.POSTrainer;
import edu.emory.mathcs.nlp.component.template.train.OnlineTrainer;
import edu.emory.mathcs.nlp.component.template.util.NLPMode;

/**
* @author Jinho D. Choi ({@code jinho.choi@emory.edu})
Expand Down Expand Up @@ -55,21 +52,12 @@ public NLPTrain(String[] args)
BinUtils.initArgs(args, this);
List<String> trainFiles = FileUtils.getFileList(train_path , train_ext);
List<String> developFiles = FileUtils.getFileList(develop_path, develop_ext);
OnlineTrainer<?> trainer;
OnlineTrainer<?> trainer = new OnlineTrainer<>();

Collections.sort(trainFiles);
Collections.sort(developFiles);

switch (mode)
{
case "pos": trainer = new POSTrainer(); break;
case "ner": trainer = new NERTrainer(); break;
case "dep": trainer = new DEPTrainer(); break;
case "doc": trainer = new DOCTrainer(); break;
default : throw new IllegalArgumentException("Unsupported mode: "+mode);
}

trainer.train(trainFiles, developFiles, configuration_file, model_file, previous_model_file);
trainer.train(NLPMode.valueOf(mode), trainFiles, developFiles, configuration_file, model_file, previous_model_file);
}

static public void main(String[] args)
Expand Down
Expand Up @@ -46,6 +46,13 @@ public Unigram()
best = null;
}

public void clear()
{
count_map.clear();
total_count = 0;
best = null;
}

public void add(T key)
{
add(key, 1);
Expand Down

0 comments on commit c3b8753

Please sign in to comment.