Permalink
Browse files

Use one model representation.

Also, serialize the model directly to file. I am still not sure if this
is what I want. The advantage is that the model is read more quickly. The
disadvantage is that the storage format is not simple anymore.
  • Loading branch information...
danieldk committed Oct 30, 2013
1 parent ddd6487 commit b419ec43a010eaade890c7104cfa4aad3fd0f6d5
@@ -27,25 +27,21 @@
import eu.danieldk.nlp.jitar.wordhandler.SuffixWordHandler;
import eu.danieldk.nlp.jitar.wordhandler.WordHandler;
-import java.io.BufferedReader;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
+import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class Evaluate {
public static void main(String[] args) throws IOException {
- if (args.length != 4) {
- System.out.println("Evaluate [brown/conll] lexicon ngrams corpus");
+ if (args.length != 3) {
+ System.out.println("Evaluate [brown/conll] model corpus");
System.exit(1);
}
Model model = null;
try {
- model = Model.readModel(new BufferedReader(new FileReader(args[1])),
- new BufferedReader(new FileReader(args[2])));
+ model = Model.readModel(new File(args[1]));
} catch (IOException e) {
System.out.println("Unable to read training data!");
e.printStackTrace();
@@ -68,7 +64,7 @@ public static void main(String[] args) throws IOException {
BufferedReader reader = null;
try {
- reader = new BufferedReader(new FileReader(args[3]));
+ reader = new BufferedReader(new FileReader(args[2]));
} catch (FileNotFoundException e) {
System.err.println(String.format("Could not open corpus for reading:", e.getMessage()));
System.exit(1);
@@ -14,10 +14,7 @@
package eu.danieldk.nlp.jitar.cli;
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
+import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -32,37 +29,24 @@
import eu.danieldk.nlp.jitar.wordhandler.KnownWordHandler;
import eu.danieldk.nlp.jitar.wordhandler.SuffixWordHandler;
import eu.danieldk.nlp.jitar.wordhandler.WordHandler;
+import org.apache.commons.lang3.StringUtils;
public class Tag {
- private static String join(Collection<String> strings, String delimiter) {
- StringBuilder sb = new StringBuilder();
-
- Iterator<String> iter = strings.iterator();
- while (iter.hasNext()) {
- sb.append(iter.next());
- if (iter.hasNext())
- sb.append(delimiter);
- }
-
- return sb.toString();
- }
-
public static void main(String[] args) {
if (args.length != 2) {
System.out.println("Tag lexicon ngrams");
System.exit(1);
}
// Load the model.
- Model model = null;
- try {
- model = Model.readModel(new BufferedReader(new FileReader(args[0])),
- new BufferedReader(new FileReader(args[1])));
- } catch (IOException e) {
- System.out.println("Unable to read the model!");
- e.printStackTrace();
- System.exit(1);
- }
+ Model model = null;
+ try {
+ model = Model.readModel(new File(args[1]));
+ } catch (IOException e) {
+ System.out.println("Unable to read training data!");
+ e.printStackTrace();
+ System.exit(1);
+ }
// Set up word handlers. The suffix word handler is used as a fallback of the
// known word handler.
@@ -95,7 +79,7 @@ public static void main(String[] args) {
HMMTagger.highestProbabilitySequence(tagger.viterbi(tokenList),
model).sequence();
- System.out.println(join(tags.subList(2, tags.size() - 1), " "));
+ System.out.println(StringUtils.join(tags.subList(2, tags.size() - 1), ' '));
}
} catch (IOException e) {
}
@@ -18,6 +18,10 @@
import eu.danieldk.nlp.jitar.corpus.CONLLCorpusReader;
import eu.danieldk.nlp.jitar.corpus.CorpusReader;
import eu.danieldk.nlp.jitar.corpus.TaggedToken;
+import eu.danieldk.nlp.jitar.data.BiGram;
+import eu.danieldk.nlp.jitar.data.Model;
+import eu.danieldk.nlp.jitar.data.TriGram;
+import eu.danieldk.nlp.jitar.data.UniGram;
import eu.danieldk.nlp.jitar.training.FrequenciesCollector;
import java.io.*;
@@ -27,44 +31,9 @@
import java.util.Map.Entry;
public class Train {
- private static void writeNGrams(Map<String, Integer> uniGrams,
- Map<String, Integer> biGrams, Map<String, Integer> triGrams,
- BufferedWriter writer) throws IOException {
- for (Entry<String, Integer> entry : uniGrams.entrySet())
- writer.write(entry.getKey() + " " + entry.getValue() + "\n");
-
- for (Entry<String, Integer> entry : biGrams.entrySet())
- writer.write(entry.getKey() + " " + entry.getValue() + "\n");
-
- for (Entry<String, Integer> entry : triGrams.entrySet())
- writer.write(entry.getKey() + " " + entry.getValue() + "\n");
-
- writer.flush();
- }
-
- private static void writeLexicon(Map<String, Map<String, Integer>> lexicon,
- BufferedWriter writer) throws IOException {
- for (Entry<String, Map<String, Integer>> wordEntry : lexicon.entrySet()) {
- String word = wordEntry.getKey();
-
- writer.write(word);
-
- for (Entry<String, Integer> tagEntry : lexicon.get(word).entrySet()) {
- writer.write(" ");
- writer.write(tagEntry.getKey());
- writer.write(" ");
- writer.write(tagEntry.getValue().toString());
- }
-
- writer.newLine();
- }
-
- writer.flush();
- }
-
public static void main(String[] args) throws IOException {
- if (args.length != 4) {
- System.out.println("Train [brown/conll] corpus lexicon ngrams");
+ if (args.length != 3) {
+ System.out.println("Train [brown/conll] corpus model");
System.exit(1);
}
@@ -103,14 +72,18 @@ else if (args[0].equals("conll"))
corpusReader.close();
}
+ Model model = frequenciesCollector.model();
+
+ ObjectOutputStream oos = null;
try {
- writeLexicon(frequenciesCollector.lexicon(), new BufferedWriter(new FileWriter(args[2])));
- writeNGrams(frequenciesCollector.uniGrams(), frequenciesCollector.biGrams(),
- frequenciesCollector.triGrams(), new BufferedWriter(new FileWriter(args[3])));
+ oos = new ObjectOutputStream(new FileOutputStream(args[2]));
+ oos.writeObject(model);
} catch (IOException e) {
- System.out.println("Could not write training data!");
+ System.out.println("Could not write model!");
e.printStackTrace();
System.exit(1);
+ } finally {
+ oos.close();
}
}
@@ -14,10 +14,14 @@
package eu.danieldk.nlp.jitar.data;
+import java.io.Serializable;
+
/**
* This class represents a word bi-gram.
*/
-public class BiGram {
+public class BiGram implements Serializable {
+ private static final long serialVersionUID = 1L;
+
private final int d_t1;
private final int d_t2;
@@ -14,16 +14,15 @@
package eu.danieldk.nlp.jitar.data;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
+import java.io.*;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
-public class Model {
+public class Model implements Serializable {
+ private static final long serialVersionUID = 1L;
+
private final Map<String, Map<Integer, Integer>> d_wordTagFreqs;
private final Map<String, Integer> d_tagNumbers;
@@ -68,6 +67,23 @@ public static Model readModel(InputStream lexiconStream,
}
+ public static Model readModel(File modelFile) throws IOException {
+ return readModel(new FileInputStream(modelFile));
+ }
+
+ public static Model readModel(InputStream modelStream) throws IOException {
+ ObjectInputStream ois = null;
+ try {
+ ois = new ObjectInputStream(modelStream);
+ return (Model) ois.readObject();
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e);
+ } finally {
+ if (ois != null)
+ ois.close();
+ }
+ }
+
/**
* Read a model from files, and construct a <i>Model</i> instance. The
* model should be stored in two text files. The first text file should
@@ -14,10 +14,14 @@
package eu.danieldk.nlp.jitar.data;
+import java.io.Serializable;
+
/**
* This class represents a word tri-gram.
*/
-public class TriGram {
+public class TriGram implements Serializable {
+ private static final long serialVersionUID = 1L;
+
private final int d_t1;
private final int d_t2;
private final int d_t3;
@@ -14,10 +14,14 @@
package eu.danieldk.nlp.jitar.data;
+import java.io.Serializable;
+
/**
* This class represents a word uni-gram.
*/
-public class UniGram {
+public class UniGram implements Serializable {
+ private static final long serialVersionUID = 1L;
+
private final int d_t1;
public UniGram(int t1) {
Oops, something went wrong.

0 comments on commit b419ec4

Please sign in to comment.