Permalink
Browse files

committing code

  • Loading branch information...
1 parent 9c1a8bf commit 70f3dce84927bda51a2c38a05e564d0cad16cf15 @dhgarrette committed May 23, 2012
Showing with 9,882 additions and 3 deletions.
  1. +6 −1 .gitignore
  2. +42 −2 README.md
  3. +11 −0 build.sbt
  4. +700 −0 data/tut-23tags-raw.pos
  5. +659 −0 data/tut-23tags-test.pos
  6. +1,500 −0 data/tut-23tags-train.pos
  7. +700 −0 data/tut-73tags-raw.pos
  8. +659 −0 data/tut-73tags-test.pos
  9. +1,500 −0 data/tut-73tags-train.pos
  10. +4 −0 run.sh
  11. BIN sbt-launch-0.11.2.jar
  12. +6 −0 src/main/resources/log4j.properties
  13. +102 −0 src/main/scala/dhgarrette/typesupervisedtagging/data/ExtractPostags.scala
  14. +140 −0 src/main/scala/dhgarrette/typesupervisedtagging/data/ExtractPostagsTut.scala
  15. +359 −0 src/main/scala/dhgarrette/typesupervisedtagging/minmodel/ModelMinimizer.scala
  16. +117 −0 src/main/scala/dhgarrette/typesupervisedtagging/minmodel/TagBigramSelector.scala
  17. +279 −0 src/main/scala/dhgarrette/typesupervisedtagging/run/MinGreedy.scala
  18. +318 −0 src/main/scala/dhgarrette/typesupervisedtagging/run/PosExperiments.scala
  19. +37 −0 src/main/scala/dhgarrette/typesupervisedtagging/run/Run.scala
  20. +126 −0 src/main/scala/dhgarrette/typesupervisedtagging/util/FileUtils.scala
  21. +43 −0 src/main/scala/dhgarrette/typesupervisedtagging/util/Time.scala
  22. +89 −0 src/main/scala/opennlp/scalabha/tag/Tagger.scala
  23. +67 −0 src/main/scala/opennlp/scalabha/tag/TaggerEvaluator.scala
  24. +86 −0 src/main/scala/opennlp/scalabha/tag/hmm/HmmTagger.scala
  25. +77 −0 src/main/scala/opennlp/scalabha/tag/hmm/SupervisedHmmTaggerTrainer.scala
  26. +454 −0 src/main/scala/opennlp/scalabha/tag/hmm/UnsupervisedHmmTaggerTrainer.scala
  27. +31 −0 src/main/scala/opennlp/scalabha/tag/hmm/support/HmmSmoothingFreqCounter.scala
  28. +108 −0 src/main/scala/opennlp/scalabha/tag/hmm/support/UnsupervisedEmissionDist.scala
  29. +253 −0 src/main/scala/opennlp/scalabha/tag/support/CondCountsTransformer.scala
  30. +160 −0 src/main/scala/opennlp/scalabha/tag/support/CountsTransformer.scala
  31. +65 −0 src/main/scala/opennlp/scalabha/tag/support/DefaultedFreqCounts.scala
  32. +57 −0 src/main/scala/opennlp/scalabha/tag/support/FreqCounts.scala
  33. +93 −0 src/main/scala/opennlp/scalabha/tag/support/FreqDist.scala
  34. +111 −0 src/main/scala/opennlp/scalabha/tag/support/TagDictFactory.scala
  35. +758 −0 src/main/scala/opennlp/scalabha/util/CollectionUtils.scala
  36. +85 −0 src/main/scala/opennlp/scalabha/util/LogNum.scala
  37. +80 −0 src/main/scala/opennlp/scalabha/util/Pattern.scala
View
@@ -10,4 +10,9 @@ project/boot/
project/plugins/project/
# Scala-IDE specific
-.scala_dependencies
+.scala_dependencies
+project
+.project
+.classpath
+.cache
+
View
@@ -1,2 +1,42 @@
-type-supervised-tagging-emnlp2012
-=================================
+[Dan Garrette]: http://cs.utexas.edu/~dhg
+[Jason Baldridge]: http://www.jasonbaldridge.com
+
+
+Type-supervised tagging: EMNLP 2012
+===================================
+
+This repository contains the code, scripts, and instructions needed to reproduce the results in the paper
+
+ > Type-supervised Hidden Markov Models for POS Tagging with Incomplete Tag Dictionaries
+ > [Dan Garrette] and [Jason Baldridge]
+ > In Proceedings of EMNLP 2012
+
+This code is outdated and will not be maintained.
+To see the most up-to-date version of the HMM code, visit the [scalabha](https://github.com/utcompling/Scalabha) repository.
+
+Running the experiments
+-------
+
+**Set up English data**
+
+ sh run.sh "en-data /path/to/treebank"
+
+That treebank directory should contain a folder `combined` containing files
+`wsj_0000.mrg` through `wsj_2454.mrg`
+
+**Run English experiments on sections 00-15**
+
+ sh run.sh en-run16
+
+**Run English experiments on sections 00-07**
+
+ sh run.sh en-run8
+
+**Run Italian experiments**
+
+The Italian data is already located in the `data` directory, so this
+experiment can be launched immediately.
+
+ sh run.sh it-run
+
+If you find any problems with these instructions, please contact Dan Garrette (dhg@cs.utexas.edu).
View
@@ -0,0 +1,11 @@
+name := "type-supervised-tagging-2012emnlp"
+
+version := "0.0.1"
+
+scalaVersion := "2.9.1"
+
+libraryDependencies ++= Seq(
+ "commons-logging" % "commons-logging" % "1.1.1",
+ "log4j" % "log4j" % "1.2.16")
+
+mainClass in (Compile, run) := Some("dhgarrette.typesupervisedtagging.run.Run")
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
4 run.sh
@@ -0,0 +1,4 @@
+#/bin/bash
+
+java -Dfile.encoding=UTF8 -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=256m -jar sbt-launch-*.jar "run $@"
+
View
Binary file not shown.
@@ -0,0 +1,6 @@
+log4j.rootLogger=DEBUG, A1
+
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+
+log4j.appender.A1.layout.ConversionPattern=%d{HH:mm:ss} %-5p %c{1} %m%n
@@ -0,0 +1,102 @@
+package dhgarrette.typesupervisedtagging.data
+
+import java.io.File
+import java.io.BufferedReader
+import dhgarrette.typesupervisedtagging.util.FileUtils._
+import java.io.FileReader
+import scala.io.Source
+import java.io.BufferedWriter
+import java.io.FileWriter
+import java.io.Writer
+import scala.collection.mutable.Buffer
+import dhgarrette.typesupervisedtagging.util.FileUtils
+
+object ExtractPostags {
+
+ val TERMINAL_RE = """\((\S+) (\S+)\)""".r
+
+ class SentenceIterator(private val files: Iterator[String]) extends Iterator[String] {
+ private var linesIterator: Iterator[String] = Iterator.empty
+
+ override def next(): String = {
+ while (hasNext) {
+ val line = linesIterator.next
+ if (line.trim.startsWith("(")) {
+ return line
+ }
+ }
+ throw new RuntimeException()
+ }
+
+ override def hasNext(): Boolean = {
+ while (true) {
+ if (linesIterator.hasNext)
+ return true
+ else if (files.hasNext)
+ linesIterator = {
+ val x = Source.fromFile(files.next).getLines.mkString(" ")
+ var pc = 0
+ var start = 0
+ val sentences = Buffer[String]()
+ for ((c, i) <- x.zipWithIndex) {
+ if (c == '(') {
+ if (pc == 0)
+ start = i
+ pc += 1
+ }
+ if (c == ')') {
+ pc -= 1
+ if (pc == 0)
+ sentences.append(x.substring(start, i + 1))
+ }
+ }
+ sentences.iterator
+ }
+ else
+ return false
+ }
+ throw new AssertionError
+ }
+ }
+
+ def main(args: Array[String]): Unit = {
+ val COMB_DIR = FileUtils.pathjoin(args(0), "combined")
+ require(new File(COMB_DIR).exists, "Directory '%s' does not exist".format(COMB_DIR))
+ val FN_RE = """^wsj_(\d\d)(\d\d)\.mrg$""".r
+
+ // val TRAIN_OUT = "data/s00-18."
+ // val DEV_OUT = "data/s19-21."
+ // val TEST_OUT = "data/s22-24."
+
+ def readSections(sections: Range) =
+ new SentenceIterator((
+ for (
+ f <- new File(COMB_DIR).listFiles;
+ fn = Some(f.getName);
+ FN_RE(sec, _) <- fn if sections.contains(sec.toInt)
+ ) yield f.getAbsolutePath).sorted.iterator)
+
+ def posFormatter(word: String, pos: String) = "%s|%s".format(word, pos)
+
+ def writeSentences(sec: Range, formatter: (String, String) => String) = {
+ writeUsing("data/s%02d-%02d.pos".format(sec.head, sec.last)) { w =>
+ for (line <- readSections(sec))
+ w.write(
+ TERMINAL_RE.findAllIn(line).matchData
+ .map(_.subgroups)
+ .flatMap {
+ case Seq("-NONE-", word) => None
+ case Seq(pos, word) => Some(formatter(word, pos))
+ }
+ .mkString(" ") + "\n")
+ }
+ }
+
+ writeSentences(0 to 7, posFormatter)
+ writeSentences(0 to 15, posFormatter)
+ writeSentences(16 to 18, posFormatter)
+ writeSentences(19 to 21, posFormatter)
+ writeSentences(22 to 24, posFormatter)
+
+ }
+}
@@ -0,0 +1,140 @@
+package dhgarrette.typesupervisedtagging.data
+
+import java.io.File
+import java.io.BufferedReader
+import dhgarrette.typesupervisedtagging.util.FileUtils._
+import opennlp.scalabha.util.CollectionUtils._
+import java.io.FileReader
+import scala.io.Source
+import java.io.BufferedWriter
+import java.io.FileWriter
+import java.io.Writer
+import scala.collection.mutable.Buffer
+
+/**
+ * TUT Split total train raw test
+ * CODICECIVILE 1100 600 250 250
+ * NEWS 700 400 150 150
+ * VEDCH 400 200 100 100
+ * EUDIR 200 100 50 50
+ * WIKI 459 200 150 109
+ * 2859 1500 700 659
+ */
+
+object ExtractPostagsTut {
+
+ val TERMINAL_RE = """\((\S+) (\S+)\)""".r
+
+ class SentenceIterator(private val files: Iterator[String]) extends Iterator[String] {
+ private var linesIterator: Iterator[String] = Iterator.empty
+
+ override def next(): String = {
+ while (hasNext) {
+ val line = linesIterator.next
+ if (line.trim.startsWith("(")) {
+ return line
+ }
+ }
+ throw new RuntimeException()
+ }
+
+ override def hasNext(): Boolean = {
+ while (true) {
+ if (linesIterator.hasNext)
+ return true
+ else if (files.hasNext)
+ linesIterator = {
+ val x = Source.fromFile(files.next).getLines.mkString(" ")
+ var pc = 0
+ var start = 0
+ val sentences = Buffer[String]()
+ for ((c, i) <- x.zipWithIndex) {
+ if (c == '(') {
+ if (pc == 0)
+ start = i
+ pc += 1
+ }
+ if (c == ')') {
+ pc -= 1
+ if (pc == 0)
+ sentences.append(x.substring(start, i + 1))
+ }
+ }
+ sentences.iterator
+ }
+ else
+ return false
+ }
+ throw new AssertionError
+ }
+ }
+
+ def main(args: Array[String]): Unit = {
+ def read(sec: String, trainNum: Int, rawNum: Int) = {
+ val Pos1Re = """^(.+)-[.0-9]+$""".r
+ val Pos2Re = """^(.+)~[^~]*$""".r
+
+ def cleanPosFine(p: String) = {
+ p match { case Pos1Re(t) => t; case t => t }
+ }
+
+ def cleanPosCoarse(p: String) = {
+ cleanPosFine(p) match { case Pos2Re(t) => t; case t => t }
+ }
+
+ val sentences =
+ new SentenceIterator(Iterator(new File("data/tut/%s.penn".format(sec)).getAbsolutePath))
+ .map(line =>
+ TERMINAL_RE.findAllIn(line).matchData
+ .map(_.subgroups)
+ .flatMap {
+ case Seq("-NONE-", word) => None
+ case Seq(pos, word) => Some((word.replaceAll("\\)", ""), cleanPosFine(pos)))
+ }
+ .map { case (w, t) => if (w.trim.isEmpty) throw new RuntimeException(w); (w, t) }
+ .toList)
+ .toList
+
+ val (train, rawTest) = sentences.splitAt(trainNum)
+ val (raw, test) = rawTest.splitAt(rawNum)
+
+ println(sec)
+ println("Num sentences: " + sentences.size)
+ println("Num tokens: " + sentences.flatten.size)
+ println("train sentences: " + train.size)
+ println("raw sentences: " + raw.size)
+ println("test sentences: " + test.size)
+
+ (train, raw, test)
+ }
+
+ val (trains, raws, tests) =
+ List(
+ read("codicecivile", 600, 250),
+ read("news", 400, 150),
+ read("vedch", 200, 100),
+ read("eudir", 100, 50),
+ read("wiki", 200, 150))
+ .unzip3
+
+ val train = trains.flatten
+ val raw = raws.flatten
+ val test = tests.flatten
+
+ val allTags = (train ++ raw ++ test).flatten.map(_._2).toSet
+ println("allTags = " + allTags.map(t => '"' + t.replace("\"", "\\\"") + '"').mkString("Set(", ",", ")"))
+
+ def write(name: String, stuff: Iterable[List[(String, String)]]) = {
+ println("%s: %s sentences, %s tokens".format(name, stuff.size, stuff.flatten.size))
+ writeUsing("data/tut-%s.pos".format(name)) { w =>
+ for (line <- stuff)
+ w.write(line.map { case (w, p) => "%s|%s".format(w, p) }.mkString(" ") + "\n")
+ }
+ }
+
+ write("train", train)
+ write("raw", raw)
+ write("test", test)
+
+ }
+}
Oops, something went wrong.

0 comments on commit 70f3dce

Please sign in to comment.