Skip to content

Commit

Permalink
transition treebank chunking to 1.5
Browse files Browse the repository at this point in the history
  • Loading branch information
dakrone committed Nov 10, 2010
1 parent d01af27 commit 0797581
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 16 deletions.
Binary file added models/en-chunker.bin
Binary file not shown.
23 changes: 12 additions & 11 deletions src/opennlp/nlp.clj
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
(:import [opennlp.tools.tokenize TokenizerModel TokenizerME])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
(:import [opennlp.tools.namefind TokenNameFinderModel NameFinderME])
#_(:import [opennlp.tools.chunker ChunkerME])
(:import [opennlp.tools.chunker ChunkerModel ChunkerME])
#_(:import [opennlp.tools.coref LinkerMode])
#_(:import [opennlp.tools.coref.mention Mention DefaultParse])
#_(:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse])
Expand Down Expand Up @@ -133,23 +133,24 @@

(defstruct treebank-phrase :phrase :tag)

#_(defn make-treebank-chunker
(defn make-treebank-chunker
"Return a function for chunking phrases from pos-tagged tokens based on
a given model file."
[modelfile]
(if-not (file-exist? modelfile)
(throw (FileNotFoundException. "Model file does not exist."))
(fn treebank-chunker
[pos-tagged-tokens]
(let [model (.getModel (SuffixSensitiveGISModelReader. (File. modelfile)))
chunker (ChunkerME. model)
[tokens tags] (de-interleave pos-tagged-tokens)
chunks (into [] (seq (.chunk chunker tokens tags)))
sized-chunks (map size-chunk (split-chunks chunks))
[types sizes] (de-interleave sized-chunks)
token-chunks (split-with-size sizes tokens)]
(map #(struct treebank-phrase (into [] (last %)) (first %))
(partition 2 (interleave types token-chunks)))))))
(with-open [modelstream (FileInputStream. modelfile)]
(let [model (ChunkerModel. modelstream)
chunker (ChunkerME. model *beam-size*)
[tokens tags] (de-interleave pos-tagged-tokens)
chunks (into [] (seq (.chunk chunker tokens tags)))
sized-chunks (map size-chunk (split-chunks chunks))
[types sizes] (de-interleave sized-chunks)
token-chunks (split-with-size sizes tokens)]
(map #(struct treebank-phrase (into [] (last %)) (first %))
(partition 2 (interleave types token-chunks))))))))


(defn phrases
Expand Down
10 changes: 5 additions & 5 deletions test/opennlp/test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
(def tokenize (make-tokenizer "models/en-token.bin"))
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
(def name-find (make-name-finder "models/namefind/en-ner-person.bin"))
#_(def chunker (make-treebank-chunker "models/EnglishChunk.bin.gz"))
(def chunker (make-treebank-chunker "models/en-chunker.bin"))


(deftest sentence-split-test
Expand Down Expand Up @@ -41,8 +41,6 @@
(is (thrown? java.lang.AssertionError (name-find "asdf"))))


(comment

(deftest chunker-test
(is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed.")))
'({:phrase ["The" "override" "system"] :tag "NP"}
Expand All @@ -52,6 +50,8 @@
{:phrase ["the" "brake" "pedal"] :tag "NP"}
{:phrase ["is" "pressed"] :tag "VP"}))))

(comment

(try
(do
(def parser (make-treebank-parser "parser-models/build.bin.gz" "parser-models/check.bin.gz" "parser-models/tag.bin.gz" "parser-models/chunk.bin.gz" "parser-models/head_rules"))
Expand Down Expand Up @@ -82,8 +82,8 @@
(is (= (type (lazy-chunk s tokenize pos-tag chunker))
clojure.lang.LazySeq))
(is (= (first (lazy-chunk s tokenize pos-tag chunker))
'({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"})))))
'({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"}))))))

)



0 comments on commit 0797581

Please sign in to comment.