Permalink
Browse files

transition treebank chunking to 1.5

  • Loading branch information...
1 parent d01af27 commit 0797581ea1885142411447033216c92182502390 @dakrone committed Nov 10, 2010
Showing with 17 additions and 16 deletions.
  1. BIN models/en-chunker.bin
  2. +12 −11 src/opennlp/nlp.clj
  3. +5 −5 test/opennlp/test.clj
View
Binary file not shown.
View
@@ -10,7 +10,7 @@
(:import [opennlp.tools.tokenize TokenizerModel TokenizerME])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
(:import [opennlp.tools.namefind TokenNameFinderModel NameFinderME])
- #_(:import [opennlp.tools.chunker ChunkerME])
+ (:import [opennlp.tools.chunker ChunkerModel ChunkerME])
#_(:import [opennlp.tools.coref LinkerMode])
#_(:import [opennlp.tools.coref.mention Mention DefaultParse])
#_(:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse])
@@ -133,23 +133,24 @@
(defstruct treebank-phrase :phrase :tag)
-#_(defn make-treebank-chunker
+(defn make-treebank-chunker
"Return a function for chunking phrases from pos-tagged tokens based on
a given model file."
[modelfile]
(if-not (file-exist? modelfile)
(throw (FileNotFoundException. "Model file does not exist."))
(fn treebank-chunker
[pos-tagged-tokens]
- (let [model (.getModel (SuffixSensitiveGISModelReader. (File. modelfile)))
- chunker (ChunkerME. model)
- [tokens tags] (de-interleave pos-tagged-tokens)
- chunks (into [] (seq (.chunk chunker tokens tags)))
- sized-chunks (map size-chunk (split-chunks chunks))
- [types sizes] (de-interleave sized-chunks)
- token-chunks (split-with-size sizes tokens)]
- (map #(struct treebank-phrase (into [] (last %)) (first %))
- (partition 2 (interleave types token-chunks)))))))
+ (with-open [modelstream (FileInputStream. modelfile)]
+ (let [model (ChunkerModel. modelstream)
+ chunker (ChunkerME. model *beam-size*)
+ [tokens tags] (de-interleave pos-tagged-tokens)
+ chunks (into [] (seq (.chunk chunker tokens tags)))
+ sized-chunks (map size-chunk (split-chunks chunks))
+ [types sizes] (de-interleave sized-chunks)
+ token-chunks (split-with-size sizes tokens)]
+ (map #(struct treebank-phrase (into [] (last %)) (first %))
+ (partition 2 (interleave types token-chunks))))))))
(defn phrases
View
@@ -8,7 +8,7 @@
(def tokenize (make-tokenizer "models/en-token.bin"))
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
(def name-find (make-name-finder "models/namefind/en-ner-person.bin"))
-#_(def chunker (make-treebank-chunker "models/EnglishChunk.bin.gz"))
+(def chunker (make-treebank-chunker "models/en-chunker.bin"))
(deftest sentence-split-test
@@ -41,8 +41,6 @@
(is (thrown? java.lang.AssertionError (name-find "asdf"))))
-(comment
-
(deftest chunker-test
(is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed.")))
'({:phrase ["The" "override" "system"] :tag "NP"}
@@ -52,6 +50,8 @@
{:phrase ["the" "brake" "pedal"] :tag "NP"}
{:phrase ["is" "pressed"] :tag "VP"}))))
+(comment
+
(try
(do
(def parser (make-treebank-parser "parser-models/build.bin.gz" "parser-models/check.bin.gz" "parser-models/tag.bin.gz" "parser-models/chunk.bin.gz" "parser-models/head_rules"))
@@ -82,8 +82,8 @@
(is (= (type (lazy-chunk s tokenize pos-tag chunker))
clojure.lang.LazySeq))
(is (= (first (lazy-chunk s tokenize pos-tag chunker))
- '({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"})))))
+ '({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"}))))))
- )
+

0 comments on commit 0797581

Please sign in to comment.