Permalink
Browse files

update treebank parsing for 1.5.

add additional test for make-tree
add model file for name finding
  • Loading branch information...
1 parent 0797581 commit c9f577322c4d17e61d903afdf6c9cb3b8f9995d6 @dakrone committed Nov 10, 2010
Binary file not shown.
View
@@ -0,0 +1,4 @@
+This directory is for the model file used to treebank-parsing.
+
+You can download the file from:
+http://opennlp.sourceforge.net/models-1.5/
View
@@ -1,4 +0,0 @@
-This directory is for the model files used to treebank-parsing.
-
-You can download the files from:
-http://opennlp.sourceforge.net/models/english/parser/
View
@@ -3,9 +3,7 @@
(:use [clojure.contrib.seq-utils :only [indexed]])
(:use [clojure.contrib.pprint :only [pprint]])
(:import [java.io File FileNotFoundException FileInputStream])
- #_(:import [opennlp.maxent DataStream GISModel])
- #_(:import [opennlp.maxent.io PooledGISModelReader SuffixSensitiveGISModelReader])
- #_(:import [opennlp.tools.util Span])
+ (:import [opennlp.tools.util Span])
#_(:import [opennlp.tools.dictionary Dictionary])
(:import [opennlp.tools.tokenize TokenizerModel TokenizerME])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
@@ -15,7 +13,8 @@
#_(:import [opennlp.tools.coref.mention Mention DefaultParse])
#_(:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse])
#_(:import [opennlp.tools.parser.chunking Parser])
- #_(:import [opennlp.tools.parser AbstractBottomUpParser Parse])
+ (:import [opennlp.tools.parser Parse ParserModel ParserFactory AbstractBottomUpParser])
+ (:import [opennlp.tools.cmdline.parser ParserTool])
(:import [opennlp.tools.postag POSModel POSTaggerME]))
@@ -197,56 +196,29 @@
(.replaceAll "\\}" "-RCB-")))
-#_(defn- parse-line
+(defn- parse-line
"Given a line and Parser object, return a list of Parses."
[line parser]
(let [line (strip-parens line)
results (StringBuffer.)
- words (.split line " ")
- p (Parse. line (Span. 0 (count line)) AbstractBottomUpParser/INC_NODE (double 1) (int 0))]
- (loop [parse-index 0 start-index 0]
- (if (> (+ parse-index 1) (count words))
- nil
- (let [token (get words parse-index)]
- ;(println "inserting " token " at " i " pidx " parse-index " sidx " start-index)
- ; Mutable state, but contained only in the parse-line function
- (.insert p (Parse. line
- (Span. start-index (+ start-index (count token)))
- AbstractBottomUpParser/TOK_NODE
- (double 0)
- (int parse-index)))
- (recur (inc parse-index) (+ 1 start-index (count token))))))
- (.show (.parse parser p) results)
+ parse-num 1]
+ (.show (first (ParserTool/parseLine line parser parse-num)) results)
(.toString results)))
-#_(defn make-treebank-parser
+(defn make-treebank-parser
"Return a function for treebank parsing a sequence of sentences, based on
- given build, check, tag, chunk models and a set of head rules."
- [buildmodel checkmodel tagmodel chunkmodel headrules & opts]
- (if-not (files-exist? [buildmodel checkmodel tagmodel chunkmodel headrules])
- (throw (FileNotFoundException. "One or more of the model or rule files does not exist"))
+ a given model file."
+ [modelfile]
+ (if-not (file-exist? modelfile)
+ (throw (FileNotFoundException. "The model file does not exist."))
(fn treebank-parser
[text]
- (let [builder (-> (File. buildmodel) SuffixSensitiveGISModelReader. .getModel)
- checker (-> (File. checkmodel) SuffixSensitiveGISModelReader. .getModel)
- opt-map (apply hash-map opts)
- parsetagger (if (and (:tagdict opt-map) (file-exist? (:tagdict opt-map)))
- (if (:case-sensitive opt-map)
- (ParserTagger. tagmodel (:tagdict opt-map) true)
- (ParserTagger. tagmodel (:tagdict opt-map) false))
- (ParserTagger. tagmodel nil))
- parsechunker (ParserChunker. chunkmodel)
- headrules (HeadRules. headrules)
- parser (Parser. builder
- checker
- parsetagger
- parsechunker
- headrules
- (int *beam-size*)
- (double *advance-percentage*))
- parses (map #(parse-line % parser) text)]
- (vec parses)))))
+ (with-open [modelstream (FileInputStream. modelfile)]
+ (let [model (ParserModel. modelstream)
+ parser (ParserFactory/create model)
+ parses (map #(parse-line % parser) text)]
+ (vec parses))))))
(defn- strip-funny-chars
View
@@ -41,48 +41,51 @@
(is (thrown? java.lang.AssertionError (name-find "asdf"))))
- (deftest chunker-test
- (is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed.")))
- '({:phrase ["The" "override" "system"] :tag "NP"}
- {:phrase ["is" "meant" "to" "deactivate"] :tag "VP"}
- {:phrase ["the" "accelerator"] :tag "NP"}
- {:phrase ["when"] :tag "ADVP"}
- {:phrase ["the" "brake" "pedal"] :tag "NP"}
- {:phrase ["is" "pressed"] :tag "VP"}))))
-
-(comment
-
- (try
- (do
- (def parser (make-treebank-parser "parser-models/build.bin.gz" "parser-models/check.bin.gz" "parser-models/tag.bin.gz" "parser-models/chunk.bin.gz" "parser-models/head_rules"))
- (deftest parser-test
- (is (= (parser ["This is a sentence ."])
- ["(TOP (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))"]))))
- (catch FileNotFoundException e
- (println "Unable to execute treebank-parser tests. Download the model files to $PROJECT_ROOT/parser-models.")))
-
- (deftest no-model-file-test
- (is (thrown? FileNotFoundException (make-sentence-detector "nonexistantfile")))
- (is (thrown? FileNotFoundException (make-tokenizer "nonexistantfile")))
- (is (thrown? FileNotFoundException (make-pos-tagger "nonexistantfile")))
- (is (thrown? FileNotFoundException (make-name-finder "nonexistantfile" "anotherfilethatdoesnotexist")))
- (is (thrown? FileNotFoundException (make-treebank-chunker "nonexistantfile")))
- (is (thrown? FileNotFoundException (make-treebank-parser "nonexistantfile" "asdf" "fdsa" "qwer" "rewq"))))
-
- (deftest laziness-test
- (let [s (get-sentences "First sentence. Second sentence?")]
- (is (= (type (lazy-tokenize s tokenize))
- clojure.lang.LazySeq))
- (is (= (first (lazy-tokenize s tokenize))
- ["First" "sentence" "."]))
- (is (= (type (lazy-tag s tokenize pos-tag))
- clojure.lang.LazySeq))
- (is (= (first (lazy-tag s tokenize pos-tag))
- '(["First" "RB"] ["sentence" "NN"] ["." "."])))
- (is (= (type (lazy-chunk s tokenize pos-tag chunker))
- clojure.lang.LazySeq))
- (is (= (first (lazy-chunk s tokenize pos-tag chunker))
- '({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"}))))))
+(deftest chunker-test
+ (is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed.")))
+ '({:phrase ["The" "override" "system"] :tag "NP"}
+ {:phrase ["is" "meant" "to" "deactivate"] :tag "VP"}
+ {:phrase ["the" "accelerator"] :tag "NP"}
+ {:phrase ["when"] :tag "ADVP"}
+ {:phrase ["the" "brake" "pedal"] :tag "NP"}
+ {:phrase ["is" "pressed"] :tag "VP"}))))
+
+(deftest no-model-file-test
+ (is (thrown? FileNotFoundException (make-sentence-detector "nonexistantfile")))
+ (is (thrown? FileNotFoundException (make-tokenizer "nonexistantfile")))
+ (is (thrown? FileNotFoundException (make-pos-tagger "nonexistantfile")))
+ (is (thrown? FileNotFoundException (make-name-finder "nonexistantfile")))
+ (is (thrown? FileNotFoundException (make-treebank-chunker "nonexistantfile")))
+ (is (thrown? FileNotFoundException (make-treebank-parser "nonexistantfile"))))
+
+
+(try
+ (do
+ (def parser (make-treebank-parser "parser-model/en-parser-chunking.bin"))
+ (deftest parser-test
+ (is (= (parser ["This is a sentence ."])
+ ["(TOP (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))"]))
+ (is (= (make-tree (first (parser ["This is a sentence ."])))
+ '{:chunk {:chunk ({:chunk {:chunk "This", :tag DT}, :tag NP} {:chunk ({:chunk "is", :tag VBZ} {:chunk ({:chunk "a", :tag DT} {:chunk "sentence", :tag NN}), :tag NP}), :tag VP} {:chunk ".", :tag .}), :tag S}, :tag TOP}))))
+ (catch FileNotFoundException e
+ (println "Unable to execute treebank-parser tests. Download the model files to $PROJECT_ROOT/parser-models.")))
+
+
+
+(deftest laziness-test
+ (let [s (get-sentences "First sentence. Second sentence?")]
+ (is (= (type (lazy-tokenize s tokenize))
+ clojure.lang.LazySeq))
+ (is (= (first (lazy-tokenize s tokenize))
+ ["First" "sentence" "."]))
+ (is (= (type (lazy-tag s tokenize pos-tag))
+ clojure.lang.LazySeq))
+ (is (= (first (lazy-tag s tokenize pos-tag))
+ '(["First" "RB"] ["sentence" "NN"] ["." "."])))
+ (is (= (type (lazy-chunk s tokenize pos-tag chunker))
+ clojure.lang.LazySeq))
+ (is (= (first (lazy-chunk s tokenize pos-tag chunker))
+ '({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"})))))

0 comments on commit c9f5773

Please sign in to comment.