Skip to content

Commit

Permalink
update treebank parsing for 1.5.
Browse files Browse the repository at this point in the history
add additional test for make-tree
add model file for name finding
  • Loading branch information
dakrone committed Nov 10, 2010
1 parent 0797581 commit c9f5773
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 90 deletions.
Binary file added models/namefind/en-ner-person.bin
Binary file not shown.
4 changes: 4 additions & 0 deletions parser-model/README.txt
@@ -0,0 +1,4 @@
This directory is for the model file used to treebank-parsing.

You can download the file from:
http://opennlp.sourceforge.net/models-1.5/
4 changes: 0 additions & 4 deletions parser-models/README.txt

This file was deleted.

60 changes: 16 additions & 44 deletions src/opennlp/nlp.clj
Expand Up @@ -3,9 +3,7 @@
(:use [clojure.contrib.seq-utils :only [indexed]]) (:use [clojure.contrib.seq-utils :only [indexed]])
(:use [clojure.contrib.pprint :only [pprint]]) (:use [clojure.contrib.pprint :only [pprint]])
(:import [java.io File FileNotFoundException FileInputStream]) (:import [java.io File FileNotFoundException FileInputStream])
#_(:import [opennlp.maxent DataStream GISModel]) (:import [opennlp.tools.util Span])
#_(:import [opennlp.maxent.io PooledGISModelReader SuffixSensitiveGISModelReader])
#_(:import [opennlp.tools.util Span])
#_(:import [opennlp.tools.dictionary Dictionary]) #_(:import [opennlp.tools.dictionary Dictionary])
(:import [opennlp.tools.tokenize TokenizerModel TokenizerME]) (:import [opennlp.tools.tokenize TokenizerModel TokenizerME])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME]) (:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
Expand All @@ -15,7 +13,8 @@
#_(:import [opennlp.tools.coref.mention Mention DefaultParse]) #_(:import [opennlp.tools.coref.mention Mention DefaultParse])
#_(:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse]) #_(:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse])
#_(:import [opennlp.tools.parser.chunking Parser]) #_(:import [opennlp.tools.parser.chunking Parser])
#_(:import [opennlp.tools.parser AbstractBottomUpParser Parse]) (:import [opennlp.tools.parser Parse ParserModel ParserFactory AbstractBottomUpParser])
(:import [opennlp.tools.cmdline.parser ParserTool])
(:import [opennlp.tools.postag POSModel POSTaggerME])) (:import [opennlp.tools.postag POSModel POSTaggerME]))




Expand Down Expand Up @@ -197,56 +196,29 @@
(.replaceAll "\\}" "-RCB-"))) (.replaceAll "\\}" "-RCB-")))




#_(defn- parse-line (defn- parse-line
"Given a line and Parser object, return a list of Parses." "Given a line and Parser object, return a list of Parses."
[line parser] [line parser]
(let [line (strip-parens line) (let [line (strip-parens line)
results (StringBuffer.) results (StringBuffer.)
words (.split line " ") parse-num 1]
p (Parse. line (Span. 0 (count line)) AbstractBottomUpParser/INC_NODE (double 1) (int 0))] (.show (first (ParserTool/parseLine line parser parse-num)) results)
(loop [parse-index 0 start-index 0]
(if (> (+ parse-index 1) (count words))
nil
(let [token (get words parse-index)]
;(println "inserting " token " at " i " pidx " parse-index " sidx " start-index)
; Mutable state, but contained only in the parse-line function
(.insert p (Parse. line
(Span. start-index (+ start-index (count token)))
AbstractBottomUpParser/TOK_NODE
(double 0)
(int parse-index)))
(recur (inc parse-index) (+ 1 start-index (count token))))))
(.show (.parse parser p) results)
(.toString results))) (.toString results)))




#_(defn make-treebank-parser (defn make-treebank-parser
"Return a function for treebank parsing a sequence of sentences, based on "Return a function for treebank parsing a sequence of sentences, based on
given build, check, tag, chunk models and a set of head rules." a given model file."
[buildmodel checkmodel tagmodel chunkmodel headrules & opts] [modelfile]
(if-not (files-exist? [buildmodel checkmodel tagmodel chunkmodel headrules]) (if-not (file-exist? modelfile)
(throw (FileNotFoundException. "One or more of the model or rule files does not exist")) (throw (FileNotFoundException. "The model file does not exist."))
(fn treebank-parser (fn treebank-parser
[text] [text]
(let [builder (-> (File. buildmodel) SuffixSensitiveGISModelReader. .getModel) (with-open [modelstream (FileInputStream. modelfile)]
checker (-> (File. checkmodel) SuffixSensitiveGISModelReader. .getModel) (let [model (ParserModel. modelstream)
opt-map (apply hash-map opts) parser (ParserFactory/create model)
parsetagger (if (and (:tagdict opt-map) (file-exist? (:tagdict opt-map))) parses (map #(parse-line % parser) text)]
(if (:case-sensitive opt-map) (vec parses))))))
(ParserTagger. tagmodel (:tagdict opt-map) true)
(ParserTagger. tagmodel (:tagdict opt-map) false))
(ParserTagger. tagmodel nil))
parsechunker (ParserChunker. chunkmodel)
headrules (HeadRules. headrules)
parser (Parser. builder
checker
parsetagger
parsechunker
headrules
(int *beam-size*)
(double *advance-percentage*))
parses (map #(parse-line % parser) text)]
(vec parses)))))




(defn- strip-funny-chars (defn- strip-funny-chars
Expand Down
87 changes: 45 additions & 42 deletions test/opennlp/test.clj
Expand Up @@ -41,48 +41,51 @@
(is (thrown? java.lang.AssertionError (name-find "asdf")))) (is (thrown? java.lang.AssertionError (name-find "asdf"))))




(deftest chunker-test (deftest chunker-test
(is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed."))) (is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed.")))
'({:phrase ["The" "override" "system"] :tag "NP"} '({:phrase ["The" "override" "system"] :tag "NP"}
{:phrase ["is" "meant" "to" "deactivate"] :tag "VP"} {:phrase ["is" "meant" "to" "deactivate"] :tag "VP"}
{:phrase ["the" "accelerator"] :tag "NP"} {:phrase ["the" "accelerator"] :tag "NP"}
{:phrase ["when"] :tag "ADVP"} {:phrase ["when"] :tag "ADVP"}
{:phrase ["the" "brake" "pedal"] :tag "NP"} {:phrase ["the" "brake" "pedal"] :tag "NP"}
{:phrase ["is" "pressed"] :tag "VP"})))) {:phrase ["is" "pressed"] :tag "VP"}))))


(comment (deftest no-model-file-test

(is (thrown? FileNotFoundException (make-sentence-detector "nonexistantfile")))
(try (is (thrown? FileNotFoundException (make-tokenizer "nonexistantfile")))
(do (is (thrown? FileNotFoundException (make-pos-tagger "nonexistantfile")))
(def parser (make-treebank-parser "parser-models/build.bin.gz" "parser-models/check.bin.gz" "parser-models/tag.bin.gz" "parser-models/chunk.bin.gz" "parser-models/head_rules")) (is (thrown? FileNotFoundException (make-name-finder "nonexistantfile")))
(deftest parser-test (is (thrown? FileNotFoundException (make-treebank-chunker "nonexistantfile")))
(is (= (parser ["This is a sentence ."]) (is (thrown? FileNotFoundException (make-treebank-parser "nonexistantfile"))))
["(TOP (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))"]))))
(catch FileNotFoundException e
(println "Unable to execute treebank-parser tests. Download the model files to $PROJECT_ROOT/parser-models."))) (try

(do
(deftest no-model-file-test (def parser (make-treebank-parser "parser-model/en-parser-chunking.bin"))
(is (thrown? FileNotFoundException (make-sentence-detector "nonexistantfile"))) (deftest parser-test
(is (thrown? FileNotFoundException (make-tokenizer "nonexistantfile"))) (is (= (parser ["This is a sentence ."])
(is (thrown? FileNotFoundException (make-pos-tagger "nonexistantfile"))) ["(TOP (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))"]))
(is (thrown? FileNotFoundException (make-name-finder "nonexistantfile" "anotherfilethatdoesnotexist"))) (is (= (make-tree (first (parser ["This is a sentence ."])))
(is (thrown? FileNotFoundException (make-treebank-chunker "nonexistantfile"))) '{:chunk {:chunk ({:chunk {:chunk "This", :tag DT}, :tag NP} {:chunk ({:chunk "is", :tag VBZ} {:chunk ({:chunk "a", :tag DT} {:chunk "sentence", :tag NN}), :tag NP}), :tag VP} {:chunk ".", :tag .}), :tag S}, :tag TOP}))))
(is (thrown? FileNotFoundException (make-treebank-parser "nonexistantfile" "asdf" "fdsa" "qwer" "rewq")))) (catch FileNotFoundException e

(println "Unable to execute treebank-parser tests. Download the model files to $PROJECT_ROOT/parser-models.")))
(deftest laziness-test
(let [s (get-sentences "First sentence. Second sentence?")]
(is (= (type (lazy-tokenize s tokenize))
clojure.lang.LazySeq)) (deftest laziness-test
(is (= (first (lazy-tokenize s tokenize)) (let [s (get-sentences "First sentence. Second sentence?")]
["First" "sentence" "."])) (is (= (type (lazy-tokenize s tokenize))
(is (= (type (lazy-tag s tokenize pos-tag)) clojure.lang.LazySeq))
clojure.lang.LazySeq)) (is (= (first (lazy-tokenize s tokenize))
(is (= (first (lazy-tag s tokenize pos-tag)) ["First" "sentence" "."]))
'(["First" "RB"] ["sentence" "NN"] ["." "."]))) (is (= (type (lazy-tag s tokenize pos-tag))
(is (= (type (lazy-chunk s tokenize pos-tag chunker)) clojure.lang.LazySeq))
clojure.lang.LazySeq)) (is (= (first (lazy-tag s tokenize pos-tag))
(is (= (first (lazy-chunk s tokenize pos-tag chunker)) '(["First" "RB"] ["sentence" "NN"] ["." "."])))
'({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"})))))) (is (= (type (lazy-chunk s tokenize pos-tag chunker))
clojure.lang.LazySeq))
(is (= (first (lazy-chunk s tokenize pos-tag chunker))
'({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"})))))






Expand Down

0 comments on commit c9f5773

Please sign in to comment.