Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

start work on moving to opennlp 1.5

  • Loading branch information...
commit f33c953e60b02465667a58ed1bc81910e3712794 1 parent a3dbd6a
@dakrone authored
View
2  README.markdown
@@ -94,7 +94,7 @@ Treebank-parsing
----------------
<b>Note: Treebank parsing is very memory intensive, make sure your JVM has
-a sufficient amount of memory available (using something like -Xmx1024m)
+a sufficient amount of memory available (using something like -Xmx512m)
or you will run out of heap space when using a treebank parser.</b>
Treebank parsing gets its own section due to how complex it is. One difference
View
BIN  models/EnglishChunk.bin.gz
Binary file not shown
View
BIN  models/EnglishSD.bin.gz
Binary file not shown
View
BIN  models/EnglishTok.bin.gz
Binary file not shown
View
BIN  models/en-sent.bin
Binary file not shown
View
1  models/namefind/README
@@ -1 +0,0 @@
-There are more namefind models than the two here. Check http://opennlp.sourceforge.net/models/ for more.
View
BIN  models/namefind/organization.bin.gz
Binary file not shown
View
BIN  models/namefind/person.bin.gz
Binary file not shown
View
BIN  models/tag.bin.gz
Binary file not shown
View
6 project.clj
@@ -2,6 +2,8 @@
:description "Natural Language Processing with Clojure, library for opennlp. http://github.com/dakrone/clojure-opennlp"
:dependencies [[org.clojure/clojure "1.2.0"]
[org.clojure/clojure-contrib "1.2.0"]
- [org.clojars.thnetos/opennlp-tools "1.4.3"]]
- :dev-dependencies [[lein-clojars "0.5.0-SNAPSHOT"]])
+ ;; [org.clojars.thnetos/opennlp-tools "1.4.3"]
+ [opennlp/tools "1.5.0"]]
+ :dev-dependencies [[lein-clojars "0.5.0-SNAPSHOT"]]
+ :repositories {"opennlp.sf.net" "http://opennlp.sourceforge.net/maven2"})
View
64 src/opennlp/nlp.clj
@@ -2,21 +2,21 @@
(ns opennlp.nlp
(:use [clojure.contrib.seq-utils :only [indexed]])
(:use [clojure.contrib.pprint :only [pprint]])
- (:import [java.io File FileNotFoundException])
- (:import [opennlp.maxent DataStream GISModel])
- (:import [opennlp.maxent.io PooledGISModelReader SuffixSensitiveGISModelReader])
- (:import [opennlp.tools.util Span])
- (:import [opennlp.tools.dictionary Dictionary])
- (:import [opennlp.tools.tokenize TokenizerME])
- (:import [opennlp.tools.sentdetect SentenceDetectorME])
- (:import [opennlp.tools.namefind NameFinderME])
- (:import [opennlp.tools.chunker ChunkerME])
- (:import [opennlp.tools.coref LinkerMode])
- (:import [opennlp.tools.coref.mention Mention DefaultParse])
- (:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse])
- (:import [opennlp.tools.parser.chunking Parser])
- (:import [opennlp.tools.parser AbstractBottomUpParser Parse])
- (:import [opennlp.tools.postag POSTaggerME DefaultPOSContextGenerator POSContextGenerator]))
+ (:import [java.io File FileNotFoundException FileInputStream])
+ #_(:import [opennlp.maxent DataStream GISModel])
+ #_(:import [opennlp.maxent.io PooledGISModelReader SuffixSensitiveGISModelReader])
+ #_(:import [opennlp.tools.util Span])
+ #_(:import [opennlp.tools.dictionary Dictionary])
+ #_(:import [opennlp.tools.tokenize TokenizerME])
+ (:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
+ #_(:import [opennlp.tools.namefind NameFinderME])
+ #_(:import [opennlp.tools.chunker ChunkerME])
+ #_(:import [opennlp.tools.coref LinkerMode])
+ #_(:import [opennlp.tools.coref.mention Mention DefaultParse])
+ #_(:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse])
+ #_(:import [opennlp.tools.parser.chunking Parser])
+ #_(:import [opennlp.tools.parser AbstractBottomUpParser Parse])
+ #_(:import [opennlp.tools.postag POSTaggerME DefaultPOSContextGenerator POSContextGenerator]))
;;; OpenNLP property for pos-tagging. Meant to be rebound before
@@ -27,26 +27,24 @@
[filename]
(.exists (File. filename)))
-
-(defn files-exist?
+(defn- files-exist?
[filenames]
(reduce 'and (map file-exist? filenames)))
-
(defn make-sentence-detector
- "Return a function for detecting sentences based on a given model file."
[modelfile]
(if-not (file-exist? modelfile)
- (throw (FileNotFoundException. "Model file does not exist."))
- (fn sentenizer
+ (throw (FileNotFoundException.))
+ (fn sentencizer
[text]
- (let [model (.getModel (SuffixSensitiveGISModelReader. (File. modelfile)))
- detector (SentenceDetectorME. model)
+ (let [model-stream (FileInputStream. modelfile)
+ model (SentenceModel. model-stream)
+ detector (SentenceDetectorME. model)
sentences (.sentDetect detector text)]
(into [] sentences)))))
-(defn make-tokenizer
+#_(defn make-tokenizer
"Return a function for tokenizing a sentence based on a given model file."
[modelfile]
(if-not (file-exist? modelfile)
@@ -59,7 +57,7 @@
(into [] tokens)))))
-(defn make-pos-tagger
+#_(defn make-pos-tagger
"Return a function for tagging tokens based on a given model file."
[modelfile]
(if-not (file-exist? modelfile)
@@ -74,7 +72,7 @@
(map #(vector %1 %2) tokens (first tags))))))
-(defn make-name-finder
+#_(defn make-name-finder
"Return a function for finding names from tokens based on given model file(s)."
[& modelfiles]
(if-not (files-exist? modelfiles)
@@ -130,7 +128,7 @@
(defstruct treebank-phrase :phrase :tag)
-(defn make-treebank-chunker
+#_(defn make-treebank-chunker
"Return a function for chunking phrases from pos-tagged tokens based on
a given model file."
[modelfile]
@@ -193,7 +191,7 @@
(.replaceAll "\\}" "-RCB-")))
-(defn- parse-line
+#_(defn- parse-line
"Given a line and Parser object, return a list of Parses."
[line parser]
(let [line (strip-parens line)
@@ -216,7 +214,7 @@
(.toString results)))
-(defn make-treebank-parser
+#_(defn make-treebank-parser
"Return a function for treebank parsing a sequence of sentences, based on
given build, check, tag, chunk models and a set of head rules."
[buildmodel checkmodel tagmodel chunkmodel headrules & opts]
@@ -297,7 +295,7 @@
(reset! start (.getEnd s))))
;;; This is broken, don't use this.
-(defn print-parse
+#_(defn print-parse
"Given a parse and the EntityMentions-to-parse map, print out the parse."
[p parse-map]
(let [start (atom (.getStart (.getSpan p)))
@@ -346,7 +344,7 @@
(map #(print-parse % parse-map) parses)))
-(defn coref-extent
+#_(defn coref-extent
[extent p index]
(if (nil? extent)
(let [snp (Parse. (.getText p) (.getSpan extent) "NML" 1.0 0)]
@@ -355,7 +353,7 @@
nil))
-(defn coref-sentence
+#_(defn coref-sentence
[sentence parses index tblinker]
(let [p (Parse/parseParse sentence)
extents (.getMentions (.getMentionFinder tblinker) (DefaultParse. p index))]
@@ -365,7 +363,7 @@
; Second Attempt
-(defn make-treebank-linker
+#_(defn make-treebank-linker
"Make a TreebankLinker, given a model directory."
[modeldir]
(let [tblinker (TreebankLinker. modeldir LinkerMode/TEST)]
View
113 test/opennlp/test.clj
@@ -4,72 +4,75 @@
(:use [clojure.test])
(:import [java.io File FileNotFoundException]))
-(def get-sentences (make-sentence-detector "models/EnglishSD.bin.gz"))
-(def tokenize (make-tokenizer "models/EnglishTok.bin.gz"))
-(def pos-tag (make-pos-tagger "models/tag.bin.gz"))
-(def name-find (make-name-finder "models/namefind/person.bin.gz"))
-(def chunker (make-treebank-chunker "models/EnglishChunk.bin.gz"))
+(def get-sentences (make-sentence-detector "models/en-sent.bin"))
+#_(def tokenize (make-tokenizer "models/EnglishTok.bin.gz"))
+#_(def pos-tag (make-pos-tagger "models/tag.bin.gz"))
+#_(def name-find (make-name-finder "models/namefind/person.bin.gz"))
+#_(def chunker (make-treebank-chunker "models/EnglishChunk.bin.gz"))
(deftest sentence-split-test
(is (= (get-sentences "First sentence. Second sentence? Here is another one. And so on and so forth - you get the idea...")
- ["First sentence. " "Second sentence? " "Here is another one. " "And so on and so forth - you get the idea..."]))
+ ["First sentence." "Second sentence?" "Here is another one." "And so on and so forth - you get the idea..."]))
(is (= (get-sentences "'Hmmm.... now what?' Mr. Green said to H.A.L.")
- ["'Hmmm.... now what?' Mr. Green said to H.A.L."])))
+ ["'Hmmm.... now what?'" "Mr. Green said to H.A.L."])))
-(deftest tokenizer-test
- (is (= (tokenize "First sentence.")
- ["First" "sentence" "."]))
- (is (= (tokenize "Mr. Smith gave a car to his son on Friday.")
- ["Mr." "Smith" "gave" "a" "car" "to" "his" "son" "on" "Friday" "."])))
+(comment
+ (deftest tokenizer-test
+ (is (= (tokenize "First sentence.")
+ ["First" "sentence" "."]))
+ (is (= (tokenize "Mr. Smith gave a car to his son on Friday.")
+ ["Mr." "Smith" "gave" "a" "car" "to" "his" "son" "on" "Friday" "."])))
-(deftest pos-tag-test
- (is (= (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday."))
- '(["Mr." "NNP"] ["Smith" "NNP"] ["gave" "VBD"] ["a" "DT"] ["car" "NN"] ["to" "TO"] ["his" "PRP$"] ["son" "NN"] ["on" "IN"] ["Friday" "NNP"] ["." "."]))))
+ (deftest pos-tag-test
+ (is (= (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday."))
+ '(["Mr." "NNP"] ["Smith" "NNP"] ["gave" "VBD"] ["a" "DT"] ["car" "NN"] ["to" "TO"] ["his" "PRP$"] ["son" "NN"] ["on" "IN"] ["Friday" "NNP"] ["." "."]))))
-(deftest name-finder-test
- (is (= (name-find (tokenize "My name is Lee, not John"))
- '("Lee" "John"))))
+ (deftest name-finder-test
+ (is (= (name-find (tokenize "My name is Lee, not John"))
+ '("Lee" "John"))))
-(deftest chunker-test
- (is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed.")))
- '({:phrase ["The" "override" "system"] :tag "NP"}
- {:phrase ["is" "meant" "to" "deactivate"] :tag "VP"}
- {:phrase ["the" "accelerator"] :tag "NP"}
- {:phrase ["when"] :tag "ADVP"}
- {:phrase ["the" "brake" "pedal"] :tag "NP"}
- {:phrase ["is" "pressed"] :tag "VP"}))))
+ (deftest chunker-test
+ (is (= (chunker (pos-tag (tokenize "The override system is meant to deactivate the accelerator when the brake pedal is pressed.")))
+ '({:phrase ["The" "override" "system"] :tag "NP"}
+ {:phrase ["is" "meant" "to" "deactivate"] :tag "VP"}
+ {:phrase ["the" "accelerator"] :tag "NP"}
+ {:phrase ["when"] :tag "ADVP"}
+ {:phrase ["the" "brake" "pedal"] :tag "NP"}
+ {:phrase ["is" "pressed"] :tag "VP"}))))
-(try
- (do
- (def parser (make-treebank-parser "parser-models/build.bin.gz" "parser-models/check.bin.gz" "parser-models/tag.bin.gz" "parser-models/chunk.bin.gz" "parser-models/head_rules"))
- (deftest parser-test
- (is (= (parser ["This is a sentence ."])
- ["(TOP (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))"]))))
- (catch FileNotFoundException e
- (println "Unable to execute treebank-parser tests. Download the model files to $PROJECT_ROOT/parser-models.")))
+ (try
+ (do
+ (def parser (make-treebank-parser "parser-models/build.bin.gz" "parser-models/check.bin.gz" "parser-models/tag.bin.gz" "parser-models/chunk.bin.gz" "parser-models/head_rules"))
+ (deftest parser-test
+ (is (= (parser ["This is a sentence ."])
+ ["(TOP (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))"]))))
+ (catch FileNotFoundException e
+ (println "Unable to execute treebank-parser tests. Download the model files to $PROJECT_ROOT/parser-models.")))
-(deftest no-model-file-test
- (is (thrown? FileNotFoundException (make-sentence-detector "nonexistantfile")))
- (is (thrown? FileNotFoundException (make-tokenizer "nonexistantfile")))
- (is (thrown? FileNotFoundException (make-pos-tagger "nonexistantfile")))
- (is (thrown? FileNotFoundException (make-name-finder "nonexistantfile" "anotherfilethatdoesnotexist")))
- (is (thrown? FileNotFoundException (make-treebank-chunker "nonexistantfile")))
- (is (thrown? FileNotFoundException (make-treebank-parser "nonexistantfile" "asdf" "fdsa" "qwer" "rewq"))))
+ (deftest no-model-file-test
+ (is (thrown? FileNotFoundException (make-sentence-detector "nonexistantfile")))
+ (is (thrown? FileNotFoundException (make-tokenizer "nonexistantfile")))
+ (is (thrown? FileNotFoundException (make-pos-tagger "nonexistantfile")))
+ (is (thrown? FileNotFoundException (make-name-finder "nonexistantfile" "anotherfilethatdoesnotexist")))
+ (is (thrown? FileNotFoundException (make-treebank-chunker "nonexistantfile")))
+ (is (thrown? FileNotFoundException (make-treebank-parser "nonexistantfile" "asdf" "fdsa" "qwer" "rewq"))))
-(deftest laziness-test
- (let [s (get-sentences "First sentence. Second sentence?")]
- (is (= (type (lazy-tokenize s tokenize))
- clojure.lang.LazySeq))
- (is (= (first (lazy-tokenize s tokenize))
- ["First" "sentence" "."]))
- (is (= (type (lazy-tag s tokenize pos-tag))
- clojure.lang.LazySeq))
- (is (= (first (lazy-tag s tokenize pos-tag))
- '(["First" "RB"] ["sentence" "NN"] ["." "."])))
- (is (= (type (lazy-chunk s tokenize pos-tag chunker))
- clojure.lang.LazySeq))
- (is (= (first (lazy-chunk s tokenize pos-tag chunker))
- '({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"})))))
+ (deftest laziness-test
+ (let [s (get-sentences "First sentence. Second sentence?")]
+ (is (= (type (lazy-tokenize s tokenize))
+ clojure.lang.LazySeq))
+ (is (= (first (lazy-tokenize s tokenize))
+ ["First" "sentence" "."]))
+ (is (= (type (lazy-tag s tokenize pos-tag))
+ clojure.lang.LazySeq))
+ (is (= (first (lazy-tag s tokenize pos-tag))
+ '(["First" "RB"] ["sentence" "NN"] ["." "."])))
+ (is (= (type (lazy-chunk s tokenize pos-tag chunker))
+ clojure.lang.LazySeq))
+ (is (= (first (lazy-chunk s tokenize pos-tag chunker))
+ '({:phrase ["First"], :tag "ADVP"} {:phrase ["sentence"], :tag "NP"})))))
+
+ )
Please sign in to comment.
Something went wrong with that request. Please try again.