Skip to content

Commit

Permalink
Updated to the latest Apache OpenNLP and updated tests for API changes.
Browse files Browse the repository at this point in the history
  • Loading branch information
oubiwann committed Aug 7, 2018
1 parent c8b1ecb commit 93c10f3
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -7,3 +7,4 @@ multi-lib/*
.lein-deps-sum
target/*
pom.xml*
.lein-repl-history
4 changes: 2 additions & 2 deletions project.clj
Expand Up @@ -4,8 +4,8 @@
:min-lein-version "2.0.0"
:license {:name "Eclipse Public License"
:url "http://www.eclipse.org/legal/epl-v10.html"}
:dependencies [[org.apache.opennlp/opennlp-tools "1.7.2"]
[instaparse "1.4.5"]]
:dependencies [[org.apache.opennlp/opennlp-tools "1.9.0"]
[instaparse "1.4.9"]]
:profiles {:dev {:dependencies [[org.clojure/clojure "1.6.0"]]
:plugins [[lein-marginalia "0.8.0"]]}
:1.5 {:dependencies [[org.clojure/clojure "1.5.1"]]}
Expand Down
5 changes: 2 additions & 3 deletions src/opennlp/nlp.clj
Expand Up @@ -250,10 +250,9 @@ start and end positions of the span."
(defmethod make-document-categorizer DoccatModel
[^DoccatModel model]
(fn document-categorizer
[text]
{:pre [(string? text)]}
[tokens]
(let [categorizer (DocumentCategorizerME. model)
outcomes (.categorize categorizer ^String text)]
outcomes (.categorize categorizer (into-array tokens))]
(with-meta
{:best-category (.getBestCategory categorizer outcomes)}
{:probabilities (parse-categories
Expand Down
4 changes: 2 additions & 2 deletions test/opennlp/test/sample.clj
Expand Up @@ -11,11 +11,11 @@
(f))))

(deftest test-samples-round-trip
(let [d (DocumentSample. "foo" "bar")]
(let [d (DocumentSample. "foo" (into-array ["bar"]))]
(is (= d (read-string (pr-str d))))))

(deftest test-clojure-document-sample-stream
(let [d (DocumentSample. "foo" "bar")
(let [d (DocumentSample. "foo" (into-array ["bar"]))
x (java.io.ByteArrayInputStream.
(.getBytes
(with-out-str
Expand Down
34 changes: 21 additions & 13 deletions test/opennlp/test/tools/train.clj
Expand Up @@ -31,9 +31,9 @@
pos-tagger (nlp/make-pos-tagger pos-model)]
(is (= (pos-tagger ["Being" "at" "the" "polls" "was" "just" "like"
"being" "at" "church."])
'(["Being" "VBD"] ["at" "IN"] ["the" "NN"] ["polls" ","]
["was" "VBD"] ["just" "RB"] ["like" "IN"] ["being" "NN"]
["at" "IN"] ["church." "NN"])))))
'(["Being" "VBG"] ["at" "IN"] ["the" "NN"] ["polls" "IN"]
["was" "NN"] ["just" "IN"] ["like" "NN"] ["being" "IN"]
["at" "DT"] ["church." "."])))))

(deftest chunker-training-test
(let [chunk-model (train/train-treebank-chunker "training/chunker.train")
Expand Down Expand Up @@ -64,8 +64,8 @@
"training/head_rules")
parser (tb/make-treebank-parser tb-parser-model)]
(is (= (parser ["This is a sentence ."])
[(str "(INC (NP (DT This)) (NP (DT is)) (NP (DT a))"
" (DT sentence) (. .) )")]))))
[(str "(INC (NP (. This)) (NP (VBP is)) (NP (VBP a))"
" (VBP sentence) (. .))")]))))

(deftest write-out-training-model-test
(let [token-model (train/train-tokenizer "training/tokenizer.train")
Expand All @@ -82,24 +82,32 @@

(deftest categorization-training-test
(let [cat-model (train/train-document-categorization "training/doccat.train")
get-category (nlp/make-document-categorizer cat-model)]
get-category (nlp/make-document-categorizer cat-model)
tokenize (nlp/make-tokenizer "models/en-token.bin")]
(let [test-cat (get-category
"The third verse of the song was quite upbeat.")]
(tokenize
"The third verse of the song was quite upbeat."))]
(is (= (:best-category test-cat) "Happy"))
(is (= (count (:probabilities (meta test-cat))) 2)))

(let [test-cat (get-category
"There was a sense of foreboding at the outset.")]
(tokenize
"There was a sense of foreboding at the outset."))]
(is (= (:best-category test-cat) "Unhappy"))
(is (= (count (:probabilities (meta test-cat))) 2)))
(is (= (:best-category
(get-category "The sun was shining, smiles everywhere."))
(get-category
(tokenize "The sun was shining, smiles everywhere.")))
"Happy"))
(is (= (:best-category
(get-category (str "The confused prisoner could not figure "
"out which way to go.")))
(get-category
(tokenize
(str "The confused prisoner could not figure "
"out which way to go."))))
"Unhappy"))
(is (= (:best-category
(get-category (str "The frowning man chastized his son for "
"not divulging the truth.")))
(get-category
(tokenize
(str "The frowning man chastized his son for "
"not divulging the truth."))))
"Unhappy"))))

0 comments on commit 93c10f3

Please sign in to comment.