Permalink
Browse files

fix tokenizer for opennlp 1.5

  • Loading branch information...
1 parent f33c953 commit 53f6efe705c723f836dc956c0d91ebb61aa51937 @dakrone committed Oct 27, 2010
Showing with 18 additions and 18 deletions.
  1. BIN models/en-token.bin
  2. +15 −15 src/opennlp/nlp.clj
  3. +3 −3 test/opennlp/test.clj
View
Binary file not shown.
View
@@ -7,7 +7,7 @@
#_(:import [opennlp.maxent.io PooledGISModelReader SuffixSensitiveGISModelReader])
#_(:import [opennlp.tools.util Span])
#_(:import [opennlp.tools.dictionary Dictionary])
- #_(:import [opennlp.tools.tokenize TokenizerME])
+ (:import [opennlp.tools.tokenize TokenizerModel TokenizerME])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
#_(:import [opennlp.tools.namefind NameFinderME])
#_(:import [opennlp.tools.chunker ChunkerME])
@@ -32,30 +32,30 @@
(reduce 'and (map file-exist? filenames)))
(defn make-sentence-detector
+ "Return a function for splitting sentences given a model file."
[modelfile]
(if-not (file-exist? modelfile)
(throw (FileNotFoundException.))
- (fn sentencizer
+ (fn sentence-detector
[text]
- (let [model-stream (FileInputStream. modelfile)
- model (SentenceModel. model-stream)
- detector (SentenceDetectorME. model)
- sentences (.sentDetect detector text)]
- (into [] sentences)))))
+ (with-open [model-stream (FileInputStream. modelfile)]
+ (let [model (SentenceModel. model-stream)
+ detector (SentenceDetectorME. model)
+ sentences (.sentDetect detector text)]
+ (into [] sentences))))))
-
-#_(defn make-tokenizer
+(defn make-tokenizer
"Return a function for tokenizing a sentence based on a given model file."
[modelfile]
(if-not (file-exist? modelfile)
- (throw (FileNotFoundException. "Model file does not exist."))
+ (throw (FileNotFoundException.))
(fn tokenizer
[sentence]
- (let [model (.getModel (SuffixSensitiveGISModelReader. (File. modelfile)))
- tokenizer (TokenizerME. model)
- tokens (.tokenize tokenizer sentence)]
- (into [] tokens)))))
-
+ (with-open [model-stream (FileInputStream. modelfile)]
+ (let [model (TokenizerModel. model-stream)
+ tokenizer (TokenizerME. model)
+ tokens (.tokenize tokenizer sentence)]
+ (into [] tokens))))))
#_(defn make-pos-tagger
"Return a function for tagging tokens based on a given model file."
@@ -5,7 +5,7 @@
(:import [java.io File FileNotFoundException]))
(def get-sentences (make-sentence-detector "models/en-sent.bin"))
-#_(def tokenize (make-tokenizer "models/EnglishTok.bin.gz"))
+(def tokenize (make-tokenizer "models/en-token.bin"))
#_(def pos-tag (make-pos-tagger "models/tag.bin.gz"))
#_(def name-find (make-name-finder "models/namefind/person.bin.gz"))
#_(def chunker (make-treebank-chunker "models/EnglishChunk.bin.gz"))
@@ -17,13 +17,13 @@
(is (= (get-sentences "'Hmmm.... now what?' Mr. Green said to H.A.L.")
["'Hmmm.... now what?'" "Mr. Green said to H.A.L."])))
-(comment
+
(deftest tokenizer-test
(is (= (tokenize "First sentence.")
["First" "sentence" "."]))
(is (= (tokenize "Mr. Smith gave a car to his son on Friday.")
["Mr." "Smith" "gave" "a" "car" "to" "his" "son" "on" "Friday" "."])))
-
+(comment
(deftest pos-tag-test
(is (= (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday."))
'(["Mr." "NNP"] ["Smith" "NNP"] ["gave" "VBD"] ["a" "DT"] ["car" "NN"] ["to" "TO"] ["his" "PRP$"] ["son" "NN"] ["on" "IN"] ["Friday" "NNP"] ["." "."]))))

0 comments on commit 53f6efe

Please sign in to comment.