Skip to content
Browse files

fix pos-tagger to work with opennlp 1.5

  • Loading branch information...
1 parent 53f6efe commit 3a1f417ea495b87ff2286821e4f31069c1c6f273 @dakrone committed Oct 26, 2010
Showing with 18 additions and 14 deletions.
  1. BIN models/en-pos-maxent.bin
  2. +15 −12 src/opennlp/nlp.clj
  3. +3 −2 test/opennlp/test.clj
View
BIN models/en-pos-maxent.bin
Binary file not shown.
View
27 src/opennlp/nlp.clj
@@ -16,13 +16,16 @@
#_(:import [opennlp.tools.lang.english ParserTagger ParserChunker HeadRules TreebankLinker CorefParse])
#_(:import [opennlp.tools.parser.chunking Parser])
#_(:import [opennlp.tools.parser AbstractBottomUpParser Parse])
- #_(:import [opennlp.tools.postag POSTaggerME DefaultPOSContextGenerator POSContextGenerator]))
+ (:import [opennlp.tools.postag POSModel POSTaggerME]))
;;; OpenNLP property for pos-tagging. Meant to be rebound before
;;; calling the tagging creators
(def #^{:dynamic true} *beam-size* 3)
+;;; Caching to use for pos-tagging
+(def #^{:dynamic true} *cache-size* 1024)
+
(defn- file-exist?
[filename]
(.exists (File. filename)))
@@ -35,7 +38,7 @@
"Return a function for splitting sentences given a model file."
[modelfile]
(if-not (file-exist? modelfile)
- (throw (FileNotFoundException.))
+ (throw (FileNotFoundException. "Model file does not exist."))
(fn sentence-detector
[text]
(with-open [model-stream (FileInputStream. modelfile)]
@@ -48,7 +51,7 @@
"Return a function for tokenizing a sentence based on a given model file."
[modelfile]
(if-not (file-exist? modelfile)
- (throw (FileNotFoundException.))
+ (throw (FileNotFoundException. "Model file does not exist."))
(fn tokenizer
[sentence]
(with-open [model-stream (FileInputStream. modelfile)]
@@ -57,20 +60,20 @@
tokens (.tokenize tokenizer sentence)]
(into [] tokens))))))
-#_(defn make-pos-tagger
- "Return a function for tagging tokens based on a given model file."
+(defn make-pos-tagger
+ "Return a function for tagging tokens based on a givel model file."
[modelfile]
(if-not (file-exist? modelfile)
(throw (FileNotFoundException. "Model file does not exist."))
(fn pos-tagger
[tokens]
- (let [token-array (if (vector? tokens) (into-array tokens) tokens)
- #^POSContextGenerator cg (DefaultPOSContextGenerator. nil)
- model (.getModel (SuffixSensitiveGISModelReader. (File. modelfile)))
- tagger (POSTaggerME. *beam-size* model cg nil)
- tags (.tag tagger 1 token-array)]
- (map #(vector %1 %2) tokens (first tags))))))
-
+ {:pre [(vector? tokens)]}
+ (with-open [model-stream (FileInputStream. modelfile)]
+ (let [token-array (into-array tokens)
+ model (POSModel. model-stream)
+ tagger (POSTaggerME. model *beam-size* *cache-size*)
+ tags (.tag tagger token-array)]
+ (partition 2 (interleave tokens tags)))))))
#_(defn make-name-finder
"Return a function for finding names from tokens based on given model file(s)."
View
5 test/opennlp/test.clj
@@ -6,7 +6,7 @@
(def get-sentences (make-sentence-detector "models/en-sent.bin"))
(def tokenize (make-tokenizer "models/en-token.bin"))
-#_(def pos-tag (make-pos-tagger "models/tag.bin.gz"))
+(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
#_(def name-find (make-name-finder "models/namefind/person.bin.gz"))
#_(def chunker (make-treebank-chunker "models/EnglishChunk.bin.gz"))
@@ -23,11 +23,12 @@
["First" "sentence" "."]))
(is (= (tokenize "Mr. Smith gave a car to his son on Friday.")
["Mr." "Smith" "gave" "a" "car" "to" "his" "son" "on" "Friday" "."])))
-(comment
+
(deftest pos-tag-test
(is (= (pos-tag (tokenize "Mr. Smith gave a car to his son on Friday."))
'(["Mr." "NNP"] ["Smith" "NNP"] ["gave" "VBD"] ["a" "DT"] ["car" "NN"] ["to" "TO"] ["his" "PRP$"] ["son" "NN"] ["on" "IN"] ["Friday" "NNP"] ["." "."]))))
+(comment
(deftest name-finder-test
(is (= (name-find (tokenize "My name is Lee, not John"))
'("Lee" "John"))))

0 comments on commit 3a1f417

Please sign in to comment.
Something went wrong with that request. Please try again.