Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

much work on detokenization to finally get parentheses working

  • Loading branch information...
commit 79277f2dde5ac407c1691f509437ca7d9bb6be10 1 parent 48797fe
@dakrone authored
View
30 models/english-detokenizer.xml
@@ -20,6 +20,12 @@ under the License.
-->
<dictionary>
+ <entry operation="RIGHT_LEFT_MATCHING">
+ <token>"</token>
+ </entry>
+ <entry operation="RIGHT_LEFT_MATCHING">
+ <token>'</token>
+ </entry>
<entry operation="MOVE_LEFT">
<token>.</token>
</entry>
@@ -38,13 +44,13 @@ under the License.
<entry operation="MOVE_LEFT">
<token>:</token>
</entry>
- <entry operation="MOVE_LEFT">
- <token>)</token>
- </entry>
<entry operation="MOVE_RIGHT">
<token>(</token>
</entry>
<entry operation="MOVE_LEFT">
+ <token>)</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
<token>}</token>
</entry>
<entry operation="MOVE_RIGHT">
@@ -65,12 +71,6 @@ under the License.
<entry operation="MOVE_LEFT">
<token>%</token>
</entry>
- <entry operation="RIGHT_LEFT_MATCHING">
- <token>"</token>
- </entry>
- <entry operation="RIGHT_LEFT_MATCHING">
- <token>"</token>
- </entry>
<entry operation="MOVE_LEFT">
<token>n't</token>
</entry>
@@ -92,4 +92,16 @@ under the License.
<entry operation="MOVE_LEFT">
<token>'m</token>
</entry>
+ <entry operation="MOVE_LEFT">
+ <token>.org</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>.com</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>.net</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>#</token>
+ </entry>
</dictionary>
View
62 src/opennlp/nlp.clj
@@ -7,7 +7,8 @@
(:import [opennlp.tools.util Span])
(:import [opennlp.tools.tokenize TokenizerModel TokenizerME
DictionaryDetokenizer DetokenizationDictionary Detokenizer
- Detokenizer$DetokenizationOperation])
+ Detokenizer$DetokenizationOperation
+ DetokenizationDictionary$Operation])
(:import [opennlp.tools.sentdetect SentenceModel SentenceDetectorME])
(:import [opennlp.tools.namefind TokenNameFinderModel NameFinderME])
(:import [opennlp.tools.postag POSModel POSTaggerME]))
@@ -122,38 +123,43 @@
(make-detokenizer (DetokenizationDictionary. model-stream))))
;; TODO: clean this up, recursion is a smell
+;; TODO: remove debug printlns once I'm satisfied
(defn- collapse-tokens
[tokens detoken-ops]
- (let [sb (StringBuilder.)]
+ (let [sb (StringBuilder.)
+ token-set (atom #{})]
+ ;;(println :ops detoken-ops)
(loop [ts tokens dt-ops detoken-ops]
(let [op (first dt-ops)
op2 (second dt-ops)]
- (if (or (= op2 nil)
- (= op2 Detokenizer$DetokenizationOperation/MERGE_TO_LEFT))
- (.append sb (first ts))
- (.append sb (str (first ts) " ")))
- (when (and op op2)
- (recur (next ts) (next dt-ops)))))
- (.toString sb)))
-
-;; older, cruddier version
-#_(defn- collapse-tokens
- [tokens detoken-ops]
- (let [sb (StringBuilder.)]
- (loop [ts tokens dt-ops detoken-ops]
- (let [op (first dt-ops)
- op2 (second dt-ops)]
- (println :ts ts)
- (println :op op)
- (println :op2 op2)
- (if (and op
- (or op2
- (= op2 Detokenizer$DetokenizationOperation/MERGE_TO_LEFT)
- (= op Detokenizer$DetokenizationOperation/MERGE_TO_RIGHT)))
- (.append sb (first ts))
- (if (> (count dt-ops) 1)
- (.append sb (str (first ts) " "))
- (.append sb (str (first ts)))))
+ ;;(println :op op)
+ ;;(println :op2 op)
+ ;;(println :ts (first ts))
+ ;;(println :sb (.toString sb))
+ (cond
+ (or (= op2 nil)
+ (= op2 Detokenizer$DetokenizationOperation/MERGE_TO_LEFT))
+ (.append sb (first ts))
+
+ (or (= op nil)
+ (= op Detokenizer$DetokenizationOperation/MERGE_TO_RIGHT))
+ (.append sb (first ts))
+
+ (= op DetokenizationDictionary$Operation/RIGHT_LEFT_MATCHING)
+ (if (contains? @token-set (first ts))
+ (do
+ ;;(println :token-set @token-set)
+ ;;(println :ts (first ts))
+ (swap! token-set disj (first ts))
+ (.append sb (first ts)))
+ (do
+ ;;(println :token-set @token-set)
+ ;;(println :ts (first ts))
+ (swap! token-set conj (first ts))
+ (.append sb (str (first ts) " "))))
+
+ :else
+ (.append sb (str (first ts) " ")))
(when (and op op2)
(recur (next ts) (next dt-ops)))))
(.toString sb)))
View
6 test/opennlp/test/nlp.clj
@@ -49,7 +49,11 @@
(is (= (detokenize (tokenize "She's the best."))
"She's the best."))
(is (= (detokenize (tokenize "I'm not sure."))
- "I'm not sure.")))
+ "I'm not sure."))
+ (is (= (detokenize (tokenize "Mary likes cows (Mary is a cow)."))
+ "Mary likes cows (Mary is a cow)."))
+ (is (= (detokenize (tokenize "Mary exclaimed \"I am a cow!\""))
+ "Mary exclaimed \"I am a cow!\"")))
(deftest precondition-test
(is (thrown? java.lang.AssertionError (get-sentences 1)))
Please sign in to comment.
Something went wrong with that request. Please try again.