-
Notifications
You must be signed in to change notification settings - Fork 9
/
analysis.clj
107 lines (95 loc) · 4.22 KB
/
analysis.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
(ns clucie.analysis
(:import [org.apache.lucene.analysis.standard StandardAnalyzer StandardTokenizer StandardFilter]
[org.apache.lucene.analysis.core KeywordAnalyzer LowerCaseFilter StopFilter]
[org.apache.lucene.analysis.cjk CJKAnalyzer]
[org.apache.lucene.analysis.ngram NGramTokenizer NGramTokenFilter]
[org.apache.lucene.analysis.ja JapaneseAnalyzer JapaneseTokenizer JapaneseTokenizer$Mode]
[org.apache.lucene.analysis Analyzer Analyzer$TokenStreamComponents]
[org.apache.lucene.analysis.miscellaneous PerFieldAnalyzerWrapper]
[org.apache.lucene.analysis.tokenattributes OffsetAttribute]
[org.apache.lucene.analysis CharArraySet Tokenizer]
[org.apache.lucene.analysis.util CharFilterFactory]
[java.io StringReader]))
(defmacro build-analyzer
[tokenizer & {:keys [char-filter-factories token-filters]}]
`(proxy [Analyzer] []
(createComponents [field-name#]
(let [src# ~tokenizer
token# (-> src#
~@token-filters)]
(Analyzer$TokenStreamComponents. src# token#)))
(initReader [field-name# reader#]
(proxy-super initReader
field-name#
(reduce #(.create ^CharFilterFactory %2 %1) reader# ~char-filter-factories)))))
(defn- char-set
(^CharArraySet [stop-words]
(char-set stop-words false))
(^CharArraySet [^java.util.Collection stop-words ^Boolean ignore-case]
(CharArraySet. stop-words ignore-case)))
(defn standard-analyzer
(^org.apache.lucene.analysis.Analyzer []
(standard-analyzer []))
(^org.apache.lucene.analysis.Analyzer [stop-words]
(StandardAnalyzer. (char-set stop-words))))
(defn keyword-analyzer
^org.apache.lucene.analysis.Analyzer
[]
(KeywordAnalyzer.))
(defn ngram-analyzer
[min-length max-length stop-words]
(build-analyzer (NGramTokenizer. min-length max-length)
:token-filters [(NGramTokenFilter. min-length max-length)
;; (StandardFilter.) ; is it necessary?
(LowerCaseFilter.)
(StopFilter. (char-set stop-words))]))
(defn cjk-analyzer
(^org.apache.lucene.analysis.Analyzer []
(CJKAnalyzer.))
(^org.apache.lucene.analysis.Analyzer [stop-words]
(CJKAnalyzer. (char-set stop-words))))
(defn- kuromoji-mode [mode]
(or
({:extended JapaneseTokenizer$Mode/EXTENDED
:normal JapaneseTokenizer$Mode/NORMAL
:search JapaneseTokenizer$Mode/SEARCH} mode)
mode
JapaneseTokenizer$Mode/NORMAL))
(defn kuromoji-analyzer
(^org.apache.lucene.analysis.Analyzer []
(JapaneseAnalyzer.))
(^org.apache.lucene.analysis.Analyzer [user-dict mode stop-words stop-tags]
(let [mode (kuromoji-mode mode)
^CharArraySet stop-words (if (instance? CharArraySet stop-words)
stop-words
(char-set stop-words false))]
(JapaneseAnalyzer. user-dict mode stop-words stop-tags))))
;;; TODO: Support to many tokenize options for morphological analyses
(defn- tokenize [^Tokenizer tokenizer ^String text]
(.setReader tokenizer (StringReader. text))
(let [^OffsetAttribute offset-attr (.addAttribute tokenizer OffsetAttribute)]
(.reset tokenizer)
(loop [results nil]
(if (.incrementToken tokenizer)
(let [start (.startOffset offset-attr)
end (.endOffset offset-attr)]
(recur (cons (.substring text start end) results)))
(do
(.end tokenizer)
(reverse results))))))
(defn- kuromoji-tokenizer [& [user-dict discard-punctuation? mode factory]]
(let [discard-punctuation? (boolean discard-punctuation?)
mode (kuromoji-mode mode)]
(if factory
(JapaneseTokenizer. factory user-dict discard-punctuation? mode)
(JapaneseTokenizer. user-dict discard-punctuation? mode))))
(defn kuromoji-tokenize [text & tokenizer-args]
(let [^Tokenizer t (apply kuromoji-tokenizer tokenizer-args)
r (tokenize t text)]
(.close t)
r))
(defn analyzer-mapping
^org.apache.lucene.analysis.Analyzer
[default mapping]
(PerFieldAnalyzerWrapper. default
(into {} (map (fn [[k v]] [(name k) v]) mapping))))