-
Notifications
You must be signed in to change notification settings - Fork 0
/
markov_naive.clj
217 lines (178 loc) · 6.89 KB
/
markov_naive.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
;; Copyright (C) 2015, Jozef Wagner. All rights reserved.
;;
;; The use and distribution terms for this software are covered by the
;; Eclipse Public License 1.0
;; (http://opensource.org/licenses/eclipse-1.0.php) which can be
;; found in the file epl-v10.html at the root of this distribution.
;;
;; By using this software in any fashion, you are agreeing to be bound
;; by the terms of this license.
;;
;; You must not remove this notice, or any other, from this software.
(ns try.markov-naive
"Naive Markov chain."
(:api dunaj)
(:require
[dunaj.host.int :refer [iCOMMA iDOT iQMARK iBANG iAPOS iint i==
ismall-letter? icapital-letter? idec]]
[dunaj.coll :refer [edit conj!]]
[dunaj.format.parser :refer
[ITokenizerMachine IParserMachineFactory tokenizer-engine]]
[incanter.core :as ic :refer [view save]]
[incanter.charts :as ich :refer [bar-chart]]))
;;; Word tokenizer
(defn ^:private unread-char! :- nil
"Unreads one char from `_char-batch_`. Returns `nil`."
[char-batch :- (Batch Char)]
(.position char-batch (idec (.position char-batch)))
nil)
(defn ^:private next-char! :- (Maybe Char)
"Returns next character from `_char-batch_`, or returns `nil` if
there are no more chars to read."
[char-batch :- (Batch Char)]
(when (.hasRemaining char-batch)
(.get char-batch)))
(defn ^:private word-char? :- Boolean
"Returns `true` if `_c_` is a valid character for word, otherwise
returns `false`."
[c]
(let [x (iint c)]
(or (ismall-letter? x) (icapital-letter? x)
(i== x (iCOMMA)) (i== (iAPOS) x))))
(defn ^:private word-token
"Returns new instance of word tokenizer machine that contains
`_item_`"
[item]
(let [word-ref (atom [item])]
(reify
ITokenizerMachine
(-analyze-batch! [this bm batch]
(loop [c (next-char! batch)]
(cond
(nil? c) this
(word-char? c) (do (alter! word-ref conj c)
(recur (next-char! batch)))
:else (do (unread-char! batch)
(str @word-ref)))))
(-analyze-eof! [this] (str @word-ref)))))
(def word-tokenizer
"A tokenizer machine factory for words."
(reify
IParserMachineFactory
(-parser-config [this]
{})
(-parser-from-type [this]
(keyword->class :char))
(-dispatch-tokenizer [this config state item]
(cond (word-char? item) (word-token item)
(= item \.) :stop-dot
(= item \!) :stop-bang
(= item \?) :stop-qmark
:else this))))
;;; Markov chain
(defn assoc-transition :- {}
"Returns markov chain with added cur-nxt association."
[chain :- {}, cur, nxt]
(let [key (when-not (keyword? cur) cur)
vals (get chain key (edit []))]
(assoc chain key (conj! vals nxt))))
(defn markov-chain :- {}
"Returns markov chain, in form of a map, from given collection of words."
[words :- []]
(let [transitions (zip (cons :start words) words)]
(reduce-unpacked assoc-transition {} transitions)))
;;; Sentence generation
(defn random-sentence :- String
"Returns a random sentence generated from a given markov-chain."
[markov-chain :- {}]
(let [stop-map {:stop-bang "!" :stop-qmark "?"}]
(loop [sentence [], cur (rand-nth (markov-chain nil))]
(if (keyword? cur)
(str (append (get stop-map cur ".") (interpose " " sentence)))
(recur (conj sentence cur) (rand-nth (markov-chain cur)))))))
;;;; Scratch
(scratch [[clojure.core :refer [binding]]]
[]
;;; load text from a file on a classpath
(def text
(with-scope
(str (slurp (classpath "try/document/moby_dick.txt")))))
;;=> #'try.markov-naive/text
(count text)
;;=> 1210052
(pp! text)
;; "Call me Ishmael. Some years ago--never mind how long precisely--having\r\nlittle or no money in my purse, and nothing particular to interest me on\r\nshore, I thought I would sail about a little and see ..."
;;; parse text into words
(def words (tokenizer-engine word-tokenizer text))
;;=> #'try.markov-naive/words
(count words)
;;=> 221846
(seq (take 45 words))
;;=> ("Call" "me" "Ishmael" :stop-dot "Some" "years" "ago" "never" "mind" "how" "long" "precisely" "having" "little" "or" "no" "money" "in" "my" "purse," "and" "nothing" "particular" "to" "interest" "me" "on" "shore," "I" "thought" "I" "would" "sail" "about" "a" "little" "and" "see" "the" "watery" "part" "of" "the" "world" :stop-dot)
;;; create markov chain from words
(def mc (markov-chain words))
;;=> #'try.markov-naive/words
(count mc)
;;=> 23453
(seq (take 20 (keys mc)))
;;=> (nil "shouted," "convince" "weary," "Lower" "mounting" "howled" "posse" "declares," "shelf," "heaped" "new," "rainbow" "Bartholomew" "in," "blandishments" "Christianity" "absorbed," "float" "sweet")
;; number of different starting words
(count (distinct (mc nil)))
;;=> 1763
;; see possible successors for "Ishmael" state
(seq (mc "Ishmael"))
;; (:stop-dot :stop-dot "can" :stop-dot "hope" :stop-qmark :stop-dot "but")
;; some more insight
(let [s (vec (sort-by #(count (second %)) (vec mc)))
pf #(->vec (first %)
(take 3 (vec (second %)))
(count (second %))
(count (distinct (second %))))]
(vec (map pf (take 10 (reverse s)))))
;; [["the" ["watery" "world" "spleen"] 13514 4694]
;; [nil ["Call" "Some" "It"] 9854 1763]
;; ["of" ["the" "driving" "every"] 6400 1698]
;; ["and" ["nothing" "see" "regulating"] 5859 2623]
;; ["a" ["little" "way" "damp,"] 4476 2146]
;; ["to" ["interest" "prevent" "get"] 4443 1155]
;; ["in" ["my" "my" "this"] 3796 771]
;; ["that" ["it" "noble" "man"] 2767 975]
;; ["his" ["sword" "deepest" "legs,"] 2414 1281]
;; ["I" ["thought" "would" "have"] 1882 494]]
;; tokens with most diverse transitions
(let [cf (comp count distinct second)
s (take 9 (reverse (sort-by cf (vec mc))))]
(ic/view
(ich/bar-chart
(vec (map #(or (first %) :start) s))
(vec (map cf s))
:title "Tokens with most diverse transitions"
:x-label ""
:y-label "")))
;;; generating random sentences
(random-sentence mc)
;;=> "I obtain ample vengeance, eternal democracy!"
;;=> "At last degree succeed in the cry."
;;=> "HUZZA PORPOISE."
;;=> "An old, old man, in the bows, at times you say, should be placed as laborers."
;;=> "So, so suddenly started on and strength, let the far fiercer curse the maid."
;; summary
(let [cf (comp count distinct second)
s (take 9 (reverse (sort-by cf (vec mc))))]
(ic/save
(ich/bar-chart
(vec (map #(or (first %) :start) s))
(vec (map cf s))
:title "Tokens with most diverse transitions"
:x-label ""
:y-label "")
"markov-bar.png"))
(time
(def mc
(->> (slurp (classpath "try/document/moby_dick.txt"))
(tokenizer-engine word-tokenizer)
seq
markov-chain
with-scope)))
(random-sentence mc)
)