-
Notifications
You must be signed in to change notification settings - Fork 3
/
core.clj
176 lines (154 loc) · 5.75 KB
/
core.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
(ns couplet.core
"Core utilities for working with Unicode code points."
(:require [clojure.core.reducers :as r]
[clojure.spec.alpha :as s]
[clojure.spec.gen.alpha :as gen])
(:import java.io.Writer
[java.util.concurrent ForkJoinPool ForkJoinTask]))
(defn codepoint?
"Returns true if x is a code point.
Corresponds to the spec :couplet.core/codepoint."
[x]
(and (int? x) (<= Character/MIN_CODE_POINT x Character/MAX_CODE_POINT)))
(defmacro codepoint-in
"Returns a spec that validates (and generates) code points in the range from
start to end inclusive.
The predefined spec :couplet.core/codepoint validates all code points."
[start end]
`(s/spec #(and (int? %) (<= ~start % ~end))
:gen #(gen/fmap int (gen/choose ~start ~end))))
(s/def ::codepoint
(codepoint-in Character/MIN_CODE_POINT Character/MAX_CODE_POINT))
(defn codepoint-str
"Returns a string containing the Unicode character specified by code point cp."
[cp]
(String/valueOf (Character/toChars cp)))
(defn- codepoint-xform
[rf]
(let [high (volatile! nil)]
(fn
([] (rf))
([result]
(rf (if-let [c @high]
(unreduced (rf result (int c)))
result)))
([result c]
(if-let [c1 @high]
(cond
(Character/isLowSurrogate c)
(do (vreset! high nil)
(rf result (Character/toCodePoint c1 c)))
(Character/isHighSurrogate c)
(let [result (rf result (int c1))]
(vreset! high (if (reduced? result) nil c))
result)
:else
(do (vreset! high nil)
(let [result (rf result (int c1))]
(if (reduced? result)
result
(rf result (int c))))))
(if (Character/isHighSurrogate c)
(do (vreset! high c)
result)
(rf result (int c))))))))
(defn- codepoint-reduce
[^CharSequence s ^long i f val]
(loop [i i
ret val]
(if (< i (.length s))
(let [cp (Character/codePointAt s i)
ret (f ret cp)]
(if (reduced? ret)
@ret
(recur (+ i (if (Character/isBmpCodePoint cp) 1 2)) ret)))
ret)))
(deftype CodePointSeq [^CharSequence s]
Iterable
(iterator [_]
(.iterator (.codePoints s)))
clojure.lang.Sequential
clojure.lang.IReduce
(reduce [_ f]
(case (.length s)
0 (f)
1 (int (.charAt s 0))
(if-let [val (and (Character/isHighSurrogate (.charAt s 0))
(Character/isLowSurrogate (.charAt s 1))
(Character/toCodePoint (.charAt s 0) (.charAt s 1)))]
(if (= (.length s) 2)
val
(codepoint-reduce s 2 f val))
(codepoint-reduce s 1 f (int (.charAt s 0))))))
(reduce [_ f val]
(if (zero? (.length s))
val
(codepoint-reduce s 0 f val))))
(defmethod print-method CodePointSeq
[^CodePointSeq cps ^Writer w]
(if *print-readably*
(do (.write w "#couplet.core.CodePointSeq")
(print-method (vector (str (.s cps))) w))
(print-method (map codepoint-str cps) w)))
(defn codepoints
"Returns a value that acts like a sequence of code points produced from the given
CharSequence s. The result is of a type that is seqable, reducible, and
foldable. The wrapped CharSequence is treated as immutable (like a string).
Unlike CharSequence, the value returned from codepoints is not counted? and does
not support random access. Use seq to obtain a regular (lazy) seq of code
points.
When no argument is supplied, returns a stateful transducer that transforms char
inputs to code points."
([] codepoint-xform)
([s]
{:pre [(instance? CharSequence s)]}
(->CodePointSeq s)))
(defn append!
"Reducing function applicable to code point input, with accumulation based on
(mutable) StringBuilder. When called with no arguments, returns a new
StringBuilder. When called with a StringBuilder argument, returns its contents
as a string (for use in completion of transduce)."
([] (StringBuilder.))
([^StringBuilder sb] (str sb))
([^StringBuilder sb cp] (.appendCodePoint sb (int cp))))
(defn to-str
"Returns a string containing the code points in coll. When a transducer is
supplied, applies the transform to the inputs before appending them to the
result.
Same as (transduce xform append! coll), so coll must either directly or by way
of transformation through xform consist of code points."
([coll]
(to-str identity coll))
([xform coll]
(transduce xform append! coll)))
(defn- fold-codepoints
[^CharSequence s start end n combinef reducef]
(if (or (<= (- end start) n)
(and (= (- end start) 2)
(Character/isHighSurrogate (.charAt s start))
(Character/isLowSurrogate (.charAt s (inc start)))))
(reduce reducef (combinef) (->CodePointSeq (.subSequence s start end)))
(let [split (+ start (quot (- end start) 2))
split (cond-> split
(and (Character/isHighSurrogate (.charAt s (dec split)))
(Character/isLowSurrogate (.charAt s split)))
inc)
^ForkJoinTask task
(r/fjtask #(fold-codepoints s split end n combinef reducef))]
(.fork task)
(combinef (fold-codepoints s start split n combinef reducef)
(.join task)))))
;; Note that partition size n is based on chars, not code points.
(extend-type CodePointSeq
r/CollFold
(coll-fold [cps n combinef reducef]
(let [^CharSequence s (.s cps)]
(cond
(zero? (.length s))
(combinef)
(<= (.length s) n)
(reduce reducef (combinef) cps)
:else
(.invoke ^ForkJoinPool @r/pool
(r/fjtask
#(fold-codepoints s 0 (.length s) n combinef reducef)))))))