/
grapheme.cr
258 lines (227 loc) · 9.78 KB
/
grapheme.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
require "./properties"
class String
# Iterates the grapheme boundaries in this string and yields for each grapheme
# cluster the byte index of the first character and the byte index of the last
# byte of the last character in the cluster. That is the byte span of the
# respective grapheme cluster in the entire string.
# The third input value is the last character itself, which can be used
# to avoid multiple decoding in single-char graphemes.
private def each_grapheme_boundary(& : Range(Int32, Int32), Char -> Nil) : Nil
state = Grapheme::Property::Start
reader = Char::Reader.new(self)
last_char = reader.current_char
# cache last_property to avoid re-calculation on the following iteration
last_property = Grapheme::Property.from(last_char)
last_boundary = 0
while reader.has_next?
char = reader.next_char
property = Grapheme::Property.from(char)
boundary, state = Grapheme.break?(last_property, property, state)
if boundary
index = reader.pos
yield last_boundary...index, last_char
last_boundary = index
end
last_char = char
last_property = property
end
end
# :nodoc:
class GraphemeIterator
include Iterator(Grapheme)
@last_char : Char
@last_property : Grapheme::Property
def initialize(str : String)
@reader = Char::Reader.new(str)
@state = Grapheme::Property::Start
@last_char = @reader.current_char
# cache last_property to avoid re-calculation on the following iteration
@last_property = Grapheme::Property.from(@last_char)
@last_boundary = 0
end
def next
return stop unless @reader.has_next?
while char = @reader.next_char
property = Grapheme::Property.from(char)
boundary, @state = Grapheme.break?(@last_property, property, @state)
last_char = @last_char
@last_char = char
@last_property = property
if boundary
index = @reader.pos
grapheme = Grapheme.new(@reader.string, @last_boundary...index, last_char)
@last_boundary = index
return grapheme
end
end
Grapheme.new(@reader.string, @last_boundary..@reader.string.bytesize, @last_char)
end
end
# `Grapheme` represents a Unicode grapheme cluster, which describes the smallest
# functional unit of a writing system. This is also called a *user-perceived character*.
#
# In the latin alphabet, most graphemes consist of a single Unicode codepoint
# (equivalent to `Char`). But a grapheme can also consist of a sequence of codepoints,
# which combine into a single unit.
#
# For example, the string `"e\u0301"` consists of two characters, the latin small letter `e`
# and the combining acute accent `´`. Together, they form a single grapheme: `é`.
# That same grapheme could alternatively be described in a single codepoint, `\u00E9` (latin small letter e with acute).
# But the combinatory possibilities are far bigger than the amount of directly
# available codepoints.
#
# ```
# "e\u0301".size # => 2
# "é".size # => 1
#
# "e\u0301".grapheme_size # => 1
# "é".grapheme_size # => 1
# ```
#
# This combination of codepoints is common in some non-latin scripts. It's also
# often used with emojis to create customized combination. For example, the
# thumbs up sign `👍` (`U+1F44D`) combined with an emoji modifier such as
# `U+1F3FC` assign a colour to the emoji.
#
# Instances of this type can be acquired via `String#each_grapheme` or `String#graphemes`.
#
# The algorithm to determine boundaries between grapheme clusters is specified
# in the [Unicode Standard Annex #29](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
@[Experimental("The grapheme API is still under development. Join the discussion at [#11610](https://github.com/crystal-lang/crystal/issues/11610).")]
struct Grapheme
# For efficiency reasons we avoid allocating a string for graphemes consisting
# of only a single character.
# As a trade-off, this leads to multi dispatch on this ivar. But that's
# acceptable compared to the allocation overhead.
#
# Graphemes consisting of a single character are always represented as
# `Char`, never as `String` to simplify comparability. This invariant is
# protected by the constructor.
@cluster : Char | String
# :nodoc:
def self.new(string : String, range : Range(Int32, Int32), char : Char) : self
if char.bytesize == range.size
new(char)
else
new(string.byte_slice(range.begin, range.end - range.begin))
end
end
# :nodoc:
def initialize(@cluster)
end
# Appends the characters in this grapheme cluster to *io*.
def to_s(io : IO) : Nil
io << @cluster
end
# Returns the characters in this grapheme cluster.
def to_s : String
case cluster = @cluster
in Char
cluster.to_s
in String
cluster
end
end
# Appends a representation of this grapheme cluster to *io*.
def inspect(io : IO) : Nil
io << "String::Grapheme("
@cluster.inspect(io)
io << ")"
end
# Returns the number of characters in this grapheme cluster.
def size : Int32
case cluster = @cluster
in Char
1
in String
cluster.size
end
end
# Returns the number of bytes in the UTF-8 representation of this grapheme cluster.
def bytesize : Int32
@cluster.bytesize
end
# Returns `true` if *other* is equivalent to `self`.
#
# Two graphemes are considered equivalent if they contain the same sequence
# of codepoints.
def ==(other : self) : Bool
@cluster == other.@cluster
end
# :nodoc:
def self.break?(c1 : Char, c2 : Char) : Bool
break?(Property.from(c1), Property.from(c2))
end
# :nodoc:
#
# Returns whether there is a grapheme break between boundclasses lbc and tbc.
#
# Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
# and GB 12/13 (regional indicator code points) require knowledge of previous characters
# which is not handled by this overload. This may result in an incorrect break before
# an E_Modifier class codepoint and an incorrectly missing break between two
# REGIONAL_INDICATOR class code points if such support does not exist in the caller.
#
# The rules are graphically displayed in a table on https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
#
# The implementation is inspired by https://github.com/JuliaStrings/utf8proc/blob/462093b3924c7491defc67fda4bc7a27baf9b088/utf8proc.c#L261
def self.break?(lbc : Property, tbc : Property) : Bool
return true if lbc.start? # GB1
return false if lbc.cr? && tbc.lf? # GB3
return true if lbc.cr? || lbc.lf? || lbc.control? # GB4
return true if tbc.cr? || tbc.lf? || tbc.control? # GB5
return false if lbc.l? && (tbc.l? || tbc.v? || tbc.lv? || tbc.lvt?) # GB6
return false if (lbc.lv? || lbc.v?) && (tbc.v? || tbc.t?) # GB7
return false if (lbc.lvt? || lbc.t?) && tbc.t? # GB8
return false if tbc.extend? || tbc.zwj? # GB9
return false if tbc.spacing_mark? # GB9a
return false if lbc.prepend? # GB9b
return false if lbc.extended_plus_zero_width? && tbc.extended_pictographic? # GB11 (requires additional handling)
return false if lbc.regional_indicator? && tbc.regional_indicator? # GB12/13 (requires additional handling)
true # GB999
end
# :nodoc:
def self.break?(c1 : Char, c2 : Char, state : Property) : {Bool, Property}
break?(Property.from(c1), Property.from(c2), state)
end
# :nodoc:
#
# Returns whether there is a grapheme break between boundclasses lbc and tbc.
#
# Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
# and GB 12/13 (regional indicator code points) require knowledge of previous characters
# which is accounted for in the state argument. The caller is expected to
# store the returned state and pass it when calling this method the next time.
# The initial value is `Property::Start`.
#
# The implementation is inspired by https://github.com/JuliaStrings/utf8proc/blob/462093b3924c7491defc67fda4bc7a27baf9b088/utf8proc.c#L291
def self.break?(lbc : Property, tbc : Property, state : Property) : {Bool, Property}
if state.start?
state = lbc_override = lbc
else
lbc_override = state
end
break_permitted = break?(lbc_override, tbc)
# Special support for GB 12/13 made possible by GB999. After two RI
# class codepoints we want to force a break. Do this by resetting the
# second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
# after that character according to GB999 (unless of course such a break is
# forbidden by a different rule such as GB9).
if state == tbc && tbc.regional_indicator?
state = Property::Any
# Special support for GB11 (emoji extend* zwj / emoji)
elsif state.extended_pictographic?
if tbc.extend? # fold EXTEND codepoints into emoji
state = Property::ExtendedPictographic
elsif tbc.zwj?
state = Property::ExtendedPlusZeroWidth # state to record emoji+zwg combo
else
state = tbc
end
else
state = tbc
end
{break_permitted, state}
end
end
end