/
encoding.cr
381 lines (352 loc) · 14 KB
/
encoding.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
class URI
# Encodes *string* so it can be safely placed as a potentially multi-segmented
# URI path, replacing special characters with URI escape sequences as needed.
#
# Unreserved characters such as ASCII letters, digits, and the characters
# `_.-~` are not encoded, as well as the character `/` which represent a
# segment separator in hierarchical paths ([RFC 3986 §3.3](https://datatracker.ietf.org/doc/html/rfc3986#section-3.3)).
#
# ```
# require "uri"
#
# URI.encode_path("foo/bar/baz") # => "foo/bar/baz"
# URI.encode_path("hello world!") # => "hello%20world%21"
# URI.encode_path("put: it+й") # => "put%3A%20it%2B%D0%B9"
# ```
#
# * `.decode` is the reverse operation.
# * `.encode_path_segment` encodes a single path segment, escaping `/`.
def self.encode_path(string : String) : String
String.build { |io| encode_path(io, string) }
end
# :ditto:
def self.encode_path(io : IO, string : String) : Nil
self.encode(string, io) { |byte| URI.unreserved?(byte) || '/' === byte }
end
# Encodes *string* so it can be safely placed inside a URI path segment,
# replacing special characters (including `/`) with URI escape sequences as needed.
#
# Unreserved characters such as ASCII letters, digits, and the characters
# `_.-~` are not encoded (see `.unreserved?`).
#
# ```
# require "uri"
#
# URI.encode_path_segment("foo;bar;baz") # => "foo%3Bbar%3Bbaz"
# URI.encode_path_segment("foo/bar/baz") # => "foo%2Fbar%2Fbaz"
# URI.encode_path_segment("foo,bar,baz") # => "foo%2Cbar%2Cbaz"
# URI.encode_path_segment("hello world!") # => "hello%20world%21"
# URI.encode_path_segment("put: it+й") # => "put%3A%20it%2B%D0%B9"
# ```
#
# * `.decode` is the reverse operation.
# * `.encode_path` encodes a path consisting of multiple segments, not escaping `/`.
# * `.encode_www_form` escapes space character as `+`.
def self.encode_path_segment(string : String) : String
String.build { |io| encode_path_segment(io, string) }
end
# :ditto:
def self.encode_path_segment(io : IO, string : String) : Nil
self.encode(string, io) { |byte| URI.unreserved?(byte) }
end
# URL-decodes *string*.
#
# ```
# require "uri"
#
# URI.decode("hello%20world!") # => "hello world!"
# URI.decode("put:%20it+%D0%B9") # => "put: it+й"
# URI.decode("http://example.com/Crystal%20is%20awesome%20=)") # => "http://example.com/Crystal is awesome =)"
# ```
#
# By default, `+` is decoded literally. If *plus_to_space* is `true`, `+` is
# decoded as space character (`0x20`). Percent-encoded values such as `%20`
# and `%2B` are always decoded as characters with the respective codepoint.
#
# ```
# require "uri"
#
# URI.decode("peter+%2B+paul") # => "peter+++paul"
# URI.decode("peter+%2B+paul", plus_to_space: true) # => "peter + paul"
# ```
#
# * `.encode` is the reverse operation.
# * `.decode_www_form` encodes plus to space by default.
def self.decode(string : String, *, plus_to_space : Bool = false) : String
if needs_decode?(string, plus_to_space: plus_to_space)
String.build { |io| decode!(string, io, plus_to_space: plus_to_space) { false } }
else
string
end
end
# URL-decodes a string and writes the result to *io*.
#
# See `.decode(string : String, *, plus_to_space : Bool = false) : String` for details.
def self.decode(string : String, io : IO, *, plus_to_space : Bool = false) : Nil
self.decode(string, io, plus_to_space: plus_to_space) { false }
end
# URL-encodes *string*.
#
# Reserved and unreserved characters are not escaped, so this only modifies some
# special characters as well as non-ASCII characters. `.reserved?` and `.unreserved?`
# provide more details on these character classes.
#
# ```
# require "uri"
#
# URI.encode("hello world!") # => "hello%20world!"
# URI.encode("put: it+й") # => "put:%20it+%D0%B9"
# URI.encode("http://example.com/Crystal is awesome =)") # => "http://example.com/Crystal%20is%20awesome%20=)"
# ```
#
# By default, the space character (`0x20`) is encoded as `%20` and `+` is
# encoded literally. If *space_to_plus* is `true`, space character is encoded
# as `+` and `+` is encoded as `%2B`:
#
# ```
# require "uri"
#
# URI.encode("peter + paul") # => "peter%20+%20paul"
# URI.encode("peter + paul", space_to_plus: true) # => "peter+%2B+paul"
# ```
#
# * `.decode` is the reverse operation.
# * `.encode_www_form` also escapes reserved characters.
@[Deprecated("Use `.encode_path` instead.")]
def self.encode(string : String, *, space_to_plus : Bool = false) : String
String.build { |io| encode(string, io, space_to_plus: space_to_plus) }
end
# URL-encodes *string* and writes the result to *io*.
#
# See `.encode(string : String, *, space_to_plus : Bool = false) : String` for details.
@[Deprecated("Use `.encode_path` instead.")]
def self.encode(string : String, io : IO, *, space_to_plus : Bool = false) : Nil
self.encode(string, io, space_to_plus: space_to_plus) { |byte| URI.reserved?(byte) || URI.unreserved?(byte) }
end
# URL-decodes *string* as [`x-www-form-urlencoded`](https://url.spec.whatwg.org/#urlencoded-serializing).
#
# ```
# require "uri"
#
# URI.decode_www_form("hello%20world!") # => "hello world!"
# URI.decode_www_form("put:%20it+%D0%B9") # => "put: it й"
# URI.decode_www_form("http://example.com/Crystal+is+awesome+=)") # => "http://example.com/Crystal is awesome =)"
# ```
#
# By default, `+` is decoded as space character (`0x20`). If *plus_to_space*
# is `false`, `+` is decoded literally as `+`. Percent-encoded values such as
# `%20` and `%2B` are always decoded as characters with the respective codepoint.
#
# ```
# require "uri"
#
# URI.decode_www_form("peter+%2B+paul") # => "peter + paul"
# URI.decode_www_form("peter+%2B+paul", plus_to_space: false) # => "peter+++paul"
# ```
#
# * `.encode_www_form` is the reverse operation.
# * `.decode` encodes plus literally by default.
def self.decode_www_form(string : String, *, plus_to_space : Bool = true) : String
decode(string, plus_to_space: plus_to_space)
end
# URL-decodes *string* as [`x-www-form-urlencoded`](https://url.spec.whatwg.org/#urlencoded-serializing)
# and writes the result to *io*.
#
# See `self.decode_www_form(string : String, *, plus_to_space : Bool = true) : String`
# for details.
def self.decode_www_form(string : String, io : IO, *, plus_to_space : Bool = true) : Nil
decode(string, io, plus_to_space: plus_to_space)
end
# URL-encodes *string* as [`x-www-form-urlencoded`](https://url.spec.whatwg.org/#urlencoded-serializing).
#
# Reserved characters are escaped, unreserved characters are not.
# `.reserved?` and `.unreserved?` provide more details on these character
# classes.
#
# ```
# require "uri"
#
# URI.encode_www_form("hello world!") # => "hello+world%21"
# URI.encode_www_form("put: it+й") # => "put%3A+it%2B%D0%B9"
# URI.encode_www_form("http://example.com/Crystal is awesome =)") # => "http%3A%2F%2Fexample.com%2FCrystal+is+awesome+%3D%29"
# ```
#
# The encoded string returned from this method can be used as name or value
# components for a `application/x-www-form-urlencoded` format serialization.
# `URI::Params` provides a higher-level API for this use case.
#
# By default, the space character (`0x20`) is encoded as `+` and `+` is encoded
# as `%2B`. If *space_to_plus* is `false`, space character is encoded as `%20`
# and `'+'` is encoded literally.
#
# ```
# require "uri"
#
# URI.encode_www_form("peter + paul") # => "peter+%2B+paul"
# URI.encode_www_form("peter + paul", space_to_plus: false) # => "peter%20%2B%20paul"
# ```
#
# * `.decode_www_form` is the reverse operation.
# * `.encode_path` escapes space character as `%20`.
def self.encode_www_form(string : String, *, space_to_plus : Bool = true) : String
String.build do |io|
encode_www_form(string, io, space_to_plus: space_to_plus)
end
end
# URL-encodes *string* as [`x-www-form-urlencoded`](https://url.spec.whatwg.org/#urlencoded-serializing)
# and writes the result to *io*.
#
# See `.encode_www_form(string : String, *, space_to_plus : Bool = true)` for
# details.
def self.encode_www_form(string : String, io : IO, *, space_to_plus : Bool = true) : Nil
encode(string, io, space_to_plus: space_to_plus) do |byte|
URI.unreserved?(byte)
end
end
# Returns whether given byte is reserved character defined in
# [RFC 3986 §2.2](https://datatracker.ietf.org/doc/html/rfc3986#section-2.2).
#
# Reserved characters are ':', '/', '?', '#', '[', ']', '@', '!',
# '$', '&', "'", '(', ')', '*', '+', ',', ';' and '='.
def self.reserved?(byte) : Bool
sub_delim?(byte) || gen_delim?(byte)
end
# :nodoc:
# Returns `true` if the byte is URI gen-delims (https://datatracker.ietf.org/doc/html/rfc3986#section-2.2).
def self.gen_delim?(byte)
byte.unsafe_chr.in?('#', '/', ':', '?', '@', '[', ']')
end
# :nodoc:
# Returns `true` if the byte is URI sub-delims (https://datatracker.ietf.org/doc/html/rfc3986#section-2.2).
def self.sub_delim?(byte) : Bool
char = byte.unsafe_chr
'&' <= char <= ',' ||
char.in?('!', '$', ';', '=')
end
# Returns whether given byte is unreserved character defined in
# [RFC 3986 §2.3](https://datatracker.ietf.org/doc/html/rfc3986#section-2.3).
#
# Unreserved characters are ASCII letters, ASCII digits, `_`, `.`, `-` and `~`.
def self.unreserved?(byte) : Bool
char = byte.unsafe_chr
char.ascii_alphanumeric? ||
char.in?('_', '.', '-', '~')
end
# URL-decodes *string* and writes the result to *io*.
#
# The block is called for each percent-encoded ASCII character and determines
# whether the value is to be decoded. When the return value is falsey,
# the character is decoded. Non-ASCII characters are always decoded.
#
# By default, `+` is decoded literally. If *plus_to_space* is `true`, `+` is
# decoded as space character (`0x20`).
#
# This method enables some customization, but typical use cases can be implemented
# by either `.decode(string : String, *, plus_to_space : Bool = false) : String` or
# `.decode_www_form(string : String, *, plus_to_space : Bool = true) : String`.
def self.decode(string : String, io : IO, *, plus_to_space : Bool = false, &block) : Nil
unless needs_decode?(string, plus_to_space: plus_to_space)
io.write_string(string.to_slice)
return
end
decode!(string, io, plus_to_space: plus_to_space) do |byte|
yield byte
end
end
# Similar to the above, but it assumes there will be chars to decode (% or +).
private def self.decode!(string : String, io : IO, *, plus_to_space : Bool, &block) : Nil
i = 0
bytesize = string.bytesize
if io.encoding == "UTF-8"
target_io = io
else
io_memory = IO::Memory.new(bytesize)
target_io = io_memory
end
while i < bytesize
byte = string.to_unsafe[i]
char = byte.unsafe_chr
i = decode_one(string, bytesize, i, byte, char, target_io, plus_to_space) { |byte| yield byte }
end
if io_memory
io.write_string(io_memory.to_slice)
end
end
# URL-encodes *string* and writes the result to an `IO`.
#
# The block is called for each ascii character (codepoint less than `0x80`) and
# determines whether the value is to be encoded. When the return value is falsey,
# the character is encoded. Non-ASCII characters are always encoded.
#
# By default, the space character (`0x20`) is encoded as `%20` and `+` is
# encoded literally. If *space_to_plus* is `true`, space character is encoded
# as `+` and `+` is encoded as `%2B`.
#
# This method enables some customization, but typical use cases can be implemented
# by either `.encode_path(string : String) : String`, `.encode_path_segment(string : String) : String` or
# `.encode_www_form(string : String, *, space_to_plus : Bool = true) : String`.
def self.encode(string : String, io : IO, space_to_plus : Bool = false, &block) : Nil
string.each_byte do |byte|
char = byte.unsafe_chr
if char == ' ' && space_to_plus
io << '+'
elsif char.ascii? && yield(byte) && (!space_to_plus || char != '+')
io << char
else
io << '%'
io << '0' if byte < 16
byte.to_s(io, 16, upcase: true)
end
end
io
end
# :nodoc:
def self.decode_one(string, bytesize, i, byte, char, io, plus_to_space = false)
self.decode_one(string, bytesize, i, byte, char, io, plus_to_space) { false }
end
# :nodoc:
# Unencodes one character. Private API
def self.decode_one(string, bytesize, i, byte, char, io, plus_to_space = false, &)
if plus_to_space && char == '+'
io.write_byte ' '.ord.to_u8
i += 1
return i
end
if char == '%' && i < bytesize - 2
i += 1
first = string.to_unsafe[i]
first_num = first.unsafe_chr.to_i? 16
unless first_num
io.write_byte byte
return i
end
i += 1
second = string.to_unsafe[i]
second_num = second.unsafe_chr.to_i? 16
unless second_num
io.write_byte byte
io.write_byte first
return i
end
encoded = (first_num * 16 + second_num).to_u8
i += 1
if encoded < 0x80 && yield encoded
io.write_byte byte
io.write_byte first
io.write_byte second
return i
end
io.write_byte encoded
return i
end
io.write_byte byte
i += 1
i
end
# Checks if there's any chars that actually needs decoding.
# If not, decoding a string can be done much faster, because
# it's just the same string.
private def self.needs_decode?(string : String, *, plus_to_space : Bool) : Bool
string.includes?('%') || (plus_to_space && string.includes?('+'))
end
end