Permalink
Browse files

Support escaped unicode surrogate pairs.

  • Loading branch information...
1 parent 77a2e98 commit d2ce667c604cb85b0e4697fddee3796a9530912b @dgraham committed Aug 4, 2010
Showing with 54 additions and 14 deletions.
  1. +3 −3 lib/json/stream/buffer.rb
  2. +39 −11 lib/json/stream/parser.rb
  3. +12 −0 test/parser_test.rb
@@ -47,9 +47,9 @@ def <<(data)
end
end
end
- encoded = bytes.pack('C*').force_encoding(Encoding::UTF_8)
- error('Invalid UTF-8 byte sequence') unless encoded.valid_encoding?
- encoded
+ bytes.pack('C*').force_encoding(Encoding::UTF_8).tap do |str|
+ error('Invalid UTF-8 byte sequence') unless str.valid_encoding?
+ end
end
private
View
@@ -36,6 +36,7 @@ class Parser
PLUS = '+'
POINT = '.'
EXPONENT = /[eE]/
+ B,F,N,R,T,U = %w[b f n r t u]
# Parses a full JSON document from a String or an IO stream and returns
# the parsed object graph. For parsing small JSON documents with small
@@ -147,22 +148,22 @@ def <<(data)
when QUOTE, BACKSLASH, SLASH
@buf << ch
@state = :start_string
- when 'b'
+ when B
@buf << "\b"
@state = :start_string
- when 'f'
+ when F
@buf << "\f"
@state = :start_string
- when 'n'
+ when N
@buf << "\n"
@state = :start_string
- when 'r'
+ when R
@buf << "\r"
@state = :start_string
- when 't'
+ when T
@buf << "\t"
@state = :start_string
- when 'u'
+ when U
@state = :unicode_escape
else
error("Expected escaped character")
@@ -172,12 +173,39 @@ def <<(data)
when HEX
@unicode << ch
if @unicode.size == 4
- @buf << @unicode.slice!(0, 4).hex
- @state = :start_string
+ codepoint = @unicode.slice!(0, 4).hex
+ if codepoint >= 0xD800 && codepoint <= 0xDBFF
+ error('Expected low surrogate pair half') if @stack[-1].is_a?(Fixnum)
+ @state = :start_surrogate_pair
+ @stack.push(codepoint)
+ elsif codepoint >= 0xDC00 && codepoint <= 0xDFFF
+ high = @stack.pop
+ error('Expected high surrogate pair half') unless high.is_a?(Fixnum)
+ pair = ((high - 0xD800) * 0x400) + (codepoint - 0xDC00) + 0x10000
+ @buf << pair
+ @state = :start_string
+ else
+ @buf << codepoint
+ @state = :start_string
+ end
end
else
error('Expected unicode escape hex digit')
end
+ when :start_surrogate_pair
+ case ch
+ when BACKSLASH
+ @state = :start_surrogate_pair_u
+ else
+ error('Expected low surrogate pair half')
+ end
+ when :start_surrogate_pair_u
+ case ch
+ when U
+ @state = :unicode_escape
+ else
+ error('Expected low surrogate pair half')
+ end
when :start_negative_number
case ch
when ZERO
@@ -368,13 +396,13 @@ def start_value(ch)
when QUOTE
@state = :start_string
@stack.push(:string)
- when 't'
+ when T
@state = :start_true
@buf << ch
- when 'f'
+ when F
@state = :start_false
@buf << ch
- when 'n'
+ when N
@state = :start_null
@buf << ch
when MINUS
View
@@ -241,6 +241,18 @@ def test_unicode_escape
assert_equal(expected, events(%q{ {"snow\\u26033 man": 1} }))
end
+ def test_unicode_escape_surrogate_pairs
+ expected = [:start_document, :start_array, :error]
+ assert_equal(expected, events(%q{ ["\uD834"] }))
+ assert_equal(expected, events(%q{ ["\uD834\uD834"] }))
+ assert_equal(expected, events(%q{ ["\uDD1E"] }))
+ assert_equal(expected, events(%q{ ["\uDD1E\uDD1E"] }))
+
+ expected = [:start_document, :start_object, [:key, "\u{1D11E}"],
+ [:value, "g\u{1D11E}clef"], :end_object, :end_document]
+ assert_equal(expected, events(%q{ {"\uD834\uDD1E": "g\uD834\uDD1Eclef"} }))
+ end
+
def test_array_trailing_comma
expected = [:start_document, :start_array, [:value, 12], :error]
assert_equal(expected, events('[12, ]'))

0 comments on commit d2ce667

Please sign in to comment.