Skip to content

Commit

Permalink
UTF-8 encoding/decoding fixes related to #317
Browse files Browse the repository at this point in the history
  • Loading branch information
asterite committed Dec 18, 2014
1 parent 76aa655 commit 64d2c85
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 19 deletions.
33 changes: 33 additions & 0 deletions spec/std/char_reader_spec.cr
Expand Up @@ -77,4 +77,37 @@ describe "CharReader" do
fail "reader each shouldn't yield on empty string"
end
end

it "errors if 0x80 <= first_byte < 0xC2" do
expect_raises { CharReader.new(String.new [0x80_u8].buffer) }
expect_raises { CharReader.new(String.new [0xC1_u8].buffer) }
end

it "errors if (second_byte & 0xC0) != 0x80" do
expect_raises { CharReader.new(String.new [0xd0_u8, 0_u8].buffer) }
end

it "errors if first_byte == 0xE0 && second_byte < 0xA0" do
expect_raises { CharReader.new(String.new [0xe0_u8, 0x9F_u8, 0xA0_u8].buffer) }
end

it "errors if first_byte < 0xF0 && (third_byte & 0xC0) != 0x80" do
expect_raises { CharReader.new(String.new [0xe0_u8, 0xA0_u8, 0_u8].buffer) }
end

it "errors if first_byte == 0xF0 && second_byte < 0x90" do
expect_raises { CharReader.new(String.new [0xf0_u8, 0x8F_u8, 0xA0_u8].buffer) }
end

it "errors if first_byte == 0xF4 && second_byte >= 0x90" do
expect_raises { CharReader.new(String.new [0xf4_u8, 0x90_u8, 0xA0_u8].buffer) }
end

it "errors if first_byte < 0xF5 && (fourth_byte & 0xC0) != 0x80" do
expect_raises { CharReader.new(String.new [0xf4_u8, 0x8F_u8, 0xA0_u8, 0_u8].buffer) }
end

it "errors if first_byte >= 0xF5" do
expect_raises { CharReader.new(String.new [0xf5_u8, 0x8F_u8, 0xA0_u8, 0xA0_u8].buffer) }
end
end
18 changes: 10 additions & 8 deletions src/char.cr
Expand Up @@ -143,8 +143,10 @@ struct Char
end

def each_byte
# See http://en.wikipedia.org/wiki/UTF-8#Sample_code

c = ord
if c <= 0x7f
if c < 0x80
# 0xxxxxxx
yield c.to_u8
elsif c <= 0x7ff
Expand All @@ -153,15 +155,15 @@ struct Char
yield (0x80 | c & 0x3f).to_u8
elsif c <= 0xffff
# 1110xxxx 10xxxxxx 10xxxxxx
yield (0xe0 | c >> 12).to_u8
yield (0x80 | c >> 6 & 0x3f).to_u8
yield (0x80 | c & 0x3f).to_u8
yield (0xe0 | (c >> 12)).to_u8
yield (0x80 | ((c >> 6) & 0x3f)).to_u8
yield (0x80 | (c & 0x3f)).to_u8
elsif c <= 0x10ffff
# 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
yield (0xf0 | c >> 18).to_u8
yield (0x80 | c >> 12 & 0x3f).to_u8
yield (0x80 | c >> 6 & 0x3f).to_u8
yield (0x80 | c & 0x3f).to_u8
yield (0xf0 | (c >> 18)).to_u8
yield (0x80 | ((c >> 12) & 0x3f)).to_u8
yield (0x80 | ((c >> 6) & 0x3f)).to_u8
yield (0x80 | (c & 0x3f)).to_u8
else
raise "Invalid char value"
end
Expand Down
52 changes: 41 additions & 11 deletions src/char_reader.cr
Expand Up @@ -71,26 +71,60 @@ struct CharReader
end

private def decode_char_at(pos)
# See http://en.wikipedia.org/wiki/UTF-8#Sample_code

first = byte_at(pos)
if first < 0x80
return yield first, 1
end

second = byte_masked_at(pos + 1)
if first < 0xc2
invalid_byte_sequence
end

second = byte_at(pos + 1)
if (second & 0xc0) != 0x80
invalid_byte_sequence
end

if first < 0xe0
return yield (first & 0x1f) << 6 | second, 2
return yield (first << 6) + (second - 0x3080), 2
end

third = byte_at(pos + 2)
if (third & 0xc0) != 0x80
invalid_byte_sequence
end

third = byte_masked_at(pos + 2)
if first < 0xf0
return yield (first & 0x0f) << 12 | (second << 6) | third, 3
if first == 0xe0 && second < 0xa0
invalid_byte_sequence
end

return yield (first << 12) + (second << 6) + (third - 0xE2080), 3
end

if first == 0xf0 && second < 0x90
invalid_byte_sequence
end

if first == 0xf4 && second >= 0x90
invalid_byte_sequence
end

fourth = byte_at(pos + 3)
if (fourth & 0xc0) != 0x80
invalid_byte_sequence
end

fourth = byte_masked_at(pos + 3)
if first < 0xf8
return yield (first & 0x07) << 18 | (second << 12) | (third << 6) | fourth, 4
if first < 0xf5
return yield (first << 18) + (second << 12) + (third << 6) + (fourth - 0x3C82080), 4
end

invalid_byte_sequence
end

private def invalid_byte_sequence
raise "Invalid byte sequence in UTF-8 string"
end

Expand All @@ -105,8 +139,4 @@ struct CharReader
private def byte_at(i)
@string.unsafe_byte_at(i).to_u32
end

private def byte_masked_at(i)
byte_at(i) & 0x3f
end
end

0 comments on commit 64d2c85

Please sign in to comment.