diff --git a/spec/std/string/utf16_spec.cr b/spec/std/string/utf16_spec.cr index 660ad492f432..f683bc7c0a34 100644 --- a/spec/std/string/utf16_spec.cr +++ b/spec/std/string/utf16_spec.cr @@ -43,5 +43,11 @@ describe "String UTF16" do input = Slice[0xdc00_u16, 0xd800_u16] String.from_utf16(input).should eq("\u{fffd}\u{fffd}") end + + it "handles null bytes" do + slice = Slice[104_u16, 105_u16, 0_u16, 55296_u16, 56485_u16] + String.from_utf16(slice).should eq("hi\0000𐂥") + String.from_utf16(slice.to_unsafe).should eq("hi") + end end end diff --git a/src/string/utf16.cr b/src/string/utf16.cr index 3a308debddbc..e733780d9aba 100644 --- a/src/string/utf16.cr +++ b/src/string/utf16.cr @@ -53,7 +53,15 @@ class String # slice = Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16] # String.from_utf16(slice) # => "hi 𐂥" # ``` - def self.from_utf16(slice : Slice(UInt16)) : String + # + # If *slice* is a pointer, the string ends when a zero value is found. + # + # ``` + # slice = Slice[104_u16, 105_u16, 0_u16, 55296_u16, 56485_u16] + # String.from_utf16(slice) # => "hi\0000𐂥" + # String.from_utf16(slice.to_unsafe) # => "hi" + # ``` + def self.from_utf16(slice : Slice(UInt16) | Pointer(UInt16)) : String bytesize = 0 size = 0 @@ -97,4 +105,29 @@ class String i += 1 end end + + # Yields each decoded char in the given pointer, stopping at the first null byte. + private def self.each_utf16_char(pointer : Pointer(UInt16)) + loop do + byte = pointer.value.to_i + break if byte == 0 + + if byte < 0xd800 || byte >= 0xe000 + # One byte + codepoint = byte + elsif 0xd800 <= byte < 0xdc00 && + 0xdc00 <= (pointer + 1).value <= 0xdfff + # Surrougate pair + pointer = pointer + 1 + codepoint = ((byte - 0xd800) << 10) + (pointer.value - 0xdc00) + 0x10000 + else + # Invalid byte + codepoint = 0xfffd + end + + yield codepoint.chr + + pointer = pointer + 1 + end + end end