diff --git a/src/parse_to_value.rs b/src/parse_to_value.rs index 975abe1..1f6c455 100644 --- a/src/parse_to_value.rs +++ b/src/parse_to_value.rs @@ -155,4 +155,68 @@ mod tests { assert_eq!(err.range().end, 11); assert_eq!(err.kind().clone(), ParseErrorKind::UnexpectedToken); } + + #[test] + fn it_should_parse_surrogate_pair() { + // RFC 8259 ยง 7: non-BMP character ๐„ž (U+1D11E) should be escaped as surrogate pair \uD834\uDD1E + let src = r#""\uD834\uDD1E""#; + let v = parse_to_value(src, &Default::default()).unwrap().unwrap(); + if let JsonValue::String(s) = v { + assert_eq!("\u{1D11E}", s.as_ref()); + } else { + panic!("Expected string value, got {:?}", v); + } + } + + #[test] + fn it_should_parse_multiple_surrogate_pairs() { + let src = r#""\uD834\uDD1E\uD834\uDD1E""#; + let v = parse_to_value(src, &Default::default()).unwrap().unwrap(); + if let JsonValue::String(s) = v { + assert_eq!("\u{1D11E}\u{1D11E}", s.as_ref()); + } else { + panic!("Expected string value, got {:?}", v); + } + } + + #[test] + fn it_should_parse_mixed_escapes_with_surrogate_pairs() { + // "A๐„žB" where ๐„ž is encoded as surrogate pair + let src = r#""\u0041\uD834\uDD1E\u0042""#; + let v = parse_to_value(src, &Default::default()).unwrap().unwrap(); + if let JsonValue::String(s) = v { + assert_eq!("A\u{1D11E}B", s.as_ref()); + } else { + panic!("Expected string value, got {:?}", v); + } + } + + #[test] + fn it_should_error_on_unpaired_high_surrogate_with_text() { + let src = r#""\uD834x""#; + let err = parse_to_value(src, &Default::default()).err().unwrap(); + assert!(err.to_string().contains("unpaired high surrogate")); + } + + #[test] + fn it_should_error_on_unpaired_high_surrogate_at_eof() { + let src = r#""\uD834""#; + let err = parse_to_value(src, &Default::default()).err().unwrap(); + assert!(err.to_string().contains("unpaired high surrogate")); + } + + #[test] + fn it_should_error_on_high_surrogate_followed_by_non_low_surrogate() { + let src = r#""\uD834\u0041""#; + let err = parse_to_value(src, &Default::default()).err().unwrap(); + assert!(err.to_string().contains("not followed by low surrogate")); + } + + #[test] + fn it_should_error_on_unpaired_low_surrogate() { + // This test verifies existing behavior is maintained + let src = r#""\uDC00""#; + let err = parse_to_value(src, &Default::default()).err().unwrap(); + assert!(err.to_string().contains("unpaired low surrogate")); + } } diff --git a/src/scanner.rs b/src/scanner.rs index 9de46db..105fc7d 100644 --- a/src/scanner.rs +++ b/src/scanner.rs @@ -620,7 +620,7 @@ mod tests { fn it_errors_on_invalid_utf8_char_for_issue_6() { assert_has_error( "\"\\uDF06\"", - "Invalid unicode escape sequence. 'DF06' is not a valid UTF8 character on line 1 column 2", + "Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2", ); } diff --git a/src/string.rs b/src/string.rs index e3437a3..cf4a17f 100644 --- a/src/string.rs +++ b/src/string.rs @@ -151,16 +151,115 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>( } } - let hex_u32 = u32::from_str_radix(&hex_text, 16); - let hex_char = match hex_u32.ok().and_then(std::char::from_u32) { - Some(hex_char) => hex_char, - None => { + let hex_value = match u32::from_str_radix(&hex_text, 16) { + Ok(v) => v, + Err(_) => { return Err(ParseStringError { byte_index: escape_start, kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text), }); } }; + + // Check if this is a high surrogate (0xD800-0xDBFF) + let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) { + // High surrogate - must be followed by low surrogate + // Peek ahead for \uXXXX pattern + let next_char = chars.move_next_char(); + if next_char != Some('\\') { + return Err(ParseStringError { + byte_index: escape_start, + kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{} (unpaired high surrogate)", + hex_text + )), + }); + } + + let next_char = chars.move_next_char(); + if next_char != Some('u') { + return Err(ParseStringError { + byte_index: escape_start, + kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{} (unpaired high surrogate)", + hex_text + )), + }); + } + + // Parse the second \uXXXX + let mut hex_text2 = String::new(); + for _ in 0..4 { + let current_char = chars.move_next_char(); + if !is_hex(current_char) { + return Err(ParseStringError { + byte_index: escape_start, + kind: ParseStringErrorKind::ExpectedFourHexDigits, + }); + } + if let Some(current_char) = current_char { + hex_text2.push(current_char); + } + } + + let hex_value2 = match u32::from_str_radix(&hex_text2, 16) { + Ok(v) => v, + Err(_) => { + return Err(ParseStringError { + byte_index: escape_start, + kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2), + }); + } + }; + + // Verify it's a low surrogate (0xDC00-0xDFFF) + if !(0xDC00..=0xDFFF).contains(&hex_value2) { + return Err(ParseStringError { + byte_index: escape_start, + kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{} (high surrogate not followed by low surrogate)", + hex_text + )), + }); + } + + // Combine surrogate pair using RFC 8259 formula + let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000; + + match std::char::from_u32(code_point) { + Some(c) => c, + None => { + return Err(ParseStringError { + byte_index: escape_start, + kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{}\\u{} (invalid surrogate pair)", + hex_text, hex_text2 + )), + }); + } + } + } else if (0xDC00..=0xDFFF).contains(&hex_value) { + // Low surrogate without high surrogate + return Err(ParseStringError { + byte_index: escape_start, + kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!( + "{} (unpaired low surrogate)", + hex_text + )), + }); + } else { + // Normal unicode escape + match std::char::from_u32(hex_value) { + Some(hex_char) => hex_char, + None => { + return Err(ParseStringError { + byte_index: escape_start, + kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text), + }); + } + } + }; + text.push(hex_char); last_start_byte_index = chars.byte_index() + chars.current_char().map(|c| c.len_utf8()).unwrap_or(0); } else {