Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions src/parse_to_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,68 @@ mod tests {
assert_eq!(err.range().end, 11);
assert_eq!(err.kind().clone(), ParseErrorKind::UnexpectedToken);
}

#[test]
fn it_should_parse_surrogate_pair() {
// RFC 8259 § 7: non-BMP character 𝄞 (U+1D11E) should be escaped as surrogate pair \uD834\uDD1E
let src = r#""\uD834\uDD1E""#;
let v = parse_to_value(src, &Default::default()).unwrap().unwrap();
if let JsonValue::String(s) = v {
assert_eq!("\u{1D11E}", s.as_ref());
} else {
panic!("Expected string value, got {:?}", v);
}
}

#[test]
fn it_should_parse_multiple_surrogate_pairs() {
let src = r#""\uD834\uDD1E\uD834\uDD1E""#;
let v = parse_to_value(src, &Default::default()).unwrap().unwrap();
if let JsonValue::String(s) = v {
assert_eq!("\u{1D11E}\u{1D11E}", s.as_ref());
} else {
panic!("Expected string value, got {:?}", v);
}
}

#[test]
fn it_should_parse_mixed_escapes_with_surrogate_pairs() {
// "A𝄞B" where 𝄞 is encoded as surrogate pair
let src = r#""\u0041\uD834\uDD1E\u0042""#;
let v = parse_to_value(src, &Default::default()).unwrap().unwrap();
if let JsonValue::String(s) = v {
assert_eq!("A\u{1D11E}B", s.as_ref());
} else {
panic!("Expected string value, got {:?}", v);
}
}

#[test]
fn it_should_error_on_unpaired_high_surrogate_with_text() {
let src = r#""\uD834x""#;
let err = parse_to_value(src, &Default::default()).err().unwrap();
assert!(err.to_string().contains("unpaired high surrogate"));
}

#[test]
fn it_should_error_on_unpaired_high_surrogate_at_eof() {
let src = r#""\uD834""#;
let err = parse_to_value(src, &Default::default()).err().unwrap();
assert!(err.to_string().contains("unpaired high surrogate"));
}

#[test]
fn it_should_error_on_high_surrogate_followed_by_non_low_surrogate() {
let src = r#""\uD834\u0041""#;
let err = parse_to_value(src, &Default::default()).err().unwrap();
assert!(err.to_string().contains("not followed by low surrogate"));
}

#[test]
fn it_should_error_on_unpaired_low_surrogate() {
// This test verifies existing behavior is maintained
let src = r#""\uDC00""#;
let err = parse_to_value(src, &Default::default()).err().unwrap();
assert!(err.to_string().contains("unpaired low surrogate"));
}
}
2 changes: 1 addition & 1 deletion src/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ mod tests {
fn it_errors_on_invalid_utf8_char_for_issue_6() {
assert_has_error(
"\"\\uDF06\"",
"Invalid unicode escape sequence. 'DF06' is not a valid UTF8 character on line 1 column 2",
"Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",
);
}

Expand Down
107 changes: 103 additions & 4 deletions src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,16 +151,115 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
}
}

let hex_u32 = u32::from_str_radix(&hex_text, 16);
let hex_char = match hex_u32.ok().and_then(std::char::from_u32) {
Some(hex_char) => hex_char,
None => {
let hex_value = match u32::from_str_radix(&hex_text, 16) {
Ok(v) => v,
Err(_) => {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
});
}
};

// Check if this is a high surrogate (0xD800-0xDBFF)
let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) {
// High surrogate - must be followed by low surrogate
// Peek ahead for \uXXXX pattern
let next_char = chars.move_next_char();
if next_char != Some('\\') {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (unpaired high surrogate)",
hex_text
)),
});
}

let next_char = chars.move_next_char();
if next_char != Some('u') {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (unpaired high surrogate)",
hex_text
)),
});
}

// Parse the second \uXXXX
let mut hex_text2 = String::new();
for _ in 0..4 {
let current_char = chars.move_next_char();
if !is_hex(current_char) {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::ExpectedFourHexDigits,
});
}
if let Some(current_char) = current_char {
hex_text2.push(current_char);
}
}

let hex_value2 = match u32::from_str_radix(&hex_text2, 16) {
Ok(v) => v,
Err(_) => {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2),
});
}
};

// Verify it's a low surrogate (0xDC00-0xDFFF)
if !(0xDC00..=0xDFFF).contains(&hex_value2) {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (high surrogate not followed by low surrogate)",
hex_text
)),
});
}

// Combine surrogate pair using RFC 8259 formula
let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000;

match std::char::from_u32(code_point) {
Some(c) => c,
None => {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{}\\u{} (invalid surrogate pair)",
hex_text, hex_text2
)),
});
}
}
} else if (0xDC00..=0xDFFF).contains(&hex_value) {
// Low surrogate without high surrogate
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (unpaired low surrogate)",
hex_text
)),
});
} else {
// Normal unicode escape
match std::char::from_u32(hex_value) {
Some(hex_char) => hex_char,
None => {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
});
}
}
};

text.push(hex_char);
last_start_byte_index = chars.byte_index() + chars.current_char().map(|c| c.len_utf8()).unwrap_or(0);
} else {
Expand Down
Loading