Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 101 additions & 124 deletions src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,130 +136,10 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
let text = text.as_mut().unwrap();
text.push_str(previous_text);
if current_char == 'u' {
let mut hex_text = String::new();
// expect four hex values
for _ in 0..4 {
let current_char = chars.move_next_char();
if !is_hex(current_char) {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::ExpectedFourHexDigits,
});
}
if let Some(current_char) = current_char {
hex_text.push(current_char);
}
}

let hex_value = match u32::from_str_radix(&hex_text, 16) {
Ok(v) => v,
Err(_) => {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
});
}
};

// Check if this is a high surrogate (0xD800-0xDBFF)
let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) {
// High surrogate - must be followed by low surrogate
// Peek ahead for \uXXXX pattern
let next_char = chars.move_next_char();
if next_char != Some('\\') {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (unpaired high surrogate)",
hex_text
)),
});
}

let next_char = chars.move_next_char();
if next_char != Some('u') {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (unpaired high surrogate)",
hex_text
)),
});
}

// Parse the second \uXXXX
let mut hex_text2 = String::new();
for _ in 0..4 {
let current_char = chars.move_next_char();
if !is_hex(current_char) {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::ExpectedFourHexDigits,
});
}
if let Some(current_char) = current_char {
hex_text2.push(current_char);
}
}

let hex_value2 = match u32::from_str_radix(&hex_text2, 16) {
Ok(v) => v,
Err(_) => {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2),
});
}
};

// Verify it's a low surrogate (0xDC00-0xDFFF)
if !(0xDC00..=0xDFFF).contains(&hex_value2) {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (high surrogate not followed by low surrogate)",
hex_text
)),
});
}

// Combine surrogate pair using RFC 8259 formula
let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000;

match std::char::from_u32(code_point) {
Some(c) => c,
None => {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{}\\u{} (invalid surrogate pair)",
hex_text, hex_text2
)),
});
}
}
} else if (0xDC00..=0xDFFF).contains(&hex_value) {
// Low surrogate without high surrogate
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (unpaired low surrogate)",
hex_text
)),
});
} else {
// Normal unicode escape
match std::char::from_u32(hex_value) {
Some(hex_char) => hex_char,
None => {
return Err(ParseStringError {
byte_index: escape_start,
kind: ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text),
});
}
}
};

let hex_char = parse_hex_char(chars).map_err(|kind| ParseStringError {
byte_index: escape_start,
kind,
})?;
text.push(hex_char);
last_start_byte_index = chars.byte_index() + chars.current_char().map(|c| c.len_utf8()).unwrap_or(0);
} else {
Expand Down Expand Up @@ -308,6 +188,103 @@ pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
}
}

fn parse_hex_char<'a, T: CharProvider<'a>>(chars: &mut T) -> Result<char, ParseStringErrorKind> {
let mut hex_text = String::new();
// expect four hex values
for _ in 0..4 {
let current_char = chars.move_next_char();
if !is_hex(current_char) {
return Err(ParseStringErrorKind::ExpectedFourHexDigits);
}
if let Some(current_char) = current_char {
hex_text.push(current_char);
}
}

let hex_value = match u32::from_str_radix(&hex_text, 16) {
Ok(v) => v,
Err(_) => {
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text));
}
};

// Check if this is a high surrogate (0xD800-0xDBFF)
let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) {
// High surrogate - must be followed by low surrogate
// Peek ahead for \uXXXX pattern
let next_char = chars.move_next_char();
if next_char != Some('\\') {
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (unpaired high surrogate)",
hex_text
)));
}

let next_char = chars.move_next_char();
if next_char != Some('u') {
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (unpaired high surrogate)",
hex_text
)));
}

// Parse the second \uXXXX
let mut hex_text2 = String::new();
for _ in 0..4 {
let current_char = chars.move_next_char();
if !is_hex(current_char) {
return Err(ParseStringErrorKind::ExpectedFourHexDigits);
}
if let Some(current_char) = current_char {
hex_text2.push(current_char);
}
}

let hex_value2 = match u32::from_str_radix(&hex_text2, 16) {
Ok(v) => v,
Err(_) => {
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2));
}
};

// Verify it's a low surrogate (0xDC00-0xDFFF)
if !(0xDC00..=0xDFFF).contains(&hex_value2) {
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (high surrogate not followed by low surrogate)",
hex_text
)));
}

// Combine surrogate pair using RFC 8259 formula
let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000;

match std::char::from_u32(code_point) {
Some(c) => c,
None => {
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{}\\u{} (invalid surrogate pair)",
hex_text, hex_text2
)));
}
}
} else if (0xDC00..=0xDFFF).contains(&hex_value) {
// Low surrogate without high surrogate
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
"{} (unpaired low surrogate)",
hex_text
)));
} else {
// Normal unicode escape
match std::char::from_u32(hex_value) {
Some(hex_char) => hex_char,
None => {
return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text));
}
}
};
Ok(hex_char)
}

fn is_hex(c: Option<char>) -> bool {
let Some(c) = c else {
return false;
Expand Down
Loading