Skip to content

Commit

Permalink
Merge pull request #486 from epage/change
Browse files Browse the repository at this point in the history
fix(parser): Improve detection of ignorable cases
  • Loading branch information
epage committed May 10, 2022
2 parents 8cd9cef + fd53983 commit 5e7e699
Showing 1 changed file with 68 additions and 24 deletions.
92 changes: 68 additions & 24 deletions crates/typos/src/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ mod parser {
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Offset
+ Clone
+ Default
+ PartialEq
+ std::fmt::Debug,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
Expand Down Expand Up @@ -172,6 +173,7 @@ mod parser {
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Offset
+ Clone
+ Default
+ PartialEq
+ std::fmt::Debug,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
Expand All @@ -181,15 +183,15 @@ mod parser {
// CAUTION: If adding an ignorable literal, if it doesn't start with `is_xid_continue`,
// - Update `is_ignore_char` to make sure `sep1` doesn't eat it all up
// - Make sure you always consume it
terminated(uuid_literal, sep1),
terminated(hash_literal, sep1),
terminated(hex_literal, sep1),
terminated(dec_literal, sep1),
terminated(ordinal_literal, sep1),
terminated(base64_literal, sep1),
terminated(email_literal, sep1),
terminated(url_literal, sep1),
terminated(css_color, sep1),
terminated(uuid_literal, peek(sep1)),
terminated(hash_literal, peek(sep1)),
terminated(base64_literal, peek(sep1)), // base64 should be quoted or something
terminated(ordinal_literal, peek(sep1)),
terminated(hex_literal, peek(sep1)),
terminated(dec_literal, peek(sep1)), // Allow digit-prefixed words
terminated(email_literal, peek(sep1)),
terminated(url_literal, peek(sep1)),
terminated(css_color, peek(sep1)),
c_escape,
printf,
other,
Expand All @@ -198,10 +200,24 @@ mod parser {

fn sep1<T>(input: T) -> IResult<T, T>
where
T: nom::InputTakeAtPosition + std::fmt::Debug,
T: nom::InputTakeAtPosition
+ nom::InputTake
+ nom::InputIter
+ nom::InputLength
+ nom::Slice<std::ops::RangeFrom<usize>>
+ nom::Slice<std::ops::RangeTo<usize>>
+ nom::Offset
+ Clone
+ Default
+ PartialEq
+ std::fmt::Debug,
<T as nom::InputTakeAtPosition>::Item: AsChar + Copy,
<T as nom::InputIter>::Item: AsChar + Copy,
{
take_while1(is_ignore_char)(input)
alt((
recognize(satisfy(|c| !is_xid_continue(c))),
map(eof, |_| T::default()),
))(input)
}

fn other<T>(input: T) -> IResult<T, T>
Expand Down Expand Up @@ -391,7 +407,16 @@ mod parser {
<T as nom::InputIter>::Item: AsChar + Copy,
{
let (padding, captured) = take_while1(is_base64_digit)(input.clone())?;

const CHUNK: usize = 4;
let padding_offset = input.offset(&padding);
let mut padding_len = CHUNK - padding_offset % CHUNK;
if padding_len == CHUNK {
padding_len = 0;
}

if captured.input_len() < 90
&& padding_len == 0
&& captured
.iter_elements()
.all(|c| !['/', '+'].contains(&c.as_char()))
Expand All @@ -402,14 +427,8 @@ mod parser {
)));
}

const CHUNK: usize = 4;
let padding_offset = input.offset(&padding);
let mut padding_len = CHUNK - padding_offset % CHUNK;
if padding_len == CHUNK {
padding_len = 0;
}

let (after, _) = take_while_m_n(padding_len, padding_len, is_base64_padding)(padding)?;

let after_offset = input.offset(&after);
Ok(input.take_split(after_offset))
}
Expand Down Expand Up @@ -1132,12 +1151,6 @@ mod test {
("D41D8CD98F00B204E9800998ECF8427E", true),
// A 31-character hexadecimal string: too short to be a hash.
("D41D8CD98F00B204E9800998ECF8427", false),
// A 40-character string, but with non-hex characters (in
// several positions.)
("Z85865fd0412e40d041e861506bb3ac11a3a91e3", false),
("485865fd04Z2e40d041e861506bb3ac11a3a91e3", false),
("485865fd0412e40d041e8Z1506bb3ac11a3a91e3", false),
("485865fd0412e40d041e861506bb3ac11a3a91eZ", false),
] {
let input = format!("Hello {} World", hashlike);
let mut expected: Vec<Identifier> = vec![
Expand All @@ -1154,6 +1167,22 @@ mod test {
}
}

#[test]
fn tokenize_hash_in_mixed_path() {
let parser = TokenizerBuilder::new().build();

let input = " /// at /rustc/c7087fe00d2ba919df1d813c040a5d47e43b0fe7\\/src\\libstd\\rt.rs:51";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("at", Case::None, 25),
// `rustc...` looks like the start of a URL
Identifier::new_unchecked("rs", Case::None, 91),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}

#[test]
fn tokenize_ignore_base64_case_1() {
let parser = TokenizerBuilder::new().build();
Expand Down Expand Up @@ -1181,6 +1210,21 @@ mod test {
assert_eq!(expected, actual);
}

#[test]
fn tokenize_ignore_base64_case_3() {
let parser = TokenizerBuilder::new().build();

let input = r#" "integrity": "sha512-hCmlUAIlUiav8Xdqw3Io4LcpA1DOt7h3LSTAC4G6JGHFFaWzI6qvFt9oilvl8BmkbBRX1IhM90ZAmpk68zccQA==","#;
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("integrity", Case::None, 8),
Identifier::new_unchecked("sha512", Case::None, 21),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}

#[test]
fn tokenize_ignore_email() {
let parser = TokenizerBuilder::new().build();
Expand Down

0 comments on commit 5e7e699

Please sign in to comment.