Skip to content

Commit

Permalink
fix: Improve the quality of symbols being reported
Browse files Browse the repository at this point in the history
  • Loading branch information
epage committed Jun 14, 2019
1 parent c7ca904 commit d78713d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 26 deletions.
30 changes: 14 additions & 16 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,20 @@ pub fn process_file(
File::open(path)?.read_to_end(&mut buffer)?;
for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
let line_num = line_idx + 1;
for token in tokens::Symbol::parse(line) {
if let Ok(word) = std::str::from_utf8(token.token) {
// Correct tokens as-is
if let Some(correction) = dictionary.correct_str(word) {
let col_num = token.offset;
let msg = report::Message {
path,
line,
line_num,
col_num,
word,
correction,
non_exhaustive: (),
};
report(msg);
}
for symbol in tokens::Symbol::parse(line) {
// Correct tokens as-is
if let Some(correction) = dictionary.correct_str(symbol.token) {
let col_num = symbol.offset;
let msg = report::Message {
path,
line,
line_num,
col_num,
word: symbol.token,
correction,
non_exhaustive: (),
};
report(msg);
}
}
}
Expand Down
21 changes: 11 additions & 10 deletions src/tokens.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Symbol<'t> {
pub token: &'t [u8],
pub token: &'t str,
pub offset: usize,
}

impl<'t> Symbol<'t> {
pub fn new(token: &'t [u8], offset: usize) -> Self {
pub fn new(token: &'t str, offset: usize) -> Self {
Self { token, offset }
}

Expand All @@ -15,9 +15,10 @@ impl<'t> Symbol<'t> {
#[allow(clippy::invalid_regex)]
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
}
SPLIT
.find_iter(content)
.map(|m| Symbol::new(m.as_bytes(), m.start()))
SPLIT.find_iter(content).filter_map(|m| {
let s = std::str::from_utf8(m.as_bytes()).ok();
s.map(|s| Symbol::new(s, m.start()))
})
}
}

Expand All @@ -36,39 +37,39 @@ mod test {
#[test]
fn tokenize_word_is_word() {
let input = b"word";
let expected: Vec<Symbol> = vec![Symbol::new(b"word", 0)];
let expected: Vec<Symbol> = vec![Symbol::new("word", 0)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}

#[test]
fn tokenize_space_separated_words() {
let input = b"A B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}

#[test]
fn tokenize_dot_separated_words() {
let input = b"A.B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}

#[test]
fn tokenize_namespace_separated_words() {
let input = b"A::B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 3)];
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 3)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}

#[test]
fn tokenize_underscore_doesnt_separate() {
let input = b"A_B";
let expected: Vec<Symbol> = vec![Symbol::new(b"A_B", 0)];
let expected: Vec<Symbol> = vec![Symbol::new("A_B", 0)];
let actual: Vec<_> = Symbol::parse(input).collect();
assert_eq!(expected, actual);
}
Expand Down

0 comments on commit d78713d

Please sign in to comment.