Skip to content

Commit

Permalink
feat(parse): Process words composing symbols
Browse files Browse the repository at this point in the history
  • Loading branch information
epage committed Jun 16, 2019
1 parent 63a8486 commit 3d1fb3b
Show file tree
Hide file tree
Showing 6 changed files with 264 additions and 33 deletions.
14 changes: 8 additions & 6 deletions benches/corrections.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@ fn load_corrections(b: &mut test::Bencher) {
}

#[bench]
fn correction(b: &mut test::Bencher) {
fn correct_word_hit(b: &mut test::Bencher) {
let corrections = defenestrate::Dictionary::new();
assert_eq!(corrections.correct_str("successs"), Some("successes"));
b.iter(|| corrections.correct_str("successs"));
let input = defenestrate::tokens::Word::new("successs", 0).unwrap();
assert_eq!(corrections.correct_word(input), Some("successes"));
b.iter(|| corrections.correct_word(input));
}

#[bench]
fn no_correction(b: &mut test::Bencher) {
fn correct_word_miss(b: &mut test::Bencher) {
let corrections = defenestrate::Dictionary::new();
assert_eq!(corrections.correct_str("success"), None);
b.iter(|| corrections.correct_str("success"));
let input = defenestrate::tokens::Word::new("success", 0).unwrap();
assert_eq!(corrections.correct_word(input), None);
b.iter(|| corrections.correct_word(input));
}
40 changes: 34 additions & 6 deletions benches/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,60 @@ mod data;

#[bench]
fn symbol_parse_empty(b: &mut test::Bencher) {
b.iter(|| defenestrate::tokens::Symbol::parse(data::EMPTY.as_bytes()).collect::<Vec<_>>());
b.iter(|| defenestrate::tokens::Symbol::parse(data::EMPTY.as_bytes()).last());
}

#[bench]
fn symbol_parse_no_tokens(b: &mut test::Bencher) {
b.iter(|| defenestrate::tokens::Symbol::parse(data::NO_TOKENS.as_bytes()).collect::<Vec<_>>());
b.iter(|| defenestrate::tokens::Symbol::parse(data::NO_TOKENS.as_bytes()).last());
}

#[bench]
fn symbol_parse_single_token(b: &mut test::Bencher) {
b.iter(|| {
defenestrate::tokens::Symbol::parse(data::SINGLE_TOKEN.as_bytes()).collect::<Vec<_>>()
defenestrate::tokens::Symbol::parse(data::SINGLE_TOKEN.as_bytes()).last();
});
}

#[bench]
fn symbol_parse_sherlock(b: &mut test::Bencher) {
b.iter(|| defenestrate::tokens::Symbol::parse(data::SHERLOCK.as_bytes()).collect::<Vec<_>>());
b.iter(|| defenestrate::tokens::Symbol::parse(data::SHERLOCK.as_bytes()).last());
}

#[bench]
fn symbol_parse_code(b: &mut test::Bencher) {
b.iter(|| defenestrate::tokens::Symbol::parse(data::CODE.as_bytes()).collect::<Vec<_>>());
b.iter(|| defenestrate::tokens::Symbol::parse(data::CODE.as_bytes()).last());
}

#[bench]
fn symbol_parse_corpus(b: &mut test::Bencher) {
b.iter(|| defenestrate::tokens::Symbol::parse(data::CORPUS.as_bytes()).collect::<Vec<_>>());
b.iter(|| defenestrate::tokens::Symbol::parse(data::CORPUS.as_bytes()).last());
}

#[bench]
fn symbol_split_lowercase_short(b: &mut test::Bencher) {
let input = "abcabcabcabc";
let symbol = defenestrate::tokens::Symbol::new(input, 0).unwrap();
b.iter(|| symbol.split().last());
}

#[bench]
fn symbol_split_lowercase_long(b: &mut test::Bencher) {
let input = "abcabcabcabc".repeat(90);
let symbol = defenestrate::tokens::Symbol::new(&input, 0).unwrap();
b.iter(|| symbol.split().last());
}

#[bench]
fn symbol_split_mixed_short(b: &mut test::Bencher) {
let input = "abcABCAbc123";
let symbol = defenestrate::tokens::Symbol::new(input, 0).unwrap();
b.iter(|| symbol.split().last());
}

#[bench]
fn symbol_split_mixed_long(b: &mut test::Bencher) {
let input = "abcABCAbc123".repeat(90);
let symbol = defenestrate::tokens::Symbol::new(&input, 0).unwrap();
b.iter(|| symbol.split().last());
}
3 changes: 2 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ fn main() {

println!("rerun-if-changed=./assets/words.csv");
write!(&mut file, "use unicase::UniCase;").unwrap();

write!(
&mut file,
"pub(crate) static DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
"pub(crate) static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
)
.unwrap();
let mut builder = phf_codegen::Map::new();
Expand Down
11 changes: 5 additions & 6 deletions src/dict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,12 @@ impl Dictionary {
Dictionary {}
}

pub fn correct_str<'s, 'w>(&'s self, word: &'w str) -> Option<&'s str> {
map_lookup(&crate::dict_codegen::DICTIONARY, word)
pub fn correct_symbol<'s, 'w>(&'s self, _sym: crate::tokens::Symbol<'w>) -> Option<&'s str> {
None
}

pub fn correct_bytes<'s, 'w>(&'s self, word: &'w [u8]) -> Option<&'s str> {
std::str::from_utf8(word)
.ok()
.and_then(|word| self.correct_str(word))
pub fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option<&'s str> {
map_lookup(&crate::dict_codegen::WORD_DICTIONARY, word.token())
}
}

Expand All @@ -27,6 +25,7 @@ fn map_lookup(
// the expanded lifetime. This is due to `Borrow` being overly strict and
// can't have an impl for `&'static str` to `Borrow<&'a str>`.
//
//
// See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548
unsafe {
let key = ::std::mem::transmute::<_, &'static str>(key);
Expand Down
22 changes: 18 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,34 @@ pub fn process_file(
for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
let line_num = line_idx + 1;
for symbol in tokens::Symbol::parse(line) {
// Correct tokens as-is
if let Some(correction) = dictionary.correct_str(symbol.token) {
let col_num = symbol.offset;
if let Some(correction) = dictionary.correct_symbol(symbol) {
let col_num = symbol.offset();
let msg = report::Message {
path,
line,
line_num,
col_num,
word: symbol.token,
word: symbol.token(),
correction,
non_exhaustive: (),
};
report(msg);
}
for word in symbol.split() {
if let Some(correction) = dictionary.correct_word(word) {
let col_num = word.offset();
let msg = report::Message {
path,
line,
line_num,
col_num,
word: word.token(),
correction,
non_exhaustive: (),
};
report(msg);
}
}
}
}

Expand Down

0 comments on commit 3d1fb3b

Please sign in to comment.