feat(parse): Process words composing symbols

crate-ci · Jun 16, 2019 · 3d1fb3b · 3d1fb3b
1 parent 63a8486
commit 3d1fb3b
Show file tree

Hide file tree

Showing 6 changed files with 264 additions and 33 deletions.
diff --git a/benches/corrections.rs b/benches/corrections.rs
@@ -8,15 +8,17 @@ fn load_corrections(b: &mut test::Bencher) {
 }
 
 #[bench]
-fn correction(b: &mut test::Bencher) {
+fn correct_word_hit(b: &mut test::Bencher) {
     let corrections = defenestrate::Dictionary::new();
-    assert_eq!(corrections.correct_str("successs"), Some("successes"));
-    b.iter(|| corrections.correct_str("successs"));
+    let input = defenestrate::tokens::Word::new("successs", 0).unwrap();
+    assert_eq!(corrections.correct_word(input), Some("successes"));
+    b.iter(|| corrections.correct_word(input));
 }
 
 #[bench]
-fn no_correction(b: &mut test::Bencher) {
+fn correct_word_miss(b: &mut test::Bencher) {
     let corrections = defenestrate::Dictionary::new();
-    assert_eq!(corrections.correct_str("success"), None);
-    b.iter(|| corrections.correct_str("success"));
+    let input = defenestrate::tokens::Word::new("success", 0).unwrap();
+    assert_eq!(corrections.correct_word(input), None);
+    b.iter(|| corrections.correct_word(input));
 }
diff --git a/benches/tokenize.rs b/benches/tokenize.rs
@@ -6,32 +6,60 @@ mod data;
 
 #[bench]
 fn symbol_parse_empty(b: &mut test::Bencher) {
-    b.iter(|| defenestrate::tokens::Symbol::parse(data::EMPTY.as_bytes()).collect::<Vec<_>>());
+    b.iter(|| defenestrate::tokens::Symbol::parse(data::EMPTY.as_bytes()).last());
 }
 
 #[bench]
 fn symbol_parse_no_tokens(b: &mut test::Bencher) {
-    b.iter(|| defenestrate::tokens::Symbol::parse(data::NO_TOKENS.as_bytes()).collect::<Vec<_>>());
+    b.iter(|| defenestrate::tokens::Symbol::parse(data::NO_TOKENS.as_bytes()).last());
 }
 
 #[bench]
 fn symbol_parse_single_token(b: &mut test::Bencher) {
     b.iter(|| {
-        defenestrate::tokens::Symbol::parse(data::SINGLE_TOKEN.as_bytes()).collect::<Vec<_>>()
+        defenestrate::tokens::Symbol::parse(data::SINGLE_TOKEN.as_bytes()).last();
     });
 }
 
 #[bench]
 fn symbol_parse_sherlock(b: &mut test::Bencher) {
-    b.iter(|| defenestrate::tokens::Symbol::parse(data::SHERLOCK.as_bytes()).collect::<Vec<_>>());
+    b.iter(|| defenestrate::tokens::Symbol::parse(data::SHERLOCK.as_bytes()).last());
 }
 
 #[bench]
 fn symbol_parse_code(b: &mut test::Bencher) {
-    b.iter(|| defenestrate::tokens::Symbol::parse(data::CODE.as_bytes()).collect::<Vec<_>>());
+    b.iter(|| defenestrate::tokens::Symbol::parse(data::CODE.as_bytes()).last());
 }
 
 #[bench]
 fn symbol_parse_corpus(b: &mut test::Bencher) {
-    b.iter(|| defenestrate::tokens::Symbol::parse(data::CORPUS.as_bytes()).collect::<Vec<_>>());
+    b.iter(|| defenestrate::tokens::Symbol::parse(data::CORPUS.as_bytes()).last());
+}
+
+#[bench]
+fn symbol_split_lowercase_short(b: &mut test::Bencher) {
+    let input = "abcabcabcabc";
+    let symbol = defenestrate::tokens::Symbol::new(input, 0).unwrap();
+    b.iter(|| symbol.split().last());
+}
+
+#[bench]
+fn symbol_split_lowercase_long(b: &mut test::Bencher) {
+    let input = "abcabcabcabc".repeat(90);
+    let symbol = defenestrate::tokens::Symbol::new(&input, 0).unwrap();
+    b.iter(|| symbol.split().last());
+}
+
+#[bench]
+fn symbol_split_mixed_short(b: &mut test::Bencher) {
+    let input = "abcABCAbc123";
+    let symbol = defenestrate::tokens::Symbol::new(input, 0).unwrap();
+    b.iter(|| symbol.split().last());
+}
+
+#[bench]
+fn symbol_split_mixed_long(b: &mut test::Bencher) {
+    let input = "abcABCAbc123".repeat(90);
+    let symbol = defenestrate::tokens::Symbol::new(&input, 0).unwrap();
+    b.iter(|| symbol.split().last());
 }
diff --git a/build.rs b/build.rs
@@ -11,9 +11,10 @@ fn main() {
 
     println!("rerun-if-changed=./assets/words.csv");
     write!(&mut file, "use unicase::UniCase;").unwrap();
+
     write!(
         &mut file,
-        "pub(crate) static DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
+        "pub(crate) static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static str> = "
     )
     .unwrap();
     let mut builder = phf_codegen::Map::new();

diff --git a/src/dict.rs b/src/dict.rs
@@ -8,14 +8,12 @@ impl Dictionary {
         Dictionary {}
     }
 
-    pub fn correct_str<'s, 'w>(&'s self, word: &'w str) -> Option<&'s str> {
-        map_lookup(&crate::dict_codegen::DICTIONARY, word)
+    pub fn correct_symbol<'s, 'w>(&'s self, _sym: crate::tokens::Symbol<'w>) -> Option<&'s str> {
+        None
     }
 
-    pub fn correct_bytes<'s, 'w>(&'s self, word: &'w [u8]) -> Option<&'s str> {
-        std::str::from_utf8(word)
-            .ok()
-            .and_then(|word| self.correct_str(word))
+    pub fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option<&'s str> {
+        map_lookup(&crate::dict_codegen::WORD_DICTIONARY, word.token())
     }
 }
 
@@ -27,6 +25,7 @@ fn map_lookup(
     // the expanded lifetime. This is due to `Borrow` being overly strict and
     // can't have an impl for `&'static str` to `Borrow<&'a str>`.
     //
+    //
     // See https://github.com/rust-lang/rust/issues/28853#issuecomment-158735548
     unsafe {
         let key = ::std::mem::transmute::<_, &'static str>(key);

diff --git a/src/lib.rs b/src/lib.rs
@@ -22,20 +22,34 @@ pub fn process_file(
     for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
         let line_num = line_idx + 1;
         for symbol in tokens::Symbol::parse(line) {
-            // Correct tokens as-is
-            if let Some(correction) = dictionary.correct_str(symbol.token) {
-                let col_num = symbol.offset;
+            if let Some(correction) = dictionary.correct_symbol(symbol) {
+                let col_num = symbol.offset();
                 let msg = report::Message {
                     path,
                     line,
                     line_num,
                     col_num,
-                    word: symbol.token,
+                    word: symbol.token(),
                     correction,
                     non_exhaustive: (),
                 };
                 report(msg);
             }
+            for word in symbol.split() {
+                if let Some(correction) = dictionary.correct_word(word) {
+                    let col_num = word.offset();
+                    let msg = report::Message {
+                        path,
+                        line,
+                        line_num,
+                        col_num,
+                        word: word.token(),
+                        correction,
+                        non_exhaustive: (),
+                    };
+                    report(msg);
+                }
+            }
         }
     }