From daba5aa3533948d41427ae5ebe0cb8e91be82022 Mon Sep 17 00:00:00 2001 From: Bernhard Schuster Date: Fri, 14 Aug 2020 09:59:16 +0200 Subject: [PATCH] feat/quirks: extract quirks from hunspell, add allow_concatenation Closes #82 --- src/checker/hunspell.rs | 179 ++++++---------------------------------- src/checker/mod.rs | 3 + src/checker/quirks.rs | 166 +++++++++++++++++++++++++++++++++++++ src/config.rs | 55 +++++++++++- 4 files changed, 243 insertions(+), 160 deletions(-) create mode 100644 src/checker/quirks.rs diff --git a/src/checker/hunspell.rs b/src/checker/hunspell.rs index e81fd308..2810e97e 100644 --- a/src/checker/hunspell.rs +++ b/src/checker/hunspell.rs @@ -10,14 +10,15 @@ use super::{tokenize, Checker, Detector, Documentation, Suggestion, SuggestionSe use crate::documentation::{CheckableChunk, ContentOrigin, PlainOverlay}; use crate::util::sub_chars; use crate::Range; -use fancy_regex::Regex; -use log::{debug, trace, warn}; +use log::{debug, trace}; use std::path::PathBuf; use hunspell_rs::Hunspell; use anyhow::{anyhow, bail, Result}; +use super::quirks::{replacements_contain_dashless, transform, Transformed}; + pub struct HunspellChecker; impl HunspellChecker { @@ -117,7 +118,11 @@ impl Checker for HunspellChecker { { let hunspell = Self::inner_init(config)?; - let transform_regex = config.transform_regex.clone().unwrap_or_else(|| Vec::new()); + let (transform_regex, allow_concatenated) = if let Some(quirks) = &config.quirks { + (quirks.transform_regex(), quirks.allow_concatenated()) + } else { + (&[][..], false) + }; let suggestions = docu.iter().try_fold::>( SuggestionSet::new(), @@ -132,7 +137,14 @@ impl Checker for HunspellChecker { let word = sub_chars(txt, range.clone()); if transform_regex.is_empty() { obtain_suggestions( - &plain, chunk, &hunspell, origin, word, range, &mut acc, + &plain, + chunk, + &hunspell, + origin, + word, + range, + allow_concatenated, + &mut acc, ) } else { match transform(&transform_regex[..], word.as_str(), range.clone()) { @@ -145,6 +157,7 @@ impl Checker for HunspellChecker { origin, word_fragment.to_owned(), range, + allow_concatenated, &mut acc, ); } @@ -157,6 +170,7 @@ impl Checker for HunspellChecker { origin, word.to_owned(), range, + allow_concatenated, &mut acc, ); } @@ -181,6 +195,7 @@ fn obtain_suggestions<'s>( origin: &ContentOrigin, word: String, range: Range, + allow_concatenated: bool, acc: &mut SuggestionSet<'s>, ) { if !hunspell.check(&word) { @@ -192,6 +207,10 @@ fn obtain_suggestions<'s>( .filter(|x| x.len() > 1) // single char suggestions tend to be useless .collect::>(); + if allow_concatenated && replacements_contain_dashless(&word, replacements.as_slice()) { + trace!(target: "quirks", "Found dashed word in replacement suggestions, treating {} as ok", &word); + return; + } for (range, span) in plain.find_spans(range.clone()) { acc.add( origin.clone(), @@ -214,155 +233,3 @@ fn obtain_suggestions<'s>( ); } } - -/// Transformed word with information on the transformation outcome. -#[derive(Debug, Eq, PartialEq)] -enum Transformed<'i> { - /// A whitelisted chunk - Whitelisted((Range, &'i str)), - /// A set of word-fragments to be checked. - Fragments(Vec<(Range, &'i str)>), - /// A word to be checked. Equiv to no match. - Atomic((Range, &'i str)), -} - -/// Transforms a word into a set of fragment-ranges and associated str slices. -fn transform<'i, R: AsRef>( - transform_regex: &[R], - word: &'i str, - range: Range, -) -> Transformed<'i> { - let mut q = std::collections::VecDeque::<(Range, &'_ str)>::with_capacity(32); - let mut words = Vec::with_capacity(16); - let mut whitelisted = 0usize; - q.push_back((range.clone(), word)); - while let Some((range, word)) = q.pop_front() { - // work on a fragment now - match transform_inner(transform_regex, word, range.clone()) { - // we try to match the fragments with the regex expr until they become atomic words or whitelisted - Transformed::Fragments(v) => q.extend(v), - Transformed::Atomic(word) => words.push(word), - Transformed::Whitelisted(_) => whitelisted += 1, - } - } - - // no match found at all, this word is "atomic" and will be checked as is - if whitelisted == 0usize { - // empty means nothing, one word with the same range means we only found the initial provided word - if words.is_empty() || (words.len() == 1 && words[0].0.len() == word.len()) { - return Transformed::Atomic((range, word)); - } - } - - if !words.is_empty() { - // collect all the words as fragments again (they actually really are) - Transformed::Fragments(words) - } else { - // if there are no words to be checked, everything is whitelisted - Transformed::Whitelisted((range, word)) - } -} - -/// Inner loop transform -/// -/// Returns `Some(vec![..])` if any captures were found. -fn transform_inner<'i, R: AsRef>( - transform_regex: &[R], - word: &'i str, - range: Range, -) -> Transformed<'i> { - for regex in transform_regex.iter().map(AsRef::as_ref) { - match regex.captures(word) { - Ok(Some(captures)) => { - // first one is always the full match - if captures.len() == 1 { - // means match, but no captures, - // which is equiv to an implicit whitelist - return Transformed::Whitelisted((range, word)); - } - let intermediate = captures - .iter() - .skip(1) - .filter_map(|m_opt| m_opt) - .map(|m| { - let intra_word_range = m.start()..m.end(); - trace!( - "Found capture for word >{}<, with match >{}< and capture >{}< at {:?}", - captures.get(0).unwrap().as_str(), - word, - m.as_str(), - &intra_word_range - ); - let offset = word - .char_indices() - .take_while(|(byte_pos, _)| m.start() > *byte_pos) - .count(); - let range = Range { - start: range.start + offset, - end: range.start + offset + m.as_str().chars().count(), - }; - (range, &word[intra_word_range]) - }) - .collect::>(); - - return Transformed::Fragments(intermediate); - } - Ok(None) => { - // no regex match, try the next regex - continue; - } - Err(e) => { - warn!("Matching regex >{}< errored: {}", regex.as_str(), e); - break; - } - } - } - // nothing matched, check the entire word instead - Transformed::Atomic((range, word)) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::config::WrappedRegex; - use env_logger; - #[test] - fn transformer() { - let _ = env_logger::builder() - .is_test(true) - .filter(None, log::LevelFilter::Trace) - .try_init(); - - let re = vec![ - WrappedRegex::from(Regex::new("^[0-9]+x$").unwrap()), //whitelist - WrappedRegex::from(Regex::new(r#"^'([^\s]+)'$"#).unwrap()), - WrappedRegex::from(Regex::new("(Alpha)(beta)").unwrap()), - ]; - - let words = vec!["2x", r#"''so-to-speak''"#, "Alphabeta", "Nothing"]; - - // whitelist - assert_eq!( - transform(re.as_slice(), words[0], 10..24), - Transformed::Whitelisted((10..24, words[0])) - ); - - // single quoted, recursive 2x - assert_eq!( - transform(re.as_slice(), words[1], 10..25), - Transformed::Fragments(vec![(12..23, &words[1][2..13])]) - ); - - // multi capture - assert_eq!( - transform(re.as_slice(), words[2], 10..19), - Transformed::Fragments(vec![(10..15, &words[2][0..5]), (15..19, &words[2][5..9]),]) - ); - - // no match - assert_eq!( - transform(re.as_slice(), words[3], 10..17), - Transformed::Atomic((10..17, words[3])) - ); - } -} diff --git a/src/checker/mod.rs b/src/checker/mod.rs index 236c28e7..07dde33d 100644 --- a/src/checker/mod.rs +++ b/src/checker/mod.rs @@ -16,6 +16,9 @@ mod hunspell; #[cfg(feature = "languagetool")] mod languagetool; +#[cfg(any(feature = "languagetool", feature = "hunspell"))] +mod quirks; + /// Implementation for a checker pub(crate) trait Checker { type Config; diff --git a/src/checker/quirks.rs b/src/checker/quirks.rs new file mode 100644 index 00000000..c63e2482 --- /dev/null +++ b/src/checker/quirks.rs @@ -0,0 +1,166 @@ +//! A set of quirks, not necessarily specific to a checker + +use crate::Range; +use fancy_regex::Regex; +use log::{trace, warn}; + +pub(crate) fn replacements_contain_dashless>(word: &str, replacements: &[T]) -> bool { + let dashless = word.chars().filter(|c| *c != '-').collect::(); + replacements + .iter() + .map(|s| s.as_ref()) + .find(|x| *x == &dashless) + .is_some() +} + +/// Transformed word with information on the transformation outcome. +#[derive(Debug, Eq, PartialEq)] +pub(crate) enum Transformed<'i> { + /// A whitelisted chunk + Whitelisted((Range, &'i str)), + /// A set of word-fragments to be checked. + Fragments(Vec<(Range, &'i str)>), + /// A word to be checked. Equiv to no match. + Atomic((Range, &'i str)), +} + +/// Transforms a word into a set of fragment-ranges and associated str slices. +pub(crate) fn transform<'i, R: AsRef>( + transform_regex: &[R], + word: &'i str, + range: Range, +) -> Transformed<'i> { + let mut q = std::collections::VecDeque::<(Range, &'_ str)>::with_capacity(32); + let mut words = Vec::with_capacity(16); + let mut whitelisted = 0usize; + q.push_back((range.clone(), word)); + while let Some((range, word)) = q.pop_front() { + // work on a fragment now + match transform_inner(transform_regex, word, range.clone()) { + // we try to match the fragments with the regex expr until they become atomic words or whitelisted + Transformed::Fragments(v) => q.extend(v), + Transformed::Atomic(word) => words.push(word), + Transformed::Whitelisted(_) => whitelisted += 1, + } + } + + // no match found at all, this word is "atomic" and will be checked as is + if whitelisted == 0usize { + // empty means nothing, one word with the same range means we only found the initial provided word + if words.is_empty() || (words.len() == 1 && words[0].0.len() == word.len()) { + return Transformed::Atomic((range, word)); + } + } + + if !words.is_empty() { + // collect all the words as fragments again (they actually really are) + Transformed::Fragments(words) + } else { + // if there are no words to be checked, everything is whitelisted + Transformed::Whitelisted((range, word)) + } +} + +/// Inner loop transform +/// +/// Returns `Some(vec![..])` if any captures were found. +fn transform_inner<'i, R: AsRef>( + transform_regex: &[R], + word: &'i str, + range: Range, +) -> Transformed<'i> { + for regex in transform_regex.iter().map(AsRef::as_ref) { + match regex.captures(word) { + Ok(Some(captures)) => { + // first one is always the full match + if captures.len() == 1 { + // means match, but no captures, + // which is equiv to an implicit whitelist + return Transformed::Whitelisted((range, word)); + } + let intermediate = captures + .iter() + .skip(1) + .filter_map(|m_opt| m_opt) + .map(|m| { + let intra_word_range = m.start()..m.end(); + trace!(target:"quirks", + "Found capture for word >{}<, with match >{}< and capture >{}< at {:?}", + captures.get(0).unwrap().as_str(), + word, + m.as_str(), + &intra_word_range + ); + let offset = word + .char_indices() + .take_while(|(byte_pos, _)| m.start() > *byte_pos) + .count(); + let range = Range { + start: range.start + offset, + end: range.start + offset + m.as_str().chars().count(), + }; + (range, &word[intra_word_range]) + }) + .collect::>(); + + return Transformed::Fragments(intermediate); + } + Ok(None) => { + // no regex match, try the next regex + continue; + } + Err(e) => { + warn!(target:"quirks", "Matching regex >{}< errored: {}", regex.as_str(), e); + break; + } + } + } + // nothing matched, check the entire word instead + Transformed::Atomic((range, word)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::WrappedRegex; + use env_logger; + #[test] + fn transformer() { + let _ = env_logger::builder() + .is_test(true) + .filter(None, log::LevelFilter::Trace) + .try_init(); + + let re = vec![ + WrappedRegex::from(Regex::new("^[0-9]+x$").unwrap()), //whitelist + WrappedRegex::from(Regex::new(r#"^'([^\s]+)'$"#).unwrap()), + WrappedRegex::from(Regex::new("(Alpha)(beta)").unwrap()), + ]; + + let words = vec!["2x", r#"''so-to-speak''"#, "Alphabeta", "Nothing"]; + + // whitelist + assert_eq!( + transform(re.as_slice(), words[0], 10..24), + Transformed::Whitelisted((10..24, words[0])) + ); + + // single quoted, recursive 2x + assert_eq!( + transform(re.as_slice(), words[1], 10..25), + Transformed::Fragments(vec![(12..23, &words[1][2..13])]) + ); + + // multi capture + assert_eq!( + transform(re.as_slice(), words[2], 10..19), + Transformed::Fragments(vec![(10..15, &words[2][0..5]), (15..19, &words[2][5..9]),]) + ); + + // no match + assert_eq!( + transform(re.as_slice(), words[3], 10..17), + Transformed::Atomic((10..17, words[3])) + ); + } +} diff --git a/src/config.rs b/src/config.rs index d8bb53a8..e989360a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -105,6 +105,41 @@ impl<'de> serde::de::Visitor<'de> for RegexVisitor { } } +#[derive(Deserialize, Serialize, Debug, Clone)] +pub struct Quirks { + /// A regular expression, whose capture groups will be checked, instead of the initial token. + /// Only the first one that matches will be used to split the word. + pub transform_regex: Option>, + /// Allow concatenated words instead of dashed connection. + /// Note that this only applies, if one of the suggestions. + pub allow_concatenation: Option, +} + +impl Default for Quirks { + fn default() -> Self { + // use some for default, so for generating the default config has the default values + // but the options are necessary to allow omitting them in the config file + Self { + transform_regex: Some(vec![]), + allow_concatenation: Some(false), + } + } +} + +impl Quirks { + pub(crate) fn allow_concatenated(&self) -> bool { + self.allow_concatenation.unwrap_or(false) + } + + pub(crate) fn transform_regex(&self) -> &[WrappedRegex] { + if let Some(ref tr) = self.transform_regex { + tr.as_slice() + } else { + &[] + } + } +} + #[derive(Deserialize, Serialize, Debug, Clone)] pub struct HunspellConfig { /// The language we want to check against, used as the dictionary and affixes file name. @@ -115,9 +150,8 @@ pub struct HunspellConfig { pub search_dirs: Option>, /// Additional dictionaries for topic specific lingo. pub extra_dictonaries: Option>, - /// A regular expression, whose capture groups will be checked, instead of the initial token. - /// Only the first one that matches will be used to split the word. - pub transform_regex: Option>, + /// Additional quirks besides dictionary lookups. + pub quirks: Option, } impl HunspellConfig { @@ -336,7 +370,7 @@ impl Default for Config { lang: Some("en_US".to_owned()), search_dirs: Some(search_dirs), extra_dictonaries: Some(Vec::new()), - transform_regex: None, + quirks: Some(Quirks::default()), }), languagetool: None, } @@ -457,4 +491,17 @@ lang = "en_US" ) .unwrap(); } + + #[test] + fn partial_7() { + let _ = Config::parse( + r#" +[hunspell] +[hunspell.quirks] +allow_concatenation = true +transform_regex = ["^'([^\\s])'$", "^[0-9]+x$"] + "#, + ) + .unwrap(); + } }