From daba5aa3533948d41427ae5ebe0cb8e91be82022 Mon Sep 17 00:00:00 2001
From: Bernhard Schuster <bernhard@ahoi.io>
Date: Fri, 14 Aug 2020 09:59:16 +0200
Subject: [PATCH] feat/quirks: extract quirks from hunspell, add
 allow_concatenation

Closes #82
---
 src/checker/hunspell.rs | 179 ++++++----------------------------------
 src/checker/mod.rs      |   3 +
 src/checker/quirks.rs   | 166 +++++++++++++++++++++++++++++++++++++
 src/config.rs           |  55 +++++++++++-
 4 files changed, 243 insertions(+), 160 deletions(-)
 create mode 100644 src/checker/quirks.rs

diff --git a/src/checker/hunspell.rs b/src/checker/hunspell.rs
index e81fd308..2810e97e 100644
--- a/src/checker/hunspell.rs
+++ b/src/checker/hunspell.rs
@@ -10,14 +10,15 @@ use super::{tokenize, Checker, Detector, Documentation, Suggestion, SuggestionSe
 use crate::documentation::{CheckableChunk, ContentOrigin, PlainOverlay};
 use crate::util::sub_chars;
 use crate::Range;
-use fancy_regex::Regex;
-use log::{debug, trace, warn};
+use log::{debug, trace};
 use std::path::PathBuf;
 
 use hunspell_rs::Hunspell;
 
 use anyhow::{anyhow, bail, Result};
 
+use super::quirks::{replacements_contain_dashless, transform, Transformed};
+
 pub struct HunspellChecker;
 
 impl HunspellChecker {
@@ -117,7 +118,11 @@ impl Checker for HunspellChecker {
     {
         let hunspell = Self::inner_init(config)?;
 
-        let transform_regex = config.transform_regex.clone().unwrap_or_else(|| Vec::new());
+        let (transform_regex, allow_concatenated) = if let Some(quirks) = &config.quirks {
+            (quirks.transform_regex(), quirks.allow_concatenated())
+        } else {
+            (&[][..], false)
+        };
 
         let suggestions = docu.iter().try_fold::<SuggestionSet, _, Result<_>>(
             SuggestionSet::new(),
@@ -132,7 +137,14 @@ impl Checker for HunspellChecker {
                         let word = sub_chars(txt, range.clone());
                         if transform_regex.is_empty() {
                             obtain_suggestions(
-                                &plain, chunk, &hunspell, origin, word, range, &mut acc,
+                                &plain,
+                                chunk,
+                                &hunspell,
+                                origin,
+                                word,
+                                range,
+                                allow_concatenated,
+                                &mut acc,
                             )
                         } else {
                             match transform(&transform_regex[..], word.as_str(), range.clone()) {
@@ -145,6 +157,7 @@ impl Checker for HunspellChecker {
                                             origin,
                                             word_fragment.to_owned(),
                                             range,
+                                            allow_concatenated,
                                             &mut acc,
                                         );
                                     }
@@ -157,6 +170,7 @@ impl Checker for HunspellChecker {
                                         origin,
                                         word.to_owned(),
                                         range,
+                                        allow_concatenated,
                                         &mut acc,
                                     );
                                 }
@@ -181,6 +195,7 @@ fn obtain_suggestions<'s>(
     origin: &ContentOrigin,
     word: String,
     range: Range,
+    allow_concatenated: bool,
     acc: &mut SuggestionSet<'s>,
 ) {
     if !hunspell.check(&word) {
@@ -192,6 +207,10 @@ fn obtain_suggestions<'s>(
             .filter(|x| x.len() > 1) // single char suggestions tend to be useless
             .collect::<Vec<_>>();
 
+        if allow_concatenated && replacements_contain_dashless(&word, replacements.as_slice()) {
+            trace!(target: "quirks", "Found dashed word in replacement suggestions, treating {} as ok", &word);
+            return;
+        }
         for (range, span) in plain.find_spans(range.clone()) {
             acc.add(
                 origin.clone(),
@@ -214,155 +233,3 @@ fn obtain_suggestions<'s>(
         );
     }
 }
-
-/// Transformed word with information on the transformation outcome.
-#[derive(Debug, Eq, PartialEq)]
-enum Transformed<'i> {
-    /// A whitelisted chunk
-    Whitelisted((Range, &'i str)),
-    /// A set of word-fragments to be checked.
-    Fragments(Vec<(Range, &'i str)>),
-    /// A word to be checked. Equiv to no match.
-    Atomic((Range, &'i str)),
-}
-
-/// Transforms a word into a set of fragment-ranges and associated str slices.
-fn transform<'i, R: AsRef<Regex>>(
-    transform_regex: &[R],
-    word: &'i str,
-    range: Range,
-) -> Transformed<'i> {
-    let mut q = std::collections::VecDeque::<(Range, &'_ str)>::with_capacity(32);
-    let mut words = Vec::with_capacity(16);
-    let mut whitelisted = 0usize;
-    q.push_back((range.clone(), word));
-    while let Some((range, word)) = q.pop_front() {
-        // work on a fragment now
-        match transform_inner(transform_regex, word, range.clone()) {
-            // we try to match the fragments with the regex expr until they become atomic words or whitelisted
-            Transformed::Fragments(v) => q.extend(v),
-            Transformed::Atomic(word) => words.push(word),
-            Transformed::Whitelisted(_) => whitelisted += 1,
-        }
-    }
-
-    // no match found at all, this word is "atomic" and will be checked as is
-    if whitelisted == 0usize {
-        // empty means nothing, one word with the same range means we only found the initial provided word
-        if words.is_empty() || (words.len() == 1 && words[0].0.len() == word.len()) {
-            return Transformed::Atomic((range, word));
-        }
-    }
-
-    if !words.is_empty() {
-        // collect all the words as fragments again (they actually really are)
-        Transformed::Fragments(words)
-    } else {
-        // if there are no words to be checked, everything is whitelisted
-        Transformed::Whitelisted((range, word))
-    }
-}
-
-/// Inner loop transform
-///
-/// Returns `Some(vec![..])` if any captures were found.
-fn transform_inner<'i, R: AsRef<Regex>>(
-    transform_regex: &[R],
-    word: &'i str,
-    range: Range,
-) -> Transformed<'i> {
-    for regex in transform_regex.iter().map(AsRef::as_ref) {
-        match regex.captures(word) {
-            Ok(Some(captures)) => {
-                // first one is always the full match
-                if captures.len() == 1 {
-                    // means match, but no captures,
-                    // which is equiv to an implicit whitelist
-                    return Transformed::Whitelisted((range, word));
-                }
-                let intermediate = captures
-                    .iter()
-                    .skip(1)
-                    .filter_map(|m_opt| m_opt)
-                    .map(|m| {
-                        let intra_word_range = m.start()..m.end();
-                        trace!(
-                            "Found capture for word >{}<, with match >{}< and capture >{}< at {:?}",
-                            captures.get(0).unwrap().as_str(),
-                            word,
-                            m.as_str(),
-                            &intra_word_range
-                        );
-                        let offset = word
-                            .char_indices()
-                            .take_while(|(byte_pos, _)| m.start() > *byte_pos)
-                            .count();
-                        let range = Range {
-                            start: range.start + offset,
-                            end: range.start + offset + m.as_str().chars().count(),
-                        };
-                        (range, &word[intra_word_range])
-                    })
-                    .collect::<Vec<_>>();
-
-                return Transformed::Fragments(intermediate);
-            }
-            Ok(None) => {
-                // no regex match, try the next regex
-                continue;
-            }
-            Err(e) => {
-                warn!("Matching regex >{}< errored: {}", regex.as_str(), e);
-                break;
-            }
-        }
-    }
-    // nothing matched, check the entire word instead
-    Transformed::Atomic((range, word))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::config::WrappedRegex;
-    use env_logger;
-    #[test]
-    fn transformer() {
-        let _ = env_logger::builder()
-            .is_test(true)
-            .filter(None, log::LevelFilter::Trace)
-            .try_init();
-
-        let re = vec![
-            WrappedRegex::from(Regex::new("^[0-9]+x$").unwrap()), //whitelist
-            WrappedRegex::from(Regex::new(r#"^'([^\s]+)'$"#).unwrap()),
-            WrappedRegex::from(Regex::new("(Alpha)(beta)").unwrap()),
-        ];
-
-        let words = vec!["2x", r#"''so-to-speak''"#, "Alphabeta", "Nothing"];
-
-        // whitelist
-        assert_eq!(
-            transform(re.as_slice(), words[0], 10..24),
-            Transformed::Whitelisted((10..24, words[0]))
-        );
-
-        // single quoted, recursive 2x
-        assert_eq!(
-            transform(re.as_slice(), words[1], 10..25),
-            Transformed::Fragments(vec![(12..23, &words[1][2..13])])
-        );
-
-        // multi capture
-        assert_eq!(
-            transform(re.as_slice(), words[2], 10..19),
-            Transformed::Fragments(vec![(10..15, &words[2][0..5]), (15..19, &words[2][5..9]),])
-        );
-
-        // no match
-        assert_eq!(
-            transform(re.as_slice(), words[3], 10..17),
-            Transformed::Atomic((10..17, words[3]))
-        );
-    }
-}
diff --git a/src/checker/mod.rs b/src/checker/mod.rs
index 236c28e7..07dde33d 100644
--- a/src/checker/mod.rs
+++ b/src/checker/mod.rs
@@ -16,6 +16,9 @@ mod hunspell;
 #[cfg(feature = "languagetool")]
 mod languagetool;
 
+#[cfg(any(feature = "languagetool", feature = "hunspell"))]
+mod quirks;
+
 /// Implementation for a checker
 pub(crate) trait Checker {
     type Config;
diff --git a/src/checker/quirks.rs b/src/checker/quirks.rs
new file mode 100644
index 00000000..c63e2482
--- /dev/null
+++ b/src/checker/quirks.rs
@@ -0,0 +1,166 @@
+//! A set of quirks, not necessarily specific to a checker
+
+use crate::Range;
+use fancy_regex::Regex;
+use log::{trace, warn};
+
+pub(crate) fn replacements_contain_dashless<T: AsRef<str>>(word: &str, replacements: &[T]) -> bool {
+    let dashless = word.chars().filter(|c| *c != '-').collect::<String>();
+    replacements
+        .iter()
+        .map(|s| s.as_ref())
+        .find(|x| *x == &dashless)
+        .is_some()
+}
+
+/// Transformed word with information on the transformation outcome.
+#[derive(Debug, Eq, PartialEq)]
+pub(crate) enum Transformed<'i> {
+    /// A whitelisted chunk
+    Whitelisted((Range, &'i str)),
+    /// A set of word-fragments to be checked.
+    Fragments(Vec<(Range, &'i str)>),
+    /// A word to be checked. Equiv to no match.
+    Atomic((Range, &'i str)),
+}
+
+/// Transforms a word into a set of fragment-ranges and associated str slices.
+pub(crate) fn transform<'i, R: AsRef<Regex>>(
+    transform_regex: &[R],
+    word: &'i str,
+    range: Range,
+) -> Transformed<'i> {
+    let mut q = std::collections::VecDeque::<(Range, &'_ str)>::with_capacity(32);
+    let mut words = Vec::with_capacity(16);
+    let mut whitelisted = 0usize;
+    q.push_back((range.clone(), word));
+    while let Some((range, word)) = q.pop_front() {
+        // work on a fragment now
+        match transform_inner(transform_regex, word, range.clone()) {
+            // we try to match the fragments with the regex expr until they become atomic words or whitelisted
+            Transformed::Fragments(v) => q.extend(v),
+            Transformed::Atomic(word) => words.push(word),
+            Transformed::Whitelisted(_) => whitelisted += 1,
+        }
+    }
+
+    // no match found at all, this word is "atomic" and will be checked as is
+    if whitelisted == 0usize {
+        // empty means nothing, one word with the same range means we only found the initial provided word
+        if words.is_empty() || (words.len() == 1 && words[0].0.len() == word.len()) {
+            return Transformed::Atomic((range, word));
+        }
+    }
+
+    if !words.is_empty() {
+        // collect all the words as fragments again (they actually really are)
+        Transformed::Fragments(words)
+    } else {
+        // if there are no words to be checked, everything is whitelisted
+        Transformed::Whitelisted((range, word))
+    }
+}
+
+/// Inner loop transform
+///
+/// Returns `Some(vec![..])` if any captures were found.
+fn transform_inner<'i, R: AsRef<Regex>>(
+    transform_regex: &[R],
+    word: &'i str,
+    range: Range,
+) -> Transformed<'i> {
+    for regex in transform_regex.iter().map(AsRef::as_ref) {
+        match regex.captures(word) {
+            Ok(Some(captures)) => {
+                // first one is always the full match
+                if captures.len() == 1 {
+                    // means match, but no captures,
+                    // which is equiv to an implicit whitelist
+                    return Transformed::Whitelisted((range, word));
+                }
+                let intermediate = captures
+                    .iter()
+                    .skip(1)
+                    .filter_map(|m_opt| m_opt)
+                    .map(|m| {
+                        let intra_word_range = m.start()..m.end();
+                        trace!(target:"quirks",
+                            "Found capture for word >{}<, with match >{}< and capture >{}< at {:?}",
+                            captures.get(0).unwrap().as_str(),
+                            word,
+                            m.as_str(),
+                            &intra_word_range
+                        );
+                        let offset = word
+                            .char_indices()
+                            .take_while(|(byte_pos, _)| m.start() > *byte_pos)
+                            .count();
+                        let range = Range {
+                            start: range.start + offset,
+                            end: range.start + offset + m.as_str().chars().count(),
+                        };
+                        (range, &word[intra_word_range])
+                    })
+                    .collect::<Vec<_>>();
+
+                return Transformed::Fragments(intermediate);
+            }
+            Ok(None) => {
+                // no regex match, try the next regex
+                continue;
+            }
+            Err(e) => {
+                warn!(target:"quirks", "Matching regex >{}< errored: {}", regex.as_str(), e);
+                break;
+            }
+        }
+    }
+    // nothing matched, check the entire word instead
+    Transformed::Atomic((range, word))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::config::WrappedRegex;
+    use env_logger;
+    #[test]
+    fn transformer() {
+        let _ = env_logger::builder()
+            .is_test(true)
+            .filter(None, log::LevelFilter::Trace)
+            .try_init();
+
+        let re = vec![
+            WrappedRegex::from(Regex::new("^[0-9]+x$").unwrap()), //whitelist
+            WrappedRegex::from(Regex::new(r#"^'([^\s]+)'$"#).unwrap()),
+            WrappedRegex::from(Regex::new("(Alpha)(beta)").unwrap()),
+        ];
+
+        let words = vec!["2x", r#"''so-to-speak''"#, "Alphabeta", "Nothing"];
+
+        // whitelist
+        assert_eq!(
+            transform(re.as_slice(), words[0], 10..24),
+            Transformed::Whitelisted((10..24, words[0]))
+        );
+
+        // single quoted, recursive 2x
+        assert_eq!(
+            transform(re.as_slice(), words[1], 10..25),
+            Transformed::Fragments(vec![(12..23, &words[1][2..13])])
+        );
+
+        // multi capture
+        assert_eq!(
+            transform(re.as_slice(), words[2], 10..19),
+            Transformed::Fragments(vec![(10..15, &words[2][0..5]), (15..19, &words[2][5..9]),])
+        );
+
+        // no match
+        assert_eq!(
+            transform(re.as_slice(), words[3], 10..17),
+            Transformed::Atomic((10..17, words[3]))
+        );
+    }
+}
diff --git a/src/config.rs b/src/config.rs
index d8bb53a8..e989360a 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -105,6 +105,41 @@ impl<'de> serde::de::Visitor<'de> for RegexVisitor {
     }
 }
 
+#[derive(Deserialize, Serialize, Debug, Clone)]
+pub struct Quirks {
+    /// A regular expression, whose capture groups will be checked, instead of the initial token.
+    /// Only the first one that matches will be used to split the word.
+    pub transform_regex: Option<Vec<WrappedRegex>>,
+    /// Allow concatenated words instead of dashed connection.
+    /// Note that this only applies, if one of the suggestions.
+    pub allow_concatenation: Option<bool>,
+}
+
+impl Default for Quirks {
+    fn default() -> Self {
+        // use some for default, so for generating the default config has the default values
+        // but the options are necessary to allow omitting them in the config file
+        Self {
+            transform_regex: Some(vec![]),
+            allow_concatenation: Some(false),
+        }
+    }
+}
+
+impl Quirks {
+    pub(crate) fn allow_concatenated(&self) -> bool {
+        self.allow_concatenation.unwrap_or(false)
+    }
+
+    pub(crate) fn transform_regex(&self) -> &[WrappedRegex] {
+        if let Some(ref tr) = self.transform_regex {
+            tr.as_slice()
+        } else {
+            &[]
+        }
+    }
+}
+
 #[derive(Deserialize, Serialize, Debug, Clone)]
 pub struct HunspellConfig {
     /// The language we want to check against, used as the dictionary and affixes file name.
@@ -115,9 +150,8 @@ pub struct HunspellConfig {
     pub search_dirs: Option<Vec<PathBuf>>,
     /// Additional dictionaries for topic specific lingo.
     pub extra_dictonaries: Option<Vec<PathBuf>>,
-    /// A regular expression, whose capture groups will be checked, instead of the initial token.
-    /// Only the first one that matches will be used to split the word.
-    pub transform_regex: Option<Vec<WrappedRegex>>,
+    /// Additional quirks besides dictionary lookups.
+    pub quirks: Option<Quirks>,
 }
 
 impl HunspellConfig {
@@ -336,7 +370,7 @@ impl Default for Config {
                 lang: Some("en_US".to_owned()),
                 search_dirs: Some(search_dirs),
                 extra_dictonaries: Some(Vec::new()),
-                transform_regex: None,
+                quirks: Some(Quirks::default()),
             }),
             languagetool: None,
         }
@@ -457,4 +491,17 @@ lang = "en_US"
         )
         .unwrap();
     }
+
+    #[test]
+    fn partial_7() {
+        let _ = Config::parse(
+            r#"
+[hunspell]
+[hunspell.quirks]
+allow_concatenation = true
+transform_regex = ["^'([^\\s])'$", "^[0-9]+x$"]
+			"#,
+        )
+        .unwrap();
+    }
 }