Skip to content

Commit

Permalink
feat/quirks: extract quirks from hunspell, add allow_concatenation
Browse files Browse the repository at this point in the history
Closes #82
  • Loading branch information
drahnr committed Aug 14, 2020
1 parent edbd1a4 commit daba5aa
Show file tree
Hide file tree
Showing 4 changed files with 243 additions and 160 deletions.
179 changes: 23 additions & 156 deletions src/checker/hunspell.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ use super::{tokenize, Checker, Detector, Documentation, Suggestion, SuggestionSe
use crate::documentation::{CheckableChunk, ContentOrigin, PlainOverlay};
use crate::util::sub_chars;
use crate::Range;
use fancy_regex::Regex;
use log::{debug, trace, warn};
use log::{debug, trace};
use std::path::PathBuf;

use hunspell_rs::Hunspell;

use anyhow::{anyhow, bail, Result};

use super::quirks::{replacements_contain_dashless, transform, Transformed};

pub struct HunspellChecker;

impl HunspellChecker {
Expand Down Expand Up @@ -117,7 +118,11 @@ impl Checker for HunspellChecker {
{
let hunspell = Self::inner_init(config)?;

let transform_regex = config.transform_regex.clone().unwrap_or_else(|| Vec::new());
let (transform_regex, allow_concatenated) = if let Some(quirks) = &config.quirks {
(quirks.transform_regex(), quirks.allow_concatenated())
} else {
(&[][..], false)
};

let suggestions = docu.iter().try_fold::<SuggestionSet, _, Result<_>>(
SuggestionSet::new(),
Expand All @@ -132,7 +137,14 @@ impl Checker for HunspellChecker {
let word = sub_chars(txt, range.clone());
if transform_regex.is_empty() {
obtain_suggestions(
&plain, chunk, &hunspell, origin, word, range, &mut acc,
&plain,
chunk,
&hunspell,
origin,
word,
range,
allow_concatenated,
&mut acc,
)
} else {
match transform(&transform_regex[..], word.as_str(), range.clone()) {
Expand All @@ -145,6 +157,7 @@ impl Checker for HunspellChecker {
origin,
word_fragment.to_owned(),
range,
allow_concatenated,
&mut acc,
);
}
Expand All @@ -157,6 +170,7 @@ impl Checker for HunspellChecker {
origin,
word.to_owned(),
range,
allow_concatenated,
&mut acc,
);
}
Expand All @@ -181,6 +195,7 @@ fn obtain_suggestions<'s>(
origin: &ContentOrigin,
word: String,
range: Range,
allow_concatenated: bool,
acc: &mut SuggestionSet<'s>,
) {
if !hunspell.check(&word) {
Expand All @@ -192,6 +207,10 @@ fn obtain_suggestions<'s>(
.filter(|x| x.len() > 1) // single char suggestions tend to be useless
.collect::<Vec<_>>();

if allow_concatenated && replacements_contain_dashless(&word, replacements.as_slice()) {
trace!(target: "quirks", "Found dashed word in replacement suggestions, treating {} as ok", &word);
return;
}
for (range, span) in plain.find_spans(range.clone()) {
acc.add(
origin.clone(),
Expand All @@ -214,155 +233,3 @@ fn obtain_suggestions<'s>(
);
}
}

/// Transformed word with information on the transformation outcome.
#[derive(Debug, Eq, PartialEq)]
enum Transformed<'i> {
/// A whitelisted chunk
Whitelisted((Range, &'i str)),
/// A set of word-fragments to be checked.
Fragments(Vec<(Range, &'i str)>),
/// A word to be checked. Equiv to no match.
Atomic((Range, &'i str)),
}

/// Transforms a word into a set of fragment-ranges and associated str slices.
fn transform<'i, R: AsRef<Regex>>(
transform_regex: &[R],
word: &'i str,
range: Range,
) -> Transformed<'i> {
let mut q = std::collections::VecDeque::<(Range, &'_ str)>::with_capacity(32);
let mut words = Vec::with_capacity(16);
let mut whitelisted = 0usize;
q.push_back((range.clone(), word));
while let Some((range, word)) = q.pop_front() {
// work on a fragment now
match transform_inner(transform_regex, word, range.clone()) {
// we try to match the fragments with the regex expr until they become atomic words or whitelisted
Transformed::Fragments(v) => q.extend(v),
Transformed::Atomic(word) => words.push(word),
Transformed::Whitelisted(_) => whitelisted += 1,
}
}

// no match found at all, this word is "atomic" and will be checked as is
if whitelisted == 0usize {
// empty means nothing, one word with the same range means we only found the initial provided word
if words.is_empty() || (words.len() == 1 && words[0].0.len() == word.len()) {
return Transformed::Atomic((range, word));
}
}

if !words.is_empty() {
// collect all the words as fragments again (they actually really are)
Transformed::Fragments(words)
} else {
// if there are no words to be checked, everything is whitelisted
Transformed::Whitelisted((range, word))
}
}

/// Inner loop transform
///
/// Returns `Some(vec![..])` if any captures were found.
fn transform_inner<'i, R: AsRef<Regex>>(
transform_regex: &[R],
word: &'i str,
range: Range,
) -> Transformed<'i> {
for regex in transform_regex.iter().map(AsRef::as_ref) {
match regex.captures(word) {
Ok(Some(captures)) => {
// first one is always the full match
if captures.len() == 1 {
// means match, but no captures,
// which is equiv to an implicit whitelist
return Transformed::Whitelisted((range, word));
}
let intermediate = captures
.iter()
.skip(1)
.filter_map(|m_opt| m_opt)
.map(|m| {
let intra_word_range = m.start()..m.end();
trace!(
"Found capture for word >{}<, with match >{}< and capture >{}< at {:?}",
captures.get(0).unwrap().as_str(),
word,
m.as_str(),
&intra_word_range
);
let offset = word
.char_indices()
.take_while(|(byte_pos, _)| m.start() > *byte_pos)
.count();
let range = Range {
start: range.start + offset,
end: range.start + offset + m.as_str().chars().count(),
};
(range, &word[intra_word_range])
})
.collect::<Vec<_>>();

return Transformed::Fragments(intermediate);
}
Ok(None) => {
// no regex match, try the next regex
continue;
}
Err(e) => {
warn!("Matching regex >{}< errored: {}", regex.as_str(), e);
break;
}
}
}
// nothing matched, check the entire word instead
Transformed::Atomic((range, word))
}

#[cfg(test)]
mod tests {
use super::*;
use crate::config::WrappedRegex;
use env_logger;
#[test]
fn transformer() {
let _ = env_logger::builder()
.is_test(true)
.filter(None, log::LevelFilter::Trace)
.try_init();

let re = vec![
WrappedRegex::from(Regex::new("^[0-9]+x$").unwrap()), //whitelist
WrappedRegex::from(Regex::new(r#"^'([^\s]+)'$"#).unwrap()),
WrappedRegex::from(Regex::new("(Alpha)(beta)").unwrap()),
];

let words = vec!["2x", r#"''so-to-speak''"#, "Alphabeta", "Nothing"];

// whitelist
assert_eq!(
transform(re.as_slice(), words[0], 10..24),
Transformed::Whitelisted((10..24, words[0]))
);

// single quoted, recursive 2x
assert_eq!(
transform(re.as_slice(), words[1], 10..25),
Transformed::Fragments(vec![(12..23, &words[1][2..13])])
);

// multi capture
assert_eq!(
transform(re.as_slice(), words[2], 10..19),
Transformed::Fragments(vec![(10..15, &words[2][0..5]), (15..19, &words[2][5..9]),])
);

// no match
assert_eq!(
transform(re.as_slice(), words[3], 10..17),
Transformed::Atomic((10..17, words[3]))
);
}
}
3 changes: 3 additions & 0 deletions src/checker/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ mod hunspell;
#[cfg(feature = "languagetool")]
mod languagetool;

#[cfg(any(feature = "languagetool", feature = "hunspell"))]
mod quirks;

/// Implementation for a checker
pub(crate) trait Checker {
type Config;
Expand Down
Loading

0 comments on commit daba5aa

Please sign in to comment.