Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add explicit concat quirks #91

Merged
merged 4 commits into from
Aug 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "cargo-spellcheck"
version = "0.3.2-alpha.0"
version = "0.4.0-alpha.1"
authors = ["Bernhard Schuster <bernhard@ahoi.io>"]
edition = "2018"
repository = "https://github.com/drahnr/cargo-spellcheck.git"
Expand Down
179 changes: 23 additions & 156 deletions src/checker/hunspell.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ use super::{tokenize, Checker, Detector, Documentation, Suggestion, SuggestionSe
use crate::documentation::{CheckableChunk, ContentOrigin, PlainOverlay};
use crate::util::sub_chars;
use crate::Range;
use fancy_regex::Regex;
use log::{debug, trace, warn};
use log::{debug, trace};
use std::path::PathBuf;

use hunspell_rs::Hunspell;

use anyhow::{anyhow, bail, Result};

use super::quirks::{replacements_contain_dashless, transform, Transformed};

pub struct HunspellChecker;

impl HunspellChecker {
Expand Down Expand Up @@ -117,7 +118,11 @@ impl Checker for HunspellChecker {
{
let hunspell = Self::inner_init(config)?;

let transform_regex = config.transform_regex.clone().unwrap_or_else(|| Vec::new());
let (transform_regex, allow_concatenated) = if let Some(quirks) = &config.quirks {
(quirks.transform_regex(), quirks.allow_concatenated())
} else {
(&[][..], false)
};

let suggestions = docu.iter().try_fold::<SuggestionSet, _, Result<_>>(
SuggestionSet::new(),
Expand All @@ -132,7 +137,14 @@ impl Checker for HunspellChecker {
let word = sub_chars(txt, range.clone());
if transform_regex.is_empty() {
obtain_suggestions(
&plain, chunk, &hunspell, origin, word, range, &mut acc,
&plain,
chunk,
&hunspell,
origin,
word,
range,
allow_concatenated,
&mut acc,
)
} else {
match transform(&transform_regex[..], word.as_str(), range.clone()) {
Expand All @@ -145,6 +157,7 @@ impl Checker for HunspellChecker {
origin,
word_fragment.to_owned(),
range,
allow_concatenated,
&mut acc,
);
}
Expand All @@ -157,6 +170,7 @@ impl Checker for HunspellChecker {
origin,
word.to_owned(),
range,
allow_concatenated,
&mut acc,
);
}
Expand All @@ -181,6 +195,7 @@ fn obtain_suggestions<'s>(
origin: &ContentOrigin,
word: String,
range: Range,
allow_concatenated: bool,
acc: &mut SuggestionSet<'s>,
) {
if !hunspell.check(&word) {
Expand All @@ -192,6 +207,10 @@ fn obtain_suggestions<'s>(
.filter(|x| x.len() > 1) // single char suggestions tend to be useless
.collect::<Vec<_>>();

if allow_concatenated && replacements_contain_dashless(&word, replacements.as_slice()) {
trace!(target: "quirks", "Found dashed word in replacement suggestions, treating {} as ok", &word);
return;
}
for (range, span) in plain.find_spans(range.clone()) {
acc.add(
origin.clone(),
Expand All @@ -214,155 +233,3 @@ fn obtain_suggestions<'s>(
);
}
}

/// Transformed word with information on the transformation outcome.
#[derive(Debug, Eq, PartialEq)]
enum Transformed<'i> {
/// A whitelisted chunk
Whitelisted((Range, &'i str)),
/// A set of word-fragments to be checked.
Fragments(Vec<(Range, &'i str)>),
/// A word to be checked. Equiv to no match.
Atomic((Range, &'i str)),
}

/// Transforms a word into a set of fragment-ranges and associated str slices.
fn transform<'i, R: AsRef<Regex>>(
transform_regex: &[R],
word: &'i str,
range: Range,
) -> Transformed<'i> {
let mut q = std::collections::VecDeque::<(Range, &'_ str)>::with_capacity(32);
let mut words = Vec::with_capacity(16);
let mut whitelisted = 0usize;
q.push_back((range.clone(), word));
while let Some((range, word)) = q.pop_front() {
// work on a fragment now
match transform_inner(transform_regex, word, range.clone()) {
// we try to match the fragments with the regex expr until they become atomic words or whitelisted
Transformed::Fragments(v) => q.extend(v),
Transformed::Atomic(word) => words.push(word),
Transformed::Whitelisted(_) => whitelisted += 1,
}
}

// no match found at all, this word is "atomic" and will be checked as is
if whitelisted == 0usize {
// empty means nothing, one word with the same range means we only found the initial provided word
if words.is_empty() || (words.len() == 1 && words[0].0.len() == word.len()) {
return Transformed::Atomic((range, word));
}
}

if !words.is_empty() {
// collect all the words as fragments again (they actually really are)
Transformed::Fragments(words)
} else {
// if there are no words to be checked, everything is whitelisted
Transformed::Whitelisted((range, word))
}
}

/// Inner loop transform
///
/// Returns `Some(vec![..])` if any captures were found.
fn transform_inner<'i, R: AsRef<Regex>>(
transform_regex: &[R],
word: &'i str,
range: Range,
) -> Transformed<'i> {
for regex in transform_regex.iter().map(AsRef::as_ref) {
match regex.captures(word) {
Ok(Some(captures)) => {
// first one is always the full match
if captures.len() == 1 {
// means match, but no captures,
// which is equiv to an implicit whitelist
return Transformed::Whitelisted((range, word));
}
let intermediate = captures
.iter()
.skip(1)
.filter_map(|m_opt| m_opt)
.map(|m| {
let intra_word_range = m.start()..m.end();
trace!(
"Found capture for word >{}<, with match >{}< and capture >{}< at {:?}",
captures.get(0).unwrap().as_str(),
word,
m.as_str(),
&intra_word_range
);
let offset = word
.char_indices()
.take_while(|(byte_pos, _)| m.start() > *byte_pos)
.count();
let range = Range {
start: range.start + offset,
end: range.start + offset + m.as_str().chars().count(),
};
(range, &word[intra_word_range])
})
.collect::<Vec<_>>();

return Transformed::Fragments(intermediate);
}
Ok(None) => {
// no regex match, try the next regex
continue;
}
Err(e) => {
warn!("Matching regex >{}< errored: {}", regex.as_str(), e);
break;
}
}
}
// nothing matched, check the entire word instead
Transformed::Atomic((range, word))
}

#[cfg(test)]
mod tests {
use super::*;
use crate::config::WrappedRegex;
use env_logger;
#[test]
fn transformer() {
let _ = env_logger::builder()
.is_test(true)
.filter(None, log::LevelFilter::Trace)
.try_init();

let re = vec![
WrappedRegex::from(Regex::new("^[0-9]+x$").unwrap()), //whitelist
WrappedRegex::from(Regex::new(r#"^'([^\s]+)'$"#).unwrap()),
WrappedRegex::from(Regex::new("(Alpha)(beta)").unwrap()),
];

let words = vec!["2x", r#"''so-to-speak''"#, "Alphabeta", "Nothing"];

// whitelist
assert_eq!(
transform(re.as_slice(), words[0], 10..24),
Transformed::Whitelisted((10..24, words[0]))
);

// single quoted, recursive 2x
assert_eq!(
transform(re.as_slice(), words[1], 10..25),
Transformed::Fragments(vec![(12..23, &words[1][2..13])])
);

// multi capture
assert_eq!(
transform(re.as_slice(), words[2], 10..19),
Transformed::Fragments(vec![(10..15, &words[2][0..5]), (15..19, &words[2][5..9]),])
);

// no match
assert_eq!(
transform(re.as_slice(), words[3], 10..17),
Transformed::Atomic((10..17, words[3]))
);
}
}
3 changes: 3 additions & 0 deletions src/checker/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ mod hunspell;
#[cfg(feature = "languagetool")]
mod languagetool;

#[cfg(any(feature = "languagetool", feature = "hunspell"))]
mod quirks;

/// Implementation for a checker
pub(crate) trait Checker {
type Config;
Expand Down
Loading