Skip to content

Commit

Permalink
Add symbol / character whitelist (fixes #50)
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelKohler committed Dec 30, 2019
1 parent 7443ab8 commit cbc63e2
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 13 deletions.
3 changes: 2 additions & 1 deletion README.md
Expand Up @@ -50,7 +50,8 @@ The following rules can be configured per language. Add a `<language>.toml` file
| min_characters | Minimum of character occurances | integer | 0
| may_end_with_colon | If a sentence can end with a : or not | boolean | false
| quote_start_with_letter | If a quote needs to start with a letter | boolean | true
| disallowed_symbols | Array of disallowed symbols or letters | String Array | all symbols allowed
| allowed_symbols_regex | Regex of allowed symbols or letters. Each character gets matched against this pattern. | String Array | not used
| disallowed_symbols | Array of disallowed symbols or letters. Only used when allowed_symbols_regex is not set or is an empty String. | String Array | all symbols allowed
| disallowed_words | Array of disallowed words | String Array | all words allowed
| broken_whitespace | Array of broken whitespaces. This could for example disallow two spaces following eachother | String Array | all types of whitespaces allowed
| min_word_count | Minimum number of words in a sentence | integer | 1
Expand Down
40 changes: 35 additions & 5 deletions src/checker.rs
Expand Up @@ -23,10 +23,19 @@ pub fn check(rules: &Config, raw: &&str) -> bool {
return false;
}

let symbols = trimmed.chars().any(|c| {
rules.disallowed_symbols.contains(&Value::try_from(c).unwrap())
});
if symbols {
let mut invalid_symbols = false;
if rules.allowed_symbols_regex.len() > 0 {
let regex = Regex::new(&rules.allowed_symbols_regex).unwrap();
invalid_symbols = trimmed.chars().any(|c| {
!regex.is_match(&c.to_string())
});
} else {
invalid_symbols = trimmed.chars().any(|c| {
rules.disallowed_symbols.contains(&Value::try_from(c).unwrap())
});
}

if invalid_symbols {
return false;
}

Expand Down Expand Up @@ -220,6 +229,28 @@ mod test {
assert_eq!(check(&rules, &"This has a %"), false);
}

#[test]
fn test_allowed_symbols_regex() {
let rules : Config = Config {
allowed_symbols_regex: String::from("[\u{0020}-\u{005A}]"),
..Default::default()
};

assert_eq!(check(&rules, &"ONLY UPPERCASE AND SPACE IS ALLOWED"), true);
assert_eq!(check(&rules, &"This is not uppercase"), false);
}

#[test]
fn test_allowed_symbols_regex_over_disallowed() {
let rules : Config = Config {
allowed_symbols_regex: String::from("[\u{0020}-\u{005A}]"),
disallowed_symbols: vec![Value::try_from('O').unwrap()],
..Default::default()
};

assert_eq!(check(&rules, &"ONLY UPPERCASE AND SPACE IS ALLOWED AND DISALLOWED O IS OKAY"), true);
}

#[test]
fn test_disallowed_words() {
let rules : Config = Config {
Expand All @@ -240,7 +271,6 @@ mod test {
assert_eq!(check(&rules, &"This has a's"), false);
}


#[test]
fn test_broken_whitespace() {
let rules : Config = Config {
Expand Down
2 changes: 2 additions & 0 deletions src/config.rs
Expand Up @@ -46,6 +46,7 @@ pub struct Config {
pub needs_punctuation_end: bool,
pub needs_uppercase_start: bool,
pub needs_letter_start: bool,
pub allowed_symbols_regex: String,
pub disallowed_symbols: Array,
pub disallowed_words: HashSet<String>,
pub broken_whitespace: Array,
Expand All @@ -66,6 +67,7 @@ impl Default for Config {
needs_punctuation_end: false,
needs_uppercase_start: false,
needs_letter_start: true,
allowed_symbols_regex: String::from(""),
disallowed_symbols: vec![],
disallowed_words: HashSet::new(),
broken_whitespace: vec![],
Expand Down
9 changes: 2 additions & 7 deletions src/rules/german.toml
Expand Up @@ -7,13 +7,8 @@ quote_start_with_letter = true
needs_punctuation_end = false
needs_letter_start = true
needs_uppercase_start = true
disallowed_symbols = [
'<', '>', '+', '*', '\', '#', '@', '^', '[', ']', '(', ')', '/',
'é', 'è', 'à', 'ç', 'Å',
'α', 'β', 'Γ', 'γ', 'Δ', 'δ', 'ε', 'ζ', 'η', 'Θ', 'θ', 'ι', 'κ',
'Λ', 'λ', 'μ', 'ν', 'Ξ', 'ξ', 'Π', 'π', 'ρ', 'Σ', 'σ', 'ς', 'τ',
'υ', 'Φ', 'φ', 'χ', 'Ψ', 'ψ', 'Ω', 'ω',
]
allowed_symbols_regex = "[\u0020-\u007BäöüßÄÖÜ„“‚‘’–\\.]"
disallowed_symbols = []
broken_whitespace = [" ", " ,", " .", " ?", " !", " ;"]

# Abbreviation examples for each regex, also cheating a bit and adding more regex which has nothing to do with abbreviations:
Expand Down

0 comments on commit cbc63e2

Please sign in to comment.