Add symbol / character whitelist (fixes #50)

common-voice · Dec 30, 2019 · cbc63e2 · cbc63e2
1 parent 7443ab8
commit cbc63e2
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -50,7 +50,8 @@ The following rules can be configured per language. Add a `<language>.toml` file
 | min_characters |  Minimum of character occurances | integer | 0
 | may_end_with_colon |  If a sentence can end with a : or not | boolean | false
 | quote_start_with_letter |  If a quote needs to start with a letter | boolean | true
-| disallowed_symbols |  Array of disallowed symbols or letters | String Array | all symbols allowed
+| allowed_symbols_regex |  Regex of allowed symbols or letters. Each character gets matched against this pattern. | String Array | not used
+| disallowed_symbols |  Array of disallowed symbols or letters. Only used when allowed_symbols_regex is not set or is an empty String. | String Array | all symbols allowed
 | disallowed_words |  Array of disallowed words | String Array | all words allowed
 | broken_whitespace |  Array of broken whitespaces. This could for example disallow two spaces following eachother | String Array | all types of whitespaces allowed
 | min_word_count |  Minimum number of words in a sentence | integer | 1

diff --git a/src/checker.rs b/src/checker.rs
@@ -23,10 +23,19 @@ pub fn check(rules: &Config, raw: &&str) -> bool {
         return false;
     }
 
-    let symbols = trimmed.chars().any(|c| {
-        rules.disallowed_symbols.contains(&Value::try_from(c).unwrap())
-    });
-    if symbols {
+    let mut invalid_symbols = false;
+    if rules.allowed_symbols_regex.len() > 0 {
+        let regex = Regex::new(&rules.allowed_symbols_regex).unwrap();
+        invalid_symbols = trimmed.chars().any(|c| {
+            !regex.is_match(&c.to_string())
+        });
+    } else {
+        invalid_symbols = trimmed.chars().any(|c| {
+            rules.disallowed_symbols.contains(&Value::try_from(c).unwrap())
+        });
+    }
+
+    if invalid_symbols {
         return false;
     }
 
@@ -220,6 +229,28 @@ mod test {
         assert_eq!(check(&rules, &"This has a %"), false);
     }
 
+    #[test]
+    fn test_allowed_symbols_regex() {
+        let rules : Config = Config {
+            allowed_symbols_regex: String::from("[\u{0020}-\u{005A}]"),
+            ..Default::default()
+        };
+
+        assert_eq!(check(&rules, &"ONLY UPPERCASE AND SPACE IS ALLOWED"), true);
+        assert_eq!(check(&rules, &"This is not uppercase"), false);
+    }
+
+    #[test]
+    fn test_allowed_symbols_regex_over_disallowed() {
+        let rules : Config = Config {
+            allowed_symbols_regex: String::from("[\u{0020}-\u{005A}]"),
+            disallowed_symbols: vec![Value::try_from('O').unwrap()],
+            ..Default::default()
+        };
+
+        assert_eq!(check(&rules, &"ONLY UPPERCASE AND SPACE IS ALLOWED AND DISALLOWED O IS OKAY"), true);
+    }
+
     #[test]
     fn test_disallowed_words() {
         let rules : Config = Config {
@@ -240,7 +271,6 @@ mod test {
         assert_eq!(check(&rules, &"This has a's"), false);
     }
 
-
     #[test]
     fn test_broken_whitespace() {
         let rules : Config = Config {

diff --git a/src/config.rs b/src/config.rs
@@ -46,6 +46,7 @@ pub struct Config {
     pub needs_punctuation_end: bool,
     pub needs_uppercase_start: bool,
     pub needs_letter_start: bool,
+    pub allowed_symbols_regex: String,
     pub disallowed_symbols: Array,
     pub disallowed_words: HashSet<String>,
     pub broken_whitespace: Array,
@@ -66,6 +67,7 @@ impl Default for Config {
             needs_punctuation_end: false,
             needs_uppercase_start: false,
             needs_letter_start: true,
+            allowed_symbols_regex: String::from(""),
             disallowed_symbols: vec![],
             disallowed_words: HashSet::new(),
             broken_whitespace: vec![],

diff --git a/src/rules/german.toml b/src/rules/german.toml
@@ -7,13 +7,8 @@ quote_start_with_letter = true
 needs_punctuation_end = false
 needs_letter_start = true
 needs_uppercase_start = true
-disallowed_symbols = [
-  '<', '>', '+', '*', '\', '#', '@', '^', '[', ']', '(', ')', '/',
-  'é', 'è', 'à', 'ç', 'Å',
-  'α', 'β', 'Γ', 'γ', 'Δ', 'δ', 'ε', 'ζ', 'η', 'Θ', 'θ', 'ι', 'κ',
-  'Λ', 'λ', 'μ', 'ν', 'Ξ', 'ξ', 'Π', 'π', 'ρ', 'Σ', 'σ', 'ς', 'τ',
-  'υ', 'Φ', 'φ', 'χ', 'Ψ', 'ψ', 'Ω', 'ω',
-]
+allowed_symbols_regex = "[\u0020-\u007BäöüßÄÖÜ„“‚‘’–\\.]"
+disallowed_symbols = []
 broken_whitespace = ["  ", " ,", " .", " ?", " !", " ;"]
 
 # Abbreviation examples for each regex, also cheating a bit and adding more regex which has nothing to do with abbreviations: