feat: add normalization for specific languages (#631)

common-voice · Aug 3, 2022 · 5a86a81 · 5a86a81
1 parent 16c011b
commit 5a86a81
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ The [Sentence Collector](https://commonvoice.mozilla.org/sentence-collector/) is
 - Is everything working as expected? If not, submit [a new issue](https://github.com/Common-Voice/sentence-collector/issues/new).
 - Review the pending issues in the [project](https://github.com/Common-Voice/sentence-collector/projects/2).
 - Create a [new PR](https://github.com/Common-Voice/sentence-collector/compare) to fix any of the existing issues in the project.
-- To add or adjust validation and cleanup for a language see [VALIDATION.md](https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/VALIDATION.md) and [CLEANUP.md](https://github.com/common-voice/sentence-collector/blob/main/server/lib/cleanup/CLEANUP.md). Cleanup is optional and should only be used in rare cases.
+- To add or adjust normalization, validation and cleanup for a language see [VALIDATION.md](https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/VALIDATION.md) and [CLEANUP.md](https://github.com/common-voice/sentence-collector/blob/main/server/lib/cleanup/CLEANUP.md). Cleanup is optional and should only be used in rare cases.
 
 ## Prerequisites
 

diff --git a/server/lib/validation/VALIDATION.md b/server/lib/validation/VALIDATION.md
@@ -48,3 +48,11 @@ In this example we are defining one function and one regex:
 * Using a regex: the second validation uses a regex. If the sentence contains any numbers, the regex matches and we will mark the sentence as invalid. The user will see `Sentence contains numbers` in the frontend. If no numbers are found (the regex doesn't match), the sentence will be marked as valid.
 
 You can return the same error message for multiple invalidation rules if appropriate, however try to be as specific as possible. In the frontend the errors will be grouped by this error message.
+
+## Normalization
+
+For certain languages there are benefits of normalizing the sentence in NFC before running through validation. This can be enabled by adding the language code to the `USE_NFC_NORMALIZATION` array in `index.js`. Activating normalization means that any further steps will get the normalized sentence. This includes the validation rules, as well as saving it to the database and then later on exporting it to the Common Voice repository.
+
+**Example:** In Korean you can either type `"ᄏ", "ᅩ" and "ᆯ"` which results in `콜` of length 3 (when checked with `.length`), or `콜` which is one code point.
+
+If we apply NFC the validation process gets easier to define. This topic came up in [this PR](https://github.com/common-voice/sentence-collector/pull/630#issuecomment-1201099593).
diff --git a/server/lib/validation/index.js b/server/lib/validation/index.js
@@ -35,27 +35,38 @@ const VALIDATORS = {
   yue,
 };
 
+// For certain language we want to normalize before we validate.
+// This then also means that the returned sentence is normalized
+// and therefore will be saved to the database in normalized form.
+const USE_NFC_NORMALIZATION = [
+  'ko',
+];
+
 module.exports = {
   validateSentences,
 };
 
 function validateSentences(language, sentences) {
   const validator = getValidatorFor(language);
 
-  return runValidation(validator, sentences);
+  return runValidation(validator, {
+    sentences,
+    normalize: USE_NFC_NORMALIZATION.includes(language),
+  });
 }
 
-function runValidation(validator, sentences = { unreviewed: [], validated: [] }) {
+function runValidation(validator, { sentences = { unreviewed: [], validated: [] }, normalize }) {
   let filtered = [];
 
   const validate = (validSentences, sentence) => {
-    const validationResult = validateSentence(validator, sentence);
+    const sentenceToValidate = normalize ? sentence.normalize('NFC') : sentence;
+    const validationResult = validateSentence(validator, sentenceToValidate);
     if (validationResult.error) {
       filtered.push(validationResult);
       return validSentences;
     }
 
-    validSentences.push(sentence);
+    validSentences.push(sentenceToValidate);
     return validSentences;
   };
 

diff --git a/server/tests/lib/validation/index.test.js b/server/tests/lib/validation/index.test.js
@@ -2,17 +2,23 @@ import test from 'ava';
 import validation from '../../../lib/validation';
 
 function validate(t, language, sentences, expected) {
+  const validationResult = validation.validateSentences(language, sentences);
+  t.log(validationResult.valid);
+  t.deepEqual(validationResult.valid, expected);
+}
+
+function validateFiltered(t, language, sentences, expected) {
   const validationResult = validation.validateSentences(language, sentences);
   t.log(validationResult.filtered);
   t.deepEqual(validationResult.filtered, expected);
 }
 
-test('validates valid sentences', validate, 'en', {
+test('validates valid sentences', validateFiltered, 'en', {
   unreviewed: ['This is valid'],
   validated: ['This is valid too'],
 }, []);
 
-test('validates invalid sentences - too long', validate, 'en', {
+test('validates invalid sentences - too long', validateFiltered, 'en', {
   unreviewed: ['This is very very very very very very very very very very very very very very very very very very very very long'],
   validated: ['This is very very very very very very very very very very very very very very very very very very very very long too'],
 }, [{
@@ -23,7 +29,7 @@ test('validates invalid sentences - too long', validate, 'en', {
   error: 'Number of words must be between 1 and 14 (inclusive)',
 }]);
 
-test('validates invalid sentences - contains numbers', validate, 'en', {
+test('validates invalid sentences - contains numbers', validateFiltered, 'en', {
   unreviewed: ['This is 2valid'],
   validated: ['This is 3valid'],
 }, [{
@@ -34,7 +40,7 @@ test('validates invalid sentences - contains numbers', validate, 'en', {
   error: 'Sentence should not contain numbers',
 }]);
 
-test('validates invalid sentences - contains abbreviation', validate, 'en', {
+test('validates invalid sentences - contains abbreviation', validateFiltered, 'en', {
   unreviewed: ['This is A.B.C.'],
   validated: ['This ABC too'],
 }, [{
@@ -45,7 +51,7 @@ test('validates invalid sentences - contains abbreviation', validate, 'en', {
   error: 'Sentence should not contain abbreviations',
 }]);
 
-test('validates invalid sentences - contains symbols', validate, 'en', {
+test('validates invalid sentences - contains symbols', validateFiltered, 'en', {
   unreviewed: ['This is # test'],
   validated: ['This is @ test', 'This is / test'],
 }, [{
@@ -59,7 +65,7 @@ test('validates invalid sentences - contains symbols', validate, 'en', {
   error: 'Sentence should not contain symbols',
 }]);
 
-test('validates invalid sentences - multiple sentences', validate, 'it', {
+test('validates invalid sentences - multiple sentences', validateFiltered, 'it', {
   unreviewed: ['This is test. And more.'],
   validated: ['This is one. This is two.'],
 }, [{
@@ -70,7 +76,7 @@ test('validates invalid sentences - multiple sentences', validate, 'it', {
   error: 'Sentence should not contain sentence punctuation inside a sentence',
 }]);
 
-test('validates invalid sentences - english chars', validate, 'ru', {
+test('validates invalid sentences - english chars', validateFiltered, 'ru', {
   unreviewed: ['This is test'],
   validated: ['This too'],
 }, [{
@@ -81,7 +87,7 @@ test('validates invalid sentences - english chars', validate, 'ru', {
   error: 'Sentence should not contain latin alphabet characters',
 }]);
 
-test('validates invalid sentences - other rules', validate, 'bas', {
+test('validates invalid sentences - other rules', validateFiltered, 'bas', {
   unreviewed: ['This is valid', 'This is wrong .', 'This as well!.', 'No;', 'Definitely not,'],
   validated: ['This too'],
 }, [{
@@ -97,3 +103,8 @@ test('validates invalid sentences - other rules', validate, 'bas', {
   sentence: 'Definitely not,',
   error: 'Sentence should not end with a comma',
 }]);
+
+test('normalizes', validate, 'ko', {
+  unreviewed: ['콜'],
+  validated: [],
+}, ['콜']);