cleanup tokenizer

dhowe · Oct 13, 2020 · 7ad5e92 · 7ad5e92
1 parent b590a0e
commit 7ad5e92
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 75 deletions.
diff --git a/issues/known-issues.js b/issues/known-issues.js
@@ -115,7 +115,6 @@ describe('RiTa.KnownIssues', () => {
       "chaises", "chaise",
       "metroes", "metro",
       "briefcases", "briefcase"
-    ]
     ];
 
     let res1, res2, res3, dbug = 0;

diff --git a/src/tokenizer.js b/src/tokenizer.js
@@ -41,110 +41,85 @@ class Tokenizer {
   }
 
   tokenize(words, regex) {
+
     if (typeof words !== 'string') return [];
 
     if (regex) return words.split(regex);
 
     words = words.trim(); // ???
-    for (let i = 0; i < TOKENIZE_REGEXS_A.length; i += 2){
-      words = words.replace(TOKENIZE_REGEXS_A[i], TOKENIZE_REGEXS_A[i+1]);
+    for (let i = 0; i < TOKENIZE_RE.length; i += 2) {
+      words = words.replace(TOKENIZE_RE[i], TOKENIZE_RE[i + 1]);
     }
-    // words = words.replace(/([Ee])[.]([Gg])[.]/g, "_$1$2_"); // E.G.
-    // words = words.replace(/([Ii])[.]([Ee])[.]/g, "_$1$2_"); // I.E.
-    //
-    // words = words.replace(/([\\?!\"\u201C\\.,;:@#$%&])/g, " $1 ");
-    // words = words.replace(/\\.\\.\\./g, " ... ");
-    // words = words.replace(/\\s+/g, ' ');
-    // words = words.replace(/,([^0-9])/g, " , $1");
-    // words = words.replace(/([^.])([.])([\])}>\"'’]*)\\s*$/g, "$1 $2$3 ");
-    // words = words.replace(/([\[\](){}<>])/g, " $1 ");
-    // words = words.replace(/--/g, " -- ");
-    // words = words.replace(/$/g, ' ');
-    // words = words.replace(/^/g, ' ');
-    // words = words.replace(/([^'])' | '/g, "$1 ' ");
-    // words = words.replace(/ \u2018/g, " \u2018 ");
-    // words = words.replace(/'([SMD]) /g, " '$1 ");
 
     if (this.RiTa.SPLIT_CONTRACTIONS) {
-      for (let i = 0; i < TOKENIZE_REGEXS_B.length; i += 2){
-        words = words.replace(TOKENIZE_REGEXS_B[i], TOKENIZE_REGEXS_B[i+1]);
+      for (let i = 0; i < CONTRACTIONS_RE.length; i += 2) {
+        words = words.replace(CONTRACTIONS_RE[i], CONTRACTIONS_RE[i + 1]);
       }
-
-      // words = words.replace(/([Cc])an['’]t/g, "$1an not");
-      // words = words.replace(/([Dd])idn['’]t/g, "$1id not");
-      // words = words.replace(/([CcWw])ouldn['’]t/g, "$1ould not");
-      // words = words.replace(/([Ss])houldn['’]t/g, "$1hould not");
-      // words = words.replace(/ ([Ii])t['’]s/g, " $1t is");
-      // words = words.replace(/n['’]t /g, " not ");
-      // words = words.replace(/['’]ve /g, " have ");
-      // words = words.replace(/['’]re /g, " are ");
     }
 
-    // "Nicole I. Kidman" gets tokenized as "Nicole I . Kidman"
-    // words = words.replace(/ ([A-Z]) \\./g, " $1. ");
-    // words = words.replace(/\\s+/g, ' ');
-    // words = words.replace(/^\\s+/g, '');
-    //
-    // words = words.replace(/_([Ee])([Gg])_/g, "$1.$2."); // E.G.
-    // words = words.replace(/_([Ii])([Ee])_/g, "$1.$2."); // I.E.
-
     let result = words.trim().split(/\s+/);
-    result.forEach((token, i) => {
-      if (token.includes('_')){
+    result.forEach((token, i) => { // use filter?
+      if (token.includes('_')) {
         result[i] = token.replace(/([a-zA-z]|[\.\,])_([a-zA-Z])/g, "$1 $2");
       }
     });
 
     return result;
   }
 
-  untokenize(arr, delim) { // TODO: should be state machine
+  untokenize(arr, delim) { // so ugly
 
     delim = delim || ' ';
 
     let thisNBPunct, thisNAPunct, lastNBPunct, lastNAPunct, thisQuote, lastQuote, thisComma, isLast,
-      lastComma, lastEndWithS, nextIsS, thisLBracket, thisRBracket, lastLBracket, lastRBracket, lastIsWWW, thisDomin, dbug = 0,
-      nextNoSpace = false,
-      afterQuote = false,
-      withinQuote = arr.length && QUOTE_RE.test(arr[0]),
-      result = arr[0] || '',
-      midSentence = false;
+      lastComma, lastEndWithS, nextIsS, thisLBracket, thisRBracket, lastLBracket, lastRBracket,
+      lastIsWWW, thisDomin, dbug = 0, nextNoSpace = false, afterQuote = false, midSentence = false,
+      withinQuote = arr.length && QUOTE_RE.test(arr[0]), result = arr[0] || '';
 
     for (let i = 1; i < arr.length; i++) {
 
       if (!arr[i]) continue;
 
       thisComma = arr[i] === ',';
-      thisNBPunct = NO_SPACE_BEFORE_PUNCT_RE.test(arr[i]);//NB -> no space before the punctuation
+      thisNBPunct = NO_SPACE_BF_PUNCT_RE.test(arr[i]);//NB -> no space before the punctuation
       thisNAPunct = NO_SPACE_AFTER_PUNCT_RE.test(arr[i]);//NA -> no space after the punctuation
       thisQuote = QUOTE_RE.test(arr[i]);
       thisLBracket = LEFT_BRACKETS_RE.test(arr[i]);//LBracket -> left bracket
       thisRBracket = RIGHT_BRACKETS_RE.test(arr[i]);//RBracket -> right bracket
       lastComma = arr[i - 1] === ',';
-      lastNBPunct = NO_SPACE_BEFORE_PUNCT_RE.test(arr[i - 1]);//NB -> no space before
+      lastNBPunct = NO_SPACE_BF_PUNCT_RE.test(arr[i - 1]);//NB -> no space before
       lastNAPunct = NO_SPACE_AFTER_PUNCT_RE.test(arr[i - 1]);//NA -> no space after
       lastQuote = QUOTE_RE.test(arr[i - 1]);
       lastLBracket = LEFT_BRACKETS_RE.test(arr[i - 1]);
       lastRBracket = RIGHT_BRACKETS_RE.test(arr[i - 1]);
-      lastEndWithS = (arr[i - 1].charAt(arr[i - 1].length - 1) === 's' && arr[i - 1]!= "is" && arr[i - 1] != "Is" && arr[i - 1] != "IS");
+      lastEndWithS = (arr[i - 1].charAt(arr[i - 1].length - 1) === 's'
+        && arr[i - 1] != "is" && arr[i - 1] != "Is" && arr[i - 1] != "IS");
       lastIsWWW = WWW_RE.test(arr[i - 1]);
-      thisDomin = DOMIN_RE.test(arr[i]);
+      thisDomin = DOMAIN_RE.test(arr[i]);
       nextIsS = i == arr.length - 1 ? false : (arr[i + 1] === "s" || arr[i + 1] === "S");
       isLast = (i == arr.length - 1);
 
-      if ((arr[i - 1] === "." && thisDomin) || nextNoSpace){
+      if ((arr[i - 1] === "." && thisDomin) || nextNoSpace) {
+
         nextNoSpace = false;
         result += arr[i];
         continue;
-      } else if (arr[i] === "." && lastIsWWW){
+
+      } else if (arr[i] === "." && lastIsWWW) {
+
         //console.log('yes');
         nextNoSpace = true;
+
       } else if (thisLBracket) {
+
         result += delim;
-      } else if (lastRBracket){
-        if (!thisNBPunct && !thisLBracket){
+
+      } else if (lastRBracket) {
+
+        if (!thisNBPunct && !thisLBracket) {
           result += delim;
         }
+
       } else if (thisQuote) {
 
         if (withinQuote) {
@@ -172,15 +147,14 @@ class Tokenizer {
         result += delim;
         midSentence = false;
 
-      } else if ((!thisNBPunct && !lastQuote && !lastNAPunct && !lastLBracket && !thisRBracket) || (!isLast && thisNBPunct && lastNBPunct && !lastNAPunct && !lastQuote && !lastLBracket && !thisRBracket)) {
-
+      } else if ((!thisNBPunct && !lastQuote && !lastNAPunct && !lastLBracket && !thisRBracket)
+        || (!isLast && thisNBPunct && lastNBPunct && !lastNAPunct
+          && !lastQuote && !lastLBracket && !thisRBracket)) {
         result += delim;
       }
 
       result += arr[i]; // add to result
-
       if (thisNBPunct && !lastNBPunct && !withinQuote && SQUOTE_RE.test(arr[i]) && lastEndWithS) {
-
         result += delim; // ??
       }
     }
@@ -189,25 +163,27 @@ class Tokenizer {
   }
 }
 
-const NO_SPACE_BEFORE_PUNCT_RE = /^[,\.\;\:\?\!\)""“”\u2019‘`'%…\u2103\^\*°\/⁄\-@]+$/;
-const QUOTE_RE = /^[""“”\u2019‘`''«»‘’]+$/;
 const LEFT_BRACKETS_RE = /^[\[\(\{⟨]+$/;
 const RIGHT_BRACKETS_RE = /^[\)\]\}⟩]+$/;
-const NO_SPACE_AFTER_PUNCT_RE = /^[\^\*\$\/⁄#\-@°]+$/;
+const QUOTE_RE = /^[""“”\u2019‘`''«»‘’]+$/;
 const SQUOTE_RE = /^[\u2019‘`']+$/;
 const APOS_RE = /^[\u2019'’]+$/;
 const LB_RE = /(\r?\n)+/g;
 const WWW_RE = /^(www[0-9]?|WWW[0-9]?)$/;
-const DOMIN_RE = /^(com|org|edu|net|xyz|gov|int|eu|hk|tw|cn|de|ch|fr)$/;
-const TOKENIZE_REGEXS_A = [
-  //save abbreviations-------
+const NO_SPACE_AFTER_PUNCT_RE = /^[\^\*\$\/⁄#\-@°]+$/;
+const DOMAIN_RE = /^(com|org|edu|net|xyz|gov|int|eu|hk|tw|cn|de|ch|fr)$/;
+const NO_SPACE_BF_PUNCT_RE = /^[,\.\;\:\?\!\)""“”\u2019‘`'%…\u2103\^\*°\/⁄\-@]+$/;
+
+const TOKENIZE_RE = [
+
+  // save abbreviations --------
   /([Ee])[.]([Gg])[.]/g, "_$1$2_",//E.g
   /([Ii])[.]([Ee])[.]/g, "_$1$2_",//i.e
   /([Aa])[.]([Mm])[.]/g, "_$1$2_",//a.m.
   /([Pp])[.]([Mm])[.]/g, "_$1$2_",//p.m.
   /(Cap)[\.]/g, "_Cap_",//Cap.
   /([Cc])[\.]/g, "_$1_",//c.
-  /([Ee][Tt])[\s]([Aa][Ll])[\.]/,"_$1zzz$2_",// et al.
+  /([Ee][Tt])[\s]([Aa][Ll])[\.]/, "_$1zzz$2_",// et al.
   /(etc|ETC)[\.]/g, "_$1_",//etc.
   /([Pp])[\.]([Ss])[\.]/g, "_$1$2dot_", // p.s.
   /([Pp])[\.]([Ss])/g, "_$1$2_", // p.s
@@ -217,13 +193,14 @@ const TOKENIZE_REGEXS_A = [
   /([Mm])([Rr]|[Ss]|[Xx])[\.]/g, "_$1$2_", // Mr. Ms. and Mx.
   /([Dd])([Rr])[\.]/g, "_$1$2_", // Dr.
   /([Pp])([Ff])[\.]/g, "_$1$2_", // Pf.
-  /([Ii])([Nn])([Dd]|[Cc])[\.]/g,"_$1$2$3_", // Ind. and Inc.
+  /([Ii])([Nn])([Dd]|[Cc])[\.]/g, "_$1$2$3_", // Ind. and Inc.
   /([Cc])([Oo])[\.][\,][\s]([Ll])([Tt])([Dd])[\.]/g, "_$1$2dcs$3$4$5_", // co., ltd.
   /([Cc])([Oo])[\.][\s]([Ll])([Tt])([Dd])[\.]/g, "_$1$2ds$3$4$5_", // co. ltd.
   /([Cc])([Oo])[\.][\,]([Ll])([Tt])([Dd])[\.]/g, "_$1$2dc$3$4$5_", // co.,ltd.
   /([Cc])([Oo])([Rr]?)([Pp]?)[\.]/g, "_$1$2$3$4_",// Corp. and Co.
   /([Ll])([Tt])([Dd])[\.]/g, "_$1$2$3_", // ltd.
   /(prof|Prof|PROF)[\.]/g, "_$1_", //Prof.
+
   //--------------------------
   /\.\.\.\s/g, "_elipsisDDD_ ",
   /([\?!\"\u201C\.,;:@#$%&])/g, " $1 ",
@@ -241,16 +218,17 @@ const TOKENIZE_REGEXS_A = [
   / ([A-Z]) \./g, " $1. ",
   /^\s+/g, '',
   /\^/g, " ^ ",
-  /°/g," ° ",
+  /°/g, " ° ",
   /_elipsisDDD_/g, " ... ",
+
   //pop abbreviations------------------
   /_([Ee])([Gg])_/g, "$1.$2.",//Eg
   /_([Ii])([Ee])_/g, "$1.$2.",//ie
   /_([Aa])([Mm])_/g, "$1.$2.",//a.m.
   /_([Pp])([Mm])_/g, "$1.$2.",//p.m.
   /_Cap_/g, "Cap.",//Cap.
   /_([Cc])_/g, "$1.",//c.
-  /_([Ee][Tt])zzz([Aa][Ll])_/,"$1_$2.",// et al.
+  /_([Ee][Tt])zzz([Aa][Ll])_/, "$1_$2.",// et al.
   /_(etc|ETC)_/g, "$1.",//etc.
   /_([Pp])([Ss])dot_/g, "$1.$2.", // p.s.
   /_([Pp])([Ss])_/g, "$1.$2",
@@ -260,24 +238,24 @@ const TOKENIZE_REGEXS_A = [
   /_([Mm])([Rr]|[Ss]|[Xx])_/g, "$1$2.", // Mr. Ms. and Mx.
   /_([Dd])([Rr])_/g, "$1$2.", // Dr.
   /_([Pp])([Ff])_/g, "$1$2.", // Pf.
-  /_([Ii])([Nn])([Dd]|[Cc])_/g,"$1$2$3.", // Ind. and Inc.
+  /_([Ii])([Nn])([Dd]|[Cc])_/g, "$1$2$3.", // Ind. and Inc.
   /_([Cc])([Oo])([Rr]?)([Pp]?)_/g, "$1$2$3$4.",// Corp. and Co.
   /_([Cc])([Oo])dc([Ll])([Tt])([Dd])_/g, "$1$2.,$3$4$5.", // co.,ltd.
   /_([Ll])([Tt])([Dd])_/g, "$1$2$3.", // ltd.
   /_([Cc])([Oo])dcs([Ll])([Tt])([Dd])_/g, "$1$2.,_$3$4$5.", // co., ltd.
   /_([Cc])([Oo])ds([Ll])([Tt])([Dd])_/g, "$1$2._$3$4$5.", // co. ltd.
-  /_(prof|PROF|Prof)_/g, "$1.", //Prof.
-
+  /_(prof|PROF|Prof)_/g, "$1." //Prof.
 ];
-const TOKENIZE_REGEXS_B = [
+
+const CONTRACTIONS_RE = [
   /([Cc])an['’]t/g, "$1an not",
   /([Dd])idn['’]t/g, "$1id not",
   /([CcWw])ouldn['’]t/g, "$1ould not",
   /([Ss])houldn['’]t/g, "$1hould not",
   /([Ii])t['’]s/g, " $1t is",
   /n['’]t /g, " not ",
   /['’]ve /g, " have ",
-  /['’]re /g, " are ",
+  /['’]re /g, " are "
 ];
 
 module && (module.exports = Tokenizer);
diff --git a/test/tokenizer-tests.js b/test/tokenizer-tests.js
@@ -1,5 +1,3 @@
-// const expect = require('chai').expect;
-// const RiTa = require('../src/rita_api');
 
 describe('RiTa.Tokenizer', () => {