Skip to content

Commit

Permalink
cleanup tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
dhowe committed Oct 13, 2020
1 parent b590a0e commit 7ad5e92
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 75 deletions.
1 change: 0 additions & 1 deletion issues/known-issues.js
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ describe('RiTa.KnownIssues', () => {
"chaises", "chaise",
"metroes", "metro",
"briefcases", "briefcase"
]
];

let res1, res2, res3, dbug = 0;
Expand Down
122 changes: 50 additions & 72 deletions src/tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,110 +41,85 @@ class Tokenizer {
}

tokenize(words, regex) {

if (typeof words !== 'string') return [];

if (regex) return words.split(regex);

words = words.trim(); // ???
for (let i = 0; i < TOKENIZE_REGEXS_A.length; i += 2){
words = words.replace(TOKENIZE_REGEXS_A[i], TOKENIZE_REGEXS_A[i+1]);
for (let i = 0; i < TOKENIZE_RE.length; i += 2) {
words = words.replace(TOKENIZE_RE[i], TOKENIZE_RE[i + 1]);
}
// words = words.replace(/([Ee])[.]([Gg])[.]/g, "_$1$2_"); // E.G.
// words = words.replace(/([Ii])[.]([Ee])[.]/g, "_$1$2_"); // I.E.
//
// words = words.replace(/([\\?!\"\u201C\\.,;:@#$%&])/g, " $1 ");
// words = words.replace(/\\.\\.\\./g, " ... ");
// words = words.replace(/\\s+/g, ' ');
// words = words.replace(/,([^0-9])/g, " , $1");
// words = words.replace(/([^.])([.])([\])}>\"'’]*)\\s*$/g, "$1 $2$3 ");
// words = words.replace(/([\[\](){}<>])/g, " $1 ");
// words = words.replace(/--/g, " -- ");
// words = words.replace(/$/g, ' ');
// words = words.replace(/^/g, ' ');
// words = words.replace(/([^'])' | '/g, "$1 ' ");
// words = words.replace(/ \u2018/g, " \u2018 ");
// words = words.replace(/'([SMD]) /g, " '$1 ");

if (this.RiTa.SPLIT_CONTRACTIONS) {
for (let i = 0; i < TOKENIZE_REGEXS_B.length; i += 2){
words = words.replace(TOKENIZE_REGEXS_B[i], TOKENIZE_REGEXS_B[i+1]);
for (let i = 0; i < CONTRACTIONS_RE.length; i += 2) {
words = words.replace(CONTRACTIONS_RE[i], CONTRACTIONS_RE[i + 1]);
}

// words = words.replace(/([Cc])an['’]t/g, "$1an not");
// words = words.replace(/([Dd])idn['’]t/g, "$1id not");
// words = words.replace(/([CcWw])ouldn['’]t/g, "$1ould not");
// words = words.replace(/([Ss])houldn['’]t/g, "$1hould not");
// words = words.replace(/ ([Ii])t['’]s/g, " $1t is");
// words = words.replace(/n['’]t /g, " not ");
// words = words.replace(/['’]ve /g, " have ");
// words = words.replace(/['’]re /g, " are ");
}

// "Nicole I. Kidman" gets tokenized as "Nicole I . Kidman"
// words = words.replace(/ ([A-Z]) \\./g, " $1. ");
// words = words.replace(/\\s+/g, ' ');
// words = words.replace(/^\\s+/g, '');
//
// words = words.replace(/_([Ee])([Gg])_/g, "$1.$2."); // E.G.
// words = words.replace(/_([Ii])([Ee])_/g, "$1.$2."); // I.E.

let result = words.trim().split(/\s+/);
result.forEach((token, i) => {
if (token.includes('_')){
result.forEach((token, i) => { // use filter?
if (token.includes('_')) {
result[i] = token.replace(/([a-zA-z]|[\.\,])_([a-zA-Z])/g, "$1 $2");
}
});

return result;
}

untokenize(arr, delim) { // TODO: should be state machine
untokenize(arr, delim) { // so ugly

delim = delim || ' ';

let thisNBPunct, thisNAPunct, lastNBPunct, lastNAPunct, thisQuote, lastQuote, thisComma, isLast,
lastComma, lastEndWithS, nextIsS, thisLBracket, thisRBracket, lastLBracket, lastRBracket, lastIsWWW, thisDomin, dbug = 0,
nextNoSpace = false,
afterQuote = false,
withinQuote = arr.length && QUOTE_RE.test(arr[0]),
result = arr[0] || '',
midSentence = false;
lastComma, lastEndWithS, nextIsS, thisLBracket, thisRBracket, lastLBracket, lastRBracket,
lastIsWWW, thisDomin, dbug = 0, nextNoSpace = false, afterQuote = false, midSentence = false,
withinQuote = arr.length && QUOTE_RE.test(arr[0]), result = arr[0] || '';

for (let i = 1; i < arr.length; i++) {

if (!arr[i]) continue;

thisComma = arr[i] === ',';
thisNBPunct = NO_SPACE_BEFORE_PUNCT_RE.test(arr[i]);//NB -> no space before the punctuation
thisNBPunct = NO_SPACE_BF_PUNCT_RE.test(arr[i]);//NB -> no space before the punctuation
thisNAPunct = NO_SPACE_AFTER_PUNCT_RE.test(arr[i]);//NA -> no space after the punctuation
thisQuote = QUOTE_RE.test(arr[i]);
thisLBracket = LEFT_BRACKETS_RE.test(arr[i]);//LBracket -> left bracket
thisRBracket = RIGHT_BRACKETS_RE.test(arr[i]);//RBracket -> right bracket
lastComma = arr[i - 1] === ',';
lastNBPunct = NO_SPACE_BEFORE_PUNCT_RE.test(arr[i - 1]);//NB -> no space before
lastNBPunct = NO_SPACE_BF_PUNCT_RE.test(arr[i - 1]);//NB -> no space before
lastNAPunct = NO_SPACE_AFTER_PUNCT_RE.test(arr[i - 1]);//NA -> no space after
lastQuote = QUOTE_RE.test(arr[i - 1]);
lastLBracket = LEFT_BRACKETS_RE.test(arr[i - 1]);
lastRBracket = RIGHT_BRACKETS_RE.test(arr[i - 1]);
lastEndWithS = (arr[i - 1].charAt(arr[i - 1].length - 1) === 's' && arr[i - 1]!= "is" && arr[i - 1] != "Is" && arr[i - 1] != "IS");
lastEndWithS = (arr[i - 1].charAt(arr[i - 1].length - 1) === 's'
&& arr[i - 1] != "is" && arr[i - 1] != "Is" && arr[i - 1] != "IS");
lastIsWWW = WWW_RE.test(arr[i - 1]);
thisDomin = DOMIN_RE.test(arr[i]);
thisDomin = DOMAIN_RE.test(arr[i]);
nextIsS = i == arr.length - 1 ? false : (arr[i + 1] === "s" || arr[i + 1] === "S");
isLast = (i == arr.length - 1);

if ((arr[i - 1] === "." && thisDomin) || nextNoSpace){
if ((arr[i - 1] === "." && thisDomin) || nextNoSpace) {

nextNoSpace = false;
result += arr[i];
continue;
} else if (arr[i] === "." && lastIsWWW){

} else if (arr[i] === "." && lastIsWWW) {

//console.log('yes');
nextNoSpace = true;

} else if (thisLBracket) {

result += delim;
} else if (lastRBracket){
if (!thisNBPunct && !thisLBracket){

} else if (lastRBracket) {

if (!thisNBPunct && !thisLBracket) {
result += delim;
}

} else if (thisQuote) {

if (withinQuote) {
Expand Down Expand Up @@ -172,15 +147,14 @@ class Tokenizer {
result += delim;
midSentence = false;

} else if ((!thisNBPunct && !lastQuote && !lastNAPunct && !lastLBracket && !thisRBracket) || (!isLast && thisNBPunct && lastNBPunct && !lastNAPunct && !lastQuote && !lastLBracket && !thisRBracket)) {

} else if ((!thisNBPunct && !lastQuote && !lastNAPunct && !lastLBracket && !thisRBracket)
|| (!isLast && thisNBPunct && lastNBPunct && !lastNAPunct
&& !lastQuote && !lastLBracket && !thisRBracket)) {
result += delim;
}

result += arr[i]; // add to result

if (thisNBPunct && !lastNBPunct && !withinQuote && SQUOTE_RE.test(arr[i]) && lastEndWithS) {

result += delim; // ??
}
}
Expand All @@ -189,25 +163,27 @@ class Tokenizer {
}
}

const NO_SPACE_BEFORE_PUNCT_RE = /^[,\.\;\:\?\!\)""“”\u2019‘`'%…\u2103\^\*°\/⁄\-@]+$/;
const QUOTE_RE = /^[""“”\u2019‘`''«»‘’]+$/;
const LEFT_BRACKETS_RE = /^[\[\(\{⟨]+$/;
const RIGHT_BRACKETS_RE = /^[\)\]\}⟩]+$/;
const NO_SPACE_AFTER_PUNCT_RE = /^[\^\*\$\/⁄#\-@°]+$/;
const QUOTE_RE = /^[""“”\u2019‘`''«»‘’]+$/;
const SQUOTE_RE = /^[\u2019‘`']+$/;
const APOS_RE = /^[\u2019'’]+$/;
const LB_RE = /(\r?\n)+/g;
const WWW_RE = /^(www[0-9]?|WWW[0-9]?)$/;
const DOMIN_RE = /^(com|org|edu|net|xyz|gov|int|eu|hk|tw|cn|de|ch|fr)$/;
const TOKENIZE_REGEXS_A = [
//save abbreviations-------
const NO_SPACE_AFTER_PUNCT_RE = /^[\^\*\$\/⁄#\-@°]+$/;
const DOMAIN_RE = /^(com|org|edu|net|xyz|gov|int|eu|hk|tw|cn|de|ch|fr)$/;
const NO_SPACE_BF_PUNCT_RE = /^[,\.\;\:\?\!\)""“”\u2019‘`'%…\u2103\^\*°\/⁄\-@]+$/;

const TOKENIZE_RE = [

// save abbreviations --------
/([Ee])[.]([Gg])[.]/g, "_$1$2_",//E.g
/([Ii])[.]([Ee])[.]/g, "_$1$2_",//i.e
/([Aa])[.]([Mm])[.]/g, "_$1$2_",//a.m.
/([Pp])[.]([Mm])[.]/g, "_$1$2_",//p.m.
/(Cap)[\.]/g, "_Cap_",//Cap.
/([Cc])[\.]/g, "_$1_",//c.
/([Ee][Tt])[\s]([Aa][Ll])[\.]/,"_$1zzz$2_",// et al.
/([Ee][Tt])[\s]([Aa][Ll])[\.]/, "_$1zzz$2_",// et al.
/(etc|ETC)[\.]/g, "_$1_",//etc.
/([Pp])[\.]([Ss])[\.]/g, "_$1$2dot_", // p.s.
/([Pp])[\.]([Ss])/g, "_$1$2_", // p.s
Expand All @@ -217,13 +193,14 @@ const TOKENIZE_REGEXS_A = [
/([Mm])([Rr]|[Ss]|[Xx])[\.]/g, "_$1$2_", // Mr. Ms. and Mx.
/([Dd])([Rr])[\.]/g, "_$1$2_", // Dr.
/([Pp])([Ff])[\.]/g, "_$1$2_", // Pf.
/([Ii])([Nn])([Dd]|[Cc])[\.]/g,"_$1$2$3_", // Ind. and Inc.
/([Ii])([Nn])([Dd]|[Cc])[\.]/g, "_$1$2$3_", // Ind. and Inc.
/([Cc])([Oo])[\.][\,][\s]([Ll])([Tt])([Dd])[\.]/g, "_$1$2dcs$3$4$5_", // co., ltd.
/([Cc])([Oo])[\.][\s]([Ll])([Tt])([Dd])[\.]/g, "_$1$2ds$3$4$5_", // co. ltd.
/([Cc])([Oo])[\.][\,]([Ll])([Tt])([Dd])[\.]/g, "_$1$2dc$3$4$5_", // co.,ltd.
/([Cc])([Oo])([Rr]?)([Pp]?)[\.]/g, "_$1$2$3$4_",// Corp. and Co.
/([Ll])([Tt])([Dd])[\.]/g, "_$1$2$3_", // ltd.
/(prof|Prof|PROF)[\.]/g, "_$1_", //Prof.

//--------------------------
/\.\.\.\s/g, "_elipsisDDD_ ",
/([\?!\"\u201C\.,;:@#$%&])/g, " $1 ",
Expand All @@ -241,16 +218,17 @@ const TOKENIZE_REGEXS_A = [
/ ([A-Z]) \./g, " $1. ",
/^\s+/g, '',
/\^/g, " ^ ",
/°/g," ° ",
/°/g, " ° ",
/_elipsisDDD_/g, " ... ",

//pop abbreviations------------------
/_([Ee])([Gg])_/g, "$1.$2.",//Eg
/_([Ii])([Ee])_/g, "$1.$2.",//ie
/_([Aa])([Mm])_/g, "$1.$2.",//a.m.
/_([Pp])([Mm])_/g, "$1.$2.",//p.m.
/_Cap_/g, "Cap.",//Cap.
/_([Cc])_/g, "$1.",//c.
/_([Ee][Tt])zzz([Aa][Ll])_/,"$1_$2.",// et al.
/_([Ee][Tt])zzz([Aa][Ll])_/, "$1_$2.",// et al.
/_(etc|ETC)_/g, "$1.",//etc.
/_([Pp])([Ss])dot_/g, "$1.$2.", // p.s.
/_([Pp])([Ss])_/g, "$1.$2",
Expand All @@ -260,24 +238,24 @@ const TOKENIZE_REGEXS_A = [
/_([Mm])([Rr]|[Ss]|[Xx])_/g, "$1$2.", // Mr. Ms. and Mx.
/_([Dd])([Rr])_/g, "$1$2.", // Dr.
/_([Pp])([Ff])_/g, "$1$2.", // Pf.
/_([Ii])([Nn])([Dd]|[Cc])_/g,"$1$2$3.", // Ind. and Inc.
/_([Ii])([Nn])([Dd]|[Cc])_/g, "$1$2$3.", // Ind. and Inc.
/_([Cc])([Oo])([Rr]?)([Pp]?)_/g, "$1$2$3$4.",// Corp. and Co.
/_([Cc])([Oo])dc([Ll])([Tt])([Dd])_/g, "$1$2.,$3$4$5.", // co.,ltd.
/_([Ll])([Tt])([Dd])_/g, "$1$2$3.", // ltd.
/_([Cc])([Oo])dcs([Ll])([Tt])([Dd])_/g, "$1$2.,_$3$4$5.", // co., ltd.
/_([Cc])([Oo])ds([Ll])([Tt])([Dd])_/g, "$1$2._$3$4$5.", // co. ltd.
/_(prof|PROF|Prof)_/g, "$1.", //Prof.

/_(prof|PROF|Prof)_/g, "$1." //Prof.
];
const TOKENIZE_REGEXS_B = [

const CONTRACTIONS_RE = [
/([Cc])an['’]t/g, "$1an not",
/([Dd])idn['’]t/g, "$1id not",
/([CcWw])ouldn['’]t/g, "$1ould not",
/([Ss])houldn['’]t/g, "$1hould not",
/([Ii])t['’]s/g, " $1t is",
/n['’]t /g, " not ",
/['’]ve /g, " have ",
/['’]re /g, " are ",
/['’]re /g, " are "
];

module && (module.exports = Tokenizer);
2 changes: 0 additions & 2 deletions test/tokenizer-tests.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// const expect = require('chai').expect;
// const RiTa = require('../src/rita_api');

describe('RiTa.Tokenizer', () => {

Expand Down

0 comments on commit 7ad5e92

Please sign in to comment.