feat: improve the matching algorithm when there are competing fixing …

…choices
codsen · Mar 24, 2021 · ce43906 · ce43906
1 parent 20859d5
commit ce43906
Show file tree

Hide file tree

Showing 12 changed files with 353 additions and 68 deletions.
diff --git a/packages/string-fix-broken-named-entities/coverage/coverage-summary.json b/packages/string-fix-broken-named-entities/coverage/coverage-summary.json
@@ -1 +1 @@
-{"total":{"lines":{"total":230,"covered":230,"skipped":0,"pct":100},"statements":{"total":241,"covered":241,"skipped":0,"pct":100},"functions":{"total":26,"covered":26,"skipped":0,"pct":100},"branches":{"total":323,"covered":323,"skipped":0,"pct":100}}}
+{"total":{"lines":{"total":246,"covered":246,"skipped":0,"pct":100},"statements":{"total":260,"covered":260,"skipped":0,"pct":100},"functions":{"total":29,"covered":29,"skipped":0,"pct":100},"branches":{"total":335,"covered":334,"skipped":0,"pct":99.7}}}
diff --git a/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.cjs.js b/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.cjs.js
@@ -343,7 +343,7 @@ function fixEnt(str, originalOpts) {
               }).join("");
               if (potentialEntityOnlyNonWhitespaceChars.length <= allNamedHtmlEntities.maxLength && allNamedHtmlEntities.allNamedEntitiesSetOnlyCaseInsensitive.has(potentialEntityOnlyNonWhitespaceChars.toLowerCase())) {
                 if (
-                !allNamedHtmlEntities.allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
+                typeof potentialEntityOnlyNonWhitespaceChars === "string" && !allNamedHtmlEntities.allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
                   var matchingEntitiesOfCorrectCaseArr = [].concat(allNamedHtmlEntities.allNamedEntitiesSetOnly).filter(function (ent) {
                     return ent.toLowerCase() === potentialEntityOnlyNonWhitespaceChars.toLowerCase();
                   });
@@ -446,6 +446,37 @@ function fixEnt(str, originalOpts) {
                     rangeValDecoded: allNamedHtmlEntities.decode("&" + _tempEnt2 + ";")
                   });
                   pingAmps(whatsOnTheLeft, i);
+                } else if (temp) {
+                  var missingLettersCount = temp.map(function (ent) {
+                    var splitStr = str.split("");
+                    return ent.split("").reduce(function (acc, curr) {
+                      if (splitStr.includes(curr)) {
+                        splitStr.splice(splitStr.indexOf(curr), 1);
+                        return acc + 1;
+                      }
+                      return acc;
+                    }, 0);
+                  });
+                  var maxVal = Math.max.apply(Math, missingLettersCount);
+                  if (maxVal && missingLettersCount.filter(function (v) {
+                    return v === maxVal;
+                  }).length === 1) {
+                    for (var z = 0, _len = missingLettersCount.length; z < _len; z++) {
+                      if (missingLettersCount[z] === maxVal) {
+                        _tempEnt2 = temp[z];
+                        rangesArr2.push({
+                          ruleName: "bad-html-entity-malformed-" + _tempEnt2,
+                          entityName: _tempEnt2,
+                          rangeFrom: whatsOnTheLeft,
+                          rangeTo: i + 1,
+                          rangeValEncoded: "&" + _tempEnt2 + ";",
+                          rangeValDecoded: allNamedHtmlEntities.decode("&" + _tempEnt2 + ";")
+                        });
+                        pingAmps(whatsOnTheLeft, i);
+                        break;
+                      }
+                    }
+                  }
                 }
               }
               if (!_tempEnt2) {

diff --git a/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.dev.umd.js b/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.dev.umd.js
@@ -10374,7 +10374,7 @@ function fixEnt(str, originalOpts) { //
               if (potentialEntityOnlyNonWhitespaceChars.length <= maxLength && allNamedEntitiesSetOnlyCaseInsensitive.has(potentialEntityOnlyNonWhitespaceChars.toLowerCase())) {
 
                 if ( // first, check is the letter case allright
-                !allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
+                typeof potentialEntityOnlyNonWhitespaceChars === "string" && !allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
                   var matchingEntitiesOfCorrectCaseArr = [].concat(allNamedEntitiesSetOnly).filter(function (ent) {
                     return ent.toLowerCase() === potentialEntityOnlyNonWhitespaceChars.toLowerCase();
                   });
@@ -10495,6 +10495,48 @@ function fixEnt(str, originalOpts) { //
                     rangeValDecoded: decode("&" + _tempEnt2 + ";")
                   });
                   pingAmps(whatsOnTheLeft, i);
+                } else if (temp) {
+                  // For example, &rsqo; could be suspected as
+                  // Lenshtein's distance &rsqb; and &rsquo;
+                  // The last chance, count how many letters are
+                  // absent in this malformed entity.
+                  var missingLettersCount = temp.map(function (ent) {
+                    var splitStr = str.split("");
+                    return ent.split("").reduce(function (acc, curr) {
+                      if (splitStr.includes(curr)) {
+                        // remove that character from splitStr
+                        // so that we count only once, repetitions need to
+                        // be matched equally
+                        splitStr.splice(splitStr.indexOf(curr), 1);
+                        return acc + 1;
+                      }
+
+                      return acc;
+                    }, 0);
+                  });
+                  var maxVal = Math.max.apply(Math, missingLettersCount); // if there's only one value with more characters matched
+                  // than others, &rsqb; vs &rsquo; - latter would win matching
+                  // against messed up &rsqo; - we pick that winning-one
+
+                  if (maxVal && missingLettersCount.filter(function (v) {
+                    return v === maxVal;
+                  }).length === 1) {
+                    for (var z = 0, _len = missingLettersCount.length; z < _len; z++) {
+                      if (missingLettersCount[z] === maxVal) {
+                        _tempEnt2 = temp[z];
+                        rangesArr2.push({
+                          ruleName: "bad-html-entity-malformed-" + _tempEnt2,
+                          entityName: _tempEnt2,
+                          rangeFrom: whatsOnTheLeft,
+                          rangeTo: i + 1,
+                          rangeValEncoded: "&" + _tempEnt2 + ";",
+                          rangeValDecoded: decode("&" + _tempEnt2 + ";")
+                        });
+                        pingAmps(whatsOnTheLeft, i);
+                        break;
+                      }
+                    }
+                  }
                 }
               } // if "tempEnt" was not set by now, it is not a known HTML entity
 

diff --git a/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.esm.js b/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.esm.js
@@ -322,7 +322,7 @@ function fixEnt(str, originalOpts) {
               const potentialEntityOnlyNonWhitespaceChars = Array.from(potentialEntity).filter(char => char.trim().length).join("");
               if (potentialEntityOnlyNonWhitespaceChars.length <= maxLength && allNamedEntitiesSetOnlyCaseInsensitive.has(potentialEntityOnlyNonWhitespaceChars.toLowerCase())) {
                 if (
-                !allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
+                typeof potentialEntityOnlyNonWhitespaceChars === "string" && !allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
                   const matchingEntitiesOfCorrectCaseArr = [...allNamedEntitiesSetOnly].filter(ent => ent.toLowerCase() === potentialEntityOnlyNonWhitespaceChars.toLowerCase());
                   if (matchingEntitiesOfCorrectCaseArr.length === 1) {
                     rangesArr2.push({
@@ -417,6 +417,35 @@ function fixEnt(str, originalOpts) {
                     rangeValDecoded: decode(`&${tempEnt};`)
                   });
                   pingAmps(whatsOnTheLeft, i);
+                } else if (temp) {
+                  const missingLettersCount = temp.map(ent => {
+                    const splitStr = str.split("");
+                    return ent.split("").reduce((acc, curr) => {
+                      if (splitStr.includes(curr)) {
+                        splitStr.splice(splitStr.indexOf(curr), 1);
+                        return acc + 1;
+                      }
+                      return acc;
+                    }, 0);
+                  });
+                  const maxVal = Math.max(...missingLettersCount);
+                  if (maxVal && missingLettersCount.filter(v => v === maxVal).length === 1) {
+                    for (let z = 0, len = missingLettersCount.length; z < len; z++) {
+                      if (missingLettersCount[z] === maxVal) {
+                        tempEnt = temp[z];
+                        rangesArr2.push({
+                          ruleName: `bad-html-entity-malformed-${tempEnt}`,
+                          entityName: tempEnt,
+                          rangeFrom: whatsOnTheLeft,
+                          rangeTo: i + 1,
+                          rangeValEncoded: `&${tempEnt};`,
+                          rangeValDecoded: decode(`&${tempEnt};`)
+                        });
+                        pingAmps(whatsOnTheLeft, i);
+                        break;
+                      }
+                    }
+                  }
                 }
               }
               if (!tempEnt) {

diff --git a/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.mjs b/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.mjs
diff --git a/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.umd.js b/packages/string-fix-broken-named-entities/dist/string-fix-broken-named-entities.umd.js
diff --git a/packages/string-fix-broken-named-entities/examples/api.json b/packages/string-fix-broken-named-entities/examples/api.json
@@ -1 +1 @@
-{"_quickTake.js":{"title":"Quick Take","content":"import &#x7B; strict as assert &#x7D; from \"assert\";\nimport &#x7B; fixEnt &#x7D; from \"string-fix-broken-named-entities\";\nimport &#x7B; rApply &#x7D; from \"ranges-apply\";\n\nconst source = \"&nsp;x&nsp;y&nsp;\";\n\n// returns Ranges notation, see codsen.com/ranges/\nassert.deepEqual(fixEnt(source), [\n  [0, 5, \"&nbsp;\"],\n  [6, 11, \"&nbsp;\"],\n  [12, 17, \"&nbsp;\"],\n]);\n\n// render result from ranges using \"ranges-apply\":\nassert.equal(rApply(source, fixEnt(source)), \"&nbsp;x&nbsp;y&nbsp;\");"},"sift-raw-ampersands-from-entities.js":{"title":"Sift raw ampersands in a string from broken character references","content":"// encode those raw ampersands and fix broken character references\n\nimport &#x7B; strict as assert &#x7D; from \"assert\";\nimport &#x7B; fixEnt &#x7D; from \"string-fix-broken-named-entities\";\nimport &#x7B; rApply &#x7D; from \"ranges-apply\";\n\nconst source = \"&&nsp;&&nsp;&\";\n\nconst finalRanges = [];\nconst indexesOfRawAmpersands = [];\n\n// fixEnt() returns Ranges (see codsen.com/ranges/)\nconst resultRanges = fixEnt(source, &#x7B;\n  textAmpersandCatcherCb: (idx) => indexesOfRawAmpersands.push(idx),\n&#x7D;);\n\n// check the ranges - all broken NBSP's were fixed:\nassert.deepEqual(resultRanges, [\n  [1, 6, \"&nbsp;\"],\n  [7, 12, \"&nbsp;\"],\n]);\n\n// don't apply the ranges yet, dump them into the \"finalRanges\" array\n// it's because applying them onto a string,\n// rApply(source, resultRanges);\n// will mess up the index positions, we'll need to calculate again.\n// The whole point of Ranges is they're COMPOSABLE.\n\nresultRanges.forEach((range) => &#x7B;\n  finalRanges.push(range);\n&#x7D;);\n\n// check what's been gathered so far:\nassert.deepEqual(resultRanges, [\n  [1, 6, \"&nbsp;\"],\n  [7, 12, \"&nbsp;\"],\n]);\n\n// check the positions of reported raw ampersands:\nassert.deepEqual(indexesOfRawAmpersands, [0, 6, 12]);\n\n// replace each character at these positions: 0, 7 and 14\n// with string \"&amp;\" - in terms of Ranges, it's a matter\n// of building a Ranges array:\nconst replacementRanges = indexesOfRawAmpersands.map((idx) => [\n  idx,\n  idx + 1,\n  \"&amp;\",\n]);\n// this is Ranges notation, array of arrays: [from index, to index, what-to-replace]\nassert.deepEqual(replacementRanges, [\n  [0, 1, \"&amp;\"], // we're saying, replace indexes from 0 to 1 with &amp;\n  [6, 7, \"&amp;\"],\n  [12, 13, \"&amp;\"],\n]);\n\n// push them into resultRanges as well:\nreplacementRanges.forEach((range) => &#x7B;\n  resultRanges.push(range);\n&#x7D;);\n\n// check what's been gathered so far:\nassert.deepEqual(resultRanges, [\n  [1, 6, \"&nbsp;\"],\n  [7, 12, \"&nbsp;\"],\n  [0, 1, \"&amp;\"],\n  [6, 7, \"&amp;\"],\n  [12, 13, \"&amp;\"],\n]);\n\n// apply Ranges onto a string - all amendments at once!\nconst finalResultStr = rApply(source, resultRanges);\n\n// check result\nassert.equal(finalResultStr, \"&amp;&nbsp;&amp;&nbsp;&amp;\");\n\n// Voilà! We fixed broken entities and encoded raw ampersands"}}
+{"_quickTake.js":{"title":"Quick Take","content":"import &#x7B; strict as assert &#x7D; from \"assert\";\nimport &#x7B; fixEnt &#x7D; from \"string-fix-broken-named-entities\";\nimport &#x7B; rApply &#x7D; from \"ranges-apply\";\n\nconst source = \"&nsp;x&nsp;y&nsp;\";\n\n// returns Ranges notation, see codsen.com/ranges/\nassert.deepEqual(fixEnt(source), [\n  [0, 5, \"&nbsp;\"],\n  [6, 11, \"&nbsp;\"],\n  [12, 17, \"&nbsp;\"],\n]);\n\n// render result from ranges using \"ranges-apply\":\nassert.equal(rApply(source, fixEnt(source)), \"&nbsp;x&nbsp;y&nbsp;\");"},"sift-raw-ampersands-from-entities.js":{"title":"Sift raw ampersands in a string from broken character references","content":"// encode those raw ampersands and fix broken character references\n\nimport &#x7B; strict as assert &#x7D; from \"assert\";\nimport &#x7B; fixEnt &#x7D; from \"string-fix-broken-named-entities\";\nimport &#x7B; rApply &#x7D; from \"ranges-apply\";\n\nconst source = \"&&nsp;&&nsp;&\";\n\nconst finalRanges = [];\nconst indexesOfRawAmpersands = [];\n\n// fixEnt() returns Ranges (see codsen.com/ranges/)\nconst resultRanges = fixEnt(source, &#x7B;\n  textAmpersandCatcherCb: (idx) => indexesOfRawAmpersands.push(idx),\n&#x7D;);\n\n// check the ranges - all broken NBSP's were fixed:\nassert.deepEqual(resultRanges, [\n  [1, 6, \"&nbsp;\"],\n  [7, 12, \"&nbsp;\"],\n]);\n\n// don't apply the ranges yet, dump them into the \"finalRanges\" array\n// it's because applying them onto a string,\n// rApply(source, resultRanges);\n// will mess up the index positions, we'll need to calculate again.\n// The whole point of Ranges is they're COMPOSABLE.\n\nresultRanges.forEach((range) => &#x7B;\n  finalRanges.push(range);\n&#x7D;);\n\n// check the positions of reported raw ampersands:\nassert.deepEqual(indexesOfRawAmpersands, [0, 6, 12]);\n\n// replace each character at these positions: 0, 6 and 12\n// with string \"&amp;\" - in terms of Ranges, it's a matter\n// of building a Ranges array:\nconst replacementRanges = indexesOfRawAmpersands.map((idx) => [\n  idx,\n  idx + 1,\n  \"&amp;\",\n]);\n// this is Ranges notation, array of arrays: [from index, to index, what-to-replace]\nassert.deepEqual(replacementRanges, [\n  [0, 1, \"&amp;\"], // we're saying, replace indexes from 0 to 1 with &amp;\n  [6, 7, \"&amp;\"],\n  [12, 13, \"&amp;\"],\n]);\n\n// push them into resultRanges as well:\nreplacementRanges.forEach((range) => &#x7B;\n  resultRanges.push(range);\n&#x7D;);\n\n// check what's been gathered so far:\nassert.deepEqual(resultRanges, [\n  [1, 6, \"&nbsp;\"],\n  [7, 12, \"&nbsp;\"],\n  [0, 1, \"&amp;\"],\n  [6, 7, \"&amp;\"],\n  [12, 13, \"&amp;\"],\n]);\n\n// apply Ranges onto a string - all amendments at once!\nconst finalResultStr = rApply(source, resultRanges);\n\n// check result\nassert.equal(finalResultStr, \"&amp;&nbsp;&amp;&nbsp;&amp;\");\n\n// Voilà! We fixed broken entities and encoded raw ampersands"}}
diff --git a/packages/string-fix-broken-named-entities/package.json b/packages/string-fix-broken-named-entities/package.json
@@ -124,4 +124,4 @@
     "tslib": "^2.1.0",
     "typescript": "^4.2.3"
   }
-}
+}
diff --git a/packages/string-fix-broken-named-entities/perf/historical.json b/packages/string-fix-broken-named-entities/perf/historical.json
@@ -50,6 +50,7 @@
     "5.0.4": 13082.545513315305,
     "5.0.6": 76307.2180750803,
     "5.0.8": 89346.35423707803,
-    "lastPublished": 89346.35423707803,
-    "lastRan": 89346.35423707803
+    "5.1.0": 91621.61148895523,
+    "lastPublished": 91621.61148895523,
+    "lastRan": 91621.61148895523
 }