Skip to content

Commit

Permalink
feat: improve the matching algorithm when there are competing fixing …
Browse files Browse the repository at this point in the history
…choices
  • Loading branch information
revelt committed Mar 24, 2021
1 parent 20859d5 commit ce43906
Show file tree
Hide file tree
Showing 12 changed files with 353 additions and 68 deletions.
@@ -1 +1 @@
{"total":{"lines":{"total":230,"covered":230,"skipped":0,"pct":100},"statements":{"total":241,"covered":241,"skipped":0,"pct":100},"functions":{"total":26,"covered":26,"skipped":0,"pct":100},"branches":{"total":323,"covered":323,"skipped":0,"pct":100}}}
{"total":{"lines":{"total":246,"covered":246,"skipped":0,"pct":100},"statements":{"total":260,"covered":260,"skipped":0,"pct":100},"functions":{"total":29,"covered":29,"skipped":0,"pct":100},"branches":{"total":335,"covered":334,"skipped":0,"pct":99.7}}}
Expand Up @@ -343,7 +343,7 @@ function fixEnt(str, originalOpts) {
}).join("");
if (potentialEntityOnlyNonWhitespaceChars.length <= allNamedHtmlEntities.maxLength && allNamedHtmlEntities.allNamedEntitiesSetOnlyCaseInsensitive.has(potentialEntityOnlyNonWhitespaceChars.toLowerCase())) {
if (
!allNamedHtmlEntities.allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
typeof potentialEntityOnlyNonWhitespaceChars === "string" && !allNamedHtmlEntities.allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
var matchingEntitiesOfCorrectCaseArr = [].concat(allNamedHtmlEntities.allNamedEntitiesSetOnly).filter(function (ent) {
return ent.toLowerCase() === potentialEntityOnlyNonWhitespaceChars.toLowerCase();
});
Expand Down Expand Up @@ -446,6 +446,37 @@ function fixEnt(str, originalOpts) {
rangeValDecoded: allNamedHtmlEntities.decode("&" + _tempEnt2 + ";")
});
pingAmps(whatsOnTheLeft, i);
} else if (temp) {
var missingLettersCount = temp.map(function (ent) {
var splitStr = str.split("");
return ent.split("").reduce(function (acc, curr) {
if (splitStr.includes(curr)) {
splitStr.splice(splitStr.indexOf(curr), 1);
return acc + 1;
}
return acc;
}, 0);
});
var maxVal = Math.max.apply(Math, missingLettersCount);
if (maxVal && missingLettersCount.filter(function (v) {
return v === maxVal;
}).length === 1) {
for (var z = 0, _len = missingLettersCount.length; z < _len; z++) {
if (missingLettersCount[z] === maxVal) {
_tempEnt2 = temp[z];
rangesArr2.push({
ruleName: "bad-html-entity-malformed-" + _tempEnt2,
entityName: _tempEnt2,
rangeFrom: whatsOnTheLeft,
rangeTo: i + 1,
rangeValEncoded: "&" + _tempEnt2 + ";",
rangeValDecoded: allNamedHtmlEntities.decode("&" + _tempEnt2 + ";")
});
pingAmps(whatsOnTheLeft, i);
break;
}
}
}
}
}
if (!_tempEnt2) {
Expand Down
Expand Up @@ -10374,7 +10374,7 @@ function fixEnt(str, originalOpts) { //
if (potentialEntityOnlyNonWhitespaceChars.length <= maxLength && allNamedEntitiesSetOnlyCaseInsensitive.has(potentialEntityOnlyNonWhitespaceChars.toLowerCase())) {

if ( // first, check is the letter case allright
!allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
typeof potentialEntityOnlyNonWhitespaceChars === "string" && !allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
var matchingEntitiesOfCorrectCaseArr = [].concat(allNamedEntitiesSetOnly).filter(function (ent) {
return ent.toLowerCase() === potentialEntityOnlyNonWhitespaceChars.toLowerCase();
});
Expand Down Expand Up @@ -10495,6 +10495,48 @@ function fixEnt(str, originalOpts) { //
rangeValDecoded: decode("&" + _tempEnt2 + ";")
});
pingAmps(whatsOnTheLeft, i);
} else if (temp) {
// For example, &rsqo; could be suspected as
// Lenshtein's distance &rsqb; and &rsquo;
// The last chance, count how many letters are
// absent in this malformed entity.
var missingLettersCount = temp.map(function (ent) {
var splitStr = str.split("");
return ent.split("").reduce(function (acc, curr) {
if (splitStr.includes(curr)) {
// remove that character from splitStr
// so that we count only once, repetitions need to
// be matched equally
splitStr.splice(splitStr.indexOf(curr), 1);
return acc + 1;
}

return acc;
}, 0);
});
var maxVal = Math.max.apply(Math, missingLettersCount); // if there's only one value with more characters matched
// than others, &rsqb; vs &rsquo; - latter would win matching
// against messed up &rsqo; - we pick that winning-one

if (maxVal && missingLettersCount.filter(function (v) {
return v === maxVal;
}).length === 1) {
for (var z = 0, _len = missingLettersCount.length; z < _len; z++) {
if (missingLettersCount[z] === maxVal) {
_tempEnt2 = temp[z];
rangesArr2.push({
ruleName: "bad-html-entity-malformed-" + _tempEnt2,
entityName: _tempEnt2,
rangeFrom: whatsOnTheLeft,
rangeTo: i + 1,
rangeValEncoded: "&" + _tempEnt2 + ";",
rangeValDecoded: decode("&" + _tempEnt2 + ";")
});
pingAmps(whatsOnTheLeft, i);
break;
}
}
}
}
} // if "tempEnt" was not set by now, it is not a known HTML entity

Expand Down
Expand Up @@ -322,7 +322,7 @@ function fixEnt(str, originalOpts) {
const potentialEntityOnlyNonWhitespaceChars = Array.from(potentialEntity).filter(char => char.trim().length).join("");
if (potentialEntityOnlyNonWhitespaceChars.length <= maxLength && allNamedEntitiesSetOnlyCaseInsensitive.has(potentialEntityOnlyNonWhitespaceChars.toLowerCase())) {
if (
!allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
typeof potentialEntityOnlyNonWhitespaceChars === "string" && !allNamedEntitiesSetOnly.has(potentialEntityOnlyNonWhitespaceChars)) {
const matchingEntitiesOfCorrectCaseArr = [...allNamedEntitiesSetOnly].filter(ent => ent.toLowerCase() === potentialEntityOnlyNonWhitespaceChars.toLowerCase());
if (matchingEntitiesOfCorrectCaseArr.length === 1) {
rangesArr2.push({
Expand Down Expand Up @@ -417,6 +417,35 @@ function fixEnt(str, originalOpts) {
rangeValDecoded: decode(`&${tempEnt};`)
});
pingAmps(whatsOnTheLeft, i);
} else if (temp) {
const missingLettersCount = temp.map(ent => {
const splitStr = str.split("");
return ent.split("").reduce((acc, curr) => {
if (splitStr.includes(curr)) {
splitStr.splice(splitStr.indexOf(curr), 1);
return acc + 1;
}
return acc;
}, 0);
});
const maxVal = Math.max(...missingLettersCount);
if (maxVal && missingLettersCount.filter(v => v === maxVal).length === 1) {
for (let z = 0, len = missingLettersCount.length; z < len; z++) {
if (missingLettersCount[z] === maxVal) {
tempEnt = temp[z];
rangesArr2.push({
ruleName: `bad-html-entity-malformed-${tempEnt}`,
entityName: tempEnt,
rangeFrom: whatsOnTheLeft,
rangeTo: i + 1,
rangeValEncoded: `&${tempEnt};`,
rangeValDecoded: decode(`&${tempEnt};`)
});
pingAmps(whatsOnTheLeft, i);
break;
}
}
}
}
}
if (!tempEnt) {
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

@@ -1 +1 @@
{"_quickTake.js":{"title":"Quick Take","content":"import &#x7B; strict as assert &#x7D; from \"assert\";\nimport &#x7B; fixEnt &#x7D; from \"string-fix-broken-named-entities\";\nimport &#x7B; rApply &#x7D; from \"ranges-apply\";\n\nconst source = \"&nsp;x&nsp;y&nsp;\";\n\n// returns Ranges notation, see codsen.com/ranges/\nassert.deepEqual(fixEnt(source), [\n [0, 5, \"&nbsp;\"],\n [6, 11, \"&nbsp;\"],\n [12, 17, \"&nbsp;\"],\n]);\n\n// render result from ranges using \"ranges-apply\":\nassert.equal(rApply(source, fixEnt(source)), \"&nbsp;x&nbsp;y&nbsp;\");"},"sift-raw-ampersands-from-entities.js":{"title":"Sift raw ampersands in a string from broken character references","content":"// encode those raw ampersands and fix broken character references\n\nimport &#x7B; strict as assert &#x7D; from \"assert\";\nimport &#x7B; fixEnt &#x7D; from \"string-fix-broken-named-entities\";\nimport &#x7B; rApply &#x7D; from \"ranges-apply\";\n\nconst source = \"&&nsp;&&nsp;&\";\n\nconst finalRanges = [];\nconst indexesOfRawAmpersands = [];\n\n// fixEnt() returns Ranges (see codsen.com/ranges/)\nconst resultRanges = fixEnt(source, &#x7B;\n textAmpersandCatcherCb: (idx) => indexesOfRawAmpersands.push(idx),\n&#x7D;);\n\n// check the ranges - all broken NBSP's were fixed:\nassert.deepEqual(resultRanges, [\n [1, 6, \"&nbsp;\"],\n [7, 12, \"&nbsp;\"],\n]);\n\n// don't apply the ranges yet, dump them into the \"finalRanges\" array\n// it's because applying them onto a string,\n// rApply(source, resultRanges);\n// will mess up the index positions, we'll need to calculate again.\n// The whole point of Ranges is they're COMPOSABLE.\n\nresultRanges.forEach((range) => &#x7B;\n finalRanges.push(range);\n&#x7D;);\n\n// check what's been gathered so far:\nassert.deepEqual(resultRanges, [\n [1, 6, \"&nbsp;\"],\n [7, 12, \"&nbsp;\"],\n]);\n\n// check the positions of reported raw ampersands:\nassert.deepEqual(indexesOfRawAmpersands, [0, 6, 12]);\n\n// replace each character at these positions: 0, 7 and 14\n// with string \"&amp;\" - in terms of Ranges, it's a matter\n// of building a Ranges array:\nconst replacementRanges = indexesOfRawAmpersands.map((idx) => [\n idx,\n idx + 1,\n \"&amp;\",\n]);\n// this is Ranges notation, array of arrays: [from index, to index, what-to-replace]\nassert.deepEqual(replacementRanges, [\n [0, 1, \"&amp;\"], // we're saying, replace indexes from 0 to 1 with &amp;\n [6, 7, \"&amp;\"],\n [12, 13, \"&amp;\"],\n]);\n\n// push them into resultRanges as well:\nreplacementRanges.forEach((range) => &#x7B;\n resultRanges.push(range);\n&#x7D;);\n\n// check what's been gathered so far:\nassert.deepEqual(resultRanges, [\n [1, 6, \"&nbsp;\"],\n [7, 12, \"&nbsp;\"],\n [0, 1, \"&amp;\"],\n [6, 7, \"&amp;\"],\n [12, 13, \"&amp;\"],\n]);\n\n// apply Ranges onto a string - all amendments at once!\nconst finalResultStr = rApply(source, resultRanges);\n\n// check result\nassert.equal(finalResultStr, \"&amp;&nbsp;&amp;&nbsp;&amp;\");\n\n// Voilà! We fixed broken entities and encoded raw ampersands"}}
{"_quickTake.js":{"title":"Quick Take","content":"import &#x7B; strict as assert &#x7D; from \"assert\";\nimport &#x7B; fixEnt &#x7D; from \"string-fix-broken-named-entities\";\nimport &#x7B; rApply &#x7D; from \"ranges-apply\";\n\nconst source = \"&nsp;x&nsp;y&nsp;\";\n\n// returns Ranges notation, see codsen.com/ranges/\nassert.deepEqual(fixEnt(source), [\n [0, 5, \"&nbsp;\"],\n [6, 11, \"&nbsp;\"],\n [12, 17, \"&nbsp;\"],\n]);\n\n// render result from ranges using \"ranges-apply\":\nassert.equal(rApply(source, fixEnt(source)), \"&nbsp;x&nbsp;y&nbsp;\");"},"sift-raw-ampersands-from-entities.js":{"title":"Sift raw ampersands in a string from broken character references","content":"// encode those raw ampersands and fix broken character references\n\nimport &#x7B; strict as assert &#x7D; from \"assert\";\nimport &#x7B; fixEnt &#x7D; from \"string-fix-broken-named-entities\";\nimport &#x7B; rApply &#x7D; from \"ranges-apply\";\n\nconst source = \"&&nsp;&&nsp;&\";\n\nconst finalRanges = [];\nconst indexesOfRawAmpersands = [];\n\n// fixEnt() returns Ranges (see codsen.com/ranges/)\nconst resultRanges = fixEnt(source, &#x7B;\n textAmpersandCatcherCb: (idx) => indexesOfRawAmpersands.push(idx),\n&#x7D;);\n\n// check the ranges - all broken NBSP's were fixed:\nassert.deepEqual(resultRanges, [\n [1, 6, \"&nbsp;\"],\n [7, 12, \"&nbsp;\"],\n]);\n\n// don't apply the ranges yet, dump them into the \"finalRanges\" array\n// it's because applying them onto a string,\n// rApply(source, resultRanges);\n// will mess up the index positions, we'll need to calculate again.\n// The whole point of Ranges is they're COMPOSABLE.\n\nresultRanges.forEach((range) => &#x7B;\n finalRanges.push(range);\n&#x7D;);\n\n// check the positions of reported raw ampersands:\nassert.deepEqual(indexesOfRawAmpersands, [0, 6, 12]);\n\n// replace each character at these positions: 0, 6 and 12\n// with string \"&amp;\" - in terms of Ranges, it's a matter\n// of building a Ranges array:\nconst replacementRanges = indexesOfRawAmpersands.map((idx) => [\n idx,\n idx + 1,\n \"&amp;\",\n]);\n// this is Ranges notation, array of arrays: [from index, to index, what-to-replace]\nassert.deepEqual(replacementRanges, [\n [0, 1, \"&amp;\"], // we're saying, replace indexes from 0 to 1 with &amp;\n [6, 7, \"&amp;\"],\n [12, 13, \"&amp;\"],\n]);\n\n// push them into resultRanges as well:\nreplacementRanges.forEach((range) => &#x7B;\n resultRanges.push(range);\n&#x7D;);\n\n// check what's been gathered so far:\nassert.deepEqual(resultRanges, [\n [1, 6, \"&nbsp;\"],\n [7, 12, \"&nbsp;\"],\n [0, 1, \"&amp;\"],\n [6, 7, \"&amp;\"],\n [12, 13, \"&amp;\"],\n]);\n\n// apply Ranges onto a string - all amendments at once!\nconst finalResultStr = rApply(source, resultRanges);\n\n// check result\nassert.equal(finalResultStr, \"&amp;&nbsp;&amp;&nbsp;&amp;\");\n\n// Voilà! We fixed broken entities and encoded raw ampersands"}}
2 changes: 1 addition & 1 deletion packages/string-fix-broken-named-entities/package.json
Expand Up @@ -124,4 +124,4 @@
"tslib": "^2.1.0",
"typescript": "^4.2.3"
}
}
}
Expand Up @@ -50,6 +50,7 @@
"5.0.4": 13082.545513315305,
"5.0.6": 76307.2180750803,
"5.0.8": 89346.35423707803,
"lastPublished": 89346.35423707803,
"lastRan": 89346.35423707803
"5.1.0": 91621.61148895523,
"lastPublished": 91621.61148895523,
"lastRan": 91621.61148895523
}

0 comments on commit ce43906

Please sign in to comment.