Skip to content

Commit

Permalink
fix: assure fuzzyMatch does not steal units that could have been matc…
Browse files Browse the repository at this point in the history
…hed better
  • Loading branch information
daniel-sc committed May 5, 2022
1 parent 5ee663d commit 95ae654
Show file tree
Hide file tree
Showing 2 changed files with 225 additions and 21 deletions.
187 changes: 187 additions & 0 deletions __tests__/merge.test.ts
Expand Up @@ -45,6 +45,70 @@ describe('merge', () => {
'</xliff>'));
});

test('should keep sorting', () => {
const sourceFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="ID4" datatype="html">\n' +
' <source>new4</source>\n' +
' </trans-unit>\n' +
' <trans-unit id="ID3" datatype="html">\n' +
' <source>source val3</source>\n' +
' </trans-unit>\n' +
' <trans-unit id="ID2" datatype="html">\n' +
' <source>source val2</source>\n' +
' </trans-unit>\n' +
' <trans-unit id="ID1" datatype="html">\n' +
' <source>source val</source>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>';
const destFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" target-language="fr-ch" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="ID1" datatype="html">\n' +
' <source>source val</source>\n' +
' <target state="new">target val</target>\n' +
' </trans-unit>\n' +
' <trans-unit id="changed-id-2" datatype="html">\n' +
' <source>source val2</source>\n' +
' <target state="new">target val2</target>\n' +
' </trans-unit>\n' +
' <trans-unit id="ID3" datatype="html">\n' +
' <source>source val3</source>\n' +
' <target state="new">target val3</target>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>';

const result = merge(sourceFileContent, destFileContent);

expect(norm(result)).toEqual(norm('<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" target-language="fr-ch" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="ID1" datatype="html">\n' +
' <source>source val</source>\n' +
' <target state="new">target val</target>\n' +
' </trans-unit>\n' +
' <trans-unit id="ID2" datatype="html">\n' +
' <source>source val2</source>\n' +
' <target state="new">target val2</target>\n' +
' </trans-unit>\n' +
' <trans-unit id="ID3" datatype="html">\n' +
' <source>source val3</source>\n' +
' <target state="new">target val3</target>\n' +
' </trans-unit>\n' +
' <trans-unit id="ID4" datatype="html">\n' +
' <source>new4</source>\n' +
' <target state="new">new4</target>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>'));
});

test('should remove obsolete node', () => {
const sourceFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" datatype="plaintext" original="ng2.template">\n' +
Expand Down Expand Up @@ -153,6 +217,94 @@ describe('merge', () => {
'</xliff>'));
});

test('should handle changed sorting and changed IDs', () => {
const sourceFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="ID1" datatype="html">\n' +
' <source>source val that is long enough</source>\n' +
' </trans-unit>\n' +
' <trans-unit id="ID2" datatype="html">\n' +
' <source>2source val that is long enough</source>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>';
const destFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" target-language="fr-ch" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="random-id-2" datatype="html">\n' +
' <source>b2source val that is long enough</source>\n' +
' <target>2target val</target>\n' +
' </trans-unit>\n' +
' <trans-unit id="random-id-1" datatype="html">\n' +
' <source>bsource val that is long enough</source>\n' +
' <target>target val</target>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>';

const result = merge(sourceFileContent, destFileContent);

expect(norm(result)).toEqual(norm('<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" target-language="fr-ch" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="ID2" datatype="html">\n' +
' <source>2source val that is long enough</source>\n' +
' <target state="new">2target val</target>\n' +
' </trans-unit>\n' +
' <trans-unit id="ID1" datatype="html">\n' +
' <source>source val that is long enough</source>\n' +
' <target state="new">target val</target>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>'));
});

test('should handle new IDs and worse match before better match', () => {
const sourceFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="new-ID2" datatype="html">\n' +
' <source>other source val that is long enough</source>\n' +
' </trans-unit>\n' +
' <trans-unit id="new-ID1" datatype="html">\n' +
' <source>source val that is long enough</source>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>';
const destFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" target-language="fr-ch" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="ID1" datatype="html">\n' +
' <source>source val that is long enough</source>\n' +
' <target>target val</target>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>';

const result = merge(sourceFileContent, destFileContent);

expect(norm(result)).toEqual(norm('<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" target-language="fr-ch" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="new-ID1" datatype="html">\n' +
' <source>source val that is long enough</source>\n' +
' <target state="new">target val</target>\n' +
' </trans-unit>\n' +
' <trans-unit id="new-ID2" datatype="html">\n' +
' <source>other source val that is long enough</source>\n' +
' <target state="new">other source val that is long enough</target>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>'));
});

test('should fuzzy match changed node', () => {
const sourceFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" datatype="plaintext" original="ng2.template">\n' +
Expand Down Expand Up @@ -188,6 +340,41 @@ describe('merge', () => {
'</xliff>'));
});

test('should not fuzzy match changed node if fuzzyMatch=false', () => {
const sourceFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="ID1" datatype="html">\n' +
' <source>source val that is long enough</source>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>';
const destFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" target-language="fr-ch" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="random-id" datatype="html">\n' +
' <source>source val that is long enough</source>\n' +
' <target>target val</target>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>';

const result = merge(sourceFileContent, destFileContent, {fuzzyMatch: false});

expect(norm(result)).toEqual(norm('<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" target-language="fr-ch" datatype="plaintext" original="ng2.template">\n' +
' <body>\n' +
' <trans-unit id="ID1" datatype="html">\n' +
' <source>source val that is long enough</source>\n' +
' <target state="new">source val that is long enough</target>\n' +
' </trans-unit>\n' +
' </body>\n' +
' </file>\n' +
'</xliff>'));
});

test('should ignore whitespace changes', () => {
const sourceFileContent = '<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">\n' +
' <file source-language="de" datatype="plaintext" original="ng2.template">\n' +
Expand Down
59 changes: 38 additions & 21 deletions src/merge.ts
Expand Up @@ -17,24 +17,19 @@ const FUZZY_THRESHOLD = 0.2;
const STATE_INITIAL_XLF_2_0 = 'initial';
const STATE_INITIAL_XLF_1_2 = 'new';

function getDestUnit(originUnit: XmlElement, destUnitsParent: XmlElement, removedNodes: XmlNode[]): XmlElement | undefined {
const destUnit = destUnitsParent.childWithAttribute('id', originUnit.attr.id);
if (destUnit) {
return destUnit;
function findClosestMatch(originUnit: XmlElement, destUnits: XmlNode[]): [targetUnit: XmlElement | undefined, score: number] {
const originText = toString(getSourceElement(originUnit)!);
const closestUnit = destUnits
.filter(n => n.type === 'element')
.map(n => ({
node: n,
dist: levenshtein(originText, toString(getSourceElement(n as XmlElement)!))
}))
.reduce((previousValue, currentValue) => (previousValue?.dist ?? Number.MAX_VALUE) > currentValue.dist ? currentValue : previousValue, undefined as { node: XmlNode, dist: number } | undefined);
if (closestUnit && closestUnit.dist / originText.length < FUZZY_THRESHOLD) {
return [closestUnit.node as XmlElement, closestUnit.dist / originText.length];
} else {
const originText = toString(getSourceElement(originUnit)!);
const closestUnit = removedNodes
.filter(n => n.type === 'element')
.map(n => ({
node: n,
dist: levenshtein(originText, toString(getSourceElement(n as XmlElement)!))
}))
.reduce((previousValue, currentValue) => (previousValue?.dist ?? Number.MAX_VALUE) > currentValue.dist ? currentValue : previousValue, undefined as { node: XmlNode, dist: number } | undefined);
if (closestUnit && closestUnit.dist / originText.length < FUZZY_THRESHOLD) {
return closestUnit.node as XmlElement;
} else {
return undefined;
}
return [undefined, 0];
}
}

Expand Down Expand Up @@ -92,6 +87,26 @@ function isUntranslated(destUnit: XmlElement, xliffVersion: '1.2' | '2.0', destS
return isInitialState(destUnit, xliffVersion) && destSourceText === destTargetText;
}

function getUnitAndDestUnit(inUnits: XmlElement[], removeNodes: XmlElement[], destUnitsParent: XmlElement, xliffVersion: '1.2' | '2.0', fuzzyMatch: boolean): [unit: XmlElement | undefined, destUnit: XmlElement | undefined] {
const unit = inUnits?.[0];
if (!unit) {
return [undefined, undefined];
}
const destUnit = destUnitsParent.childWithAttribute('id', unit.attr.id);
if (destUnit) {
return [unit, destUnit];
} else if (fuzzyMatch) {
// find best match first to make sure we don't steal a better match just because the other unit was first:
const allInUnitsWithoutDestinationUnit = inUnits.filter(u => !destUnitsParent.childWithAttribute('id', u.attr.id)); // non-empty as it contains at least `unit`!
const bestMatch = allInUnitsWithoutDestinationUnit
.map((inUnit: XmlElement): [XmlElement, [XmlElement | undefined, number]] => [inUnit, findClosestMatch(inUnit, removeNodes)])
.reduce((previousValue, currentValue) => (previousValue[1][1] ?? Number.MAX_VALUE) > currentValue[1][1] ? currentValue : previousValue, [undefined, [undefined, Number.MAX_VALUE]] as [XmlElement | undefined, [XmlElement | undefined, number]]);
return [bestMatch[0], bestMatch[1][0]];
} else {
return [unit, undefined];
}
}

export function merge(inFileContent: string, destFileContent: string, options?: MergeOptions) {
const inDoc = new XmlDocument(inFileContent);
const destDoc = new XmlDocument(destFileContent);
Expand All @@ -103,11 +118,13 @@ export function merge(inFileContent: string, destFileContent: string, options?:

// collect (potentially) obsolete units (defer actual removal to allow for fuzzy matching..):
const originIds = new Set(inUnits.map(u => u.attr.id));
let removeNodes = getUnits(destDoc, xliffVersion)!.filter(destUnit => !originIds.has(destUnit.attr.id));
const removeNodes = getUnits(destDoc, xliffVersion)!.filter(destUnit => !originIds.has(destUnit.attr.id));

// add missing units and update existing ones:
for (const unit of inUnits) {
const destUnit = getDestUnit(unit, destUnitsParent, options?.fuzzyMatch ?? true ? removeNodes : []);
for (let [unit, destUnit] = getUnitAndDestUnit(inUnits, removeNodes, destUnitsParent, xliffVersion, options?.fuzzyMatch ?? true);
unit !== undefined;
[unit, destUnit] = getUnitAndDestUnit(inUnits, removeNodes, destUnitsParent, xliffVersion, options?.fuzzyMatch ?? true)) {
inUnits.splice(inUnits.indexOf(unit), 1);
const unitSource = getSourceElement(unit)!;
const unitSourceText = toString(...unitSource.children);
if (destUnit) {
Expand All @@ -124,7 +141,7 @@ export function merge(inFileContent: string, destFileContent: string, options?:
}
if (destUnit.attr.id !== unit.attr.id) {
console.debug(`matched unit with previous id "${destUnit.attr.id}" to new id: "${unit.attr.id}"`);
removeNodes = removeNodes.filter(n => n !== destUnit);
removeNodes.splice(removeNodes.indexOf(destUnit), 1);
destUnit.attr.id = unit.attr.id;
resetTranslationState(destUnit, xliffVersion, options);
}
Expand Down

0 comments on commit 95ae654

Please sign in to comment.