Skip to content

Commit

Permalink
Merge pull request #385 from getodk/issa/myers-diff
Browse files Browse the repository at this point in the history
improve the diff implementation with schema input and myers algo
  • Loading branch information
issa-tseng committed Aug 4, 2021
2 parents 524b3e2 + a669ac1 commit f16636d
Show file tree
Hide file tree
Showing 7 changed files with 642 additions and 498 deletions.
325 changes: 218 additions & 107 deletions lib/data/submission.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
// except according to the terms contained in the LICENSE file.

const { Readable } = require('stream');
const { createHash } = require('crypto');
const hparser = require('htmlparser2');
const fmdiff = require('fast-myers-diff');
const { SchemaStack } = require('./schema');
const { noop } = require('../util/util');
const { contains, isEmpty, map, max, union } = require('ramda');
const { union, last, pluck } = require('ramda');


// reads submission xml with the streaming parser, and outputs a stream of every
Expand Down Expand Up @@ -59,136 +61,245 @@ const submissionXmlToFieldStream = (fields, xml) => {
return outStream;
};

// Reads XML without reading form schema
const submissionXmlToObj = (xml) => {
const fieldStack = [];
const data = {};
let currNode = data;
const nodeStack = [ currNode ];

let textBuffer = ''; // agglomerates text nodes that come as multiple events.
////////////////////////////////////////////////////////////////////////////////
// SUBMISSION DIFFING
//
// there are two phases to processing submission diffs:
// 1 translating the xml to data trees in _hashedTree below
// 2 using those trees to determine deltas in _recurseDiff below
//
// when we do the tree conversion, we process hashes for each data subtree so that
// when we're running through the trees in step 2, we don't have to navigate a bunch
// of identical data to learn it's identical; we can just check that the hashes match.
//
// when repeat (array) data changes, we run the hashes through a myers diffing routine
// to determine where the change hunks are. from there, we treat pure additions and
// removals as is, and 1:1 trades as edits. in the tricker case where the removals and
// additions in a single hunk are different lengths, we run all the candidates in the
// change range against each other, looking for the ones that match the worst and
// knocking the necessary number out.
// this resolution could be made more clever by improving the match-scoring system,
// and by better-respecting linearity between the a and b changelists.

const subhash = Symbol('subhash');
const subhashes = Symbol('subhashes');
const keys = Symbol('keys');
const ptr = last;

/* eslint-disable no-param-reassign */
// this utility is used in _hashedTree to decorate structures for computed metadata
// onto a data object, depending on its type.
const _decorated = (obj, array = false) => {
obj[subhash] = createHash('sha1');
if (array) obj[subhashes] = [];
else obj[keys] = [];
return obj;
};

// iff (rare) we get a complex (n removed, <>n added) repeat diff delta back from
// the myers diff algo, we need to score diffs against each other to determine which
// to match w whom. this recursive util computes (very naively, TODO improve) a "difference"
// score given a { old, new, path } diffline, where lower numbers are smaller diffs.
//
// _withScore is the entrypoint, and _deepCount handles structural data values.
const score = Symbol('diff score');
const _deepCount = (x) => {
if (x == null) return 0;
if (typeof x === 'string') return 1;
let result = 0;
for (const k of Object.keys(x)) result += 1 + _deepCount(x[k]);
return result;
};
const _withScore = (diff) => {
diff[score] = 1;
for (const change of diff) diff[score] += _deepCount(change.old) + _deepCount(change.new);
return diff;
};
/* eslint-enable no-param-reassign */

// converts an xml submission to a js tree, computing branch subhashes and
// decorating them as it goes. the resulting tree has definitive {} and []
// structures where schema-appropriate, and every structure has a [subhash]
// Symbol key stored on it which indicates the subhash of that tree. every
// object has a [keys] where we've cached the keys so we don't have to continually
// requery them later when analyzing. every array also has a [subhashes] which
// plucks all subhashes from direct-child structures, again for quick analysis.
//
// takes in the set of all structural fields ever known to exist on the form,
// to establish group/repeat structure.
const _hashedTree = (structurals, xml) => {
const tree = _decorated({});
const treeStack = [ tree ];
const stack = new SchemaStack(structurals, true);
const repeats = new Set();

let textBuffer;
const parser = new hparser.Parser({
onopentag: (tagName) => {
fieldStack.push(tagName);
nodeStack.push(currNode);

if (tagName in currNode) {
// tagname is already present so this is probably a repeat
if (!Array.isArray(currNode[tagName])) {
// make it into an array if not already an arary
currNode[tagName] = [currNode[tagName]];
}
const context = ptr(treeStack);
if (stack.droppedWrapper === true) context[keys].push(tagName);
textBuffer = '';

// push empty object to put child contents into
const newObj = {};
currNode[tagName].push(newObj);
currNode = newObj;
} else {
// tag name does not yet exist, make empty object
currNode[tagName] = {};
currNode = currNode[tagName];
}
const structural = stack.push(tagName);
if ((structural != null) && (structural !== SchemaStack.Wrapper)) {
// no matter what we have a new object context to create.
const treeNode = _decorated({});
treeStack.push(treeNode);

textBuffer = '';
if (structural.type === 'structure') { // new obj just gets stuck on groups,
context[tagName] = treeNode;
} else if (structural.type === 'repeat') { // but for repeats,
if (context[tagName] == null) { // sometimes an array must be created first.
const repeat = _decorated([ treeNode ], true);
context[tagName] = repeat;
repeat[subhashes] = [];
repeats.add(repeat);
} else {
context[tagName].push(treeNode);
}
}
}
},
ontext(text) {
ontext: (text) => {
textBuffer += text;
},
onclosetag() {
const field = fieldStack.pop();
currNode = nodeStack.pop();
onclosetag: (tagName) => {
const structural = stack.pop();
if (stack.exited === true) {
// nothing routine to do, but since we are done let's digest the root hash.
tree[subhash] = tree[subhash].digest('base64');
} else if (structural == null) {
// primitive values should update their context hash.
// TODO: if we want empty and nonexistent nodes to coalesce we could do it here.
const context = ptr(treeStack);
context[tagName] = textBuffer;
context[subhash].update(`${tagName}\0${textBuffer}\0\0`);
} else {
// repeats have to deal with updating array running totals.
const structure = treeStack.pop();
structure[subhash] = structure[subhash].digest('base64');
const context = ptr(treeStack);

if (isEmpty(currNode[field])) {
// only set terminal node text values
currNode[field] = textBuffer;
if (structural.type === 'repeat') {
const repeat = context[structural.name];
repeat[subhashes].push(structure[subhash]);
repeat[subhash].update(structure[subhash]);
}
context[subhash].update(structure[subhash]);
}
},
}
}, { xmlMode: true, decodeEntities: true });

parser.write(xml);
parser.end();

return data;
// now that everything is done go and finalize all our repeat subhashes.
for (const repeat of repeats) repeat[subhash] = repeat[subhash].digest('base64');
return tree;
};

// Helper function for formatting the diff representation of one node
// curr and prev: values
// xpath: full tree path as an array, not including the current node
// key: current node key
// index: node index if it is within a repeat group
const formatDiff = (curr, prev, keyStack, key, index = null) => ({
new: curr || null,
old: prev || null,
path: keyStack.slice(1).concat(index ? [[key, index]] : [key]), // first stack element 'data' removed
});

const compareObjects = (a, b, keyStack = []) => {
const ak = Object.keys(a); // more recent submission
const bk = Object.keys(b); // previous submission
const allKeys = union(ak, bk);

const differences = [];

for (const key of allKeys) {
// Check for keys that are not both present
if (!(contains(key, ak)) || !(contains(key, bk))) {
// if one key is missing, that one will be undefined
differences.push(formatDiff(a[key], b[key], keyStack, key));
} else {
// Compare the same keys
let valueA = a[key];
let valueB = b[key];

// If one is an array and the other isn't, make both into arrays
if (Array.isArray(valueA) && !Array.isArray(valueB))
valueB = [valueB];
else if (!Array.isArray(valueA) && Array.isArray(valueB))
valueA = [valueA];

if (Array.isArray(valueA) && Array.isArray(valueB)) {
// If they are both arrays, iterate through the longer one
for (let i = 0; i < max(valueA.length, valueB.length); i += 1) {
const innerValueA = valueA[i];
const innerValueB = valueB[i];

if (!innerValueA || !innerValueB) {
differences.push(formatDiff(innerValueA, innerValueB, keyStack, key, i));
} else {
differences.push(...compareObjects(
innerValueA,
innerValueB,
keyStack.concat([[ key, i ]])
));
// deals w the subtle differences between recursing into an obj vs array.
const _recurseDiff = (a, b, subpath, subkey) => (!Array.isArray(a)
? _diffObj(a, b, (subkey != null) ? subpath.concat([ subkey ]) : subpath) // eslint-disable-line no-use-before-define
: _diffArray(a, b, subpath, subkey)); // eslint-disable-line no-use-before-define

// diffs two object-type submission data trees and returns a set of diffs.
const _diffObj = (a, b, subpath) => {
const results = [];
for (const key of union(a[keys], b[keys])) {
const av = a[key];
if (!Object.prototype.hasOwnProperty.call(a, key)) { // null -> b
results.push({ new: b[key], path: subpath.concat([ key ]) });
} else if (!Object.prototype.hasOwnProperty.call(b, key)) { // a -> null
results.push({ old: av, path: subpath.concat([ key ]) });
} else if (av[subhash] == null) { // primitive
if (av !== b[key]) // a -> b
results.push({ old: av, new: b[key], path: subpath.concat([ key ]) });
} else if (av[subhash] !== b[key][subhash]) { // structural a -> b
results.push(..._recurseDiff(av, b[key], subpath, key));
}
}
return results;
};

// diffs two array-type submission data trees and returns a set of diffs.
// n.b. fast-myers-diff outputs eg [ aa, az ) and [ ba, bz ) patch ranges.
const _diffArray = (a, b, subpath, parentKey) => {
const results = [];
for (const [ aa, az, ba, bz ] of fmdiff.diff(a[subhashes], b[subhashes])) {
if (aa === az) { // null -> bs
for (let i = ba; i < bz; i += 1)
results.push({ new: b[i], path: subpath.concat([[ parentKey, i ]]) });
} else if (ba === bz) { // as -> null
for (let i = aa; i < az; i += 1)
results.push({ old: a[i], path: subpath.concat([[ parentKey, i ]]) });
} else if ((az - aa) === (bz - ba)) { // as -> bs direct 1:1 match
for (let i = 0; i < (az - aa); i += 1)
results.push(..._recurseDiff(a[aa + i], b[ba + i], subpath.concat([[ parentKey, aa + i ]])));
} else { // as -> bs complex
// if we have too many on one side, we want to eliminate the worst cross-matches
// as pure add/remove so we can diff the rest across
const alen = az - aa;
const blen = bz - ba;
const diffs = [];
for (let i = 0; i < alen; i += 1) diffs.push([]); // init subarrays
for (let i = 0; i < alen; i += 1) // cartesian cross-diff the whole delta
for (let j = 0; j < blen; j += 1)
diffs[i][j] = _withScore(_recurseDiff(a[aa + i], b[ba + j], subpath.concat([[ parentKey, aa + i ]])));

// now that we have all diffs find the worst matches and mark them for atomic add/remove diffs
// TODO: the lookup thing sort of sucks.
// l for longer, s for shorter.
const [ l, la, llen, slen, polarity, lookup ] = (alen > blen)
? [ a, aa, alen, blen, 'old', ((x, y) => diffs[x][y]) ]
: [ b, ba, blen, alen, 'new', ((y, x) => diffs[x][y]) ];
const delta = llen - slen; // need to take this many out
const knockouts = new Array(delta); // going to take these ones out
for (let i = 0; i < llen; i += 1) {
// for each ko candidate we want to find its minimum match score (best match)
let min = Number.MAX_SAFE_INTEGER;
for (let j = 0; j < slen; j += 1) min = Math.min(min, lookup(i, j)[score]);

// now let's see if that's one of the maximum (worst) ones we know of.
// we do <= to tend towards leaving earlier values alone all things equal.
for (let k = 0; k < delta; k += 1)
if ((knockouts[k] == null) || (knockouts[k].min <= min)) {
knockouts[k] = { min, idx: la + i };
break; // don't overwrite multiple
}
}

// finally output all our diffs in one go. we already calculated them so we just
// need to sort out the correct responses and look them up.
const skips = new Set(pluck('idx', knockouts));
let j = 0;
for (let i = 0; i < llen; i += 1) {
if (skips.has(la + i)) {
results.push({ [polarity]: l[la + i], path: subpath.concat([[ parentKey, la + i ]]) });
} else {
results.push(...lookup(i, j));
j += 1;
}
} else if (typeof (a[key]) === 'object' && typeof (b[key]) === 'object') {
// If children are both objects, compare them recursively
differences.push(...compareObjects(
a[key],
b[key],
keyStack.concat(key)
));
} else if (valueA.toString() !== valueB.toString()) {
// If they are both different values, note the change
differences.push(formatDiff(valueA, valueB, keyStack, key));
}
// else: the values are the same
}
}

return differences;
return results;
};

const diffSubmissions = (versions) => new Promise((resolve) => {
const diffs = {};
const jsonVersions = map((v) => ({instanceId: v.instanceId, content: submissionXmlToObj(v.xml)}), versions);

for (let i = 0; i < versions.length - 1; i += 1) {
diffs[jsonVersions[i].instanceId] = compareObjects(jsonVersions[i].content, jsonVersions[i + 1].content);
}
resolve(diffs);
});
// actual public interface to diff all given versions of a submission in sequential
// order. because of database query ordering, we expect versions in newest-first order.
//
// will return { instanceId: [{ [new], [old], path }] } where each instanceId
// indicates the changes that resulted in that version from the previous.
const diffSubmissions = (structurals, versions) => {
const byVersion = {};
const _trees = [];
for (const version of versions) _trees.push(_hashedTree(structurals, version.xml));
for (let i = 0; i < versions.length - 1; i += 1)
byVersion[versions[i].instanceId] = _recurseDiff(_trees[i + 1], _trees[i], []);
return byVersion;
};

module.exports = { submissionXmlToFieldStream, submissionXmlToObj, compareObjects, diffSubmissions, formatDiff };
module.exports = { submissionXmlToFieldStream, _hashedTree, _diffObj, _diffArray, diffSubmissions, _symbols: { subhash, subhashes, keys, score } };

6 changes: 5 additions & 1 deletion lib/model/query/forms.js
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,10 @@ const getBinaryFields = (formDefId) => ({ all }) =>
all(sql`select * from form_fields where "formDefId"=${formDefId} and "binary"=true order by "order" asc`)
.then(map(construct(Form.Field)));

const getStructuralFields = (formDefId) => ({ all }) =>
all(sql`select * from form_fields where "formDefId"=${formDefId} and (type='repeat' or type='structure') order by "order" asc`)
.then(map(construct(Form.Field)));


////////////////////////////////////////////////////////////////////////////////
// MISC
Expand All @@ -337,7 +341,7 @@ module.exports = {
setManagedKey,
getByAuthForOpenRosa,
getVersions, getByActeeIdForUpdate, getByProjectId, getByProjectAndXmlFormId,
getFields, getBinaryFields,
getFields, getBinaryFields, getStructuralFields,
lockDefs, getAllSubmitters
};

3 changes: 2 additions & 1 deletion lib/resources/submissions.js
Original file line number Diff line number Diff line change
Expand Up @@ -481,11 +481,12 @@ module.exports = (service, endpoint) => {
getForm(params, Forms)
.then(auth.canOrReject('submission.read'))
.then((form) => Promise.all([
Forms.getStructuralFields(form.def.id),
Submissions.getDefsByFormAndLogicalId(form.id, params.rootId, draft),
Submissions.getByIds(params.projectId, params.formId, params.rootId, draft)
.then(getOrNotFound)
]))
.then(([ versions ]) => diffSubmissions(versions))));
.then(([ structurals, versions ]) => diffSubmissions(structurals, versions))));
};

// reify for draft/nondraft
Expand Down
Loading

0 comments on commit f16636d

Please sign in to comment.