Merge pull request #385 from getodk/issa/myers-diff

improve the diff implementation with schema input and myers algo
getodk · Aug 4, 2021 · f16636d · f16636d
2 parents 524b3e2 + a669ac1
commit f16636d
Show file tree

Hide file tree

Showing 7 changed files with 642 additions and 498 deletions.
diff --git a/lib/data/submission.js b/lib/data/submission.js
@@ -8,10 +8,12 @@
 // except according to the terms contained in the LICENSE file.
 
 const { Readable } = require('stream');
+const { createHash } = require('crypto');
 const hparser = require('htmlparser2');
+const fmdiff = require('fast-myers-diff');
 const { SchemaStack } = require('./schema');
 const { noop } = require('../util/util');
-const { contains, isEmpty, map, max, union } = require('ramda');
+const { union, last, pluck } = require('ramda');
 
 
 // reads submission xml with the streaming parser, and outputs a stream of every
@@ -59,136 +61,245 @@ const submissionXmlToFieldStream = (fields, xml) => {
   return outStream;
 };
 
-// Reads XML without reading form schema
-const submissionXmlToObj = (xml) => {
-  const fieldStack = [];
-  const data = {};
-  let currNode = data;
-  const nodeStack = [ currNode ];
 
-  let textBuffer = ''; // agglomerates text nodes that come as multiple events.
+////////////////////////////////////////////////////////////////////////////////
+// SUBMISSION DIFFING
+//
+// there are two phases to processing submission diffs:
+// 1 translating the xml to data trees in _hashedTree below
+// 2 using those trees to determine deltas in _recurseDiff below
+//
+// when we do the tree conversion, we process hashes for each data subtree so that
+// when we're running through the trees in step 2, we don't have to navigate a bunch
+// of identical data to learn it's identical; we can just check that the hashes match.
+//
+// when repeat (array) data changes, we run the hashes through a myers diffing routine
+// to determine where the change hunks are. from there, we treat pure additions and
+// removals as is, and 1:1 trades as edits. in the tricker case where the removals and
+// additions in a single hunk are different lengths, we run all the candidates in the
+// change range against each other, looking for the ones that match the worst and
+// knocking the necessary number out.
+// this resolution could be made more clever by improving the match-scoring system,
+// and by better-respecting linearity between the a and b changelists.
+
+const subhash = Symbol('subhash');
+const subhashes = Symbol('subhashes');
+const keys = Symbol('keys');
+const ptr = last;
+
+/* eslint-disable no-param-reassign */
+// this utility is used in _hashedTree to decorate structures for computed metadata
+// onto a data object, depending on its type.
+const _decorated = (obj, array = false) => {
+  obj[subhash] = createHash('sha1');
+  if (array) obj[subhashes] = [];
+  else obj[keys] = [];
+  return obj;
+};
+
+// iff (rare) we get a complex (n removed, <>n added) repeat diff delta back from
+// the myers diff algo, we need to score diffs against each other to determine which
+// to match w whom. this recursive util computes (very naively, TODO improve) a "difference"
+// score given a { old, new, path } diffline, where lower numbers are smaller diffs.
+//
+// _withScore is the entrypoint, and _deepCount handles structural data values.
+const score = Symbol('diff score');
+const _deepCount = (x) => {
+  if (x == null) return 0;
+  if (typeof x === 'string') return 1;
+  let result = 0;
+  for (const k of Object.keys(x)) result += 1 + _deepCount(x[k]);
+  return result;
+};
+const _withScore = (diff) => {
+  diff[score] = 1;
+  for (const change of diff) diff[score] += _deepCount(change.old) + _deepCount(change.new);
+  return diff;
+};
+/* eslint-enable no-param-reassign */
 
+// converts an xml submission to a js tree, computing branch subhashes and
+// decorating them as it goes. the resulting tree has definitive {} and []
+// structures where schema-appropriate, and every structure has a [subhash]
+// Symbol key stored on it which indicates the subhash of that tree. every
+// object has a [keys] where we've cached the keys so we don't have to continually
+// requery them later when analyzing. every array also has a [subhashes] which
+// plucks all subhashes from direct-child structures, again for quick analysis.
+//
+// takes in the set of all structural fields ever known to exist on the form,
+// to establish group/repeat structure.
+const _hashedTree = (structurals, xml) => {
+  const tree = _decorated({});
+  const treeStack = [ tree ];
+  const stack = new SchemaStack(structurals, true);
+  const repeats = new Set();
+
+  let textBuffer;
   const parser = new hparser.Parser({
     onopentag: (tagName) => {
-      fieldStack.push(tagName);
-      nodeStack.push(currNode);
-
-      if (tagName in currNode) {
-        // tagname is already present so this is probably a repeat
-        if (!Array.isArray(currNode[tagName])) {
-          // make it into an array if not already an arary
-          currNode[tagName] = [currNode[tagName]];
-        }
+      const context = ptr(treeStack);
+      if (stack.droppedWrapper === true) context[keys].push(tagName);
+      textBuffer = '';
 
-        // push empty object to put child contents into
-        const newObj = {};
-        currNode[tagName].push(newObj);
-        currNode = newObj;
-      } else {
-        // tag name does not yet exist, make empty object
-        currNode[tagName] = {};
-        currNode = currNode[tagName];
-      }
+      const structural = stack.push(tagName);
+      if ((structural != null) && (structural !== SchemaStack.Wrapper)) {
+        // no matter what we have a new object context to create.
+        const treeNode = _decorated({});
+        treeStack.push(treeNode);
 
-      textBuffer = '';
+        if (structural.type === 'structure') { // new obj just gets stuck on groups,
+          context[tagName] = treeNode;
+        } else if (structural.type === 'repeat') { // but for repeats,
+          if (context[tagName] == null) { // sometimes an array must be created first.
+            const repeat = _decorated([ treeNode ], true);
+            context[tagName] = repeat;
+            repeat[subhashes] = [];
+            repeats.add(repeat);
+          } else {
+            context[tagName].push(treeNode);
+          }
+        }
+      }
     },
-    ontext(text) {
+    ontext: (text) => {
       textBuffer += text;
     },
-    onclosetag() {
-      const field = fieldStack.pop();
-      currNode = nodeStack.pop();
+    onclosetag: (tagName) => {
+      const structural = stack.pop();
+      if (stack.exited === true) {
+        // nothing routine to do, but since we are done let's digest the root hash.
+        tree[subhash] = tree[subhash].digest('base64');
+      } else if (structural == null) {
+        // primitive values should update their context hash.
+        // TODO: if we want empty and nonexistent nodes to coalesce we could do it here.
+        const context = ptr(treeStack);
+        context[tagName] = textBuffer;
+        context[subhash].update(`${tagName}\0${textBuffer}\0\0`);
+      } else {
+        // repeats have to deal with updating array running totals.
+        const structure = treeStack.pop();
+        structure[subhash] = structure[subhash].digest('base64');
+        const context = ptr(treeStack);
 
-      if (isEmpty(currNode[field])) {
-        // only set terminal node text values
-        currNode[field] = textBuffer;
+        if (structural.type === 'repeat') {
+          const repeat = context[structural.name];
+          repeat[subhashes].push(structure[subhash]);
+          repeat[subhash].update(structure[subhash]);
+        }
+        context[subhash].update(structure[subhash]);
       }
-    },
+    }
   }, { xmlMode: true, decodeEntities: true });
 
   parser.write(xml);
   parser.end();
 
-  return data;
+  // now that everything is done go and finalize all our repeat subhashes.
+  for (const repeat of repeats) repeat[subhash] = repeat[subhash].digest('base64');
+  return tree;
 };
 
-// Helper function for formatting the diff representation of one node
-//   curr and prev: values
-//   xpath: full tree path as an array, not including the current node
-//   key: current node key
-//   index: node index if it is within a repeat group
-const formatDiff = (curr, prev, keyStack, key, index = null) => ({
-  new: curr || null,
-  old: prev || null,
-  path: keyStack.slice(1).concat(index ? [[key, index]] : [key]), // first stack element 'data' removed
-});
-
-const compareObjects = (a, b, keyStack = []) => {
-  const ak = Object.keys(a); // more recent submission
-  const bk = Object.keys(b); // previous submission
-  const allKeys = union(ak, bk);
-
-  const differences = [];
-
-  for (const key of allKeys) {
-    // Check for keys that are not both present
-    if (!(contains(key, ak)) || !(contains(key, bk))) {
-      // if one key is missing, that one will be undefined
-      differences.push(formatDiff(a[key], b[key], keyStack, key));
-    } else {
-      // Compare the same keys
-      let valueA = a[key];
-      let valueB = b[key];
-
-      // If one is an array and the other isn't, make both into arrays
-      if (Array.isArray(valueA) && !Array.isArray(valueB))
-        valueB = [valueB];
-      else if (!Array.isArray(valueA) && Array.isArray(valueB))
-        valueA = [valueA];
-
-      if (Array.isArray(valueA) && Array.isArray(valueB)) {
-        // If they are both arrays, iterate through the longer one
-        for (let i = 0; i < max(valueA.length, valueB.length); i += 1) {
-          const innerValueA = valueA[i];
-          const innerValueB = valueB[i];
-
-          if (!innerValueA || !innerValueB) {
-            differences.push(formatDiff(innerValueA, innerValueB, keyStack, key, i));
-          } else {
-            differences.push(...compareObjects(
-              innerValueA,
-              innerValueB,
-              keyStack.concat([[ key, i ]])
-            ));
+// deals w the subtle differences between recursing into an obj vs array.
+const _recurseDiff = (a, b, subpath, subkey) => (!Array.isArray(a)
+  ? _diffObj(a, b, (subkey != null) ? subpath.concat([ subkey ]) : subpath) // eslint-disable-line no-use-before-define
+  : _diffArray(a, b, subpath, subkey)); // eslint-disable-line no-use-before-define
+
+// diffs two object-type submission data trees and returns a set of diffs.
+const _diffObj = (a, b, subpath) => {
+  const results = [];
+  for (const key of union(a[keys], b[keys])) {
+    const av = a[key];
+    if (!Object.prototype.hasOwnProperty.call(a, key)) { // null -> b
+      results.push({ new: b[key], path: subpath.concat([ key ]) });
+    } else if (!Object.prototype.hasOwnProperty.call(b, key)) { // a -> null
+      results.push({ old: av, path: subpath.concat([ key ]) });
+    } else if (av[subhash] == null) { // primitive
+      if (av !== b[key]) // a -> b
+        results.push({ old: av, new: b[key], path: subpath.concat([ key ]) });
+    } else if (av[subhash] !== b[key][subhash]) { // structural a -> b
+      results.push(..._recurseDiff(av, b[key], subpath, key));
+    }
+  }
+  return results;
+};
+
+// diffs two array-type submission data trees and returns a set of diffs.
+// n.b. fast-myers-diff outputs eg [ aa, az ) and [ ba, bz ) patch ranges.
+const _diffArray = (a, b, subpath, parentKey) => {
+  const results = [];
+  for (const [ aa, az, ba, bz ] of fmdiff.diff(a[subhashes], b[subhashes])) {
+    if (aa === az) { // null -> bs
+      for (let i = ba; i < bz; i += 1)
+        results.push({ new: b[i], path: subpath.concat([[ parentKey, i ]]) });
+    } else if (ba === bz) { // as -> null
+      for (let i = aa; i < az; i += 1)
+        results.push({ old: a[i], path: subpath.concat([[ parentKey, i ]]) });
+    } else if ((az - aa) === (bz - ba)) { // as -> bs direct 1:1 match
+      for (let i = 0; i < (az - aa); i += 1)
+        results.push(..._recurseDiff(a[aa + i], b[ba + i], subpath.concat([[ parentKey, aa + i ]])));
+    } else { // as -> bs complex
+      // if we have too many on one side, we want to eliminate the worst cross-matches
+      // as pure add/remove so we can diff the rest across
+      const alen = az - aa;
+      const blen = bz - ba;
+      const diffs = [];
+      for (let i = 0; i < alen; i += 1) diffs.push([]); // init subarrays
+      for (let i = 0; i < alen; i += 1) // cartesian cross-diff the whole delta
+        for (let j = 0; j < blen; j += 1)
+          diffs[i][j] = _withScore(_recurseDiff(a[aa + i], b[ba + j], subpath.concat([[ parentKey, aa + i ]])));
+
+      // now that we have all diffs find the worst matches and mark them for atomic add/remove diffs
+      // TODO: the lookup thing sort of sucks.
+      // l for longer, s for shorter.
+      const [ l, la, llen, slen, polarity, lookup ] = (alen > blen)
+        ? [ a, aa, alen, blen, 'old', ((x, y) => diffs[x][y]) ]
+        : [ b, ba, blen, alen, 'new', ((y, x) => diffs[x][y]) ];
+      const delta = llen - slen; // need to take this many out
+      const knockouts = new Array(delta); // going to take these ones out
+      for (let i = 0; i < llen; i += 1) {
+        // for each ko candidate we want to find its minimum match score (best match)
+        let min = Number.MAX_SAFE_INTEGER;
+        for (let j = 0; j < slen; j += 1) min = Math.min(min, lookup(i, j)[score]);
+
+        // now let's see if that's one of the maximum (worst) ones we know of.
+        // we do <= to tend towards leaving earlier values alone all things equal.
+        for (let k = 0; k < delta; k += 1)
+          if ((knockouts[k] == null) || (knockouts[k].min <= min)) {
+            knockouts[k] = { min, idx: la + i };
+            break; // don't overwrite multiple
           }
+      }
+
+      // finally output all our diffs in one go. we already calculated them so we just
+      // need to sort out the correct responses and look them up.
+      const skips = new Set(pluck('idx', knockouts));
+      let j = 0;
+      for (let i = 0; i < llen; i += 1) {
+        if (skips.has(la + i)) {
+          results.push({ [polarity]: l[la + i], path: subpath.concat([[ parentKey, la + i ]]) });
+        } else {
+          results.push(...lookup(i, j));
+          j += 1;
         }
-      } else if (typeof (a[key]) === 'object' && typeof (b[key]) === 'object') {
-        // If children are both objects, compare them recursively
-        differences.push(...compareObjects(
-          a[key],
-          b[key],
-          keyStack.concat(key)
-        ));
-      } else if (valueA.toString() !== valueB.toString()) {
-        // If they are both different values, note the change
-        differences.push(formatDiff(valueA, valueB, keyStack, key));
       }
-      // else: the values are the same
     }
   }
-
-  return differences;
+  return results;
 };
 
-const diffSubmissions = (versions) => new Promise((resolve) => {
-  const diffs = {};
-  const jsonVersions = map((v) => ({instanceId: v.instanceId, content: submissionXmlToObj(v.xml)}), versions);
-
-  for (let i = 0; i < versions.length - 1; i += 1) {
-    diffs[jsonVersions[i].instanceId] = compareObjects(jsonVersions[i].content, jsonVersions[i + 1].content);
-  }
-  resolve(diffs);
-});
+// actual public interface to diff all given versions of a submission in sequential
+// order. because of database query ordering, we expect versions in newest-first order.
+//
+// will return { instanceId: [{ [new], [old], path }] } where each instanceId
+// indicates the changes that resulted in that version from the previous.
+const diffSubmissions = (structurals, versions) => {
+  const byVersion = {};
+  const _trees = [];
+  for (const version of versions) _trees.push(_hashedTree(structurals, version.xml));
+  for (let i = 0; i < versions.length - 1; i += 1)
+    byVersion[versions[i].instanceId] = _recurseDiff(_trees[i + 1], _trees[i], []);
+  return byVersion;
+};
 
-module.exports = { submissionXmlToFieldStream, submissionXmlToObj, compareObjects, diffSubmissions, formatDiff };
+module.exports = { submissionXmlToFieldStream, _hashedTree, _diffObj, _diffArray, diffSubmissions, _symbols: { subhash, subhashes, keys, score } };
 
diff --git a/lib/model/query/forms.js b/lib/model/query/forms.js
@@ -313,6 +313,10 @@ const getBinaryFields = (formDefId) => ({ all }) =>
   all(sql`select * from form_fields where "formDefId"=${formDefId} and "binary"=true order by "order" asc`)
     .then(map(construct(Form.Field)));
 
+const getStructuralFields = (formDefId) => ({ all }) =>
+  all(sql`select * from form_fields where "formDefId"=${formDefId} and (type='repeat' or type='structure') order by "order" asc`)
+    .then(map(construct(Form.Field)));
+
 
 ////////////////////////////////////////////////////////////////////////////////
 // MISC
@@ -337,7 +341,7 @@ module.exports = {
   setManagedKey,
   getByAuthForOpenRosa,
   getVersions, getByActeeIdForUpdate, getByProjectId, getByProjectAndXmlFormId,
-  getFields, getBinaryFields,
+  getFields, getBinaryFields, getStructuralFields,
   lockDefs, getAllSubmitters
 };
 
diff --git a/lib/resources/submissions.js b/lib/resources/submissions.js
@@ -481,11 +481,12 @@ module.exports = (service, endpoint) => {
       getForm(params, Forms)
         .then(auth.canOrReject('submission.read'))
         .then((form) => Promise.all([
+          Forms.getStructuralFields(form.def.id),
           Submissions.getDefsByFormAndLogicalId(form.id, params.rootId, draft),
           Submissions.getByIds(params.projectId, params.formId, params.rootId, draft)
             .then(getOrNotFound)
         ]))
-        .then(([ versions ]) => diffSubmissions(versions))));
+        .then(([ structurals, versions ]) => diffSubmissions(structurals, versions))));
   };
 
   // reify for draft/nondraft