/
Match.cs
392 lines (329 loc) · 16.8 KB
/
Match.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
#nullable disable
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.Linq;
namespace Microsoft.CodeAnalysis.Differencing
{
public sealed partial class Match<TNode>
{
private const double ExactMatchDistance = 0.0;
private const double EpsilonDistance = 0.00001;
private const double MatchingDistance1 = 0.25;
private const double MatchingDistance2 = 0.5;
private const double MatchingDistance3 = 0.75;
private const double MaxDistance = 1.0;
private readonly TreeComparer<TNode> _comparer;
private readonly TNode _root1;
private readonly TNode _root2;
private readonly Dictionary<TNode, TNode> _oneToTwo = new();
private readonly Dictionary<TNode, TNode> _twoToOne = new();
internal Match(TNode root1, TNode root2, TreeComparer<TNode> comparer, IEnumerable<KeyValuePair<TNode, TNode>> knownMatches)
{
_root1 = root1;
_root2 = root2;
_comparer = comparer;
var labelCount = comparer.LabelCount;
CategorizeNodesByLabels(comparer, root1, labelCount, out var nodes1, out _);
CategorizeNodesByLabels(comparer, root2, labelCount, out var nodes2, out _);
// Root nodes always match. Add them before adding known matches to make sure we always have root mapping.
TryAdd(root1, root2);
if (knownMatches != null)
{
foreach (var knownMatch in knownMatches)
{
if (comparer.GetLabel(knownMatch.Key) != comparer.GetLabel(knownMatch.Value))
{
throw new ArgumentException(string.Format(WorkspacesResources.Matching_nodes_0_and_1_must_have_the_same_label, knownMatch.Key, knownMatch.Value), nameof(knownMatches));
}
if (!comparer.TreesEqual(knownMatch.Key, root1))
{
throw new ArgumentException(string.Format(WorkspacesResources.Node_0_must_be_contained_in_the_old_tree, knownMatch.Key), nameof(knownMatches));
}
if (!comparer.TreesEqual(knownMatch.Value, root2))
{
throw new ArgumentException(string.Format(WorkspacesResources.Node_0_must_be_contained_in_the_new_tree, knownMatch.Value), nameof(knownMatches));
}
// skip pairs whose key or value is already mapped:
TryAdd(knownMatch.Key, knownMatch.Value);
}
}
ComputeMatch(nodes1, nodes2);
}
private static void CategorizeNodesByLabels(
TreeComparer<TNode> comparer,
TNode root,
int labelCount,
out List<TNode>[] nodes,
out int totalCount)
{
nodes = new List<TNode>[labelCount];
var count = 0;
// It is important that we add the nodes in depth-first prefix order.
// This order ensures that a node of a certain kind can have a parent of the same kind
// and we can still use tied-to-parent for that kind. That's because the parent will always
// be processed earlier than the child due to depth-first prefix ordering.
foreach (var node in comparer.GetDescendants(root))
{
var label = comparer.GetLabel(node);
if (label < 0 || label >= labelCount)
{
throw new InvalidOperationException(string.Format(WorkspacesResources.Label_for_node_0_is_invalid_it_must_be_within_bracket_0_1, node, labelCount));
}
var list = nodes[label];
if (list == null)
{
nodes[label] = list = new List<TNode>();
}
list.Add(node);
count++;
}
totalCount = count;
}
private void ComputeMatch(List<TNode>[] nodes1, List<TNode>[] nodes2)
{
Debug.Assert(nodes1.Length == nodes2.Length);
// --- The original FastMatch algorithm ---
//
// For each leaf label l, and then for each internal node label l do:
// a) S1 := chain T1(l)
// b) S2 := chain T2(l)
// c) lcs := LCS(S1, S2, Equal)
// d) For each pair of nodes (x,y) in lcs add (x,y) to M.
// e) Pair unmatched nodes with label l as in Algorithm Match, adding matches to M:
// For each unmatched node x in T1, if there is an unmatched node y in T2 such that equal(x,y)
// then add (x,y) to M.
//
// equal(x,y) is defined as follows:
// x, y are leafs => equal(x,y) := label(x) == label(y) && compare(value(x), value(y)) <= f
// x, y are nodes => equal(x,y) := label(x) == label(y) && |common(x,y)| / max(|x|, |y|) > t
// where f, t are constants.
//
// --- Actual implementation ---
//
// We also categorize nodes by their labels, but then we proceed differently:
//
// 1) A label may be marked "tied to parent". Let x, y have both label l and l is "tied to parent".
// Then (x,y) can be in M only if (parent(x), parent(y)) in M.
// Thus we require labels of children tied to a parent to be preceded by all their possible parent labels.
//
// 2) Rather than defining function equal in terms of constants f and t, which are hard to get right,
// we try to match multiple times with different threshold for node distance.
// The comparer defines the distance [0..1] between two nodes and it can do so by analyzing
// the node structure and value. The comparer can tune the distance specifically for each node kind.
// We first try to match nodes of the same labels to the exactly matching or almost matching counterparts.
// The we keep increasing the threshold and keep adding matches.
for (var l = 0; l < nodes1.Length; l++)
{
if (nodes1[l] != null && nodes2[l] != null)
{
ComputeMatchForLabel(l, nodes1[l], nodes2[l]);
}
}
}
private void ComputeMatchForLabel(int label, List<TNode> s1, List<TNode> s2)
{
var tiedToAncestor = _comparer.TiedToAncestor(label);
ComputeMatchForLabel(s1, s2, tiedToAncestor, EpsilonDistance); // almost exact match
ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance1); // ok match
ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance2); // ok match
ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance3); // ok match
ComputeMatchForLabel(s1, s2, tiedToAncestor, MaxDistance); // any match
}
private void ComputeMatchForLabel(List<TNode> s1, List<TNode> s2, int tiedToAncestor, double maxAcceptableDistance)
{
// Obviously, the algorithm below is O(n^2). However, in the common case, the 2 lists will
// be sequences that exactly match. The purpose of "firstNonMatch2" is to reduce the complexity
// to O(n) in this case. Basically, the pointer is the 1st non-matched node in the list of nodes of tree2
// with the given label.
// Whenever we match to firstNonMatch2 we set firstNonMatch2 to the subsequent node.
// So in the case of totally matching sequences, we process them in O(n) -
// both node1 and firstNonMatch2 will be advanced simultaneously.
Debug.Assert(maxAcceptableDistance is >= ExactMatchDistance and <= MaxDistance);
var count1 = s1.Count;
var count2 = s2.Count;
var firstNonMatch2 = 0;
for (var i1 = 0; i1 < count1; i1++)
{
var node1 = s1[i1];
// Skip this guy if it already has a partner
if (HasPartnerInTree2(node1))
{
continue;
}
// Find node2 that matches node1 the best, i.e. has minimal distance.
var bestDistance = MaxDistance * 2;
TNode bestMatch = default;
var matched = false;
int i2;
for (i2 = firstNonMatch2; i2 < count2; i2++)
{
var node2 = s2[i2];
// Skip this guy if it already has a partner
if (HasPartnerInTree1(node2))
{
continue;
}
// this requires parents to be processed before their children:
if (tiedToAncestor > 0)
{
// TODO (tomat): For nodes tied to their parents,
// consider avoiding matching them to all other nodes of the same label.
// Rather we should only match them with their siblings that share the same parent.
// Check if nodes that are configured to be tied to their ancestor have the respective ancestor matching.
// In cases when we compare substrees rooted below both of these ancestors we assume the ancestors are
// matching since the roots of the subtrees must match and therefore their ancestors must match as well.
// If one node's ancestor is present in the subtree and the other isn't then we are not in the scenario
// of comparing subtrees with matching roots and thus we consider the nodes not matching.
var hasAncestor1 = _comparer.TryGetAncestor(node1, tiedToAncestor, out var ancestor1);
var hasAncestor2 = _comparer.TryGetAncestor(node2, tiedToAncestor, out var ancestor2);
if (hasAncestor1 != hasAncestor2)
{
continue;
}
if (hasAncestor1)
{
// Since CategorizeNodesByLabels added nodes to the s1/s2 lists in depth-first prefix order,
// we can also accept equality in the following condition. That's because we find the partner
// of the parent node before we get to finding it for the child node of the same kind.
Debug.Assert(_comparer.GetLabel(ancestor1) <= _comparer.GetLabel(node1));
if (!Contains(ancestor1, ancestor2))
{
continue;
}
}
}
// We know that
// 1. (node1, node2) not in M
// 2. Both of their parents are matched to the same parent (or are not matched)
//
// Now, we have no other choice than comparing the node "values"
// and looking for the one with the smaller distance.
var distance = _comparer.GetDistance(node1, node2);
if (distance < bestDistance)
{
matched = true;
bestMatch = node2;
bestDistance = distance;
// We only stop if we've got an exact match. This is to resolve the problem
// of entities with identical names(name is often used as the "value" of a
// node) but with different "sub-values" (e.g. two locals may have the same name
// but different types. Since the type is not part of the value, we don't want
// to stop looking for the best match if we don't have an exact match).
if (distance == ExactMatchDistance)
{
break;
}
}
}
if (matched && bestDistance <= maxAcceptableDistance)
{
var added = TryAdd(node1, bestMatch);
// We checked above that node1 doesn't have a partner.
// The map is a bijection by construction, so we should be able to add the mapping.
Debug.Assert(added);
// If we exactly matched to firstNonMatch2 we can advance it.
if (i2 == firstNonMatch2)
{
firstNonMatch2 = i2 + 1;
}
if (firstNonMatch2 == count2)
{
return;
}
}
}
}
internal bool TryAdd(TNode node1, TNode node2)
{
Debug.Assert(_comparer.TreesEqual(node1, _root1));
Debug.Assert(_comparer.TreesEqual(node2, _root2));
if (_oneToTwo.ContainsKey(node1) || _twoToOne.ContainsKey(node2))
{
return false;
}
_oneToTwo.Add(node1, node2);
_twoToOne.Add(node2, node1);
return true;
}
internal bool TryGetPartnerInTree1(TNode node2, out TNode partner1)
{
var result = _twoToOne.TryGetValue(node2, out partner1);
Debug.Assert(_comparer.TreesEqual(node2, _root2));
Debug.Assert(!result || _comparer.TreesEqual(partner1, _root1));
return result;
}
internal bool HasPartnerInTree1(TNode node2)
{
Debug.Assert(_comparer.TreesEqual(node2, _root2));
return _twoToOne.ContainsKey(node2);
}
internal bool TryGetPartnerInTree2(TNode node1, out TNode partner2)
{
var result = _oneToTwo.TryGetValue(node1, out partner2);
Debug.Assert(_comparer.TreesEqual(node1, _root1));
Debug.Assert(!result || _comparer.TreesEqual(partner2, _root2));
return result;
}
internal bool HasPartnerInTree2(TNode node1)
{
Debug.Assert(_comparer.TreesEqual(node1, _root1));
return _oneToTwo.ContainsKey(node1);
}
internal bool Contains(TNode node1, TNode node2)
{
Debug.Assert(_comparer.TreesEqual(node2, _root2));
return TryGetPartnerInTree2(node1, out var partner2) && node2.Equals(partner2);
}
public TreeComparer<TNode> Comparer => _comparer;
public TNode OldRoot => _root1;
public TNode NewRoot => _root2;
public IReadOnlyDictionary<TNode, TNode> Matches
{
get
{
return new ReadOnlyDictionary<TNode, TNode>(_oneToTwo);
}
}
public IReadOnlyDictionary<TNode, TNode> ReverseMatches
{
get
{
return new ReadOnlyDictionary<TNode, TNode>(_twoToOne);
}
}
public bool TryGetNewNode(TNode oldNode, out TNode newNode)
=> _oneToTwo.TryGetValue(oldNode, out newNode);
public bool TryGetOldNode(TNode newNode, out TNode oldNode)
=> _twoToOne.TryGetValue(newNode, out oldNode);
/// <summary>
/// Returns an edit script (a sequence of edits) that transform <see cref="OldRoot"/> subtree
/// to <see cref="NewRoot"/> subtree.
/// </summary>
public EditScript<TNode> GetTreeEdits()
=> new(this);
/// <summary>
/// Returns an edit script (a sequence of edits) that transform a sequence of nodes <paramref name="oldNodes"/>
/// to a sequence of nodes <paramref name="newNodes"/>.
/// </summary>
/// <exception cref="ArgumentNullException"><paramref name="oldNodes"/> or <paramref name="newNodes"/> is a null reference.</exception>
public IEnumerable<Edit<TNode>> GetSequenceEdits(IEnumerable<TNode> oldNodes, IEnumerable<TNode> newNodes)
{
if (oldNodes == null)
{
throw new ArgumentNullException(nameof(oldNodes));
}
if (newNodes == null)
{
throw new ArgumentNullException(nameof(newNodes));
}
var oldList = (oldNodes as IReadOnlyList<TNode>) ?? oldNodes.ToList();
var newList = (newNodes as IReadOnlyList<TNode>) ?? newNodes.ToList();
return new LongestCommonSubsequence(this).GetEdits(oldList, newList);
}
}
}