Skip to content

Commit

Permalink
Myers diff algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
fisharebest committed Oct 20, 2015
1 parent d2272de commit 2d2df57
Show file tree
Hide file tree
Showing 4 changed files with 317 additions and 1 deletion.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
CHANGE LOG
==========

## 1.1.0 (2015-10-20)
- Myers’ diff

## 1.0.1 (2015-05-15)
- Exclude test scripts in export.

Expand Down
29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,34 @@ $path = $dijkstra->shortestPaths('A', 'E'); // array(array('A', 'B', 'D', 'E'))
$path = $dijkstra->shortestPaths('A', 'E', array('B')); // array(array('A', 'B', 'D', 'E'))
$path = $dijkstra->shortestPaths('A', 'E', array('D')); // array(array('A', 'B', 'C', 'E'))
$path = $dijkstra->shortestPaths('A', 'E', array('B', 'D')); // array(array('A', 'F', 'C', 'E'))
```

## Myers’ diff

```
Find the difference between two sequences of tokens (characters, words, lines, etc.) using
[An O(ND) Difference Algorithm and Its Variations](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.6927)
by Eugene W. Myers’.

The output can be interpreted as either:

* A series of instructions to transform the first sequence into the second sequence.
* A list of matches (tokens that appear in both sequences) and mismatches (tokens that appear in
just one sequence).

``` php
$x = array('a', 'b', 'c', 'a', 'b', 'b', 'a');
$y = array('c', 'b', 'a', 'b', 'a', 'c');
$algorithm = new MyersDiff;
$diff = $algorithm->calculate($x, $y);
// array(
// array('a', MyersDiff::DELETE), i.e. 'a' occurs only in $x
// array('b', MyersDiff::DELETE), i.e. 'b' occurs only in $x
// array('c', MyersDiff::KEEP), i.e. 'c' occurs both $x and $y
// array('b', MyersDiff::INSERT), i.e. 'b' occurs only in $y
// array('a', MyersDiff::KEEP), i.e. 'a' occurs in both $x and $y
// array('b', MyersDiff::KEEP), i.e. 'b' occurs in both $x and $y
// array('b', MyersDiff::DELETE), i.e. 'b' occurs only in $x
// array('a', MyersDiff::KEEP), i.e. 'a' occurs in both $x and $y
// array('c', MyersDiff::INSERT), i.e. 'c' occurs only in $y
// );
```
153 changes: 153 additions & 0 deletions src/MyersDiff.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
<?php
namespace Fisharebest\Algorithm;

/**
* @package fisharebest/algorithm
* @author Greg Roach <greg@subaqua.co.uk>
* @copyright (c) 2015 Greg Roach <greg@subaqua.co.uk>
* @license GPL-3.0+
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

/**
* Class MyersDiff - find the shortest edit sequence to transform one string into another.
*
* Based on "An O(ND) Difference Algorithm and Its Variations" by Eugene W Myers.
*
* http://www.xmailserver.org/diff2.pdf
* http://www.codeproject.com/Articles/42279/Investigating-Myers-diff-algorithm-Part-of
*/
class MyersDiff {
/** Instruction to delete a token which only appears in the first sequence */
const DELETE = -1;

/** Instruction to keep a token which is common to both sequences */
const KEEP = 0;

/** Instruction to insert a token which only appears in the last sequence */
const INSERT = 1;

/**
* When one of the sequences is empty, there is no
*/
private function degenerateCase(array $x, $action) {
$solution = array();
foreach ($x as $token) {
$solution[] = array($token, $action);
}

return $solution;
}

/**
* Calculate the shortest edit sequence to convert $x into $y.
*
* @param string[] $a - tokens (characters, words or lines)
* @param string[] $b - tokens (characters, words or lines)
*
* @return array[] - pairs of token and edit (-1 for delete, 0 for keep, +1 for insert)
*/
public function calculate(array $a, array $b) {
// Check for degenerate cases.
if (empty($a)) {
return $this->degenerateCase($b, self::INSERT);
}
if (empty($b)) {
return $this->degenerateCase($a, self::DELETE);
}

// The algorithm uses array keys numbered from one.
$n = count($a);
$m = count($b);
$a = array_combine(range(1, $n), array_values($a));
$b = array_combine(range(1, $m), array_values($b));
$max = $m + $n;

// Keep a copy of $v after each iteration of $d.
$v_save = array();

// Find the shortest "D-path".
$v[1] = 0;
for ($d = 0; $d <= $max; ++$d) {
// Examine all possible "K-lines" for this "D-path".
for ($k = -$d; $k <= $d; $k += 2) {
if ($k === -$d || $k !== $d && $v[$k - 1] < $v[$k + 1]) {
// Move down.
$x = $v[$k + 1];
} else {
// Move right.
$x = $v[$k - 1] + 1;
}
// Derive Y from X.
$y = $x - $k;
// Follow the diagonal.
while ($x < $n && $y < $m && $a[$x + 1] === $b[$y + 1]) {
++$x;
++$y;
}
// Just store X, as we can calculate Y (from X + K).
$v[$k] = $x;
$v_save[$d] = $v;
// Solution found?
if ($x === $n && $y === $m) {
break 2;
}
}
}

// Extract the solution by back-tracking through the saved results.
$x = $n;
$y = $m;
$snakes = array();
for ($d = count($v_save) - 1; $x > 0 && $y > 0; --$d) {
array_unshift($snakes, array($x, $y));

$v = $v_save[$d];
$k = $x - $y;

if ($k === -$d || $k !== $d && $v[$k - 1] < $v[$k + 1]) {
$k_prev = $k + 1;
} else {
$k_prev = $k - 1;
}

$x = $v[$k_prev];
$y = $x - $k_prev;
}

// Extract the solution from the snake endpoints
$solution = array();
$x = 0;
$y = 0;
foreach ($snakes as $snake) {
// Horizontals
while ($snake[0] - $snake[1] > $x - $y) {
++$x;
$solution[] = array($a[$x], self::DELETE);
}
// Verticals
while ($snake[0] - $snake[1] < $x - $y) {
++$y;
$solution[] = array($b[$y], self::INSERT);
}
// Diagonals
while ($x < $snake[0]) {
++$x;
++$y;
$solution[] = array($a[$x], self::KEEP);
}
}

return $solution;
}
}
133 changes: 133 additions & 0 deletions test/MyersDiffTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
<?php
namespace Fisharebest\Algorithm;

/**
* @package fisharebest/algorithm
* @author Greg Roach <greg@subaqua.co.uk>
* @copyright (c) 2015 Greg Roach <greg@subaqua.co.uk>
* @license GPL-3.0+
*s
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

class MyersDiffTest extends \PHPUnit_Framework_TestCase {
/**
* Test empty sequences.
*
* @return string[]
*/
public function testBothEmpty() {
$algorithm = new MyersDiff;
$x = array();
$y = array();
$diff = array();

$this->assertSame($diff, $algorithm->calculate($x, $y));
}

/**
* Test one empty sequence.
*
* @return string[]
*/
public function testFirstEmpty() {
$algorithm = new MyersDiff;
$x = array();
$y = array('a', 'b', 'c');
$diff = array(
array('a', MyersDiff::INSERT),
array('b', MyersDiff::INSERT),
array('c', MyersDiff::INSERT),
);

$this->assertSame($diff, $algorithm->calculate($x, $y));
}

/**
* Test one empty sequence.
*
* @return string[]
*/
public function testSecondEmpty() {
$algorithm = new MyersDiff;
$x = array('a', 'b', 'c');
$y = array();
$diff = array(
array('a', MyersDiff::DELETE),
array('b', MyersDiff::DELETE),
array('c', MyersDiff::DELETE),
);

$this->assertSame($diff, $algorithm->calculate($x, $y));
}

/**
* Test identical sequences.
*
* @return string[]
*/
public function testIdentical() {
$algorithm = new MyersDiff;
$x = array('a', 'b', 'c');
$y = array('a', 'b', 'c');
$diff = array(
array('a', MyersDiff::KEEP),
array('b', MyersDiff::KEEP),
array('c', MyersDiff::KEEP),
);

$this->assertSame($diff, $algorithm->calculate($x, $y));
}

/**
* Test two non-empty sequences.
*
* @return string[]
*/
public function testBothNonEmpty() {
$algorithm = new MyersDiff;
$x = array('a', 'b', 'c', 'a', 'b', 'b', 'a');
$y = array('c', 'b', 'a', 'b', 'a', 'c');
$diff = array(
array('a', MyersDiff::DELETE),
array('b', MyersDiff::DELETE),
array('c', MyersDiff::KEEP),
array('b', MyersDiff::INSERT),
array('a', MyersDiff::KEEP),
array('b', MyersDiff::KEEP),
array('b', MyersDiff::DELETE),
array('a', MyersDiff::KEEP),
array('c', MyersDiff::INSERT),
);

$this->assertSame($diff, $algorithm->calculate($x, $y));
}

/**
* Test delete-before-insert.
*
* @return string[]
*/
public function testDeleteBeforeInsert() {
$algorithm = new MyersDiff;
$x = array('a', 'b', 'c');
$y = array('a', 'd', 'c');
$diff = array(
array('a', MyersDiff::KEEP),
array('b', MyersDiff::DELETE),
array('d', MyersDiff::INSERT),
array('c', MyersDiff::KEEP),
);

$this->assertSame($diff, $algorithm->calculate($x, $y));
}
}

0 comments on commit 2d2df57

Please sign in to comment.