-
Notifications
You must be signed in to change notification settings - Fork 742
/
NeedlemanWunschEditDistance.java
209 lines (176 loc) · 8.19 KB
/
NeedlemanWunschEditDistance.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/*
* Copyright 2017 The Error Prone Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.errorprone.names;
/**
* The Needleman-Wunsch algorithm for finding least-cost string edit distances between pairs of
* strings. Like Levenshtein, but this version allows for a sequence of adjacent
* deletions/insertions to cost less than the total cost of each individual deletion/insertion, so
* that, for example editing {@code Christopher} into {@code Chris} (dropping 6 characters) is not 6
* times as expensive as editing {@code Christopher} into {@code Christophe}.
*
* <p>See http://en.wikipedia.org/wiki/Needleman-Wunsch_algorithm
*
* @author alanw@google.com (Alan Wendt)
*/
public final class NeedlemanWunschEditDistance {
private NeedlemanWunschEditDistance() {
/* disallow instantiation */
}
/**
* Returns the edit distance between two strings. Levenshtein charges the same cost for each
* insertion or deletion. This algorithm is slightly more general in that it charges a sequence of
* adjacent insertions/deletions an up-front cost plus an incremental cost per insert/delete
* operation. The idea is that Christopher -> Chris should be less than 6 times as expensive as
* Christopher -> Christophe. The algorithm used to calculate this distance takes time and
* space proportional to the product of {@code source.length()} and {@code target.length()} to
* build the 3 arrays.
*
* @param source source string.
* @param target target string
* @param caseSensitive if true, case is used in comparisons and 'a' != 'A'.
* @param changeCost cost of changing one character
* @param openGapCost cost to open a gap to insert or delete some characters.
* @param continueGapCost marginal cost to insert or delete next character.
* @return edit distance between the source and target strings.
*/
public static int getEditDistance(
String source,
String target,
boolean caseSensitive,
int changeCost,
int openGapCost,
int continueGapCost) {
if (!caseSensitive) {
source = source.toLowerCase();
target = target.toLowerCase();
}
int sourceLength = source.length();
int targetLength = target.length();
if (sourceLength == 0) {
return scriptCost(openGapCost, continueGapCost, targetLength);
}
if (targetLength == 0) {
return scriptCost(openGapCost, continueGapCost, sourceLength);
}
// mMatrix[i][j] = Cost of aligning source.substring(0,i) with
// target.substring(0,j), using an edit script ending with
// matched characters.
int[][] mMatrix = new int[sourceLength + 1][targetLength + 1];
// Cost of an alignment that ends with a bunch of deletions.
// dMatrix[i][j] = best found cost of changing the first i chars
// of source into the first j chars of target, ending with one
// or more deletes of source characters.
int[][] dMatrix = new int[sourceLength + 1][targetLength + 1];
// Cost of an alignment that ends with one or more insertions.
int[][] iMatrix = new int[sourceLength + 1][targetLength + 1];
mMatrix[0][0] = dMatrix[0][0] = iMatrix[0][0] = 0;
// Any edit script that changes i chars of source into zero
// chars of target will only involve deletions. So only the
// d&m Matrix entries are relevant, because dMatrix[i][0] gives
// the cost of changing an i-length string into a 0-length string,
// using an edit script ending in deletions.
for (int i = 1; i <= sourceLength; i++) {
mMatrix[i][0] = dMatrix[i][0] = scriptCost(openGapCost, continueGapCost, i);
// Make the iMatrix entries impossibly expensive, so they'll be
// ignored as inputs to min(). Use a big cost but not
// max int because that will overflow if anything's added to it.
iMatrix[i][0] = Integer.MAX_VALUE / 2;
}
for (int j = 1; j <= targetLength; j++) {
// Only the i&m Matrix entries are relevant here, because they represent
// the cost of changing a 0-length string into a j-length string, using
// an edit script ending in insertions.
mMatrix[0][j] = iMatrix[0][j] = scriptCost(openGapCost, continueGapCost, j);
// Make the dMatrix entries impossibly expensive, so they'll be
// ignored as inputs to min(). Use a big cost but not
// max int because that will overflow if anything's added to it.
dMatrix[0][j] = Integer.MAX_VALUE / 2;
}
for (int i = 1; i <= sourceLength; i++) {
char sourceI = source.charAt(i - 1);
for (int j = 1; j <= targetLength; j++) {
char targetJ = target.charAt(j - 1);
int cost = (sourceI == targetJ) ? 0 : changeCost;
// Cost of changing i chars of source into j chars of target,
// using an edit script ending in matched characters.
mMatrix[i][j] =
cost
+ Math.min(
mMatrix[i - 1][j - 1], Math.min(iMatrix[i - 1][j - 1], dMatrix[i - 1][j - 1]));
// Cost of an edit script ending in a deletion.
dMatrix[i][j] =
Math.min(
mMatrix[i - 1][j] + openGapCost + continueGapCost,
dMatrix[i - 1][j] + continueGapCost);
// Cost of an edit script ending in an insertion.
iMatrix[i][j] =
Math.min(
mMatrix[i][j - 1] + openGapCost + continueGapCost,
iMatrix[i][j - 1] + continueGapCost);
}
}
// Return the minimum cost.
int costOfEditScriptEndingWithMatch = mMatrix[sourceLength][targetLength];
int costOfEditScriptEndingWithDelete = dMatrix[sourceLength][targetLength];
int costOfEditScriptEndingWithInsert = iMatrix[sourceLength][targetLength];
return Math.min(
costOfEditScriptEndingWithMatch,
Math.min(costOfEditScriptEndingWithDelete, costOfEditScriptEndingWithInsert));
}
/** Return the worst case edit distance between strings of this length */
public static int getWorstCaseEditDistance(
int sourceLength, int targetLength, int changeCost, int openGapCost, int continueGapCost) {
int maxLen = Math.max(sourceLength, targetLength);
int minLen = Math.min(sourceLength, targetLength);
// Compute maximum cost of changing one string into another. If the
// lengths differ, you'll need maxLen - minLen insertions or deletions.
int totChangeCost =
scriptCost(openGapCost, continueGapCost, maxLen - minLen) + minLen * changeCost;
// Another possibility is to just delete the entire source and insert the
// target, and not do any changes.
int blowAwayCost =
scriptCost(openGapCost, continueGapCost, sourceLength)
+ scriptCost(openGapCost, continueGapCost, targetLength);
return Math.min(totChangeCost, blowAwayCost);
}
/**
* Returns a normalized edit distance between 0 and 1. This is useful if you are comparing or
* aggregating distances of different pairs of strings
*/
public static double getNormalizedEditDistance(
String source,
String target,
boolean caseSensitive,
int changeCost,
int openGapCost,
int continueGapCost) {
if (source.isEmpty() && target.isEmpty()) {
return 0.0;
}
return (double)
getEditDistance(source, target, caseSensitive, changeCost, openGapCost, continueGapCost)
/ (double)
getWorstCaseEditDistance(
source.length(), target.length(), changeCost, openGapCost, continueGapCost);
}
/**
* Return the cost of a script consisting of a contiguous sequence of insertions or a contiguous
* sequence of deletions.
*/
private static int scriptCost(int openGapCost, int continueGapCost, int scriptLength) {
return (scriptLength == 0) ? 0 : openGapCost + scriptLength * continueGapCost;
}
}