-
Notifications
You must be signed in to change notification settings - Fork 0
/
JaroWinklerDistance.java
350 lines (321 loc) · 12.4 KB
/
JaroWinklerDistance.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
package eu.fbk.utils.core.strings;
/**
* A similarity algorithm indicating the percentage of matched characters between two character sequences.
*
* <p>
* The Jaro measure is the weighted sum of percentage of matched characters
* from each file and transposed characters. Winkler increased this measure
* for matching initial characters.
* </p>
*
* <p>
* This implementation is based on the Jaro Winkler similarity algorithm
* from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
* http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
* </p>
*
* <p>
* This code has been adapted from Apache Commons Lang 3.3.
* </p>
*/
public class JaroWinklerDistance implements EditDistance<Double> {
/**
* The default prefix length limit set to four.
*/
private static final int PREFIX_LENGTH_LIMIT = 4;
/**
* Represents a failed index search.
*/
public static final int INDEX_NOT_FOUND = -1;
/**
* Find the Jaro Winkler Distance which indicates the similarity score
* between two CharSequences.
*
* <pre>
* distance.apply(null, null) = IllegalArgumentException
* distance.apply("","") = 0.0
* distance.apply("","a") = 0.0
* distance.apply("aaapppp", "") = 0.0
* distance.apply("frog", "fog") = 0.93
* distance.apply("fly", "ant") = 0.0
* distance.apply("elephant", "hippo") = 0.44
* distance.apply("hippo", "elephant") = 0.44
* distance.apply("hippo", "zzzzzzzz") = 0.0
* distance.apply("hello", "hallo") = 0.88
* distance.apply("ABC Corporation", "ABC Corp") = 0.91
* distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.93
* distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94
* distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9
* </pre>
*
* @param left the first String, must not be null
* @param right the second String, must not be null
* @return result distance
* @throws IllegalArgumentException if either String input {@code null}
*/
public Double apply(CharSequence left, CharSequence right) {
final double defaultScalingFactor = 0.1;
final double percentageRoundValue = 100.0;
if (left == null || right == null) {
throw new IllegalArgumentException("Strings must not be null");
}
final double jaro = score(left, right);
final int cl = commonPrefixLength(left, right);
final double matchScore = Math.round((jaro + defaultScalingFactor
* cl * (1.0 - jaro)) * percentageRoundValue) / percentageRoundValue;
return matchScore;
}
/**
* Calculates the number of characters from the beginning of the strings
* that match exactly one-to-one, up to a maximum of four (4) characters.
*
* @param first The first string.
* @param second The second string.
* @return A number between 0 and 4.
*/
private static int commonPrefixLength(final CharSequence first,
final CharSequence second) {
final int result = getCommonPrefix(first.toString(), second.toString())
.length();
// Limit the result to 4.
return result > PREFIX_LENGTH_LIMIT ? PREFIX_LENGTH_LIMIT : result;
}
/**
* Compares all Strings in an array and returns the initial sequence of
* characters that is common to all of them.
*
* <p>
* For example,
* <code>getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) -> "i am a "</code>
* </p>
*
* <pre>
* getCommonPrefix(null) = ""
* getCommonPrefix(new String[] {}) = ""
* getCommonPrefix(new String[] {"abc"}) = "abc"
* getCommonPrefix(new String[] {null, null}) = ""
* getCommonPrefix(new String[] {"", ""}) = ""
* getCommonPrefix(new String[] {"", null}) = ""
* getCommonPrefix(new String[] {"abc", null, null}) = ""
* getCommonPrefix(new String[] {null, null, "abc"}) = ""
* getCommonPrefix(new String[] {"", "abc"}) = ""
* getCommonPrefix(new String[] {"abc", ""}) = ""
* getCommonPrefix(new String[] {"abc", "abc"}) = "abc"
* getCommonPrefix(new String[] {"abc", "a"}) = "a"
* getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab"
* getCommonPrefix(new String[] {"abcde", "abxyz"}) = "ab"
* getCommonPrefix(new String[] {"abcde", "xyz"}) = ""
* getCommonPrefix(new String[] {"xyz", "abcde"}) = ""
* getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) = "i am a "
* </pre>
*
* @param strs array of String objects, entries may be null
* @return the initial sequence of characters that are common to all Strings
* in the array; empty String if the array is null, the elements are
* all null or if there is no common prefix.
*/
public static String getCommonPrefix(final String... strs) {
if (strs == null || strs.length == 0) {
return "";
}
final int smallestIndexOfDiff = indexOfDifference(strs);
if (smallestIndexOfDiff == INDEX_NOT_FOUND) {
// all strings were identical
if (strs[0] == null) {
return "";
}
return strs[0];
} else if (smallestIndexOfDiff == 0) {
// there were no common initial characters
return "";
} else {
// we found a common initial character sequence
return strs[0].substring(0, smallestIndexOfDiff);
}
}
/**
* This method returns the Jaro-Winkler score for string matching.
*
* @param first the first string to be matched
* @param second the second string to be machted
* @return matching score without scaling factor impact
*/
protected static double score(final CharSequence first,
final CharSequence second) {
String shorter;
String longer;
// Determine which String is longer.
if (first.length() > second.length()) {
longer = first.toString().toLowerCase();
shorter = second.toString().toLowerCase();
} else {
longer = second.toString().toLowerCase();
shorter = first.toString().toLowerCase();
}
// Calculate the half length() distance of the shorter String.
final int halflength = shorter.length() / 2 + 1;
// Find the set of matching characters between the shorter and longer
// strings. Note that
// the set of matching characters may be different depending on the
// order of the strings.
final String m1 = getSetOfMatchingCharacterWithin(shorter, longer,
halflength);
final String m2 = getSetOfMatchingCharacterWithin(longer, shorter,
halflength);
// If one or both of the sets of common characters is empty, then
// there is no similarity between the two strings.
if (m1.length() == 0 || m2.length() == 0) {
return 0.0;
}
// If the set of common characters is not the same size, then
// there is no similarity between the two strings, either.
if (m1.length() != m2.length()) {
return 0.0;
}
// Calculate the number of transposition between the two sets
// of common characters.
final int transpositions = transpositions(m1, m2);
final double defaultDenominator = 3.0;
// Calculate the distance.
final double dist = (m1.length() / ((double) shorter.length())
+ m2.length() / ((double) longer.length()) + (m1.length() - transpositions)
/ ((double) m1.length())) / defaultDenominator;
return dist;
}
/**
* Calculates the number of transposition between two strings.
*
* @param first The first string.
* @param second The second string.
* @return The number of transposition between the two strings.
*/
protected static int transpositions(final CharSequence first,
final CharSequence second) {
int transpositions = 0;
for (int i = 0; i < first.length(); i++) {
if (first.charAt(i) != second.charAt(i)) {
transpositions++;
}
}
return transpositions / 2;
}
/**
* Compares all CharSequences in an array and returns the index at which the
* CharSequences begin to differ.
*
* <p>
* For example,
* <code>indexOfDifference(new String[] {"i am a machine", "i am a robot"}) -> 7</code>
* </p>
*
* <pre>
* distance.indexOfDifference(null) = -1
* distance.indexOfDifference(new String[] {}) = -1
* distance.indexOfDifference(new String[] {"abc"}) = -1
* distance.indexOfDifference(new String[] {null, null}) = -1
* distance.indexOfDifference(new String[] {"", ""}) = -1
* distance.indexOfDifference(new String[] {"", null}) = 0
* distance.indexOfDifference(new String[] {"abc", null, null}) = 0
* distance.indexOfDifference(new String[] {null, null, "abc"}) = 0
* distance.indexOfDifference(new String[] {"", "abc"}) = 0
* distance.indexOfDifference(new String[] {"abc", ""}) = 0
* distance.indexOfDifference(new String[] {"abc", "abc"}) = -1
* distance.indexOfDifference(new String[] {"abc", "a"}) = 1
* distance.indexOfDifference(new String[] {"ab", "abxyz"}) = 2
* distance.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2
* distance.indexOfDifference(new String[] {"abcde", "xyz"}) = 0
* distance.indexOfDifference(new String[] {"xyz", "abcde"}) = 0
* distance.indexOfDifference(new String[] {"i am a machine", "i am a robot"}) = 7
* </pre>
*
* @param css array of CharSequences, entries may be null
* @return the index where the strings begin to differ; -1 if they are all
* equal
*/
protected static int indexOfDifference(final CharSequence... css) {
if (css == null || css.length <= 1) {
return INDEX_NOT_FOUND;
}
boolean anyStringNull = false;
boolean allStringsNull = true;
final int arrayLen = css.length;
int shortestStrLen = Integer.MAX_VALUE;
int longestStrLen = 0;
// find the min and max string lengths; this avoids checking to make
// sure we are not exceeding the length of the string each time through
// the bottom loop.
for (int i = 0; i < arrayLen; i++) {
if (css[i] == null) {
anyStringNull = true;
shortestStrLen = 0;
} else {
allStringsNull = false;
shortestStrLen = Math.min(css[i].length(), shortestStrLen);
longestStrLen = Math.max(css[i].length(), longestStrLen);
}
}
// handle lists containing all nulls or all empty strings
if (allStringsNull || longestStrLen == 0 && !anyStringNull) {
return INDEX_NOT_FOUND;
}
// handle lists containing some nulls or some empty strings
if (shortestStrLen == 0) {
return 0;
}
// find the position with the first difference across all strings
int firstDiff = -1;
for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) {
final char comparisonChar = css[0].charAt(stringPos);
for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) {
if (css[arrayPos].charAt(stringPos) != comparisonChar) {
firstDiff = stringPos;
break;
}
}
if (firstDiff != -1) {
break;
}
}
if (firstDiff == -1 && shortestStrLen != longestStrLen) {
// we compared all of the characters up to the length of the
// shortest string and didn't find a match, but the string lengths
// vary, so return the length of the shortest string.
return shortestStrLen;
}
return firstDiff;
}
/**
* Gets a set of matching characters between two strings.
*
* <p>
* Two characters from the first string and the second string are
* considered matching if the character's respective positions are no
* farther than the limit value.
* </p>
*
* @param first The first string.
* @param second The second string.
* @param limit The maximum distance to consider.
* @return A string contain the set of common characters.
*/
protected static String getSetOfMatchingCharacterWithin(
final CharSequence first, final CharSequence second, final int limit) {
final StringBuilder common = new StringBuilder();
final StringBuilder copy = new StringBuilder(second);
for (int i = 0; i < first.length(); i++) {
final char ch = first.charAt(i);
boolean found = false;
// See if the character is within the limit positions away from the
// original position of that character.
for (int j = Math.max(0, i - limit); !found
&& j < Math.min(i + limit, second.length()); j++) {
if (copy.charAt(j) == ch) {
found = true;
common.append(ch);
copy.setCharAt(j, '*');
}
}
}
return common.toString();
}
}