-
Notifications
You must be signed in to change notification settings - Fork 5
/
StringCompare.java
199 lines (185 loc) · 6.82 KB
/
StringCompare.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
package org.genericsystem.cv.utils;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* This class contains various methods that can be used to compare {@link String} objects.
*
* @author Pierrik Lassalas
*/
public class StringCompare {
private static final int DEFAULT_K = 3;
private static final Pattern SPACE_PATTERN = Pattern.compile("\\s+");
/**
* Describe the method used to compute the similarity
*
* @author Pierrik Lassalas
*/
public enum SIMILARITY {
/**
* Use the normalized Levenshtein distance (similarity = 1 - distance)
*/
LEVENSHTEIN,
/**
* Use letter-pairs similarity
*/
LETTER_PAIRS,
/**
* Use the cosine similarity to compare individual chars
*/
COSINE_CHAR,
/**
* Use the cosine similarity to compare individual words
*/
COSINE_WORD;
}
public static void main(String[] args) {
// System.out.println(getShingles("Bonjour monsieur, comment allez-vous aujourd'hui ?", 3));
// System.out.println(getShinglesSpacesRemoved("Bonjour monsieur, comment allez-vous aujourd'hui ?", 3));
// System.out.println(containsSubstring("Bonjour mon ami, avez-vous vu le soleil ?", "am\ni "));
// System.out.println(containsSubstring("Bonjour mon ami, avez-vous vu le soleil ?", "chien"));
System.out.println(containsSubstring("Bonjour mon ami, avez-vous vu le soleil ?", "solell", 0.7, SIMILARITY.LEVENSHTEIN));
}
/**
* Check whether a candidate string is contained in the original string.
*
* @param original - the original string
* @param candidate - the candidate, expected to be found in <code>original</code>
* @return <code>true</code> if a match was found, <code>false</code> otherwise
*/
public static boolean containsSubstring(String original, String candidate) {
if (original == null || candidate == null)
throw new IllegalArgumentException("Attempt to compare one (or more) null strings");
if (original.trim().isEmpty() || candidate.trim().isEmpty())
return false;
String candidateNoSpaces = SPACE_PATTERN.matcher(candidate.toLowerCase()).replaceAll("");
int k = candidateNoSpaces.length();
Set<String> shingles = getShinglesSpacesRemoved(original.toLowerCase(), k);
return shingles.stream().anyMatch(s -> s.equals(candidateNoSpaces));
}
/**
* Check whether a candidate string is contained in the original string.
*
* @param original - the original string
* @param candidate - the candidate, expected to be found in <code>original</code>
* @param similarityThreshold - the similarity threshold
* @param option - the method to be used for string comparison
* @return <code>true</code> if a match was found, <code>false</code> otherwise
*/
public static boolean containsSubstring(String original, String candidate, double similarityThreshold, SIMILARITY option) {
if (original == null || candidate == null)
throw new IllegalArgumentException("Attempt to compare one (or more) null strings");
if (original.trim().isEmpty() || candidate.trim().isEmpty())
return false;
String candidateNoSpaces = SPACE_PATTERN.matcher(candidate.toLowerCase()).replaceAll("");
int k = candidateNoSpaces.length();
Set<String> shingles = getShinglesSpacesRemoved(original.toLowerCase(), k);
return shingles.stream().anyMatch(s -> {
double sim = compare(s, candidateNoSpaces, option);
return sim > similarityThreshold ? true : false;
});
}
/**
* Compute the similarity between two strings.
*
* @param string1 - the first string
* @param string2 - the second string
* @param option - the method to be used for string comparison
* @return a score between 0 and 1
*/
public static double compare(String string1, String string2, SIMILARITY option) {
double sim = 0;
switch (option) {
default:
case LEVENSHTEIN:
sim = Levenshtein.similarity(string1, string2);
break;
case LETTER_PAIRS:
sim = LetterPairSimilarity.compareStrings(string1, string2);
break;
case COSINE_CHAR:
sim = CosineSimilarity.cosineSimilarity(string1, string2, CosineSimilarity.PATTERN.SINGLE_CHAR);
break;
case COSINE_WORD:
sim = CosineSimilarity.cosineSimilarity(string1, string2, CosineSimilarity.PATTERN.WORDS);
break;
}
return sim;
}
/**
* Compute the similarity between the members of a list of strings.
*
* @param strings - the list of strings
* @param option - the method to be used for string comparison
* @return a score between 0 and 1
*/
public static double similarity(List<String> strings, SIMILARITY option) {
double sim = 0;
int n = strings.size();
if (n == 1)
return 1;
for (int i = 0; i < n; i++) {
for (int j = i + 1; j < n; j++) {
sim += compare(strings.get(i), strings.get(j), option);
}
}
return 2 * sim / (n * (n - 1)); // divide by the total number of distances
}
/**
* Decompose a string in shingles of default size, without considering words (i.e., all spaces are removed).
*
* @param string - the input string
* @return a {@link Set} containing the individual shingles (lowercased)
*/
public static Set<String> getShinglesSpacesRemoved(final String string) {
return getShinglesSpacesRemoved(string, DEFAULT_K);
}
/**
* Decompose a string in shingles of size <code>k</code>, without considering words (i.e., all spaces are removed).
*
* @param string - the input string
* @param k - the size of the shingles
* @return a {@link Set} containing the individual shingles (lowercased)
*/
public static Set<String> getShinglesSpacesRemoved(final String string, int k) {
String stringNoSpaces = SPACE_PATTERN.matcher(string).replaceAll("");
return getShingles(stringNoSpaces, k);
}
/**
* Decompose a string in shingles of default size.
*
* @param string - the input string
* @return a {@link Set} containing the individual shingles (lowercased)
*/
public static Set<String> getShingles(final String string) {
return getShingles(string, DEFAULT_K);
}
/**
* Decompose a string in small chunks (shingles) of size <code>k</code>.
*
* @param string - the input string
* @param k - the size of the shingles
* @return a {@link Set} containing the individual shingles (lowercased)
*/
public static Set<String> getShingles(final String string, int k) {
if (k <= 0)
throw new IllegalArgumentException(String.format("k should be positive (provided value: %d)", k));
if (string == null || string.isEmpty())
return Collections.emptySet();
Set<String> shingles = SPACE_PATTERN.splitAsStream(string.toLowerCase()).flatMap(word -> {
if (word.length() < k)
return Stream.empty();
int size = word.length() - k + 1;
List<String> tmp = new ArrayList<>(size);
for (int i = 0; i < size; ++i) {
tmp.add(word.substring(i, i + k));
}
return tmp.stream();
}).collect(Collectors.toSet());
return shingles;
}
}