-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
levenshtein.cr
158 lines (137 loc) · 4.07 KB
/
levenshtein.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Levenshtein distance methods.
module Levenshtein
# Computes the [levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance) of two strings.
#
# ```
# require "levenshtein"
#
# Levenshtein.distance("algorithm", "altruistic") # => 6
# Levenshtein.distance("hello", "hallo") # => 1
# Levenshtein.distance("こんにちは", "こんちは") # => 1
# Levenshtein.distance("hey", "hey") # => 0
# ```
def self.distance(string1 : String, string2 : String) : Int32
return 0 if string1 == string2
s_size = string1.size
t_size = string2.size
return t_size if s_size == 0
return s_size if t_size == 0
# This is to allocate less memory
if t_size > s_size
string1, string2 = string2, string1
t_size, s_size = s_size, t_size
end
costs = Slice(Int32).new(t_size + 1) { |i| i }
last_cost = 0
if string1.single_byte_optimizable? && string2.single_byte_optimizable?
s = string1.to_unsafe
t = string2.to_unsafe
s_size.times do |i|
last_cost = i + 1
t_size.times do |j|
sub_cost = s[i] == t[j] ? 0 : 1
cost = Math.min(Math.min(last_cost + 1, costs[j + 1] + 1), costs[j] + sub_cost)
costs[j] = last_cost
last_cost = cost
end
costs[t_size] = last_cost
end
last_cost
else
reader = Char::Reader.new(string1)
# Use an array instead of a reader to decode the second string only once
chars = string2.chars
reader.each_with_index do |char1, i|
last_cost = i + 1
chars.each_with_index do |char2, j|
sub_cost = char1 == char2 ? 0 : 1
cost = Math.min(Math.min(last_cost + 1, costs[j + 1] + 1), costs[j] + sub_cost)
costs[j] = last_cost
last_cost = cost
end
costs[t_size] = last_cost
end
last_cost
end
end
# Finds the closest string to a given string amongst many strings.
#
# ```
# require "levenshtein"
#
# finder = Levenshtein::Finder.new "hallo"
# finder.test "hay"
# finder.test "hall"
# finder.test "hallo world"
#
# finder.best_match # => "hall"
# ```
class Finder
# :nodoc:
record Entry,
value : String,
distance : Int32
@tolerance : Int32
def initialize(@target : String, tolerance : Int? = nil)
@tolerance = tolerance || (target.size / 5.0).ceil.to_i
end
def test(name : String, value : String = name)
distance = Levenshtein.distance(@target, name)
if distance <= @tolerance
if best_entry = @best_entry
if distance < best_entry.distance
@best_entry = Entry.new(value, distance)
end
else
@best_entry = Entry.new(value, distance)
end
end
end
def best_match : String?
@best_entry.try &.value
end
def self.find(name, tolerance = nil)
sn = new name, tolerance
yield sn
sn.best_match
end
def self.find(name, all_names, tolerance = nil) : String?
find(name, tolerance) do |similar|
all_names.each do |a_name|
similar.test(a_name)
end
end
end
end
# Finds the best match for *name* among strings added within the given block.
# *tolerance* can be used to set maximum Levenshtein distance allowed.
#
# ```
# require "levenshtein"
#
# best_match = Levenshtein.find("hello") do |l|
# l.test "hulk"
# l.test "holk"
# l.test "halka"
# l.test "ello"
# end
# best_match # => "ello"
# ```
def self.find(name, tolerance = nil)
Finder.find(name, tolerance) do |sn|
yield sn
end
end
# Finds the best match for *name* among strings provided in *all_names*.
# *tolerance* can be used to set maximum Levenshtein distance allowed.
#
# ```
# require "levenshtein"
#
# Levenshtein.find("hello", ["hullo", "hel", "hall", "hell"], 2) # => "hullo"
# Levenshtein.find("hello", ["hurlo", "hel", "hall"], 1) # => nil
# ```
def self.find(name, all_names, tolerance = nil)
Finder.find(name, all_names, tolerance)
end
end