/
levenshtein.go
176 lines (156 loc) · 5 KB
/
levenshtein.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
package phonetizz
// orginially from: github.com/texttheater/golang-levenshtein
// had problems with vendor so all credits to him.
import (
"fmt"
"io"
"os"
)
type EditOperation int
const (
Ins = iota
Del
Sub
Match
)
type EditScript []EditOperation
type MatchFunction func(rune, rune) bool
type Options struct {
InsCost int
DelCost int
SubCost int
Matches MatchFunction
}
// DefaultOptions is the default options: insertion cost is 1, deletion cost is
// 1, substitution cost is 2, and two runes match iff they are the same.
var DefaultOptions Options = Options{
InsCost: 1,
DelCost: 1,
SubCost: 2,
Matches: func(sourceCharacter rune, targetCharacter rune) bool {
return sourceCharacter == targetCharacter
},
}
func (operation EditOperation) String() string {
if operation == Match {
return "match"
} else if operation == Ins {
return "ins"
} else if operation == Sub {
return "sub"
}
return "del"
}
// DistanceForStrings returns the edit distance between source and target.
func DistanceForStrings(source []rune, target []rune, op Options) int {
return DistanceForMatrix(MatrixForStrings(source, target, op))
}
// DistanceForMatrix reads the edit distance off the given Levenshtein matrix.
func DistanceForMatrix(matrix [][]int) int {
return matrix[len(matrix)-1][len(matrix[0])-1]
}
// MatrixForStrings generates a 2-D array representing the dynamic programming
// table used by the Levenshtein algorithm, as described e.g. here:
// http://www.let.rug.nl/kleiweg/lev/
// The reason for putting the creation of the table into a separate function is
// that it cannot only be used for reading of the edit distance between two
// strings, but also e.g. to backtrace an edit script that provides an
// alignment between the characters of both strings.
func MatrixForStrings(source []rune, target []rune, op Options) [][]int {
// Make a 2-D matrix. Rows correspond to prefixes of source, columns to
// prefixes of target. Cells will contain edit distances.
// Cf. http://www.let.rug.nl/~kleiweg/lev/levenshtein.html
height := len(source) + 1
width := len(target) + 1
matrix := make([][]int, height)
// Initialize trivial distances (from/to empty string). That is, fill
// the left column and the top row with row/column indices.
for i := 0; i < height; i++ {
matrix[i] = make([]int, width)
matrix[i][0] = i
}
for j := 1; j < width; j++ {
matrix[0][j] = j
}
// Fill in the remaining cells: for each prefix pair, choose the
// (edit history, operation) pair with the lowest cost.
for i := 1; i < height; i++ {
for j := 1; j < width; j++ {
delCost := matrix[i-1][j] + op.DelCost
matchSubCost := matrix[i-1][j-1]
if !op.Matches(source[i-1], target[j-1]) {
matchSubCost += op.SubCost
}
insCost := matrix[i][j-1] + op.InsCost
matrix[i][j] = min(delCost, min(matchSubCost,
insCost))
}
}
//LogMatrix(source, target, matrix)
return matrix
}
// EditScriptForStrings returns an optimal edit script to turn source into
// target.
func EditScriptForStrings(source []rune, target []rune, op Options) EditScript {
return backtrace(len(source), len(target),
MatrixForStrings(source, target, op), op)
}
// EditScriptForMatrix returns an optimal edit script based on the given
// Levenshtein matrix.
func EditScriptForMatrix(matrix [][]int, op Options) EditScript {
return backtrace(len(matrix)-1, len(matrix[0])-1, matrix, op)
}
// WriteMatrix writes a visual representation of the given matrix for the given
// strings to the given writer.
func WriteMatrix(source []rune, target []rune, matrix [][]int, writer io.Writer) {
fmt.Fprintf(writer, " ")
for _, targetRune := range target {
fmt.Fprintf(writer, " %c", targetRune)
}
fmt.Fprintf(writer, "\n")
fmt.Fprintf(writer, " %2d", matrix[0][0])
for j, _ := range target {
fmt.Fprintf(writer, " %2d", matrix[0][j+1])
}
fmt.Fprintf(writer, "\n")
for i, sourceRune := range source {
fmt.Fprintf(writer, "%c %2d", sourceRune, matrix[i+1][0])
for j, _ := range target {
fmt.Fprintf(writer, " %2d", matrix[i+1][j+1])
}
fmt.Fprintf(writer, "\n")
}
}
// LogMatrix writes a visual representation of the given matrix for the given
// strings to os.Stderr. This function is deprecated, use
// WriteMatrix(source, target, matrix, os.Stderr) instead.
func LogMatrix(source []rune, target []rune, matrix [][]int) {
WriteMatrix(source, target, matrix, os.Stderr)
}
func backtrace(i int, j int, matrix [][]int, op Options) EditScript {
if i > 0 && matrix[i-1][j]+op.DelCost == matrix[i][j] {
return append(backtrace(i-1, j, matrix, op), Del)
}
if j > 0 && matrix[i][j-1]+op.InsCost == matrix[i][j] {
return append(backtrace(i, j-1, matrix, op), Ins)
}
if i > 0 && j > 0 && matrix[i-1][j-1]+op.SubCost == matrix[i][j] {
return append(backtrace(i-1, j-1, matrix, op), Sub)
}
if i > 0 && j > 0 && matrix[i-1][j-1] == matrix[i][j] {
return append(backtrace(i-1, j-1, matrix, op), Match)
}
return []EditOperation{}
}
func min(a int, b int) int {
if b < a {
return b
}
return a
}
func max(a int, b int) int {
if b > a {
return b
}
return a
}