-
Notifications
You must be signed in to change notification settings - Fork 0
/
data-hasher.go
191 lines (154 loc) · 5.33 KB
/
data-hasher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
package vugu
import (
"encoding/binary"
"fmt"
"math"
"reflect"
"sort"
"github.com/cespare/xxhash"
)
// DataHasher can be implemented by types to override the hashing behavior. ComputeHash() will call DataHash()
// on an instance of a type if the method is present. Useful for providing fast and stable hashing for structures
// that are large but only require a small amount data to be examined to determine if changed, or are already
// hashed internally, etc.
type DataHasher interface {
DataHash() uint64
}
// ComputeHash performs data hashing.
// It walks your data structure and hashes the information as it goes.
// It uses xxhash internally and returns a uint64. It is intended to be both fast and have good hash distribution to avoid
// collision-related bugs. Maps are sorted by key and then both keys and values are hashed. Nil values are skipped.
// Behavior is undefined for circular references via interface or pointer. Will call DataHash() method if present
// (see DataHasher) on a type. Otherwise will walk the data and find primitive values and hash them byte for byte.
// No guarantee is made about the exact hash algorithm used or how type information is or is not hashed - only that
// the same data structure should consistently hash to the same value and changing any value in the data tree should
// change the hash.
func ComputeHash(i interface{}) uint64 {
// FIXME: do we need to handle circular references here? (via pointer, via interface{}),
// right now this will recurse forever and eventually stack overflow, not sure if this is
// real world case or not. Not sure if we should panic or gracefully handle it or what.
if _, ok := i.(*reflect.Value); ok {
panic("cannot compute hash on a *reflect.Value")
}
if _, ok := i.(reflect.Value); ok {
panic("cannot compute hash on a reflect.Value")
}
if dh, ok := i.(DataHasher); ok {
return dh.DataHash()
}
b8 := make([]byte, 8)
// find dereferenced value
v := reflect.ValueOf(i)
for v.Kind() == reflect.Ptr {
if v.IsNil() {
// return xxhash.Sum64(b8)
return 0
}
v = v.Elem()
}
// log.Printf("v = %#v", v)
switch v.Kind() {
case reflect.Bool:
if v.Bool() {
b8[0] = 1
}
return xxhash.Sum64(b8)
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
binary.BigEndian.PutUint64(b8, uint64(v.Int()))
return xxhash.Sum64(b8)
case reflect.Float32, reflect.Float64:
binary.BigEndian.PutUint64(b8, math.Float64bits(v.Float()))
return xxhash.Sum64(b8)
case reflect.Complex64, reflect.Complex128:
c := v.Complex()
h := xxhash.New()
binary.BigEndian.PutUint64(b8, math.Float64bits(real(c)))
h.Write(b8)
binary.BigEndian.PutUint64(b8, math.Float64bits(imag(c)))
h.Write(b8)
return h.Sum64()
case reflect.Func, reflect.Chan, reflect.UnsafePointer:
p := v.Pointer()
binary.BigEndian.PutUint64(b8, uint64(p))
return xxhash.Sum64(b8)
case reflect.Interface:
d := v.InterfaceData()
h := xxhash.New()
binary.BigEndian.PutUint64(b8, uint64(d[0]))
h.Write(b8)
binary.BigEndian.PutUint64(b8, uint64(d[1]))
h.Write(b8)
return h.Sum64()
case reflect.Array, reflect.Slice:
h := xxhash.New()
l := v.Len()
for i := 0; i < l; i++ {
binary.BigEndian.PutUint64(b8, ComputeHash(v.Index(i).Interface()))
h.Write(b8)
}
return h.Sum64()
case reflect.Map:
h := xxhash.New()
// we have to do a little dance here to ensure the sequence is stable,
// because MapKeys() is not - both per the doc and in practice
mk := v.MapKeys()
khmap := make(map[uint64]int, len(mk)) // index_into_mk -> hash
for i, k := range mk {
// NOTE: collisions will cause a key to be skipped - not great,
// but possibly better than hashes being unstable for the same input.
// (i.e. I'd prefer an obscure bug where the UI didn't update one time, as opposed to
// one where the CPU is on fire because it's updating all the time)
khmap[ComputeHash(k.Interface())] = i
}
hs := make([]uint64, 0, len(khmap))
for h := range khmap {
hs = append(hs, h)
}
sort.Slice(hs, func(i, j int) bool {
return hs[i] < hs[j]
})
// log.Printf("START1")
// now that we have a stable sequence:
for _, i := range hs {
// write key hash
binary.BigEndian.PutUint64(b8, i)
h.Write(b8)
// compute and write value hash
mapKey := mk[khmap[i]]
mapValue := v.MapIndex(mapKey)
vhash := ComputeHash(mapValue.Interface())
binary.BigEndian.PutUint64(b8, vhash)
h.Write(b8)
// log.Printf("map: i_hash=%v, v_hash=%v", i, vhash)
}
// for _, k := range mk {
// binary.BigEndian.PutUint64(b8, ComputeHash(k.Interface()))
// h.Write(b8)
// val := v.MapIndex(k)
// binary.BigEndian.PutUint64(b8, ComputeHash(val.Interface()))
// h.Write(b8)
// }
return h.Sum64()
case reflect.Struct:
h := xxhash.New()
l := v.NumField()
t := v.Type()
for i := 0; i < l; i++ {
if t.Field(i).PkgPath != "" {
continue // skip unexported fields
}
binary.BigEndian.PutUint64(b8, ComputeHash(v.Field(i).Interface()))
h.Write(b8)
}
return h.Sum64()
case reflect.String:
return xxhash.Sum64String(v.String())
case reflect.Ptr:
panic(fmt.Errorf("should not have gotten a pointer here"))
default:
// panic(fmt.Errorf("unknown kind %v", v.Kind()))
// nop
return 0
}
panic("unreachable code")
}