forked from lytics/hll
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sparseutil.go
225 lines (187 loc) · 4.84 KB
/
sparseutil.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
package hll
import (
"math/bits"
"sort"
)
const RHOW_BITS = 6
const RHOW_MASK = uint64(1<<RHOW_BITS) - 1
func encode(sparseIndex uint32, sparseRhoW uint8, p, pPrime uint) uint32 {
mask := (uint32(1) << (pPrime - p)) - 1
if (sparseIndex & mask) != 0 {
return sparseIndex
}
var rhoEncodedFlag uint32
if pPrime >= p+RHOW_BITS {
rhoEncodedFlag = uint32(1) << pPrime
} else {
rhoEncodedFlag = uint32(1) << (p + RHOW_BITS)
}
normalIndex := sparseIndex >> (pPrime - p)
return rhoEncodedFlag | normalIndex<<RHOW_BITS | uint32(sparseRhoW)
}
func computeRhoW(value uint64, bts uint8) uint8 {
// Strip of the index and move the rhoW to a higher order.
w := value << (64 - bts)
// If the rhoW consists only of zeros, return the maximum length of bits + 1.
if w == 0 {
return bts + 1
}
return uint8(bits.LeadingZeros64(w) + 1)
}
// x is a hash code.
func encodeSparseHash(hash uint64, p, pPrime uint) (hashCode uint32) {
sparseIndex := uint32(hash >> (64 - pPrime))
sparseRhoW := computeRhoW(hash, uint8(64-pPrime))
return encode(sparseIndex, sparseRhoW, p, pPrime)
}
// k is an encoded hash.
func decodeSparseHash(k uint64, p, pPrime uint) (idx uint64, rhoW uint8) {
var rhoEncodedFlag uint64
if pPrime >= p+RHOW_BITS {
rhoEncodedFlag = uint64(1) << pPrime
} else {
rhoEncodedFlag = uint64(1) << (p + RHOW_BITS)
}
if k&rhoEncodedFlag == 0 {
return k, 0
}
return ((k ^ rhoEncodedFlag) >> RHOW_BITS) << (pPrime - p), uint8(k & RHOW_MASK)
}
func decodeSparseHashForNormal(k uint64, p, pPrime uint) (idx uint64, rhoW uint8) {
var rhoEncodedFlag uint64
if pPrime >= p+RHOW_BITS {
rhoEncodedFlag = uint64(1) << pPrime
} else {
rhoEncodedFlag = uint64(1) << (p + RHOW_BITS)
}
if k&rhoEncodedFlag == 0 {
return k >> (pPrime - p), computeRhoW(k, uint8(pPrime-p))
}
return (k ^ rhoEncodedFlag) >> RHOW_BITS, uint8((k & RHOW_MASK) + uint64(pPrime) - uint64(p))
}
type mergeElem struct {
index uint64
rho uint8
encoded uint64
}
type u64It func() (uint64, bool)
type mergeElemIt func() (mergeElem, bool)
func makeMergeElemIter(p, pPrime uint, input u64It) mergeElemIt {
firstElem := true
var lastIndex uint64
return func() (mergeElem, bool) {
for {
hashCode, ok := input()
if !ok {
return mergeElem{}, false
}
idx, r := decodeSparseHash(hashCode, p, pPrime)
if !firstElem && idx == lastIndex {
// In the case where the tmp_set is being merged with the sparse_list, the tmp_set
// may contain elements that have the same index value. In this case, they will
// have been sorted so the one with the max rho value will come first. We should
// discard all dupes after the first.
continue
}
firstElem = false
lastIndex = idx
return mergeElem{idx, r, hashCode}, true
}
}
}
// The input should be sorted by hashcode.
func makeU64SliceIt(in []uint64) u64It {
idx := 0
return func() (uint64, bool) {
if idx == len(in) {
return 0, false
}
result := in[idx]
idx++
return result, true
}
}
// Both input iterators must be sorted by hashcode.
func merge(p, pPrime uint, sizeEst uint64, it1, it2 u64It) *sparse {
leftIt := makeMergeElemIter(p, pPrime, it1)
rightIt := makeMergeElemIter(p, pPrime, it2)
left, haveLeft := leftIt()
right, haveRight := rightIt()
output := newSparse(sizeEst)
for haveLeft && haveRight {
var toAppend uint64
if left.index < right.index {
toAppend = left.encoded
left, haveLeft = leftIt()
} else if right.index < left.index {
toAppend = right.encoded
right, haveRight = rightIt()
} else { // The indexes are equal. Keep the one with the highest rho value.
if left.rho > right.rho {
toAppend = left.encoded
} else {
toAppend = right.encoded
}
left, haveLeft = leftIt()
right, haveRight = rightIt()
}
output.Add(toAppend)
}
for haveRight {
output.Add(right.encoded)
right, haveRight = rightIt()
}
for haveLeft {
output.Add(left.encoded)
left, haveLeft = leftIt()
}
return output
}
func toNormal(s *sparse, p, pPrime uint) normal {
m := 1 << p
M := newNormal(uint64(m))
it := s.GetIterator()
for {
k, ok := it()
if !ok {
break
}
idx, r := decodeSparseHashForNormal(k, p, pPrime)
val := maxU8(M.Get(idx), r)
M.Set(idx, val)
}
return M
}
func maxU8(x, y uint8) uint8 {
if x >= y {
return x
}
return y
}
func maxU64(x, y uint64) uint64 {
if x >= y {
return x
}
return y
}
func sortHashcodesByIndex(xs []uint64, p, pPrime uint) {
sort.Sort(uint64Sorter{xs, p, pPrime})
}
type uint64Sorter struct {
xs []uint64
p, pPrime uint
}
func (u uint64Sorter) Len() int {
return len(u.xs)
}
func (u uint64Sorter) Less(i, j int) bool {
iIndex, iRho := decodeSparseHash(u.xs[i], u.p, u.pPrime)
jIndex, jRho := decodeSparseHash(u.xs[j], u.p, u.pPrime)
if iIndex != jIndex {
return iIndex < jIndex
}
return iRho > jRho
}
func (u uint64Sorter) Swap(i, j int) {
u.xs[i], u.xs[j] = u.xs[j], u.xs[i]
}