-
Notifications
You must be signed in to change notification settings - Fork 289
/
hyperloglog_examples.py
41 lines (32 loc) · 1.03 KB
/
hyperloglog_examples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
'''
Some examples for MinHash
'''
from datasketch.hyperloglog import HyperLogLog
data1 = ['hyperloglog', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
'estimating', 'the', 'cardinality', 'of', 'dataset', 'dataset', 'a']
data2 = ['hyperloglog', 'is', 'a', 'probabilistic', 'DATA', 'structure', 'for',
'estimating', 'the', 'number', 'of', 'distinct', 'values', 'of',
'dataset', 'dataset', 'a']
def eg1():
h = HyperLogLog()
for d in data1:
h.update(d.encode('utf8'))
print("Estimated cardinality is", h.count())
s1 = set(data1)
print("Actual cardinality is", len(s1))
def eg2():
h1 = HyperLogLog()
h2 = HyperLogLog()
for d in data1:
h1.update(d.encode('utf8'))
for d in data2:
h2.update(d.encode('utf8'))
u = HyperLogLog.union(h1, h2)
print("Estimated union cardinality is", u.count())
s1 = set(data1)
s2 = set(data2)
su = s1.union(s2)
print("Actual union cardinality is", len(su))
if __name__ == "__main__":
eg1()
eg2()