-
Notifications
You must be signed in to change notification settings - Fork 73
/
min_hash.py
136 lines (93 loc) · 2.97 KB
/
min_hash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
import random
import hashlib
def sigGen(matrix):
"""
* generate the signature vector
:param matrix: a ndarray var
:return a signature vector: a list var
"""
# the row sequence set
seqSet = [i for i in range(matrix.shape[0])]
# initialize the sig vector as [-1, -1, ..., -1]
result = [-1 for i in range(matrix.shape[1])]
count = 0
while len(seqSet) > 0:
# choose a row of matrix randomly
randomSeq = random.choice(seqSet)
for i in range(matrix.shape[1]):
if matrix[randomSeq][i] != 0 and result[i] == -1:
result[i] = randomSeq
count += 1
if count == matrix.shape[1]:
break
seqSet.remove(randomSeq)
# return a list
return result
def sigMatrixGen(input_matrix, n):
"""
generate the sig matrix
:param input_matrix: naarray var
:param n: the row number of sig matrix which we set
:return sig matrix: ndarray var
"""
result = []
for i in range(n):
sig = sigGen(input_matrix)
result.append(sig)
# return a ndarray
return np.array(result)
def minHash(input_matrix, b, r):
"""
map the sim vector into same hash bucket
:param input_matrix:
:param b: the number of bands
:param r: the row number of a band
:return the hash bucket: a dictionary, key is hash value, value is column number
"""
hashBuckets = {}
# permute the matrix for n times
n = b * r
# generate the sig matrix
sigMatrix = sigMatrixGen(input_matrix, n)
# begin and end of band row
begin, end = 0, r
# count the number of band level
count = 0
while end <= sigMatrix.shape[0]:
count += 1
# traverse the column of sig matrix
for colNum in range(sigMatrix.shape[1]):
# generate the hash object, we used md5
hashObj = hashlib.md5()
# calculate the hash value
band = str(sigMatrix[begin: begin + r, colNum]) + str(count)
hashObj.update(band.encode())
# use hash value as bucket tag
tag = hashObj.hexdigest()
# update the dictionary
if tag not in hashBuckets:
hashBuckets[tag] = [colNum]
elif colNum not in hashBuckets[tag]:
hashBuckets[tag].append(colNum)
begin += r
end += r
# return a dictionary
return hashBuckets
def nn_search(dataSet, query):
"""
:param dataSet: 2-dimension array
:param query: 1-dimension array
:return: the data columns in data set that are similarity with query
"""
result = set()
dataSet.append(query)
input_matrix = np.array(dataSet).T
hashBucket = minHash(input_matrix, 20, 5)
queryCol = input_matrix.shape[1] - 1
for key in hashBucket:
if queryCol in hashBucket[key]:
for i in hashBucket[key]:
result.add(i)
result.remove(queryCol)
return result