In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext("local", "lhs")
import numpy as np

In [2]:
data = np.array([[1000,2000,3000,4000,5000,6000,7000,8000], [1000,2000,3000,4000,5000,6000,7000,8000], [1000,2000,3000,4000], [5000,6000,7000,8000]])

In [3]:
dataRDD = sc.parallelize(data)

In [4]:
dataWithIndex = dataRDD.zipWithIndex().map(lambda x: (x[1], x[0]))

In [5]:
class VectorWrapper():
    def __init__(self, vector):
        self.vector = vector
        
class Hasher():
    def __init__(self, signatures):
        self.signatures = signatures
        self.mod = np.random.randint(1000, size=self.signatures)
        self.bias = np.random.randint(1000, size=self.signatures)
        
    def hashValue(self, value, signature):
        return value%self.mod[signature]+self.bias[signature]
    
    def hashVector(self, vector, signature):
        return np.vectorize(self.hashValue)(vector, signature)
    
    def minHashVector(self, signature, vectorWrapper):
        return np.amin(self.hashVector(vectorWrapper.vector, signature))
    
    def generateSignatures(self, vector):
        return np.vectorize(self.minHashVector)(np.arange(self.signatures), VectorWrapper(vector))

In [16]:
signatures = 1000
numberOfBands = 200
t = (1/numberOfBands)**(numberOfBands/signatures)
#print(t)

In [17]:
hasher = Hasher(1000)

In [18]:
minHash = dataWithIndex.map(lambda x: (x[0], hasher.generateSignatures(x[1])))
#print(minHash.collect())

In [19]:
#to improve
def simpleHash(vector):
    return np.sum(vector)

In [20]:
bands0 = minHash.flatMap(lambda x: np.arange(numberOfBands))
bands1 = minHash.flatMap(lambda x: np.ones(numberOfBands, dtype=int)*x[0])
bands2 = minHash.flatMap(lambda x: np.array(np.split(x[1], numberOfBands)))
bands2Hashed = bands2.map(lambda x: simpleHash(x))
bands12Hashed = bands1.zip(bands2Hashed)
bands = bands0.zip(bands12Hashed)
#print(bands.collect())

In [21]:
def generateCandidates(vector):
    candidates = []
    for x in vector:
        for y in vector:
            if x[0] < y[0] and x[1] == y[1]:
                candidates.append((x[0], y[0]))
    return candidates

In [22]:
bandsInGroup = bands.groupByKey()

In [23]:
candidates = bandsInGroup.flatMap(lambda x: generateCandidates(x[1])).map(lambda x: (x[0]*len(data)+x[1], x)).values().distinct()
#print(candidates.collect())

In [24]:
#print(bandsInGroup.map(lambda x : (x[0], list(x[1]))).collect())

In [25]:
for couple in candidates.collect():
    first = minHash.filter(lambda x: x[0]==couple[0]).collect()[0][1]
    second = minHash.filter(lambda x: x[0]==couple[1]).collect()[0][1]
    comparison = np.dstack((first, second))[0]
    print("Similarity between "+str(couple[0])+" and "+str(couple[1])+" is "+str(len(list(filter(lambda x: x[0] == x[1], comparison)))/len(comparison)))

Similarity between 0 and 1 is 1.0
Similarity between 0 and 3 is 0.602
Similarity between 1 and 3 is 0.602
Similarity between 0 and 2 is 0.428
Similarity between 1 and 2 is 0.428
