In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext("local", "lhs")
import numpy as np

In [2]:
import string
def cleanDocument(document):
    document = document.lower()
    document = document.translate(str.maketrans('','',string.punctuation))
    return document

In [3]:
numberOfDocuments = 9
documents = []
for i in range(9):
    documents.append(sc.textFile("data/"+str(i)+".txt"))
    documents[i] = documents[i].map(lambda x: cleanDocument(x))

In [4]:
def hashShingling(text):
    val = 0
    mod = 10**6+7
    for c in text:
        val = (val*26 + ord(c))%mod
    return val

In [5]:
shinglingSize = 9

In [6]:
data = []

In [7]:
for d in range(numberOfDocuments):
    document = []
    for s in documents[d].collect():
        document.append(s)
    for x in document:
        shinglings = []
        for i in range(shinglingSize-1, len(x)):
                shinglings.append(hashShingling(x[i-shinglingSize+1:i]))
        data.append(shinglings)
data = np.array(data)

In [8]:
#data = np.array([[1000,2000,3000,4000,5000,6000,7000,8000], [1000,2000,3000,4000,5000,6000,7000,8000], [1000,2000,3000,4000], [5000,6000,7000,8000]])

In [9]:
dataRDD = sc.parallelize(data)

In [10]:
dataWithIndex = dataRDD.zipWithIndex().map(lambda x: (x[1], x[0]))
#print(dataWithIndex.collect())

In [11]:
class VectorWrapper():
    def __init__(self, vector):
        self.vector = vector
        
class Hasher():
    def __init__(self, signatures):
        self.signatures = signatures
        self.mod = np.random.randint(10000, size=self.signatures)
        self.bias = np.random.randint(10000, size=self.signatures)
        
    def hashValue(self, value, signature):
        return value%self.mod[signature]+self.bias[signature]
    
    def hashVector(self, vector, signature):
        return np.vectorize(self.hashValue)(vector, signature)
    
    def minHashVector(self, signature, vectorWrapper):
        return np.amin(self.hashVector(vectorWrapper.vector, signature))
    
    def generateSignatures(self, vector):
        return np.vectorize(self.minHashVector)(np.arange(self.signatures), VectorWrapper(vector))

In [12]:
signatures = 1000 #number of hash functions
numberOfBands = 200 #b
t = (1/numberOfBands)**(numberOfBands/signatures)
print(t)

0.3465724215775732


In [13]:
hasher = Hasher(signatures)

In [14]:
minHash = dataWithIndex.map(lambda x: (x[0], hasher.generateSignatures(x[1])))
print(minHash.collect())

[(0, array([9337, 9221, 2727, 8562, 3968, 1040, 6714, 4721, 9542, 2409, 6246,
       2957, 2523, 2041, 1020, 3664, 8574, 3618,  448, 2500, 8953,  285,
       1732, 4060, 8470, 6603,  438, 6861, 6469,  935, 5520, 6447, 7488,
       1358, 7050, 2949, 6620, 8859, 6913,  814, 9743, 5127, 2460, 2839,
       1436, 9391, 1689, 6126, 3186, 6997, 6345, 8366, 6642, 7471, 2231,
       7725, 7133, 4427, 5996, 7591,  510,  135, 7814, 7388, 6513, 8521,
       5475, 3719, 3448, 3673, 4791, 8694, 2905, 1814,  296, 8244, 9054,
       1398, 4397, 4955, 6119, 1692, 8754, 7871, 1644, 7519, 7550, 3608,
       2185, 7933, 2225,  597, 6101, 3892, 4032, 8775, 8713, 1232, 7067,
       9343, 9420, 6600, 8528, 1402, 9693, 3304,   62, 1429, 7272, 7389,
       4510, 8396, 2382, 1550, 8672,   46, 3228, 5030, 1857, 2240, 4922,
       8375, 8365, 8936,  968,  426, 3024, 6760, 3768, 4675, 9884, 1692,
       6675, 6667, 7410, 2489, 9743, 6950, 5394, 8942, 3536, 4731, 5289,
       3359, 1990, 8622, 3075, 8316,  676, 868

In [15]:
#to improve
def simpleHash(vector):
    return np.sum(vector)

In [16]:
bands0 = minHash.flatMap(lambda x: np.arange(numberOfBands))
bands1 = minHash.flatMap(lambda x: np.ones(numberOfBands, dtype=int)*x[0])
bands2 = minHash.flatMap(lambda x: np.array(np.split(x[1], numberOfBands)))
bands2Hashed = bands2.map(lambda x: simpleHash(x))
bands12Hashed = bands1.zip(bands2Hashed)
bands = bands0.zip(bands12Hashed)
print(bands.collect()) #(bandId, (documentId, hashOverTheBand))

[(0, (0, 33815)), (1, (0, 24426)), (2, (0, 14787)), (3, (0, 18804)), (4, (0, 23500)), (5, (0, 21306)), (6, (0, 27863)), (7, (0, 26155)), (8, (0, 21605)), (9, (0, 27389)), (10, (0, 31055)), (11, (0, 32872)), (12, (0, 22360)), (13, (0, 24836)), (14, (0, 18500)), (15, (0, 28048)), (16, (0, 26080)), (17, (0, 28795)), (18, (0, 16847)), (19, (0, 35130)), (20, (0, 35643)), (21, (0, 19456)), (22, (0, 25510)), (23, (0, 12401)), (24, (0, 31566)), (25, (0, 18653)), (26, (0, 32328)), (27, (0, 33518)), (28, (0, 18905)), (29, (0, 29378)), (30, (0, 18119)), (31, (0, 26467)), (32, (0, 29586)), (33, (0, 27468)), (34, (0, 17838)), (35, (0, 31279)), (36, (0, 20645)), (37, (0, 35066)), (38, (0, 21516)), (39, (0, 29875)), (40, (0, 25526)), (41, (0, 14897)), (42, (0, 16381)), (43, (0, 19310)), (44, (0, 26773)), (45, (0, 25394)), (46, (0, 36821)), (47, (0, 36460)), (48, (0, 21033)), (49, (0, 14674)), (50, (0, 29591)), (51, (0, 20504)), (52, (0, 21883)), (53, (0, 29638)), (54, (0, 27554)), (55, (0, 27422)), (

In [17]:
def generateCandidates(vector):
    candidates = []
    for x in vector:
        for y in vector:
            if x[0] < y[0] and x[1] == y[1]:
                candidates.append((x[0], y[0]))
    return candidates

In [18]:
bandsInGroup = bands.groupByKey()

In [19]:
candidates = bandsInGroup.flatMap(lambda x: generateCandidates(x[1])).map(lambda x: (x[0]*len(data)+x[1], x)).values().distinct()
print(candidates.collect())

[(0, 2), (0, 4), (1, 3), (2, 4), (3, 4), (3, 6), (4, 6), (1, 2), (2, 3), (1, 6), (0, 3), (7, 8), (0, 1), (1, 5), (3, 8), (4, 5), (2, 7), (0, 8), (1, 4), (2, 8), (6, 7), (3, 7), (4, 8), (4, 7), (5, 8), (1, 7), (0, 6), (2, 6), (0, 5), (3, 5), (0, 7), (1, 8), (6, 8), (2, 5), (5, 7), (5, 6)]


In [20]:
print(bandsInGroup.map(lambda x : (x[0], list(x[1]))).collect())

[(0, [(0, 33815), (1, 33816), (2, 33815), (3, 33816), (4, 33815), (5, 33819), (6, 33790), (7, 33814), (8, 33796)]), (1, [(0, 24426), (1, 24417), (2, 24441), (3, 24419), (4, 24419), (5, 24429), (6, 24419), (7, 24446), (8, 24440)]), (2, [(0, 14787), (1, 14777), (2, 14777), (3, 14777), (4, 14795), (5, 14786), (6, 14758), (7, 14765), (8, 14822)]), (3, [(0, 18804), (1, 18752), (2, 18780), (3, 18787), (4, 18757), (5, 18793), (6, 18742), (7, 18766), (8, 18810)]), (4, [(0, 23500), (1, 23513), (2, 23528), (3, 23510), (4, 23499), (5, 23505), (6, 23527), (7, 23495), (8, 23501)]), (5, [(0, 21306), (1, 21313), (2, 21311), (3, 21320), (4, 21306), (5, 21310), (6, 21313), (7, 21315), (8, 21324)]), (6, [(0, 27863), (1, 27874), (2, 27879), (3, 27863), (4, 27881), (5, 27898), (6, 27857), (7, 27861), (8, 27865)]), (7, [(0, 26155), (1, 26138), (2, 26163), (3, 26155), (4, 26176), (5, 26173), (6, 26144), (7, 26135), (8, 26145)]), (8, [(0, 21605), (1, 21596), (2, 21612), (3, 21597), (4, 21602), (5, 21616), (6

In [21]:
for couple in candidates.collect():
    first = minHash.filter(lambda x: x[0]==couple[0]).collect()[0][1]
    second = minHash.filter(lambda x: x[0]==couple[1]).collect()[0][1]
    comparison = np.dstack((first, second))[0]
    print("Similarity between "+str(couple[0])+" and "+str(couple[1])+" is "+str(len(list(filter(lambda x: x[0] == x[1], comparison)))/len(comparison)))

Similarity between 0 and 2 is 0.17
Similarity between 0 and 4 is 0.178
Similarity between 1 and 3 is 0.247
Similarity between 2 and 4 is 0.202
Similarity between 3 and 4 is 0.234


KeyboardInterrupt: 