In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext("local", "lhs")
import numpy as np

In [2]:
import string
def cleanDocument(document):
    document = document.lower()
    document = document.translate(str.maketrans('','',string.punctuation))
    return document

In [3]:
numberOfDocuments = 10
documents = []
for i in range(numberOfDocuments):
    documents.append(sc.textFile("data/"+str(i)+".txt"))
    documents[i] = documents[i].map(lambda x: cleanDocument(x))

In [4]:
def hashShingling(text):
    val = 0
    mod = 2**32-1
    for c in text:
        val = (val*26 + ord(c))%mod
    return val

In [5]:
shinglingSize = 9

In [6]:
data = []

In [7]:
for d in range(numberOfDocuments):
    document = ""
    for s in documents[d].collect():
        document = document + s
    shinglings = []
    for i in range(shinglingSize-1, len(document)):
            shinglings.append(hashShingling(document[i-shinglingSize+1:i]))
    data.append(shinglings)
data = np.array(data)

In [8]:
#data = np.array([[1000,2000,3000,4000,5000,6000,7000,8000], [1000,2000,3000,4000,5000,6000,7000,8000], [1000,2000,3000,4000], [5000,6000,7000,8000]])

In [9]:
dataRDD = sc.parallelize(data)

In [10]:
dataWithIndex = dataRDD.zipWithIndex().map(lambda x: (x[1], x[0]))
print(dataWithIndex.collect())

[(0, [4165096782, 3847052731, 79622680, 2788544303, 2748935685, 1593350548, 1280398990, 2901655764, 478326072, 54620676, 1554460787, 143504058, 4234150193, 1221713064, 540268048, 128327973, 1970278777, 738504341, 2524288372, 1673820506, 3705126946, 1061803005, 2887938029, 121664876, 3666331479, 3638295676, 573239425, 2522367865, 4292521819, 2739375333, 1928812744, 3407535719, 746303017, 3526395226, 3252618838, 429400062, 2708790312, 2427426027, 2659587588, 932845265, 1412891959, 3426316242, 983074307, 3761148596, 3434906295, 4126572392, 419744520, 3500047678, 3942393191, 1516173110, 2400690225, 1254664247, 1064489754, 2957793517, 1937907633, 3644002833, 2599480679, 1795739778, 621218815, 3769831994, 2366850812, 2126933535, 516560761, 1048722604, 1631307214, 2141511162, 3815735688, 1475743769, 2517590049, 1535876469, 2580468500, 717391777, 2523180706, 602719722, 3288855583, 2746366006, 735759651, 2452926435, 4113377439, 2709097791, 4061753347, 3577235627, 2364105048, 303449724, 21027159

In [11]:
class VectorWrapper():
    def __init__(self, vector):
        self.vector = vector
        
class Hasher():
    def __init__(self, signatures):
        self.signatures = signatures
        self.coefficient = np.random.randint(2**32-1, size=self.signatures)
        self.bias = np.random.randint(2**32-1, size=self.signatures)
        self.mod = np.ones(self.signatures)*2**32-1#np.random.randint(10000, size=self.signatures)
        
    def hashValue(self, value, signature):
        return (value*self.coefficient[signature]+self.bias[signature])%self.mod[signature]
    
    def hashVector(self, vector, signature):
        return np.vectorize(self.hashValue)(vector, signature)
    
    def minHashVector(self, signature, vectorWrapper):
        return np.amin(self.hashVector(vectorWrapper.vector, signature))
    
    def generateSignatures(self, vector):
        return np.vectorize(self.minHashVector)(np.arange(self.signatures), VectorWrapper(vector))

In [12]:
signatures = 100 #number of hash functions
numberOfBands = 25 #b
t = (1/numberOfBands)**(numberOfBands/signatures)
print(t)

0.4472135954999579


In [13]:
hasher = Hasher(signatures)

In [14]:
minHash = dataWithIndex.map(lambda x: (x[0], hasher.generateSignatures(x[1])))
print(minHash.collect())

[(0, array([2.7105420e+06, 8.1953100e+05, 7.9026300e+05, 1.1477433e+07,
       1.0057026e+07, 1.0408739e+07, 5.1890770e+06, 1.5592110e+06,
       6.1694300e+06, 2.9079550e+06, 1.0861110e+06, 2.5336000e+04,
       1.1968240e+06, 8.6333470e+06, 3.6817000e+04, 4.9093130e+06,
       5.4706420e+06, 1.1133527e+07, 8.8661060e+06, 1.0767020e+06,
       1.0926700e+07, 6.0196100e+06, 8.1477250e+06, 3.2858590e+06,
       6.0853100e+05, 8.7710420e+06, 3.3030910e+06, 1.2337364e+07,
       1.6954440e+06, 1.3134437e+07, 3.1600630e+06, 3.9731880e+06,
       5.2035210e+06, 8.3360000e+03, 1.0141820e+07, 2.0179400e+06,
       1.9928900e+05, 1.1122964e+07, 7.1498500e+05, 7.7315320e+06,
       2.9813642e+07, 1.2732400e+05, 1.5201681e+07, 7.4347630e+06,
       9.5087770e+06, 3.7341110e+06, 2.7885940e+06, 3.0232380e+06,
       1.1855492e+07, 4.7718200e+05, 2.8281580e+06, 4.2278250e+06,
       1.5517435e+07, 3.5852320e+06, 3.7145150e+06, 2.5700810e+06,
       4.7369010e+06, 4.7518550e+06, 4.1594920e+06, 8.746

In [15]:
#to improve
def simpleHash(vector):
    return np.sum(vector)

In [16]:
bands0 = minHash.flatMap(lambda x: np.arange(numberOfBands))
bands1 = minHash.flatMap(lambda x: np.ones(numberOfBands, dtype=int)*x[0])
bands2 = minHash.flatMap(lambda x: np.array(np.split(x[1], numberOfBands)))
bands2Hashed = bands2.map(lambda x: simpleHash(x))
bands12Hashed = bands1.zip(bands2Hashed)
bands = bands0.zip(bands12Hashed)
print(bands.collect()) #(bandId, (documentId, hashOverTheBand))

[(0, (0, 15797769.0)), (1, (0, 27214053.0)), (2, (0, 10188832.0)), (3, (0, 14776301.0)), (4, (0, 26546977.0)), (5, (0, 28379894.0)), (6, (0, 25020028.0)), (7, (0, 21963132.0)), (8, (0, 17371617.0)), (9, (0, 19768770.0)), (10, (0, 52577410.0)), (11, (0, 19054720.0)), (12, (0, 19388657.0)), (13, (0, 25387263.0)), (14, (0, 22394806.0)), (15, (0, 31188494.0)), (16, (0, 12337885.0)), (17, (0, 30348332.0)), (18, (0, 14599386.0)), (19, (0, 16072149.0)), (20, (0, 20387332.0)), (21, (0, 14572809.0)), (22, (0, 19579563.0)), (23, (0, 15230094.0)), (24, (0, 42429538.0)), (0, (1, 10911674.0)), (1, (1, 6769044.0)), (2, (1, 7446190.0)), (3, (1, 15986955.0)), (4, (1, 7243285.0)), (5, (1, 14136322.0)), (6, (1, 16861508.0)), (7, (1, 15478375.0)), (8, (1, 9917006.0)), (9, (1, 19933340.0)), (10, (1, 10958307.0)), (11, (1, 4464052.0)), (12, (1, 12209790.0)), (13, (1, 15744014.0)), (14, (1, 10050347.0)), (15, (1, 10372350.0)), (16, (1, 13615273.0)), (17, (1, 11950018.0)), (18, (1, 11697722.0)), (19, (1, 157

In [17]:
def generateCandidates(vector):
    candidates = []
    for x in vector:
        for y in vector:
            if x[0] < y[0] and x[1] == y[1]:
                candidates.append((x[0], y[0]))
    return candidates

In [18]:
bandsInGroup = bands.groupByKey()

In [19]:
candidates = bandsInGroup.flatMap(lambda x: generateCandidates(x[1])).map(lambda x: (x[0]*len(data)+x[1], x)).values().distinct()
print(candidates.collect())

[(8, 9)]


In [20]:
print(bandsInGroup.map(lambda x : (x[0], list(x[1]))).collect())

[(0, [(0, 15797769.0), (1, 10911674.0), (2, 6163918.0), (3, 12170926.0), (4, 12730896.0), (5, 11722864.0), (6, 21126757.0), (7, 27343415.0), (8, 48313729.0), (9, 51312006.0)]), (1, [(0, 27214053.0), (1, 6769044.0), (2, 33497745.0), (3, 11988444.0), (4, 12201933.0), (5, 12487709.0), (6, 12956255.0), (7, 73310765.0), (8, 9893899.0), (9, 65802640.0)]), (2, [(0, 10188832.0), (1, 7446190.0), (2, 21474822.0), (3, 7001093.0), (4, 22045433.0), (5, 36325459.0), (6, 4401944.0), (7, 23304214.0), (8, 17689627.0), (9, 19274942.0)]), (3, [(0, 14776301.0), (1, 15986955.0), (2, 16002244.0), (3, 9222284.0), (4, 22959459.0), (5, 29394892.0), (6, 6414082.0), (7, 13991852.0), (8, 35498772.0), (9, 35498772.0)]), (4, [(0, 26546977.0), (1, 7243285.0), (2, 10527071.0), (3, 28360642.0), (4, 20975454.0), (5, 36869338.0), (6, 12814332.0), (7, 43512982.0), (8, 36895038.0), (9, 75571213.0)]), (5, [(0, 28379894.0), (1, 14136322.0), (2, 17177560.0), (3, 6782123.0), (4, 8460914.0), (5, 10237611.0), (6, 5614950.0), (7

In [21]:
for couple in candidates.collect():
    first = minHash.filter(lambda x: x[0]==couple[0]).collect()[0][1]
    second = minHash.filter(lambda x: x[0]==couple[1]).collect()[0][1]
    comparison = np.dstack((first, second))[0]
    print("Similarity between "+str(couple[0])+" and "+str(couple[1])+" is "+str(len(list(filter(lambda x: x[0] == x[1], comparison)))/len(comparison)))

Similarity between 8 and 9 is 0.49
