In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext("local", "lhs")
import numpy as np

In [2]:
import os 
print(os.getcwd())

/Users/danmontesi/Desktop/Data-Mining-KTH/Lab-1-LSH


In [3]:
# Read the documents and import as string
numberOfDocuments = 10
documents = []
for i in range(numberOfDocuments):
    documents.append(sc.textFile("data/"+str(i)+".txt"))
    documents[i] = documents[i].map(lambda x: cleanDocument(x))

In [4]:
"""
    Remove basic punctuation and clean the document
"""
import string
def cleanDocument(document):
    document = document.lower()
    document = document.translate(str.maketrans('','',string.punctuation))
    return document

In [5]:
"""
    Given a shingle, returns its hashed value
"""

def hashShingling(shingle, mod=2 ** 32 - 1):
    val = 0
    for c in shingle:
        val = (val * 26 + ord(c)) % mod
    return val


"""
    Get list of shinglings from a document as a list
"""
def get_shinglings(document, k):
    shinglings = []
    for i in range(k - 1, len(document)):
        shinglings.append(hashShingling(document[i - k + 1:i]))

    return np.array(shinglings)

In [6]:
import numpy as np

class VectorWrapper():
    def __init__(self, vector):
        self.vector = vector

"""
    The class Hasher contains all the useful functions to build a minHash signature (in the form of a vector)
    of a given length from a given set of integers (a set of hashed shingles).
"""

class Hasher():
    def __init__(self, signatures):
        self.signatures = signatures
        self.coefficient = np.random.randint(2**32-1, size=self.signatures)
        self.bias = np.random.randint(2**32-1, size=self.signatures)
        self.mod = np.ones(self.signatures)*2**32-1#np.random.randint(10000, size=self.signatures)
        
    def hashValue(self, value, signature):
        return (value*self.coefficient[signature]+self.bias[signature])%self.mod[signature]
    
    # Compute hash of each element of a vector (vectorizing operation)
    def hashVector(self, vector, signature):
        return np.vectorize(self.hashValue)(vector, signature)
    
    # gets minimum of the (hashed) values in the vector given
    def minHashVector(self, signature, vectorWrapper):
        return np.amin(self.hashVector(vectorWrapper.vector, signature))
    
    # given a vector, returns the min-hash for all possible elements. 
    def generateSignatures(self, vector):
        return np.vectorize(self.minHashVector)(np.arange(self.signatures), VectorWrapper(vector))

In [7]:
"""
    It computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.
"""
def computeJaccard(set1, set2):
    #To be sure that the user inputted a set:
    a = set(set1)
    b = set(set2)

    union = a.union(b)
    intersection = a.intersection(b)
    return round(len(intersection)/len(union), 2)

In [8]:
signatures = 100
doc_num = numberOfDocuments
hasher = Hasher(signatures)

In [9]:
"""
    Estimates similarity of two integer vectors – minhash signatures –
    as a fraction of components, in which they agree.
"""
def getSimilarity(list1, list2):
    comparison = np.dstack((list1, list2))[0]
    return len(list(filter(lambda x: x[0] == x[1], comparison))) / len(comparison)

In [10]:
"""
    Given a list of pairs of documents, it prints the min hashing similarities between these pairs.
"""
def print_min_hashing_similarities(min_hash_lists, candidates):
    for couple in candidates:
        first = min_hash_lists.filter(lambda x: x[0]==couple[0]).collect()[0][1]
        second = min_hash_lists.filter(lambda x: x[0]==couple[1]).collect()[0][1]
        print("Similarity between "+str(couple[0])+" and "+str(couple[1])+" is "+str(getSimilarity(first, second)))

"""
    Generates the list of signatures for each document.
"""
def min_hashing(signature_lists, hasher):
    min_hash_lists = signature_lists.map(lambda x: (x[0], hasher.generateSignatures(x[1])))
    return min_hash_lists

In [11]:
"""
    Computes an hash value given a vector. 
"""
def vectorHash(vector):
    return np.sum(vector) % (2 ** 32 - 1)

"""
    Given a bucket, generate all the possible pairs of candidates for that bucket. 
"""
def generateCandidates(vector):
    candidates = []
    for x in vector:
        for y in vector:
            if x[0] < y[0] and x[1] == y[1]:
                candidates.append((x[0], y[0]))
    return candidates

"""
    Given a collection of minhash signatures (integer vectors) and a similarity threshold t, the functions (using banding and hashing)
    finds all candidate pairs of signatures that agree on at least fraction t of their components and print their similarities.
"""
def lsh_test(doc_num, min_hash_lists):
    
    # Computes threshold from given bands_num and signatures
    def getThreshold(signatures, bands_num):
        return np.round((1 / bands_num) ** (bands_num / signatures),4)
        
    possible_bands = []
    for i in range(1, signatures//2 +1):
        if signatures%i == 0:
            possible_bands.append(i)
            
    for i in range(len(possible_bands)):
        print("{}: {}".format(i, getThreshold(signatures, possible_bands[i])))
    choice = int(input("Which threshold would you like to select? Please insert the index id:") )
    # TODO catch errors
    
    bands_num = possible_bands[choice]
   
    print("Threshold is set to " + str(getThreshold(signatures, bands_num)))

    # Create a tuples having as elements (bandId, (documentId, hashOverTheBand))
    
    # [1,2,3,4,5.... num_bands]
    bands0 = min_hash_lists.flatMap(lambda x: np.arange(bands_num))
    # [1,1, ..... 1] , len = num_bands
    bands1 = min_hash_lists.flatMap(lambda x: np.ones(bands_num, dtype=int) * x[0])
    # split in num_bands same length parts the dataset
    bands2 = min_hash_lists.flatMap(lambda x: np.array(np.split(x[1], bands_num)))
    
    # get hash of vectors created (bands)
    bands2Hashed = bands2.map(lambda x: vectorHash(x))
    
    print(bands2Hashed.collect())
    
    # create couples (1, hashed_vector1), (1, hashed_vector2), ...
    bands12Hashed = bands1.zip(bands2Hashed)
    
    
    # create couples (1,(1, hashed_vector1)), (2,(1, hashed_vector2)), ...
    bands = bands0.zip(bands12Hashed)

    print(bands.collect())
    # for each bend, I have all possible bucke
    bandsInGroup = bands.groupByKey()

    candidates = bandsInGroup.flatMap(lambda x: generateCandidates(x[1])).map(
        lambda x: (x[0] * doc_num + x[1], x)).values().distinct()
    print("Candidates are: " + str(candidates.collect()) )
    
    print_min_hashing_similarities(min_hash_lists, candidates.collect())
        

In [12]:
"""
    Compute the jaccard similarities between all the documents
"""
def jaccard_test():
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            print("Jaccard similarity between document {} and {} is {}".format(i, j, computeJaccard(data[i], data[j])))
    
"""
    Computes the approximate Jaccard similarities with min hashing between all the documents 
"""
def min_hashing_test(signatures_lists, min_hash_lists):
    
    candidates = []
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            candidates.append((i, j))
    
    print_min_hashing_similarities(min_hash_lists, candidates)

In [13]:
# Create class shinglings to convert document into shingles and then into number
shinglingSize = 7

# Create function Shingling that constructs k–shingles of a given length shinglingSize, from each document, 
# computes a hash value for each unique shingle, and represents the document in the form of an ordered set of 
# its hashed shinglingSize-shingles.

data = []
for d in range(numberOfDocuments):
    document = ""
    for s in documents[d].collect():
        document = document + s
    shinglings = []
    for i in range(shinglingSize - 1, len(document)):
        # get the hash of the shingling and append to the new vector of integers
        shinglings.append(hashShingling(document[i - shinglingSize + 1:i]))
    data.append(shinglings)

data = np.array(data)

# create RDD and create LSH class
dataRDD = sc.parallelize(data)
dataWithIndex = dataRDD.zipWithIndex().map(lambda x: (x[1], x[0]))

In [14]:
# compute the jaccard similarities between all the documents
jaccard_test()

Jaccard similarity between document 0 and 1 is 0.05
Jaccard similarity between document 0 and 2 is 0.05
Jaccard similarity between document 0 and 3 is 0.05
Jaccard similarity between document 0 and 4 is 0.03
Jaccard similarity between document 0 and 5 is 0.02
Jaccard similarity between document 0 and 6 is 0.02
Jaccard similarity between document 0 and 7 is 0.03
Jaccard similarity between document 0 and 8 is 0.01
Jaccard similarity between document 0 and 9 is 0.01
Jaccard similarity between document 1 and 2 is 0.06
Jaccard similarity between document 1 and 3 is 0.04
Jaccard similarity between document 1 and 4 is 0.05
Jaccard similarity between document 1 and 5 is 0.03
Jaccard similarity between document 1 and 6 is 0.03
Jaccard similarity between document 1 and 7 is 0.02
Jaccard similarity between document 1 and 8 is 0.01
Jaccard similarity between document 1 and 9 is 0.01
Jaccard similarity between document 2 and 3 is 0.04
Jaccard similarity between document 2 and 4 is 0.03
Jaccard simi

In [14]:
# retrieves the list of signatures for each document.
min_hash_lists = min_hashing(dataWithIndex, hasher)

In [15]:
# computes the approximate Jaccard similarities with min hashing between all the documents
min_hashing_test(dataWithIndex, min_hash_lists )

KeyboardInterrupt: 

In [16]:
# computes the approximate Jaccard similarities with min hashing between all the pairs of documents that are likely to have a similarity higher than the chosen treshold
lsh_test(numberOfDocuments, min_hash_lists )

0: 1.0
1: 0.9862
2: 0.9461
3: 0.9227
4: 0.7943
5: 0.5493
6: 0.4472
7: 0.1414
Which threshold would you like to select? Please insert the index id:6
Threshold is set to 0.4472
[19755788.0, 6549237.0, 17511651.0, 26770135.0, 25565318.0, 24885585.0, 40915553.0, 12264498.0, 20444336.0, 13359914.0, 17361800.0, 14808063.0, 59470232.0, 24821304.0, 17858289.0, 17998129.0, 14446236.0, 53650375.0, 37387992.0, 15891596.0, 58370091.0, 12373549.0, 21515492.0, 20064647.0, 39739272.0, 11052067.0, 16504316.0, 7413723.0, 9236455.0, 15065505.0, 7507439.0, 9296881.0, 6203383.0, 9942182.0, 5940428.0, 29831424.0, 5772867.0, 11898564.0, 13399414.0, 16082727.0, 6423907.0, 21291962.0, 12026746.0, 3785894.0, 12177190.0, 16775028.0, 12170614.0, 14111509.0, 10898755.0, 13955704.0, 35082122.0, 17753738.0, 16308988.0, 34659552.0, 22374710.0, 29040627.0, 35887947.0, 16974581.0, 3527987.0, 7296266.0, 25077902.0, 16634905.0, 23491169.0, 28833109.0, 11452917.0, 15828173.0, 21594218.0, 31947719.0, 27282422.0, 19369222.

Candidates are: []
