In [1]:


class CompareSets():
    """
    Class CompareSets:
    It computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.
    """


    def computeJaccard(self, set1, set2):
        #To be sure that the user inputted a set:
        a = set(set1)
        b = set(set2)

        union = a.union(b)
        intersection = a.intersection(b)
        print("Similarity between the 2 sets is " + str(round(len(intersection)/len(union), 2)))




if __name__ == '__main__':
    cs = CompareSets()
    cs.computeJaccard([1,2,3,3,4], [1,2,5])

Similarity between the 2 sets is 0.4


In [2]:

import numpy as np

class CompareSignatures():

    """
    A class CompareSignatures that estimates similarity of two integer vectors – minhash signatures –
    as a fraction of components, in which they agree.
    """

    def getSimilarity(self, list1, list2):
        comparison = np.dstack((list1, list2))[0]
        print("Similarity between the 2 vectors is " + str(
        len(list(filter(lambda x: x[0] == x[1], comparison))) / len(comparison)))

In [3]:
import numpy as np



""" 1 - 
 Create a class Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, 
 computes a hash value for each unique shingle, and represents the document in the form of an ordered set of 
 its hashed k-shingles.
"""
class Shingling():

    def __init__(self, k):
        """
        :param k: length of the shingles
        """

        self.k = k




    def hashShingling(self, shingle, mod=2 ** 32 - 1):
        """
        Given a shingle, returns its hashed value
        the hash function is defined as below.
        hash(char) = val*26 + getAscii(char)%mod

        :param shingle: input characters sequence
        :return val: hashed value for the string
        """

        val = 0
        for c in shingle:
            val = (val * 26 + ord(c)) % mod
        return val



    def get_shinglings(self, document):
        """
        Get list of shinglings from a document as a list
        :param document: Document in form of String
        """

        shinglings = []
        for i in range(self.k - 1, len(document)):
            shinglings.append(self.hashShingling(document[i - self.k + 1:i]))

        return np.array(shinglings)




In [4]:
import findspark
findspark.init()

import numpy as np


class VectorWrapper():
    def __init__(self, vector):
        self.vector = vector

class MinHashing():
    """
    A class MinHashing that builds a minHash signature (in the form of a vector or a set)
    of a given length n from a given set of integers (a set of hashed shingles).
    """

    def __init__(self, n):
        self.n = n
        self.coefficient = np.random.randint(2 ** 32 - 1, size=self.n)
        self.bias = np.random.randint(2 ** 32 - 1, size=self.n)
        self.mod = np.ones(self.n) * 2 ** 32 - 1

    def hashValue(self, value, signature):
        return (value * self.coefficient[signature] + self.bias[signature]) % self.mod[signature]

    def hashVector(self, vector, signature):
        return np.vectorize(self.hashValue)(vector, signature)

    def minHashVector(self, signature, vectorWrapper):
        return np.amin(self.hashVector(vectorWrapper.vector, signature))

    def generateSignatures(self, vector):
        return np.vectorize(self.minHashVector)(np.arange(self.n), VectorWrapper(vector))

In [5]:
import numpy as np


class LSH():
    """
    Bonus task:
    Class LSH that implements the LSH technique: given a collection of minhash signatures
    (integer vectors) and a similarity threshold t, the LSH class (using banding and hashing)
    finds all candidate pairs of signatures that agree on at least fraction t of their components.
    """
    def __init__(self, signatures_lists, bands_num, signatures, doc_num):
        """
        :param signatures_lists: collection of minhash signatures (integer vectors) in the form of a RDD
        """
        self.signatures_lists = signatures_lists
        self.bands_num = bands_num
        self.signatures = signatures
        self.doc_num = doc_num

    def vectorHash(self, vector):
        """
        Computes an hash value given a vector
        :return: hashed value
        """
        return np.sum(vector) % 2 ** 32 - 1

    def generateCandidates(self, vector):
        """
        Given a vector, states whether it is a candidate or not
        """

        candidates = []
        for x in vector:
            for y in vector:
                if x[0] < y[0] and x[1] == y[1]:
                    candidates.append((x[0], y[0]))
        return candidates

    def run(self):

        # Computer threshold from given bands_num and signatures
        t = (1 / self.bands_num) ** (self.bands_num / self.signatures)
        print("Threshold is set to " + str(t))
        hasher = MinHashing(self.signatures)

        # Generate signatures of a given list of integers
        min_hash_lists = self.signatures_lists.map(lambda x: (x[0], hasher.generateSignatures(x[1])))
        #print(min_hash_lists.collect())
        # Create a tuples having as elements (bandId, (documentId, hashOverTheBand))
        bands0 = min_hash_lists.flatMap(lambda x: np.arange(self.bands_num))
        print(min_hash_lists)
        print(bands0)
        return
        bands1 = min_hash_lists.flatMap(lambda x: np.ones(self.bands_num, dtype=int) * x[0])
        bands2 = min_hash_lists.flatMap(lambda x: np.array(np.split(x[1], self.bands_num)))
        bands2Hashed = bands2.map(lambda x: self.vectorHash(x))
        bands12Hashed = bands1.zip(bands2Hashed)
        bands = bands0.zip(bands12Hashed)

        bandsInGroup = bands.groupByKey()

        candidates = bandsInGroup.flatMap(lambda x: self.generateCandidates(x[1])).map(
            lambda x: (x[0] * len(self.doc_num) + x[1], x)).values().distinct()
        print("Candidates are: " + candidates.collect())








In [6]:
import string
import findspark
import pyspark
import numpy as np



def cleanDocument(document):
    """
    Given a document in String format, remove the punctuation
    :param document:
    :return:
    """
    document = document.lower()
    document = document.translate(str.maketrans('', '', string.punctuation))
    return document



findspark.init()
sc = pyspark.SparkContext("local", "lhs")


In [7]:
numberOfDocuments = 10

# Read documents to analyze
documents = []
for i in range(numberOfDocuments):
    documents.append(sc.textFile("../data/" + str(i) + ".txt"))
    documents[i] = documents[i].map(lambda x: cleanDocument(x))

# Create class shinglings to convert document into shingles and then into number
shinglingSize = 7
shingler = Shingling(shinglingSize)

data = []
for d in range(numberOfDocuments):
    document = ""
    for s in documents[d].collect():
        document = document + s
    shinglings = []
    for i in range(shinglingSize - 1, len(document)):
        # get the hash of the shingling and append to the new vector of integers
        shinglings.append(shingler.hashShingling(document[i - shinglingSize + 1:i]))
    data.append(shinglings)

data = np.array(data)

# create RDD and create LSH class
dataRDD = sc.parallelize(data)
dataWithIndex = dataRDD.zipWithIndex().map(lambda x: (x[1], x[0]))

lsh = LSH(dataWithIndex, 25, 100, numberOfDocuments)

lsh.run()

Threshold is set to 0.4472135954999579
PythonRDD[31] at RDD at PythonRDD.scala:53


Traceback (most recent call last):
  File "/opt/spark-2.4.3-bin-hadoop2.7/python/pyspark/serializers.py", line 590, in dumps
    return cloudpickle.dumps(obj, 2)
  File "/opt/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 863, in dumps
    cp.dump(obj)
  File "/opt/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 260, in dump
    return Pickler.dump(self, obj)
  File "/home/strenuus/anaconda3/lib/python3.7/pickle.py", line 437, in dump
    self.save(obj)
  File "/home/strenuus/anaconda3/lib/python3.7/pickle.py", line 504, in save
    f(self, obj) # Call unbound method with explicit self
  File "/home/strenuus/anaconda3/lib/python3.7/pickle.py", line 786, in save_tuple
    save(element)
  File "/home/strenuus/anaconda3/lib/python3.7/pickle.py", line 504, in save
    f(self, obj) # Call unbound method with explicit self
  File "/opt/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 406, in save_function
    self.save_function_tuple(obj)
  File "

PicklingError: Could not serialize object: Exception: It appears that you are attempting to broadcast an RDD or reference an RDD from an action or transformation. RDD transformations and actions can only be invoked by the driver, not inside of other transformations; for example, rdd1.map(lambda x: rdd2.values.count() * x) is invalid because the values transformation and count action cannot be performed inside of the rdd1.map transformation. For more information, see SPARK-5063.