In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext("local", "lhs")
import numpy as np

In [2]:
import os 
print(os.getcwd())

/Users/danmontesi/Desktop/Data-Mining-KTH/Lab-1-LSH


In [3]:
# Read the documents and import as string
numberOfDocuments = 10
documents = []
for i in range(numberOfDocuments):
    documents.append(sc.textFile("data/"+str(i)+".txt"))
    documents[i] = documents[i].map(lambda x: cleanDocument(x))

In [4]:
"""
    Remove basic punctuation and clean the document
"""
import string
def cleanDocument(document):
    document = document.lower()
    document = document.translate(str.maketrans('','',string.punctuation))
    return document

In [5]:
""" 
 Create function Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, 
 computes a hash value for each unique shingle, and represents the document in the form of an ordered set of 
 its hashed k-shingles.
"""

def hashShingling(shingle, mod=2 ** 32 - 1):
    """
    Given a shingle, returns its hashed value
    the hash function is defined as below.
    hash(char) = val*26 + getAscii(char)%mod

    :param shingle: input characters sequence
    :return val: hashed value for the string
    """

    val = 0
    for c in shingle:
        val = (val * 26 + ord(c)) % mod
    return val



def get_shinglings(document, k):
    """
    Get list of shinglings from a document as a list
    :param document: Document in form of String
    """

    shinglings = []
    for i in range(k - 1, len(document)):
        shinglings.append(hashShingling(document[i - k + 1:i]))

    return np.array(shinglings)

In [6]:
import numpy as np

"""
Function MinHashing that builds a minHash signature (in the form of a vector or a set)
of a given length n from a given set of integers (a set of hashed shingles).
"""

class VectorWrapper():
    def __init__(self, vector):
        self.vector = vector

class Hasher():
    def __init__(self, signatures):
        self.signatures = signatures
        self.coefficient = np.random.randint(2**32-1, size=self.signatures)
        self.bias = np.random.randint(2**32-1, size=self.signatures)
        self.mod = np.ones(self.signatures)*2**32-1#np.random.randint(10000, size=self.signatures)
        
    def hashValue(self, value, signature):
        return (value*self.coefficient[signature]+self.bias[signature])%self.mod[signature]
    
    def hashVector(self, vector, signature):
        return np.vectorize(self.hashValue)(vector, signature)
    
    def minHashVector(self, signature, vectorWrapper):
        return np.amin(self.hashVector(vectorWrapper.vector, signature))
    
    def generateSignatures(self, vector):
        return np.vectorize(self.minHashVector)(np.arange(self.signatures), VectorWrapper(vector))

In [7]:
"""
Class CompareSets:
It computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.
"""

def computeJaccard(set1, set2):
    #To be sure that the user inputted a set:
    a = set(set1)
    b = set(set2)

    union = a.union(b)
    intersection = a.intersection(b)
    return round(len(intersection)/len(union), 2)


In [8]:
"""
CompareSignatures: estimates similarity of two integer vectors – minhash signatures –
as a fraction of components, in which they agree.
"""

def getSimilarity(list1, list2):
    comparison = np.dstack((list1, list2))[0]
    return len(list(filter(lambda x: x[0] == x[1], comparison))) / len(comparison)

In [13]:
"""
Bonus task:
Class LSH that implements the LSH technique: given a collection of minhash signatures
(integer vectors) and a similarity threshold t, the LSH class (using banding and hashing)
finds all candidate pairs of signatures that agree on at least fraction t of their components.
"""

bands_num = 25
signatures = 100
doc_num = numberOfDocuments
hasher = Hasher(signatures)

def vectorHash(vector):
    """
    Computes an hash value given a vector
    :return: hashed value  """
    return np.sum(vector) % (2 ** 32 - 1)

def generateCandidates(vector):
    """
    Given a vector, states whether it is a candidate or not
    """
    candidates = []
    for x in vector:
        for y in vector:
            if x[0] < y[0] and x[1] == y[1]:
                candidates.append((x[0], y[0]))
    return candidates

"""
    ....
"""
def min_hashing(signature_lists, hasher):
    min_hash_lists = signature_lists.map(lambda x: (x[0], hasher.generateSignatures(x[1])))
    return min_hash_lists


def LSH(signatures_lists, signatures = 100, doc_num = 10):
    
    # Computer threshold from given bands_num and signatures
    def getThreshold(signatures, bands_num):
        return np.round((1 / bands_num) ** (bands_num / signatures),4)
        
    possible_bands = []
    for i in range(1, signatures//2 +1):
        if signatures%i == 0:
            possible_bands.append(i)
            
    for i in range(len(possible_bands)):
        print("{}: {}".format(i, getThreshold(signatures, possible_bands[i])))
    choice = int(input("Which threshold would you like to select? Please insert their index id:") )
    # TODO catch errors
    
    bands_num = possible_bands[choice]
   
    print("Threshold is set to " + str(getThreshold(signatures, bands_num)))
    # Generate signatures of a given list of integers
    
    min_hash_lists = min_hashing(signatures_lists, hasher)

    # Create a tuples having as elements (bandId, (documentId, hashOverTheBand))
    bands0 = min_hash_lists.flatMap(lambda x: np.arange(bands_num))
    bands1 = min_hash_lists.flatMap(lambda x: np.ones(bands_num, dtype=int) * x[0])
    bands2 = min_hash_lists.flatMap(lambda x: np.array(np.split(x[1], bands_num)))
    bands2Hashed = bands2.map(lambda x: vectorHash(x))
    bands12Hashed = bands1.zip(bands2Hashed)
    bands = bands0.zip(bands12Hashed)

    bandsInGroup = bands.groupByKey()

    candidates = bandsInGroup.flatMap(lambda x: generateCandidates(x[1])).map(
        lambda x: (x[0] * doc_num + x[1], x)).values().distinct()
    print("Candidates are: " + str(candidates.collect()) )
    
    for couple in candidates.collect():
        first = min_hash_lists.filter(lambda x: x[0]==couple[0]).collect()[0][1]
        second = min_hash_lists.filter(lambda x: x[0]==couple[1]).collect()[0][1]
        print("Similarity between "+str(couple[0])+" and "+str(couple[1])+" is "+str(getSimilarity(first, second)))
        

In [15]:
# Create class shinglings to convert document into shingles and then into number
shinglingSize = 7

data = []
for d in range(numberOfDocuments):
    document = ""
    for s in documents[d].collect():
        document = document + s
    shinglings = []
    for i in range(shinglingSize - 1, len(document)):
        # get the hash of the shingling and append to the new vector of integers
        shinglings.append(hashShingling(document[i - shinglingSize + 1:i]))
    data.append(shinglings)

data = np.array(data)

# create RDD and create LSH class
dataRDD = sc.parallelize(data)
dataWithIndex = dataRDD.zipWithIndex().map(lambda x: (x[1], x[0]))

LSH(dataWithIndex, 100, numberOfDocuments)


PythonRDD[52] at RDD at PythonRDD.scala:53
0: 1.0
1: 0.9862
2: 0.9461
3: 0.9227
4: 0.7943
5: 0.5493
6: 0.4472
7: 0.1414
Which threshold would you like to select? Please insert their index id:6
Threshold is set to 0.4472
Candidates are: [(8, 9)]
Similarity between 8 and 9 is 0.65


In [29]:
#3 funzioni

# data = [ [3214, 31431, 134134, ...], [...]]

"""
    Computes the Jaccard Similarity for each document
    documents are already split in hash shingles and saved 
    global vsriable data
"""
def jaccard_test():
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            print("Jaccard similarity between document {} and {} is {}".format(i, j, computeJaccard(data[i], data[j])))
    
"""
    Computes the MinHashing Similarity for each document
    global vsriable ...
    signatures_lists = datawithindex...
    
"""
def min_hashing_test(signatures_lists, min_hash_lists):
    
    for couple in signatures_lists.collect():
        first = min_hash_lists.filter(lambda x: x[0]==couple[0]).collect()[0]
        second = min_hash_lists.filter(lambda x: x[0]==couple[1]).collect()[0]
        print("Similarity between "+str(couple[0])+" and "+str(couple[1])+" is "+str(getSimilarity(first, second)))


    
    
# min-hashing senza lsh con i 10 documenti


In [13]:
jaccard_test()

Jaccard similarity between document 0 and 1 is 0.03
Jaccard similarity between document 0 and 2 is 0.03
Jaccard similarity between document 0 and 3 is 0.02
Jaccard similarity between document 0 and 4 is 0.01
Jaccard similarity between document 0 and 5 is 0.0
Jaccard similarity between document 0 and 6 is 0.0
Jaccard similarity between document 0 and 7 is 0.0
Jaccard similarity between document 0 and 8 is 0.0
Jaccard similarity between document 0 and 9 is 0.0
Jaccard similarity between document 1 and 2 is 0.03
Jaccard similarity between document 1 and 3 is 0.02
Jaccard similarity between document 1 and 4 is 0.03
Jaccard similarity between document 1 and 5 is 0.01
Jaccard similarity between document 1 and 6 is 0.01
Jaccard similarity between document 1 and 7 is 0.0
Jaccard similarity between document 1 and 8 is 0.0
Jaccard similarity between document 1 and 9 is 0.0
Jaccard similarity between document 2 and 3 is 0.01
Jaccard similarity between document 2 and 4 is 0.01
Jaccard similarity b

In [30]:
min_hashing_test(dataWithIndex, min_hashing(dataWithIndex, hasher))

IndexError: list index out of range