In [None]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext("local", "lhs")
import numpy as np

In [1]:
import os 
print(os.getcwd())

/Users/danmontesi/Desktop/Data-Mining-KTH


In [3]:
# Read the documents and import as string
numberOfDocuments = 10
documents = []
for i in range(numberOfDocuments):
    documents.append(sc.textFile("data/"+str(i)+".txt"))
    documents[i] = documents[i].map(lambda x: cleanDocument(x))

In [4]:
"""
    Remove basic punctuation and clean the document
"""
import string
def cleanDocument(document):
    document = document.lower()
    document = document.translate(str.maketrans('','',string.punctuation))
    return document

In [5]:
""" 
 Create function Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, 
 computes a hash value for each unique shingle, and represents the document in the form of an ordered set of 
 its hashed k-shingles.
"""

def hashShingling(shingle, mod=2 ** 32 - 1):
    """
    Given a shingle, returns its hashed value
    the hash function is defined as below.
    hash(char) = val*26 + getAscii(char)%mod

    :param shingle: input characters sequence
    :return val: hashed value for the string
    """

    val = 0
    for c in shingle:
        val = (val * 26 + ord(c)) % mod
    return val



def get_shinglings(document, k):
    """
    Get list of shinglings from a document as a list
    :param document: Document in form of String
    """

    shinglings = []
    for i in range(k - 1, len(document)):
        shinglings.append(hashShingling(document[i - k + 1:i]))

    return np.array(shinglings)

In [6]:
import numpy as np

"""
Function MinHashing that builds a minHash signature (in the form of a vector or a set)
of a given length n from a given set of integers (a set of hashed shingles).
"""

class VectorWrapper():
    def __init__(self, vector):
        self.vector = vector

n = 1000
coefficient = np.random.randint(2 ** 32 - 1, size=n)
bias = np.random.randint(2 ** 32 - 1, size=n)
mod = np.ones(n) * 2 ** 32 - 1

def hashValue(value, signature):
    return (value * coefficient[signature] + bias[signature]) % mod[signature]

def hashVector(vector, signature):
    return np.vectorize(hashValue)(vector, signature)

def minHashVector(signature, vectorWrapper):
    return np.amin(hashVector(vectorWrapper.vector, signature))

def MinHashing(vector):
    return np.vectorize(minHashVector)(np.arange(n), VectorWrapper(vector))

In [7]:
"""
Class CompareSets:
It computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.
"""

def computeJaccard(set1, set2):
    #To be sure that the user inputted a set:
    a = set(set1)
    b = set(set2)

    union = a.union(b)
    intersection = a.intersection(b)
    print("Similarity between the 2 sets is " + str(round(len(intersection)/len(union), 2)))


In [8]:
"""
CompareSignatures: estimates similarity of two integer vectors – minhash signatures –
as a fraction of components, in which they agree.
"""

def getSimilarity(list1, list2):
    comparison = np.dstack((list1, list2))[0]
    print("Similarity between the 2 vectors is " + str(
    len(list(filter(lambda x: x[0] == x[1], comparison))) / len(comparison)))

In [9]:
"""
Bonus task:
Class LSH that implements the LSH technique: given a collection of minhash signatures
(integer vectors) and a similarity threshold t, the LSH class (using banding and hashing)
finds all candidate pairs of signatures that agree on at least fraction t of their components.
"""

signatures_lists = []
bands_num = 25
signatures = 100
doc_num = numberOfDocuments

def vectorHash(vector):
    """
    Computes an hash value given a vector
    :return: hashed value
    """
    return np.sum(vector) % 2 ** 32 - 1

def generateCandidates(vector):
    """
    Given a vector, states whether it is a candidate or not
    """

    candidates = []
    for x in vector:
        for y in vector:
            if x[0] < y[0] and x[1] == y[1]:
                candidates.append((x[0], y[0]))
    return candidates

def LSH(signatures_lists,
        bands_num = 25,
        signatures = 100,
        doc_num = len(signatures_lists)):

    # Computer threshold from given bands_num and signatures
    t = (1 / bands_num) ** (bands_num / signatures)
    print("Threshold is set to " + str(t))

    # Generate signatures of a given list of integers
    min_hash_lists = signatures_lists.map(lambda x: (x[0], MinHashing(x[1])))

    # Create a tuples having as elements (bandId, (documentId, hashOverTheBand))
    bands0 = min_hash_lists.flatMap(lambda x: np.arange(bands_num))
    bands1 = min_hash_lists.flatMap(lambda x: np.ones(bands_num, dtype=int) * x[0])
    bands2 = min_hash_lists.flatMap(lambda x: np.array(np.split(x[1], bands_num)))
    bands2Hashed = bands2.map(lambda x: vectorHash(x))
    bands12Hashed = bands1.zip(bands2Hashed)
    bands = bands0.zip(bands12Hashed)

    bandsInGroup = bands.groupByKey()

    candidates = bandsInGroup.flatMap(lambda x: generateCandidates(x[1])).map(
        lambda x: (x[0] * len(doc_num) + x[1], x)).values().distinct()
    print("Candidates are: " + str(candidates.collect()) )





In [11]:
# Create class shinglings to convert document into shingles and then into number
shinglingSize = 3

data = []
for d in range(numberOfDocuments):
    document = ""
    for s in documents[d].collect():
        document = document + s
    shinglings = []
    for i in range(shinglingSize - 1, len(document)):
        # get the hash of the shingling and append to the new vector of integers
        shinglings.append(hashShingling(document[i - shinglingSize + 1:i]))
    data.append(shinglings)

data = np.array(data)

# create RDD and create LSH class
dataRDD = sc.parallelize(data)
dataWithIndex = dataRDD.zipWithIndex().map(lambda x: (x[1], x[0]))

LSH(dataWithIndex, 25, 100, numberOfDocuments)


Threshold is set to 0.4472135954999579
Candidates are: []
