In [1]:
import pyspark
from pyspark import SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkFiles
import re
import random


path = "../data/covid_news_truncated.json"

conf = SparkConf()
conf.getAll()

sc = SparkContext(appName="lsh")
    
spark = SparkSession(sc)
sc.setLogLevel("ERROR")

textfile = sc.textFile(path)

#sc.addFile(path)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/23 11:16:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [11]:
# Large prime value to use in hashing
LARGE_PRIME = 75874811

# number of hash functions
R_VALUE = 13
B_VALUE = 11
K_VALUE = R_VALUE * B_VALUE

# generate random array to use in hashing
RANDOM_ARRAY = [(random.randint(1,2**31 - 1), random.randint(1,2**31 - 1)) for i in range(0,K_VALUE)]
#print((421*value + 16) % 1013)



# gets document and returns k-shingles
def shingles(document, k=9):

    #initial set
    shingles_set = set()

    # getting rid of punctuation, etc
    document[1] = re.sub(r'[^\w\s]', '', document[1].lower())

    # split to chars
    chars_list = re.split('', document[1].lower())

    # create shingles with length k
    for i in range(len(chars_list) - k):
        chars = chars_list[i:i + k]
        shingle = ''.join(chars)
        shingles_set.add(shingle)

    # sort shingles
    shingles_set = sorted(shingles_set)

    # returns: doc_id, set_shingles
    return document[0], shingles_set


# hash a value
def hash_function(x, a, b, N): 
    return (((a * hash(x) + b) % LARGE_PRIME ) % N)

# get signature matrix from min hashing
def min_hash(document, total_size):
    
    # shingles
    x = document[1]

    # initial matrix
    signature_matrix = []

    # iterate through the defined number of hash functions (hi)
    for i in range(0, K_VALUE):

        # start value is infinite
        minhash = float('inf')

        # get random integers
        a,b = RANDOM_ARRAY[i]

        # for each shingle
        for value in x:
            # hash shingle
            h = hash_function(value,a,b, len(x))
            # if lower, replace with current value
            if h < minhash:
                minhash = h

        # append the lowest number
        signature_matrix.append(minhash)

    #print(signature_matrix)
    
    # returns: doc_id, signature matrix
    return document[0],signature_matrix


# ---------------------------------------------------------------------- #

# gets a band and hashes it to a bucket
def hash_lsh(band): 

    # intial array
    h1_array = []

    # for each row within the band
    for value in band:
        # hash
        h1_array.append((value * len(band)) % LARGE_PRIME)

    #returns: min value
    return min(h1_array)

# gets signature and returns bucket values for each band
def get_bucket_values(signature):

    # list of bucket values
    bucket_values = []

    # for the entire signature, iterate over each band with r rows and hash it
    for idx in range(0, B_VALUE):

        # if there is no more bands to hash
        if idx * R_VALUE > len(signature): 
            break 

        # get end of band
        max_id = min(idx * R_VALUE + R_VALUE, len(signature))

        # hash the band to a bucket
        bucket = hash_lsh(signature[idx * R_VALUE : max_id])

        # append bucket value
        bucket_values.append(bucket)

    # returns: doc_id, bucket values
    return bucket_values

# given the signatures, returns candidate pairs
def lsh_algorithm(signatures_matrix):

    # dict with buckets for each document
    k_buckets = {}

    # initial candidates list
    candidates = []

    # iterate over the signatures
    for doc in signatures_matrix:

        # get the bucket values for each signature
        bucket = get_bucket_values(signatures_matrix[doc])

        # iterate over the other signatures bucket values
        for b_doc in k_buckets:

            # iterate over the bucket values
            for i in range(len(bucket)):

                # if at least 1 bucket value is the same, then at least 1 band hashes to the same bucket -> candidate 
                if k_buckets[b_doc][i] == bucket[i]:

                    # because it is candidate, compare with jaccard similarity and append to candidates list
                    similar_pair = jaccard_similarity(signatures_matrix[doc], signatures_matrix[b_doc])
                    candidates.append((doc, b_doc, similar_pair))

                    # because we only need 1 hash value in the same bucket, no need to continue
                    break

        # add the bucket values for each signature
        k_buckets[doc] = bucket

    # returns: candidates
    return candidates

# calculate jaccard similarity
def jaccard_similarity(sig_matrix_1, sig_matrix_2):
    print(sig_matrix_1)
    print(sig_matrix_2)
    # get intersection of the 2 matrices
    intersection = len([sig_matrix_1[i] for i in range(0, len(sig_matrix_1)) if (sig_matrix_1[i] == sig_matrix_2[i])])
    # get union of the 2 matrices
    union = (len(sig_matrix_1) + len(sig_matrix_2)) - intersection
    # calculate jaccard similarity
    jaccard_sim = intersection / union

    return jaccard_sim


def shingles_jaccard(shingles_1, shingles_2):
    intersection = len(list(set(shingles_1).intersection(shingles_2)))
    union = (len(set(shingles_1)) + len(set(shingles_2))) - intersection
    return float(intersection) / union


def article_shingles_similarity(doc_id, lsh_candidates, filtered_shingles):
    for x in filtered_shingles:
        if x[0] == str(doc_id):
            doc_shingles = x[1]

    print(doc_id)
    print(doc_shingles)

    for candidate in filtered_shingles:
        if str(doc_id) != candidate[0]:
            jacc = shingles_jaccard(doc_shingles, candidate[1])
            print(str(doc_id) + " jaccard sim with " + candidate[0] + ": " + str(jacc))




final_shingles = textfile.map(lambda line: eval(line)) \
                .map(lambda dict: [dict["tweet_id"], dict["text"]]) \
                .map(shingles)

total_size = final_shingles.count()
final_minhash = final_shingles.map(lambda k: min_hash(k, total_size))

#print("shingles and minhash done")

signatures_matrix = { doc: sig_matrix for doc, sig_matrix in final_minhash.collect() }
#
#print("collect done")
#
#print(signatures_matrix)

candidates = lsh_algorithm(signatures_matrix)
#
#print("similar pairs found")
#
sorted_candidates = sc.parallelize(candidates).sortBy(lambda pair: - pair[2])
#
#print("final pairs done")
#
final_results = sorted_candidates.collect()
#
#for x in final_results[:10]:
    #print(x[0])
    #print(type(x[0]))

    #if x[0] == 1349048668570189824:
    #    print("entrou")
    #print(x)

for x in final_results[:10]:
    print(x)


#article = 1349045380101648384
#filtered_candidates = sorted_candidates.filter(lambda x: x[0] == str(article)).collect()
#print(filtered_candidates)
#
#candidates_ids = [filtered_candidates[0][0]] + [x[1] for x in filtered_candidates]
##print(candidates_ids)
#
#filtered_shingles = final_shingles.filter(lambda x: x[0] in candidates_ids).collect()
#
#article_shingles_similarity(article, filtered_candidates, filtered_shingles)


[0, 15, 4, 0, 2, 1, 5, 2, 1, 0, 2, 1, 2, 1, 3, 2, 2, 13, 0, 1, 6, 4, 0, 5, 4, 0, 2, 5, 0, 2, 1, 0, 0, 0, 9, 5, 9, 0, 0, 4, 0, 0, 5, 7, 1, 3, 1, 1, 0, 3, 0, 1, 2, 1, 1, 12, 0, 1, 1, 5, 3, 8, 0, 4, 1, 2, 0, 16, 8, 2, 3, 0, 4, 8, 1, 1, 4, 5, 5, 8, 7, 3, 9, 5, 1, 1, 20, 1, 0, 0, 8, 2, 2, 0, 0, 2, 3, 0, 5, 3, 1, 1, 3, 1, 6, 1, 2, 2, 2, 1, 5, 1, 0, 2, 1, 0, 3, 2, 6, 0, 0, 0, 2, 0, 1, 1, 2, 0, 4, 2, 2, 0, 0, 0, 13, 4, 6, 2, 0, 1, 2, 6, 5]
[5, 2, 1, 1, 0, 2, 1, 1, 5, 8, 0, 1, 0, 1, 0, 2, 0, 0, 1, 5, 2, 1, 0, 1, 3, 1, 1, 1, 1, 5, 1, 0, 1, 0, 2, 0, 3, 6, 7, 0, 5, 0, 0, 8, 9, 3, 2, 0, 0, 0, 12, 5, 0, 1, 3, 1, 4, 3, 6, 1, 1, 4, 3, 0, 0, 2, 1, 8, 0, 3, 9, 1, 0, 9, 2, 0, 0, 3, 1, 0, 0, 4, 0, 0, 0, 5, 1, 3, 4, 3, 0, 0, 4, 1, 0, 4, 5, 0, 5, 6, 0, 0, 1, 0, 0, 6, 1, 2, 0, 5, 3, 0, 2, 16, 0, 8, 7, 11, 1, 3, 4, 2, 2, 10, 2, 2, 1, 4, 4, 6, 2, 0, 2, 2, 6, 3, 0, 4, 11, 0, 9, 0, 1]
[0, 15, 4, 0, 2, 1, 5, 2, 1, 0, 2, 1, 2, 1, 3, 2, 2, 13, 0, 1, 6, 4, 0, 5, 4, 0, 2, 5, 0, 2, 1, 0, 0, 0, 9, 5, 9, 0, 0, 4, 0, 0, 