## **MinHashLSH**

### In local system memory implementation:

In [133]:
import pandas as pd
import numpy as np

In [134]:
class InMemoryMinHashLSH:
    def __init__(self, documents, k=5):
        self.documents = documents
        self.shingles = None
        self.signatures = None
        self.buckets = None
    
    def shingling(self, documents=pd.DataFrame([""]), k=5):
        if documents.any().any() == "":
            self.documents = documents

        shingles = set()
        doc_shingles = set()
        for doc in self.documents["text"]:
            for i in range(len(doc) - k + 1):
                shingle = doc[i:i+k]
                shingles.add(shingle)
                doc_shingles.add(shingle)
        shingles = list(shingles)
        
        boolean_vectors = np.full((len(self.documents), len(shingles)), False, dtype=bool)
        for i, doc in enumerate(self.documents["text"]):
            for j, shingle in enumerate(shingles):
                if shingle in doc:
                    boolean_vectors[i, j] = True
        return pd.DataFrame(boolean_vectors, columns=shingles).transpose()
        
    def minhashing(self, shingles_bvs, num_perm=128):
        signatures = []
        for _ in range(0, num_perm):
            hash_funcs = np.random.permutation(shingles_bvs.shape[0])
            signature = []
            for j in range(0, shingles_bvs.shape[1]):
                for hash in hash_funcs:
                    if shingles_bvs.iloc[hash, j]:
                        signature.append(hash)
                        break
            signatures.append(signature)
        return pd.DataFrame(signatures)
    
    def locality_sensitive_hashing(self, signature, num_bands=8, num_rows=16):
        buckets = {}
        for i in range(0, num_bands):
            band = signature.iloc[i*num_rows:(i+1)*num_rows]
            for j in range(0, band.shape[1]):
                hashed_band = hash(tuple(band.iloc[:, j]))
                if hashed_band in buckets:
                    buckets[hashed_band].append(j)
                else:
                    buckets[hashed_band] = [j]
        return buckets

    def run(self, documents=""):
        if documents != "":
            self.documents = documents
        self.shingles = self.shingling(self.documents)
        self.signatures = self.minhashing(self.shingles)
        self.buckets = self.locality_sensitive_hashing(self.signatures)
        return self.buckets
    
    def __jaccard_similarity(self, a, b):
        return len(a & b) / len(a | b)
    
    def approximateNearestNeighbors(self, key, n):
        pass

In [135]:
test_docs = ["This is a test document", "This document is another test document", "This is a test document", "Hello word"]
docs_df = pd.DataFrame(test_docs, columns=["text"])
in_memory_lsh = InMemoryMinHashLSH(docs_df)
pd.set_option('display.max_columns', None)

# bool_vecs = in_memory_lsh.shingling()
# sigs = in_memory_lsh.minhashing(bool_vecs, 128)
# buckets = in_memory_lsh.locality_sensitive_hashing(sigs)
buckets = in_memory_lsh.run()
print(buckets)

{1379140599642863165: [0, 2], 9040997445242030420: [1], -2011539986988155481: [3], 5927102016616960046: [0, 2], -7896971630727833918: [1], 4934936327892699789: [3], 244299711004002593: [0, 2], 7666908281629136108: [1], -9161432108047039538: [3], 8331251141106009057: [0, 2], 7309846408532718783: [1], -4169070300030851549: [3], 538062161962066744: [0, 2], -8568018872202180160: [1], -6376555837399552003: [3], 5925910462200784584: [0, 2], -4604407864863100061: [1], 6162459667513325389: [3], -2954536926530557813: [0, 2], 6257829669100608064: [1], 7776162716060853749: [3], 6853229475000010218: [0, 2], 6361141334233927601: [1], -8053711553699312521: [3]}


In [136]:
from pyspark.sql import functions as F 
class SparkMinHashLSH:
    def __init__(self, documents, k=5):
        self.documents = documents
    
    def shingling(self,documents="", k=5):
        if documents != "":
            self.documents = documents

        # documents_rdd = self.documents.rdd.map(lambda x: x[0])
        # shingles = documents_rdd.flatMap(lambda x: list(set([x[i:i+k] for i in range(len(x) - k + 1)])))
        # doc_shingles = documents_rdd.map(lambda x: list(set([x[i:i+k] for i in range(len(x) - k + 1)])))
        # bool_vecs = doc_shingles.leftOuterJoin(shingles)
        # bool_vecs = shingles.join(doc_shingles, on="shingles", how="left").fillna(0)
        # return shingles
        # ohe = OneHotEncoder(inputCol="shingles", outputCol="shingles_ohe")
        # return bool_vecs

In [137]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("minHashLSH").getOrCreate()
sc = spark.sparkContext

In [138]:
data = [("This is a test document",), ("This document is another test document",), ("This is a test document",), ("Hello wordello ",)]
df = spark.createDataFrame(data, ["text"])
# spark_lsh = SparkMinHashLSH(df)
# spark_lsh.shingling().take(5)

In [139]:
import re, hashlib, math, time
from random import randint, seed
seed(16)


class hashFamily:
    def __init__(self, i):
        self.resultSize = 8 # how many bytes we want back
        self.maxLen = 20 # how long can our i be (in decimal)
        self.salt = str(i).zfill(self.maxLen)[-self.maxLen:]
        self.id = i
        
    def get_hash_value(self, el_to_hash):
        return int(hashlib.sha1(str(el_to_hash).encode('utf-8') + self.salt.encode('utf-8')).hexdigest()[-self.resultSize:], 16)
    

hash_family = hashFamily(0)
assert hash_family.get_hash_value("test") == hash_family.get_hash_value("test")