## **MinHashLSH**

### In local system memory implementation:

In [1]:
import pandas as pd
import numpy as np

In [2]:
class InMemoryMinHashLSH:
    def __init__(self, documents, k=5):
        self.documents = documents
        self.shingles = None
        self.signatures = None
        self.buckets = None
        self.num_bands = 8
        self.k = k
        self.num_perms = 128
    
    def shingling(self, documents=pd.DataFrame([""]), k=5):
        if documents.any().any() == "":
            self.documents = documents

        shingles = set()
        doc_shingles = set()
        for doc in self.documents["text"]:
            for i in range(len(doc) - k + 1):
                shingle = doc[i:i+k]
                shingles.add(shingle)
                doc_shingles.add(shingle)
        shingles = list(shingles)
        
        boolean_vectors = np.full((len(self.documents), len(shingles)), False, dtype=bool)
        for i, doc in enumerate(self.documents["text"]):
            for j, shingle in enumerate(shingles):
                if shingle in doc:
                    boolean_vectors[i, j] = True
        return pd.DataFrame(boolean_vectors, columns=shingles).transpose()
        
    def minhashing(self, shingles_bvs, num_perm=128):
        signatures = []
        for _ in range(0, num_perm):
            hash_funcs = np.random.permutation(shingles_bvs.shape[0])
            signature_row = []
            for j in range(0, shingles_bvs.shape[1]):
                for hash in hash_funcs:
                    if shingles_bvs.iloc[hash, j]:
                        signature_row.append(hash)
                        break
            signatures.append(signature_row)
        return pd.DataFrame(signatures)
    
    def locality_sensitive_hashing(self, signatures, num_bands=8):
        self.num_bands = num_bands
        buckets = {}
        for doc_id in signatures:
            sig = signatures[doc_id]
            for i in range(0, len(sig), self.num_bands):
                band = hash(tuple(sig[i:i+self.num_bands]))
                if band in buckets:
                    buckets[band].add(doc_id)
                else:
                    buckets[band] = {doc_id}
        return buckets

    def run(self, **kwargs):
        if "documents" in kwargs:
            self.documents = kwargs["documents"]
        if "bands" in kwargs:
            self.num_bands = kwargs["bands"]
        if "num_perms" in kwargs:
            self.num_perms = kwargs["num_perms"]
        if "k" in kwargs:
            self.k = kwargs["k"]
        bitvecs = self.shingling(self.documents, self.k)

        # The regular permutation variant of the minhashing algorithm is used
        # assuming that the amount of data process is relatively small to fit in memory
        # of course we'll utilize the row hashing variant in the spark implementation
        self.signatures = self.minhashing(bitvecs, self.num_perms) 
        self.buckets = self.locality_sensitive_hashing(self.signatures, self.num_bands)
        return self.buckets
    
    def __jaccard_similarity(self, a, b):
        return len(a & b) / len(a | b)
    
    def approximateNearestNeighbors(self, key, n):
        n = 1 if n > 1 else n
        sig = self.signatures[key]
        similar_docs = {}
        for i in range(0, len(sig), self.num_bands):
            band_hash = hash(tuple(sig[i:i+self.num_bands]))
            if band_hash in self.buckets:
                for doc_id in self.buckets[band_hash]:
                    if doc_id != key:
                        if doc_id in similar_docs:
                            similar_docs[doc_id] += 1
                        else:
                            similar_docs[doc_id] = 1
        similar_docs = {k: v for k, v in sorted(similar_docs.items(), key=lambda item: item[1], reverse=True)}

        most_similar_docs = []
        for doc_id in similar_docs:
            jac_sim = self.__jaccard_similarity(set(self.signatures[key]), set(self.signatures[doc_id]))
            if jac_sim > n:
                most_similar_docs.append((doc_id, jac_sim))
        return most_similar_docs

In [3]:
test_docs = ["This is a test document", "This document is another test document", "This is a test document","This is a test","This is a document", "Hello word"]
docs_df = pd.DataFrame(test_docs, columns=["text"])
in_memory_lsh = InMemoryMinHashLSH(docs_df)
pd.set_option('display.max_columns', None)

# bool_vecs = in_memory_lsh.shingling()
# sigs = in_memory_lsh.minhashing(bool_vecs, 128)
# buckets = in_memory_lsh.locality_sensitive_hashing(sigs)
buckets = in_memory_lsh.run(k=3, bands=4, num_perms=256, documents=docs_df)
# print(buckets)
in_memory_lsh.approximateNearestNeighbors(0, 0.5)

[(2, 1.0), (4, 0.6666666666666666), (3, 0.55), (1, 0.5483870967741935)]

### Spark implementation:

In [150]:
from pyspark.sql import functions as F 
from pyspark.sql import window as W
from pyspark.sql.types import ArrayType, IntegerType

def hash_gen(num_hashes=128):
   hashes = []
   for i in range(num_hashes):
       hash = (np.random.randint(1, 1000), np.random.randint(1, 1000), np.random.randint(1, 1000))
       hashes.append(hash)
   return hashes

# This follows the row hashing algorithm with the input as a squashed list of the lists of the boolean vectors
def minhash_udf(row, sig_length=128):

    # Generating the fixed parameters for the hash functions
    hash_funcs = hash_gen(sig_length)
    
    # Generating the "infinite" matrix (can't use np.inf as it's not supported by spark, class error w/e)
    final_ans = [[-2 for _ in range(len(row[0][1]))] for _ in range(len(hash_funcs))]
    for row in row:
        row_id = row[0]
        row_vals = row[1]
        for i in range(len(hash_funcs)):
            # Hashing the row id
            curr_hash = ((hash_funcs[i][0] * row_id) + hash_funcs[i][1]) % hash_funcs[i][2]
            for j in range(len(row_vals)):
                if row_vals[j]:
                    # Minhashing with the "infinite" matrix
                    final_ans[i][j] = curr_hash if final_ans[i][j] == -2 else min(final_ans[i][j], curr_hash)
    return final_ans

class SparkMinHashLSH:
    def __init__(self, documents, k=5):
        self.documents = documents
    
    
    def shingling(self, **kwargs):
        if "documents" in kwargs:
            self.documents = kwargs["documents"]
        
        # High k values would result in document skipping if the document is smaller than the shingle size
        shingle_size = 5 if "k" not in kwargs else kwargs["k"]
        # The monotonically increasing ID function only generates random unique IDs
        # Thus requiring us to use the row_number function instead
        # The following code is essentially creating a "window" of ordered from 1, the "lit"(literal) function acts the same way as passing an arg to a function 
        documents = self.documents.withColumn("docID", F.row_number().over(W.Window.orderBy(F.lit(1))))
        shingles = (documents.rdd.map(lambda x: (x[0],x[1])) 
                   .map(lambda x: list(set([(x[1], x[0][i:i+shingle_size]) for i in range(len(x[0]) - shingle_size + 1)]))) # Shingling with k size shingle
                   .flatMap(lambda x: x) # DF formatting, from shape [(docID, shingle), ...] to (docID, shingle), aka unknown data shape to 2 columns
                   .toDF(["docID", "shingles"])
                    )
        return (shingles.groupBy("shingles")
                .pivot("docID") # Rotating the DF based on docIDs
                .agg(F.lit(True)) # Create a true column and aggregate on each present intersected shingles corresponding to the docID
                .fillna(False)) # Fill the NaNs left by the aggregation process with False
    
    def minhashing(self, sc, **kwargs):
        if "bool_vecs" not in kwargs:
            raise ValueError("Boolean Vectors not provided")
        bool_vecs = kwargs["bool_vecs"]
        sig_length = 128 if "sig_length" not in kwargs else kwargs["sig_length"]
        
        bool_vecs = bool_vecs.withColumn("id", F.row_number().over(W.Window.orderBy(F.lit(1)))) # Col numbering nothing exciting

        # Grouping the bool vecs into lists for processing
        bool_vecs = bool_vecs.select("id", F.array([F.col(x) for x in bool_vecs.columns if x != "id" and x != "shingles"]).alias("vals")) 
        typed_minhash_udf = F.udf(minhash_udf, ArrayType(ArrayType(IntegerType()))) # Defining the return type so it doesn't become string cuz spark doesnt know how to process lol   
        sigs = bool_vecs.agg(typed_minhash_udf(F.collect_list(F.struct(F.col("id"), F.col("vals"))), F.lit(sig_length)).alias("sigs")) # Black magic, jk, continue reading at the above udf func

        return sigs.select(F.explode("sigs")).rdd.flatMap(lambda x: x).toDF() # Convert it to a more "expected" and "familiar" format of signatures per col

In [40]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("minHashLSH").getOrCreate()
sc = spark.sparkContext

In [153]:
data = [("This is a test document",), ("This document is another test document",), ("This is a test document",), ("Hello wordello ",), ("Word Hello world",), ("hello", )]
df = spark.createDataFrame(data, ["text"])
spark_lsh = SparkMinHashLSH(df)
bool_vecs = spark_lsh.shingling(k=2)
sigs = spark_lsh.minhashing(sc, bool_vecs=bool_vecs, sig_length=16)
sigs.show()

+---+---+---+---+---+---+
| _1| _2| _3| _4| _5| _6|
+---+---+---+---+---+---+
| 10| 10| 10| 55| 55| 27|
|171|171|171|171|171|171|
| 25| 25| 25|  5| 65|184|
| 23| 22| 23| 83| 83| 22|
| 19| 19| 19|  3|  3| 75|
| 24| 24| 24| 30|  0| 30|
| 23| 23| 23|197|197|197|
|  1|  1|  1| 19|  0|106|
|  8|  8|  8| 80| 80| 26|
| 40| 40| 40| 65| 60|199|
|  3|  3|  3|331| 57|536|
| 19|  9| 19| 81| 51|193|
|145|145|145|150|144|152|
| 14| 14| 14| 58|  6| 58|
|  2|  2|  2|  0|  0|  0|
| 34| 34| 34|252|  8|418|
+---+---+---+---+---+---+

