## **MinHashLSH**

### In local system memory implementation:

In [3]:
import pandas as pd
import numpy as np

In [105]:
class InMemoryMinHashLSH:
    def __init__(self, documents, k=5):
        self.documents = documents
    
    def shingling(self, documents="", k=5):
        if documents != "":
            self.documents = documents

        shingles = set()
        doc_shingles = set()
        for doc in self.documents["text"]:
            for i in range(len(doc) - k + 1):
                shingle = doc[i:i+k]
                shingles.add(shingle)
                doc_shingles.add(shingle)
        shingles = list(shingles)
        
        boolean_vectors = np.full((len(self.documents), len(shingles)), False, dtype=bool)
        for i, doc in enumerate(self.documents["text"]):
            for j, shingle in enumerate(shingles):
                if shingle in doc:
                    boolean_vectors[i, j] = True
        return pd.DataFrame(boolean_vectors, columns=shingles).transpose()
        
    def minhashing(self, shingles_bvs, num_perm=128):
        signatures = []
        for _ in range(0, num_perm):
            hash_funcs = np.random.permutation(shingles_bvs.shape[0])
            signature = []
            for j in range(0, shingles_bvs.shape[1]):
                for hash in hash_funcs:
                    if shingles_bvs.iloc[hash, j]:
                        signature.append(hash)
                        break
            signatures.append(signature)
        return pd.DataFrame(signatures)
    
    def locality_sensitive_hashing(self, signature, num_bands=8, num_rows=16):
        pass

    def run(self):
        shingles = self.shingling(self.documents)
        signature = self.minhashing(shingles)
        buckets = self.locality_sensitive_hashing(signature)
        return buckets
    
    def __jaccard_similarity(self, a, b):
        return len(a & b) / len(a | b)
    
    def approximateNearestNeighbors(self, key, n):
        pass

In [107]:
test_docs = ["This is a test document", "This document is another test document", "This is a test document", "Hello word"]
docs_df = pd.DataFrame(test_docs, columns=["text"])
in_memory_lsh = InMemoryMinHashLSH(docs_df)
pd.set_option('display.max_columns', None)
bool_vecs = in_memory_lsh.shingling()
sigs = in_memory_lsh.minhashing(bool_vecs, 128)
sigs

(42, 4)


Unnamed: 0,0,1,2,3
0,13,29,13,27
1,18,1,18,40
2,36,36,36,40
3,15,19,15,40
4,2,1,2,40
...,...,...,...,...
123,34,19,34,5
124,4,14,4,5
125,0,24,0,9
126,23,23,23,40


In [68]:
from pyspark.sql import functions as F 
class SparkMinHashLSH:
    def __init__(self, documents, k=5):
        self.documents = documents
    
    def shingling(self,documents="", k=5):
        if documents != "":
            self.documents = documents

        # documents_rdd = self.documents.rdd.map(lambda x: x[0])
        # shingles = documents_rdd.flatMap(lambda x: list(set([x[i:i+k] for i in range(len(x) - k + 1)])))
        # doc_shingles = documents_rdd.map(lambda x: list(set([x[i:i+k] for i in range(len(x) - k + 1)])))
        # bool_vecs = doc_shingles.leftOuterJoin(shingles)
        # bool_vecs = shingles.join(doc_shingles, on="shingles", how="left").fillna(0)
        # return shingles
        # ohe = OneHotEncoder(inputCol="shingles", outputCol="shingles_ohe")
        # return bool_vecs

In [62]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("minHashLSH").getOrCreate()
sc = spark.sparkContext

In [66]:
data = [("This is a test document",), ("This document is another test document",), ("This is a test document",), ("Hello wordello ",)]
df = spark.createDataFrame(data, ["text"])
spark_lsh = SparkMinHashLSH(df)
spark_lsh.shingling().take(5)

['est d', 'test ', 'cumen', ' test', 'his i']

In [74]:
import re, hashlib, math, time
from random import randint, seed
seed(16)


class hashFamily:
    def __init__(self, i):
        self.resultSize = 8 # how many bytes we want back
        self.maxLen = 20 # how long can our i be (in decimal)
        self.salt = str(i).zfill(self.maxLen)[-self.maxLen:]
        self.id = i
        
    def get_hash_value(self, el_to_hash):
        return int(hashlib.sha1(str(el_to_hash).encode('utf-8') + self.salt.encode('utf-8')).hexdigest()[-self.resultSize:], 16)
    

hash_family = hashFamily(0)

True
