In [5]:
import hashlib
import random
import numpy as np

# Step 1: Extract k-shingles from text
def get_shingles(text, k):
    shingles = set()
    for i in range(len(text) - k + 1):
        shingles.add(text[i:i + k])
    return shingles

# Step 2: Hashing a shingle (for MinHash)
def hash_shingle(shingle, seed):
    return int(hashlib.md5((shingle + str(seed)).encode('utf8')).hexdigest(), 16)

# Step 3: Apply MinHash
def minhash(shingles, num_hashes=100):
    minhashes = []
    for seed in range(num_hashes):
        min_hash = min([hash_shingle(shingle, seed) for shingle in shingles])
        minhashes.append(min_hash)
    return minhashes

# Step 4: Calculate Jaccard similarity using MinHash signatures
def calculate_jaccard_similarity(minhash1, minhash2):
    matches = sum(1 for i in range(len(minhash1)) if minhash1[i] == minhash2[i])
    return matches / len(minhash1)

def load_text_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return file.read()

# Main function to calculate similarity between 3 documents
def compare_documents(file1, file2, file3, k=5, num_hashes=100):
    doc1_text = load_text_from_file(file1)
    doc2_text = load_text_from_file(file2)
    doc3_text = load_text_from_file(file3)
    
    # Generate shingles
    doc1_shingles = get_shingles(doc1_text, k)
    doc2_shingles = get_shingles(doc2_text, k)
    doc3_shingles = get_shingles(doc3_text, k)
    
    # Apply MinHash
    doc1_minhash = minhash(doc1_shingles, num_hashes)
    doc2_minhash = minhash(doc2_shingles, num_hashes)
    doc3_minhash = minhash(doc3_shingles, num_hashes)
    
    # Compute Jaccard similarity
    similarity12 = calculate_jaccard_similarity(doc1_minhash, doc2_minhash)
    similarity13 = calculate_jaccard_similarity(doc1_minhash, doc3_minhash)
    similarity23 = calculate_jaccard_similarity(doc2_minhash, doc3_minhash)
    
    return similarity12, similarity13, similarity23

# Example usage:
file1 = "doc1.txt"
file2 = "doc2.txt"
file3 = "doc3.txt"

similarity12, similarity13, similarity23 = compare_documents(file1, file2, file3)

print(f"Similarity between doc1 and doc2: {similarity12}")
print(f"Similarity between doc1 and doc3: {similarity13}")
print(f"Similarity between doc2 and doc3: {similarity23}")

Similarity between doc1 and doc2: 1.0
Similarity between doc1 and doc3: 0.56
Similarity between doc2 and doc3: 0.56
