# Imports

In [61]:
import os
import glob
import re
import time
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Custom modules
import Shingling
import CompareSets
import MinHashing
import CompareSignatures

# Setup

In [62]:
DATA_DIR = '../data/'

# The 'k' in k-shingles
SHINGLE_LENGTH = 10

# The number of hash functions for MinHashing (signature length 'n')
# More hashes = better accuracy, but slower.
MINHASH_SIGNATURE_SIZE = 100

# The Jaccard similarity threshold 's'
SIMILARITY_THRESHOLD = 0.8

# Load Documents

In [63]:
doc_files = sorted(glob.glob(os.path.join(DATA_DIR, '*.txt')))
num_docs = len(doc_files)

docs = []
doc_names = []
for doc_path in doc_files:
    doc_names.append(os.path.basename(doc_path))
    try:
        with open(doc_path, 'r', encoding='utf-8') as f:
            docs.append(f.read())
    except Exception as e:
        print(f"Could not read {doc_path}: {e}")

print(f"\nSuccessfully loaded {len(docs)} documents.")

def clean_text(text: str) -> str:
    """A simple text cleaner."""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove all punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Compact all whitespace into a single space
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


Successfully loaded 10 documents.


# Shingling

In [64]:
print(f"\n--- Stage 1: Shingling (k={SHINGLE_LENGTH}) ---")
start_time = time.time()

shingler = Shingling.Shingling(k=SHINGLE_LENGTH)

# This list will hold the set of hashed shingles for each document
doc_shingle_sets = []

for i in range(num_docs):
    doc_content = docs[i]
    
    cleaned_doc = clean_text(doc_content)
    hashed_shingles_list = shingler.get_hashed_shingles(cleaned_doc)
    shingles_set = set(hashed_shingles_list)
    
    doc_shingle_sets.append(shingles_set)
    print(f"  '{doc_names[i]}' -> {len(shingles_set)} unique shingles.")

print(f"\nShingling all documents took {time.time() - start_time:.4f} seconds.")


--- Stage 1: Shingling (k=10) ---
  '041.txt' -> 1098 unique shingles.
  '052.txt' -> 752 unique shingles.
  '053.txt' -> 824 unique shingles.
  '140.txt' -> 2077 unique shingles.
  '141.txt' -> 816 unique shingles.
  '180.txt' -> 3387 unique shingles.
  '207.txt' -> 2112 unique shingles.
  '288.txt' -> 1145 unique shingles.
  '299.txt' -> 2357 unique shingles.
  '438.txt' -> 1278 unique shingles.

Shingling all documents took 0.0089 seconds.


# Jaccard Similarity

In [65]:
print(f"\n--- Stage 2: Jaccard Similarity (Threshold s={SIMILARITY_THRESHOLD}) ---")
start_time = time.time()

ground_truth_similar_pairs = []

_ = CompareSets.CompareSets.calculate_jaccard

for i in range(num_docs):
    for j in range(i + 1, num_docs):
        set1 = doc_shingle_sets[i]
        set2 = doc_shingle_sets[j]
        
        j_sim = CompareSets.CompareSets.calculate_jaccard(set1, set2)
        
        # If similarity is above our threshold, record it
        if j_sim >= SIMILARITY_THRESHOLD:
            pair = tuple(sorted((doc_names[i], doc_names[j])))
            ground_truth_similar_pairs.append((pair, j_sim))
            print(f"  {pair} | Jaccard = {j_sim:.4f}")

print(f"\nFound {len(ground_truth_similar_pairs)} true similar pairs.")
print(f"Comparison took {time.time() - start_time:.4f} seconds.")


--- Stage 2: Jaccard Similarity (Threshold s=0.8) ---

Found 0 true similar pairs.
Comparison took 0.0103 seconds.


# MinHashing

In [66]:
print(f"\n--- Stage 3: Minhashing (Signature Size n={MINHASH_SIGNATURE_SIZE}) ---")
start_time = time.time()

# This list will hold the MinHash signatures (vectors) for each document
doc_signatures = []

minhasher = MinHashing.MinHashing(num_hashes=MINHASH_SIGNATURE_SIZE)

for i in range(num_docs):
    shingle_set = doc_shingle_sets[i]
    
    signature = minhasher.get_signature(shingle_set)
    doc_signatures.append(signature)

print(f"\nMinhashing all documents took {time.time() - start_time:.4f} seconds.")


--- Stage 3: Minhashing (Signature Size n=100) ---

Minhashing all documents took 0.3364 seconds.


# Signature Comparison

In [67]:
print(f"\n--- Stage 4: Signature Comparison (Threshold s={SIMILARITY_THRESHOLD}) ---")
start_time = time.time()

minhash_similar_pairs = []

_ = CompareSignatures.CompareSignatures.calculate_similarity

for i in range(num_docs):
    for j in range(i + 1, num_docs):
        sig1 = doc_signatures[i]
        sig2 = doc_signatures[j]
        
        est_sim = CompareSignatures.CompareSignatures.calculate_similarity(sig1, sig2)
        
        # If estimated similarity is above our threshold, record it
        if est_sim >= SIMILARITY_THRESHOLD:
            pair = tuple(sorted((doc_names[i], doc_names[j])))
            minhash_similar_pairs.append((pair, est_sim))
            print(f" {pair} | Est. Sim = {est_sim:.4f}")

print(f"\nFound {len(minhash_similar_pairs)} estimated similar pairs.")
print(f"Signature comparison took {time.time() - start_time:.4f} seconds.")


--- Stage 4: Signature Comparison (Threshold s=0.8) ---

Found 0 estimated similar pairs.
Signature comparison took 0.0003 seconds.


# Evaluation

In [68]:
print(f"\n--- Stage 5: Evaluation ---")

true_pairs_set = set([pair[0] for pair in ground_truth_similar_pairs])
est_pairs_set = set([pair[0] for pair in minhash_similar_pairs])

if not true_pairs_set and not est_pairs_set:
    print("No similar pairs found by either method.")
else:
    true_positives = len(true_pairs_set.intersection(est_pairs_set))
    false_positives = len(est_pairs_set.difference(true_pairs_set))
    false_negatives = len(true_pairs_set.difference(est_pairs_set))

    print(f"True Positives:   {true_positives}")
    print(f"False Positives: {false_positives}")
    print(f"False Negatives: {false_negatives}")

    # Precision and Recall
    if (true_positives + false_positives) > 0:
        precision = true_positives / (true_positives + false_positives)
        print(f"Precision: {precision:.4f}")
    else:
        print("Precision: N/A (no positive estimations)")
        
    if (true_positives + false_negatives) > 0:
        recall = true_positives / (true_positives + false_negatives)
        print(f"Recall:    {recall:.4f}")
    else:
        print("Recall:    N/A (no true pairs to find)")

print("\n--- Summary ---")
print(f"Ground Truth (Jaccard): {len(true_pairs_set)} pairs")
print(f"MinHash Estimate:       {len(est_pairs_set)} pairs")


--- Stage 5: Evaluation ---
No similar pairs found by either method.

--- Summary ---
Ground Truth (Jaccard): 0 pairs
MinHash Estimate:       0 pairs
