# Daft MinHash Deduplication (Common Crawl)

This notebook builds an end-to-end deduplication pipeline over web pages with:
- Robust parsing for inconsistent robots/HTTP records
- Text extraction and normalization
- Tokenization and k-shingle generation
- MinHash signatures using Daft expressions
- Lightweight LSH banding for candidate duplicate grouping

References: Daft Expressions API: [docs.daft.ai Expressions](https://docs.daft.ai/en/stable/api/expressions/)


In [None]:
# Setup
import os
from typing import Iterable, List, Optional, Tuple

import daft
from daft import col, lit
from daft.context import get_context

# Ensure we have local execution for small samples; scale later
get_context().set_runner_local()

print(daft.__version__)


In [None]:
# Sample input: raw WARC payload-like bytes for demonstration
# In practice, read WARC with warcio and project out headers, URLs, and payload
sample_records = [
    {
        "record_id": "rec-1",
        "raw_bytes": b"robots: classic\r\nhostname: ip-10-158-89-8.ec2.internal\r\nsoftware: Nutch 1.6 (CC)\r\nisPartOf: CC-MAIN-2018-17\r\noperator: Common Crawl Admin\r\ndescription: Wide crawl of the web for April 2018\r\npublisher: Common Crawl\r\nformat: WARC File Format 1.0\r\nconformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n",
        "url": "about:robots"
    },
    {
        "record_id": "rec-2",
        "raw_bytes": b"GET /news-ed-eventi/news-seat HTTP/1.0\r\nHost: 00064.dealerseat.com\r\nAccept-Encoding: x-gzip, gzip, deflate\r\nUser-Agent: CCBot/2.0 (http://commoncrawl.org/faq/)\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nIf-Modified-Since: Fri, 19 Jan 2018 05:39:59 GMT\r\n\r\n<html><body>SEAT News</body></html>",
        "url": "http://00064.dealerseat.com/news-ed-eventi/news-seat"
    },
]

df = daft.from_pydict({k: [rec[k] for rec in sample_records] for k in sample_records[0].keys()})
df.show()


In [None]:
# Parser UDF: separate HTTP headers from body; ignore robots-like records
import re
from html import unescape

ROBOTS_HINTS = (b"robots:", b"isPartOf:", b"WARC File Format", b"Common Crawl")

HEADER_BODY_SPLIT = re.compile(rb"\r?\n\r?\n", re.MULTILINE)
TAG_CLEAN = re.compile(r"<[^>]+>")
WS = re.compile(r"\s+")


def parse_payload(raw_bytes: bytes) -> dict:
    if raw_bytes is None:
        return {"is_html": False, "text": None}

    # Heuristic: drop obvious robots/metadata blocks
    lower = raw_bytes[:200].lower()
    if any(hint in lower for hint in ROBOTS_HINTS):
        return {"is_html": False, "text": None}

    # Split HTTP headers from body if present
    parts = HEADER_BODY_SPLIT.split(raw_bytes, maxsplit=1)
    body = parts[1] if len(parts) == 2 else parts[0]

    # Quick check for html
    if b"<html" not in body.lower() and b"<body" not in body.lower():
        # Not clearly HTML; still attempt minimal text extraction
        try:
            text = body.decode("utf-8", errors="ignore")
        except Exception:
            return {"is_html": False, "text": None}
        text = unescape(TAG_CLEAN.sub(" ", text))
        text = WS.sub(" ", text).strip()
        return {"is_html": False, "text": text or None}

    try:
        html = body.decode("utf-8", errors="ignore")
    except Exception:
        return {"is_html": False, "text": None}

    text = unescape(TAG_CLEAN.sub(" ", html))
    text = WS.sub(" ", text).strip()
    return {"is_html": True, "text": text or None}


df_parsed = df.with_columns(
    {
        "parsed": col("raw_bytes").udf(parse_payload),
        "record_id": col("record_id"),
        "url": col("url"),
    }
).select([col("record_id"), col("url"), col("parsed")])

df_parsed = df_parsed.with_columns(
    {
        "is_html": col("parsed").struct.get("is_html"),
        "text": col("parsed").struct.get("text"),
    }
).drop("parsed")

df_parsed.show()


In [None]:
# Normalize text, tokenize, and build k-shingles
# We'll use Daft string expressions where possible, with small UDFs where needed
from typing import List

K = 5  # shingle size


def tokenize(text: Optional[str]) -> List[str]:
    if not text:
        return []
    # very light tokenization; lowercased, basic split
    return [t for t in re.split(r"[^a-z0-9]+", text.lower()) if t]


def shingles(tokens: List[str], k: int = K) -> List[str]:
    if not tokens or k <= 0 or len(tokens) < k:
        return []
    return [" ".join(tokens[i : i + k]) for i in range(len(tokens) - k + 1)]


df_tokens = (
    df_parsed
    .with_columns({"lower_text": col("text").str.lower()})
    .with_columns({
        "tokens": col("lower_text").udf(tokenize),
        "shingles": col("tokens").udf(lambda toks: shingles(toks, K)),
    })
    .drop("lower_text")
)

df_tokens.select(["record_id", "is_html", "url", "tokens", "shingles"]).show()


In [None]:
# MinHash signature (UDF fallback). Daft also exposes Expression.minhash; we can swap later.
import hashlib
from typing import Sequence

NUM_PERM = 128
ROWS_PER_BAND = 4  # 128/4 = 32 bands


def _hash64(b: bytes) -> int:
    # Stable 64-bit hash via SHA1 digest
    d = hashlib.sha1(b).digest()
    return int.from_bytes(d[:8], byteorder="big", signed=False)


# Precompute per-permutation salts
_SALTS = [f"mh{seed}".encode("utf-8") for seed in range(NUM_PERM)]


def minhash_signature(shingles: Optional[Sequence[str]], num_perm: int = NUM_PERM) -> List[int]:
    if not shingles:
        return [2**64 - 1] * num_perm
    sig = [2**64 - 1] * num_perm
    for sh in shingles:
        sh_b = sh.encode("utf-8", errors="ignore")
        for i in range(num_perm):
            h = _hash64(_SALTS[i] + b"|" + sh_b)
            if h < sig[i]:
                sig[i] = h
    return sig


def lsh_bands(signature: Sequence[int], rows_per_band: int = ROWS_PER_BAND) -> List[dict]:
    if not signature:
        return []
    n = len(signature)
    if rows_per_band <= 0 or n % rows_per_band != 0:
        # Trim to nearest multiple
        n = (n // rows_per_band) * rows_per_band
    bands = []
    for b in range(0, n, rows_per_band):
        chunk = signature[b : b + rows_per_band]
        hasher = hashlib.sha1()
        for v in chunk:
            hasher.update(v.to_bytes(8, byteorder="big", signed=False))
        bucket = int.from_bytes(hasher.digest()[:8], byteorder="big", signed=False)
        bands.append({"band": b // rows_per_band, "bucket": bucket})
    return bands


df_sig = df_tokens.with_columns({
    "signature": col("shingles").udf(minhash_signature),
    "num_shingles": col("shingles").list.length(),
})

df_sig.select(["record_id", "num_shingles", "signature"]).show()


In [None]:
# LSH banding and candidate grouping
from daft import functions as F

# Produce (band, bucket) pairs
bands_df = df_sig.with_columns({
    "bands": col("signature").udf(lambda sig: lsh_bands(sig, ROWS_PER_BAND))
}).explode("bands")

bands_df = bands_df.with_columns({
    "band": col("bands").struct.get("band"),
    "bucket": col("bands").struct.get("bucket"),
}).drop("bands")

# Group by (band, bucket) and collect record_ids per bucket
bucketed = (
    bands_df
    .groupby(["band", "bucket"]) 
    .agg({"record_id": col("record_id").agg_list()})
    .with_columns({"bucket_size": col("record_id").list.length()})
)

# Keep only buckets with at least 2 docs
candidates = bucketed.where(col("bucket_size") >= lit(2))
candidates.show()


In [None]:
# Candidate pairs (within each bucket, make unique unordered pairs)
from itertools import combinations

def pairs_from_list(ids: List[str]) -> List[Tuple[str, str]]:
    if not ids or len(ids) < 2:
        return []
    # sort for stable, unordered pairs
    s = sorted(ids)
    return [(a, b) for a, b in combinations(s, 2)]

pairs_df = candidates.with_columns({
    "pairs": col("record_id").udf(pairs_from_list)
}).explode("pairs")

pairs_df = pairs_df.with_columns({
    "id_a": col("pairs").list.get(0),
    "id_b": col("pairs").list.get(1),
}).drop("pairs")

pairs_df.show()
