In [1]:
"""
PREPROCESSING ABLATION + EVALUATION (SentenceTransformer version)
- Fair sampling (same indices for all variants, fixed seed)
- SentenceTransformer embeddings (better than RoBERTa mean pooling for similarity/diversity)
- Clean, report-ready table (CSV export + pretty console)
- No fine-tuning required
"""

import re
import difflib
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# SentenceTransformer (preferred for sentence/document embeddings)
from sentence_transformers import SentenceTransformer

# =====================================================
# PREPROCESSING FUNCTIONS
# =====================================================

def extract_text_from_html(html):
    if pd.isna(html):
        return ""
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(separator=" ").strip()

def basic_clean(text):
    if text is None:
        return ""
    text = str(text).replace("\xa0", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def normalize_sentence(s):
    s = s.lower()
    s = re.sub(r"\d{3,}", " ", s)  # remove long numbers (phones/ids)
    s = re.sub(r"[^a-z0-9\s.,;:!?\-]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def advanced_deduplicate_sentences(
    text, min_chars=10, fuzzy_threshold=0.95, fuzzy_window=50
):
    sentences = re.split(r"(?<=[.!?])\s+", text)
    cleaned = []
    buffer = []
    seen = set()

    for s in sentences:
        raw = s.strip()
        if not raw:
            continue

        norm = normalize_sentence(raw)
        if len(norm) < min_chars or norm in seen:
            continue

        is_dup = False
        for prev in buffer[-fuzzy_window:]:
            if difflib.SequenceMatcher(None, norm, prev).ratio() >= fuzzy_threshold:
                is_dup = True
                break
        if is_dup:
            continue

        cleaned.append(raw)
        seen.add(norm)
        buffer.append(norm)

    return " ".join(cleaned)

def deduplicate_sentences(text):
    return advanced_deduplicate_sentences(text)

def normalize_entities(text):
    replacements = {
        r"e\.?\s*coli": "escherichia coli",
        r"listeria spp": "listeria monocytogenes",
        r"salmonella spp": "salmonella",
    }
    for pat, rep in replacements.items():
        text = re.sub(pat, rep, text, flags=re.IGNORECASE)
    return text

# =====================================================
# BUILD PREPROCESSING VARIANTS (ABLATION)
# =====================================================

def build_preprocess_variants(df: pd.DataFrame):
    # raw
    v0_raw = df["title"].fillna("").astype(str) + " " + df["text"].fillna("").astype(str)

    # html only
    v1_html = (
        df["title"].apply(extract_text_from_html).astype(str) + " " +
        df["text"].apply(extract_text_from_html).astype(str)
    )

    # html + basic
    v2_basic = (
        df["title"].apply(extract_text_from_html).apply(basic_clean).astype(str) + " " +
        df["text"].apply(extract_text_from_html).apply(basic_clean).astype(str)
    )

    # html + basic + dedup
    v3_dedup = v2_basic.apply(deduplicate_sentences)

    # html + basic + entities
    v4_entities = v2_basic.apply(normalize_entities)

    # full pipeline: (html -> basic -> dedup -> entities) for title and text separately, then merge
    v5_full = (
        df["title"]
        .apply(extract_text_from_html)
        .apply(basic_clean)
        .apply(deduplicate_sentences)
        .apply(normalize_entities)
        .astype(str)
        + " " +
        df["text"]
        .apply(extract_text_from_html)
        .apply(basic_clean)
        .apply(deduplicate_sentences)
        .apply(normalize_entities)
        .astype(str)
    )

    return {
        "v0_raw": v0_raw.str.lower().tolist(),
        "v1_html": v1_html.str.lower().tolist(),
        "v2_basic": v2_basic.str.lower().tolist(),
        "v3_dedup": v3_dedup.str.lower().tolist(),
        "v4_entities": v4_entities.str.lower().tolist(),
        "v5_full": v5_full.str.lower().tolist(),
    }

# =====================================================
# FAIR SAMPLING (same indices for every variant)
# =====================================================

def get_fair_sample_indices(n_total: int, sample_size: int = 500, seed: int = 42):
    rng = np.random.default_rng(seed)
    sample_size = min(sample_size, n_total)
    return rng.choice(n_total, size=sample_size, replace=False)

# =====================================================
# METRICS HELPERS (report-friendly)
# =====================================================

def trigram_dup_ratio(texts):
    trigrams = []
    for t in texts:
        w = t.split()
        for i in range(len(w) - 2):
            trigrams.append(" ".join(w[i:i+3]))
    if not trigrams:
        return 0.0
    return 1.0 - (len(set(trigrams)) / len(trigrams))

def avg_doc_ttr(texts):
    # more stable than corpus-level TTR
    ttrs = []
    for t in texts:
        w = t.split()
        if not w:
            continue
        ttrs.append(len(set(w)) / len(w))
    return float(np.mean(ttrs)) if ttrs else 0.0

def noise_counts(texts):
    html_pattern = r"<[^>]+>"
    url_pattern = r"http[s]?://\S+"
    html_cnt = sum(len(re.findall(html_pattern, t)) for t in texts)
    url_cnt = sum(len(re.findall(url_pattern, t)) for t in texts)
    multi_space = sum(len(re.findall(r"\s{3,}", t)) for t in texts)
    return html_cnt, url_cnt, multi_space

# =====================================================
# EVALUATOR (SentenceTransformer)
# =====================================================

class PreprocessingEvaluatorST:
    """
    Evaluate preprocessing quality without fine-tuning using:
    - redundancy (text dup, trigram dup)
    - informativeness (avg length, avg doc TTR)
    - noise removal (html/url/multispace)
    - embedding diversity (variance) & semantic redundancy (avg cosine sim)
    """

    def __init__(self, st_model_name: str = "sentence-transformers/all-mpnet-base-v2", device: str = None):
        # device auto (SentenceTransformer handles it)
        self.st_model = SentenceTransformer(st_model_name, device=device)

    def compute_embeddings(self, texts, batch_size=64, normalize=True):
        embs = self.st_model.encode(
            texts,
            batch_size=batch_size,
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=normalize,  # L2-normalize for cosine similarity stability
        )
        return embs

    def embedding_metrics(self, embs):
        # cosine similarity since embeddings normalized -> dot product = cosine
        sims = np.matmul(embs, embs.T)
        mask = ~np.eye(sims.shape[0], dtype=bool)
        off = sims[mask]

        # variance across dims (diversity)
        emb_var = float(np.mean(np.var(embs, axis=0)))
        return {
            "emb_variance": emb_var,
            "avg_pairwise_sim": float(np.mean(off)),
            "std_pairwise_sim": float(np.std(off)),
        }

    def evaluate(self, texts, sample_indices=None, batch_size=64):
        if sample_indices is not None:
            texts = [texts[i] for i in sample_indices]

        texts = [t if isinstance(t, str) else "" for t in texts]
        texts = [t.strip() for t in texts]
        n = len(texts)

        # basic stats
        word_lens = [len(t.split()) for t in texts]
        avg_len = float(np.mean(word_lens)) if word_lens else 0.0
        std_len = float(np.std(word_lens)) if word_lens else 0.0

        # duplication
        text_dup = 1.0 - (len(set(texts)) / n) if n else 0.0
        tri_dup = trigram_dup_ratio(texts)

        # vocab (per-doc TTR)
        doc_ttr = avg_doc_ttr(texts)

        # noise
        html_cnt, url_cnt, multi_space = noise_counts(texts)

        # embeddings
        embs = self.compute_embeddings(texts, batch_size=batch_size, normalize=True)
        emb_m = self.embedding_metrics(embs)

        metrics = {
            "n_sample": n,
            "avg_word_count": avg_len,
            "std_word_count": std_len,
            "avg_doc_ttr": doc_ttr,
            "text_dup_ratio": text_dup,
            "trigram_dup_ratio": tri_dup,
            "html_tag_count": html_cnt,
            "url_count": url_cnt,
            "multi_space_count": multi_space,
            **emb_m,
        }

        score = self.quality_score(metrics)
        return metrics, score

    def quality_score(self, m):
        """
        Report-friendly score in [0, 100] (relative ranking).
        Designed to NOT punish reasonable domain similarity too harshly.

        - Redundancy reduction (30)
        - Noise reduction (15)
        - Semantic redundancy (avg sim) (20)
        - Embedding diversity (variance) (15)
        - Length sanity (10)
        - Lexical diversity (doc TTR) (10)
        """
        score = 0.0

        # 1) Redundancy (lower dup is better)
        red = (1 - m["text_dup_ratio"]) * 15 + (1 - m["trigram_dup_ratio"]) * 15
        score += max(0.0, min(30.0, red))

        # 2) Noise (counts per sample; lower is better)
        # normalize by n_sample to be comparable
        n = max(1, m["n_sample"])
        noise_raw = (m["html_tag_count"] + m["url_count"] + 0.1 * m["multi_space_count"]) / n
        # map to [0,15]: if noise_raw==0 => 15, if noise_raw>=2 => ~0
        noise = max(0.0, 15.0 * (1.0 - min(1.0, noise_raw / 2.0)))
        score += noise

        # 3) Semantic redundancy (avg cosine sim): lower is better, but don't over-punish
        # in-domain reports can be similar; treat 0.80 as good, 0.95 as bad
        sim = m["avg_pairwise_sim"]
        sim_score = 20.0 * (1.0 - np.clip((sim - 0.80) / (0.95 - 0.80), 0.0, 1.0))
        score += sim_score

        # 4) Embedding diversity: higher is better (normalize by typical ranges)
        # mpnet-base normalized embeddings often have small variance (~0.01-0.10 depending)
        var = m["emb_variance"]
        var_score = 15.0 * np.clip(var / 0.05, 0.0, 1.0)
        score += var_score

        # 5) Length sanity: avoid too short after cleaning
        avg_len = m["avg_word_count"]
        # ideal window for this task often ~50-400 words (adjustable)
        if avg_len < 30:
            len_score = 10.0 * (avg_len / 30.0)
        elif avg_len > 500:
            len_score = 10.0 * (500.0 / avg_len)
        else:
            len_score = 10.0
        score += float(np.clip(len_score, 0.0, 10.0))

        # 6) Lexical diversity (avg doc TTR): moderate-high is good; too high can be noise
        ttr = m["avg_doc_ttr"]
        # target band ~0.25-0.60; map with peak around 0.45
        ttr_score = 10.0 * (1.0 - min(1.0, abs(ttr - 0.45) / 0.25))
        score += float(np.clip(ttr_score, 0.0, 10.0))

        return float(np.clip(score, 0.0, 100.0))

# =====================================================
# RUN ABLATION + REPORT TABLE (pretty + CSV)
# =====================================================

def run_ablation_report(
    csv_path: str,
    st_model: str = "sentence-transformers/all-mpnet-base-v2",
    sample_size: int = 3000,
    seed: int = 42,
    output_csv: str = "preprocess_ablation_report.csv",
):
    df = pd.read_csv(csv_path)
    variants = build_preprocess_variants(df)

    # fair sample indices (same for all)
    idx = get_fair_sample_indices(len(df), sample_size=sample_size, seed=seed)

    evaluator = PreprocessingEvaluatorST(st_model_name=st_model)

    rows = []
    for name, texts in variants.items():
        metrics, score = evaluator.evaluate(texts, sample_indices=idx)
        row = {"variant": name, "score": score, **metrics}
        rows.append(row)

    report = pd.DataFrame(rows)

    # Make it pretty for reporting: select key columns + rounding
    key_cols = [
        "variant", "score", "avg_word_count", "avg_doc_ttr",
        "text_dup_ratio", "trigram_dup_ratio",
        "html_tag_count", "url_count",
        "emb_variance", "avg_pairwise_sim", "std_pairwise_sim",
        "n_sample"
    ]
    report_view = report[key_cols].copy()
    report_view["score"] = report_view["score"].round(2)
    report_view["avg_word_count"] = report_view["avg_word_count"].round(2)
    report_view["avg_doc_ttr"] = report_view["avg_doc_ttr"].round(4)
    report_view["text_dup_ratio"] = report_view["text_dup_ratio"].round(4)
    report_view["trigram_dup_ratio"] = report_view["trigram_dup_ratio"].round(4)
    report_view["emb_variance"] = report_view["emb_variance"].round(6)
    report_view["avg_pairwise_sim"] = report_view["avg_pairwise_sim"].round(4)
    report_view["std_pairwise_sim"] = report_view["std_pairwise_sim"].round(4)

    # sort by score desc
    report_view = report_view.sort_values("score", ascending=False).reset_index(drop=True)

    # export CSV (report-ready)
    report_view.to_csv(output_csv, index=False)

    # pretty console print
    print("\nPREPROCESSING ABLATION REPORT (SentenceTransformer)")
    print("=" * 110)
    print(f"Model: {st_model} | sample_size={sample_size} | seed={seed}")
    print(f"Saved: {output_csv}")
    print("=" * 110)
    print(report_view.to_string(index=False))
    print("=" * 110)

    return report_view

# =====================================================
# MAIN
# =====================================================

if __name__ == "__main__":
    # Kaggle path example
    report = run_ablation_report(
        csv_path="/kaggle/input/foodhazard/aug_data1.csv",
        st_model="sentence-transformers/all-mpnet-base-v2",  # strong & report-friendly
        sample_size=3000,   # increase if you have time (1000 is even better)
        seed=42,
        output_csv="preprocess_ablation_report.csv",
    )


2025-12-31 15:55:19.896174: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767196520.201579      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767196520.290053      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767196521.068190      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767196521.068245      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767196521.068248      17 computation_placer.cc:177] computation placer alr

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


PREPROCESSING ABLATION REPORT (SentenceTransformer)
Model: sentence-transformers/all-mpnet-base-v2 | sample_size=3000 | seed=42
Saved: preprocess_ablation_report.csv
    variant  score  avg_word_count  avg_doc_ttr  text_dup_ratio  trigram_dup_ratio  html_tag_count  url_count  emb_variance  avg_pairwise_sim  std_pairwise_sim  n_sample
v4_entities  67.48          290.35       0.6094          0.0023             0.6767               0        477      0.000769            0.4092            0.1217      3000
   v2_basic  67.48          290.33       0.6094          0.0023             0.6766               0        477      0.000769            0.4093            0.1217      3000
   v3_dedup  67.08          266.23       0.6297          0.0023             0.6497               0        474      0.000770            0.4087            0.1220      3000
    v5_full  66.83          263.36       0.6370          0.0023             0.6467               0        474      0.000773            0.4062            