In [1]:
# ============================================================
# ONE-CELL: FULL, CLEAN, CORRECT (VALID/TEST per model + VALID-tuned ensemble + TEST grid per group)
# - Preprocess EXACTLY like your "code chuẩn"
# - Fixes DataLoader/collate_fn to avoid: 'list' object has no attribute 'to'
# - For EACH GROUP:
#     (1) Print single-model VALID + TEST (Product/Hazard/Avg macro-F1)
#     (2) Ensemble: tune best weight on VALID (by Avg F1) -> evaluate on TEST with that tuned weight
#     (3) Print TEST grid: scores for EACH weight (like you want)
# - Save CSVs: summary_results.csv, test_grid_results.csv, valid_grid_results.csv
# ============================================================

# =========================
# 0) IMPORTS + CONFIG
# =========================
import os, re, json, difflib
from collections import defaultdict

import numpy as np
import pandas as pd
import torch
from torch import nn
from bs4 import BeautifulSoup
from datasets import Dataset
from transformers import AutoTokenizer
from sklearn.metrics import f1_score
from tqdm import tqdm
from huggingface_hub import hf_hub_download

os.environ["TOKENIZERS_PARALLELISM"] = "false"

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------- Paths ----------
TRAIN_JSON_PATH = "/kaggle/input/chunked_deberta_512.json"
VALID_CSV_PATH  = "/kaggle/input/incidents_valid.csv"
TEST_CSV_PATH   = "/kaggle/input/incidents_test.csv"

# ---------- Chunking ----------
MAX_TOKENS    = 512
CHUNK_OVERLAP = 64
MIN_CHARS     = 30
BATCH_SIZE    = 16

# ---------- Ensemble grid ----------
WEIGHT_STEPS = np.linspace(0.0, 1.0, 21)  # 0.0..1.0 step 0.05

# ============================================================
# MODEL GROUPS (each group = 2 models to ensemble)
# ============================================================
MODEL_GROUPS = {
    "Original": {
        "deberta": ("tringooo/DeBERTa-original", "microsoft/deberta-v3-large"),
        "roberta": ("tringooo/RoBERTa-original", "FacebookAI/roberta-large"),
    },
    "Aug1": {
        "deberta": ("tringooo/DeBERTa-FoodHazardDetection", "microsoft/deberta-v3-large"),
        "roberta": ("tringooo/RoBERTa-FoodHazardDetection", "FacebookAI/roberta-large"),
    },
    "Aug2": {
        "deberta": ("tringooo/DeBERTa-aug2", "microsoft/deberta-v3-large"),
        "roberta": ("tringooo/RoBERTa-aug2", "FacebookAI/roberta-large"),
    },
}

# ============================================================
# 1) LABEL SPACE (mapping giống lúc train)
# ============================================================
with open(TRAIN_JSON_PATH, "r", encoding="utf-8") as f:
    train_raw = json.load(f)
train_df = pd.DataFrame(train_raw)

prod_cat_type = train_df["product_category"].astype("category")
haz_cat_type  = train_df["hazard_category"].astype("category")

product_categories = list(prod_cat_type.cat.categories)
hazard_categories  = list(haz_cat_type.cat.categories)

product_label2id = {c: i for i, c in enumerate(product_categories)}
hazard_label2id  = {c: i for i, c in enumerate(hazard_categories)}

N_PRODUCT = len(product_categories)
N_HAZARD  = len(hazard_categories)

print("N_PRODUCT:", N_PRODUCT, "| N_HAZARD:", N_HAZARD)

# ============================================================
# 2) PREPROCESS (EXACT like your "code chuẩn")
# ============================================================
def extract_text_from_html(html_content):
    if pd.isna(html_content):
        return ""
    soup = BeautifulSoup(str(html_content), "html.parser")
    return soup.get_text(separator=" ").strip()

def basic_clean(text: str) -> str:
    text = text.replace("\xa0", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

NOISE_PATTERNS = []  # keep same as your code

def remove_recall_boilerplate(text: str) -> str:
    t = text
    for p in NOISE_PATTERNS:
        t = re.sub(p, " ", t, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", t).strip()

def normalize_sentence_for_dedupe(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\d{3,}", " ", s)
    s = re.sub(r'[^a-z0-9\s.,;:!?\-"]+', " ", s)
    return re.sub(r"\s+", " ", s).strip()

def advanced_deduplicate_sentences(text: str) -> str:
    sentences = re.split(r"(?<=[.!?])\s+", text)
    cleaned, seen, buf = [], set(), []

    for s in sentences:
        raw = s.strip()
        if not raw:
            continue
        norm = normalize_sentence_for_dedupe(raw)
        if len(norm) < 10:
            continue
        if norm in seen:
            continue

        dup = False
        for prev in buf[-50:]:
            if difflib.SequenceMatcher(None, norm, prev).ratio() >= 0.95:
                dup = True
                break
        if dup:
            continue

        cleaned.append(raw)
        seen.add(norm)
        buf.append(norm)

    return " ".join(cleaned).strip()

ENTITY_REPLACEMENTS = {
    r"\be\.?\s*coli\b": "escherichia coli",
    r"\bc\.?\s*botulinum\b": "clostridium botulinum",
    r"\blisteria\s+spp\b": "listeria monocytogenes",
    r"\bsoy\s+proteins?\b": "soybeans",
}

def normalize_entities(text: str) -> str:
    t = text
    for pat, rep in ENTITY_REPLACEMENTS.items():
        t = re.sub(pat, rep, t, flags=re.IGNORECASE)
    return t

def clean_foodhazard_text(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    t = basic_clean(text)
    t = remove_recall_boilerplate(t)
    t = advanced_deduplicate_sentences(t)
    t = normalize_entities(t)
    return re.sub(r"\s+", " ", t).strip()

# ============================================================
# 3) CHUNKING + SOFTMAX (EXACT like your "code chuẩn")
# ============================================================
def chunk_by_tokens(text, tokenizer, max_tokens=512, overlap=64, min_chars=30):
    text = str(text).strip()
    if not text:
        return []
    enc = tokenizer(
        text,
        add_special_tokens=False,
        return_attention_mask=False,
        return_token_type_ids=False,
    )["input_ids"]

    chunks = []
    start = 0
    while start < len(enc):
        end = start + max_tokens
        sub_ids = enc[start:end]
        sub_text = tokenizer.decode(sub_ids, skip_special_tokens=True)
        sub_text = re.sub(r"\s+", " ", sub_text).strip()
        if len(sub_text) >= min_chars:
            chunks.append(sub_text)
        start += max_tokens - overlap
    return chunks

def softmax_np(x):
    x = x - np.max(x, axis=-1, keepdims=True)
    e = np.exp(x)
    return e / np.sum(e, axis=-1, keepdims=True)

# ============================================================
# 4) MODEL MULTI-TASK + LOADER (supports safetensors)
# ============================================================
class MultiTaskClassifier(nn.Module):
    def __init__(self, base_model_name, n_product, n_hazard, dropout=0.1):
        super().__init__()
        from transformers import AutoModel
        self.backbone = AutoModel.from_pretrained(base_model_name)
        hidden_size = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.product_head = nn.Linear(hidden_size, n_product)
        self.hazard_head  = nn.Linear(hidden_size, n_hazard)

    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0]
        cls = self.dropout(cls)
        product_logits = self.product_head(cls)
        hazard_logits  = self.hazard_head(cls)
        return torch.cat([product_logits, hazard_logits], dim=-1)

def load_multitask_from_hub(repo_id: str, base_model: str):
    tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)

    weights_path = None
    for fname in ["model.safetensors", "pytorch_model.bin"]:
        try:
            weights_path = hf_hub_download(repo_id=repo_id, filename=fname)
            break
        except Exception:
            pass
    if weights_path is None:
        raise FileNotFoundError(f"Không tìm thấy weights trong repo: {repo_id}")

    model = MultiTaskClassifier(base_model_name=base_model, n_product=N_PRODUCT, n_hazard=N_HAZARD)

    if weights_path.endswith(".safetensors"):
        from safetensors.torch import load_file as safe_load_file
        state_dict = safe_load_file(weights_path)
    else:
        state_dict = torch.load(weights_path, map_location="cpu")

    missing, unexpected = model.load_state_dict(state_dict, strict=False)
    print(f"[{repo_id}] loaded. missing={len(missing)} unexpected={len(unexpected)}")

    model.to(device).eval()
    return tokenizer, model

# ============================================================
# 5) DATASET BUILD FROM CSV (EXACT like your "code chuẩn")
# ============================================================
def build_chunk_dataset_from_csv(csv_path: str, tokenizer):
    df = pd.read_csv(csv_path)

    df["title_clean"] = df["title"].apply(extract_text_from_html).apply(clean_foodhazard_text)
    df["text_clean"]  = df["text"].apply(extract_text_from_html).apply(clean_foodhazard_text)
    df["merged_text"] = (df["title_clean"].fillna("") + " " + df["text_clean"].fillna("")).str.lower().str.strip()

    stt_col = "stt" if "stt" in df.columns else None

    chunk_records = []
    y_prod = {}
    y_haz  = {}

    skipped_unknown = 0
    for idx, row in df.iterrows():
        doc_id = int(row[stt_col]) if stt_col is not None else int(idx)

        prod_cat = str(row.get("product-category", "")).strip()
        haz_cat  = str(row.get("hazard-category", "")).strip()

        if (prod_cat not in product_label2id) or (haz_cat not in hazard_label2id):
            skipped_unknown += 1
            continue

        y_prod[doc_id] = product_label2id[prod_cat]
        y_haz[doc_id]  = hazard_label2id[haz_cat]

        text = str(row.get("merged_text", "") or "").strip()
        if not text:
            chunk_records.append({"doc_id": doc_id, "text": "[EMPTY]"})
            continue

        chunks = chunk_by_tokens(text, tokenizer, MAX_TOKENS, CHUNK_OVERLAP, MIN_CHARS)
        if not chunks:
            chunk_records.append({"doc_id": doc_id, "text": "[EMPTY]"})
            continue

        for ch in chunks:
            chunk_records.append({"doc_id": doc_id, "text": ch})

    chunk_df = pd.DataFrame(chunk_records)
    ds = Dataset.from_pandas(chunk_df[["doc_id", "text"]])

    def tok_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_TOKENS)

    tokenized = ds.map(tok_fn, batched=True)

    print(f"[{os.path.basename(csv_path)}] docs={len(y_prod)} chunks={len(chunk_records)} skipped_unknown={skipped_unknown}")
    return tokenized, y_prod, y_haz

# ============================================================
# 6) INFERENCE -> DOC PROBS (mean pooling) (FIXED collate_fn)
# ============================================================
@torch.no_grad()
def infer_doc_probs(tokenized_ds: Dataset, model: nn.Module):
    from torch.utils.data import DataLoader

    ds_no_text = tokenized_ds.remove_columns(["text"])

    def collate_fn(features):
        batch = {}
        for k in features[0].keys():
            if k == "__index_level_0__":
                continue
            vals = [f[k] for f in features]
            batch[k] = torch.tensor(vals, dtype=torch.long)
        return batch

    loader = DataLoader(ds_no_text, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    prod_sum = defaultdict(lambda: np.zeros(N_PRODUCT, dtype=np.float64))
    haz_sum  = defaultdict(lambda: np.zeros(N_HAZARD,  dtype=np.float64))
    cnt      = defaultdict(int)

    for batch in tqdm(loader, desc="Infer", leave=False):
        doc_ids = batch["doc_id"].cpu().numpy().astype(int)

        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attn_mask).detach().cpu().numpy()

        prod_probs = softmax_np(logits[:, :N_PRODUCT])
        haz_probs  = softmax_np(logits[:, N_PRODUCT:])

        for d, pp, hp in zip(doc_ids, prod_probs, haz_probs):
            prod_sum[d] += pp
            haz_sum[d]  += hp
            cnt[d] += 1

    doc_prod = {d: prod_sum[d] / max(cnt[d], 1) for d in cnt}
    doc_haz  = {d: haz_sum[d]  / max(cnt[d], 1) for d in cnt}
    return doc_prod, doc_haz

# ============================================================
# 7) EVAL + ENSEMBLE UTILS
# ============================================================
def eval_probs(doc_probs: dict, y_true: dict):
    doc_ids = sorted(set(doc_probs.keys()) & set(y_true.keys()))
    y_t = [int(y_true[d]) for d in doc_ids]
    y_p = [int(np.argmax(doc_probs[d])) for d in doc_ids]
    f1 = f1_score(y_t, y_p, average="macro")
    return float(f1)

def avg_task_f1(prod_probs: dict, haz_probs: dict, y_prod: dict, y_haz: dict):
    pf1 = eval_probs(prod_probs, y_prod)
    hf1 = eval_probs(haz_probs,  y_haz)
    return pf1, hf1, 0.5 * (pf1 + hf1)

def ensemble_predict_2models(doc_probs_a: dict, doc_probs_b: dict, w_a: float):
    w_b = 1.0 - w_a
    out = {}
    common = set(doc_probs_a.keys()) & set(doc_probs_b.keys())
    for d in common:
        p = w_a * doc_probs_a[d] + w_b * doc_probs_b[d]
        s = float(p.sum())
        if s > 0:
            p = p / s
        out[d] = p
    return out

def tune_best_weight_on_valid(vp_a, vp_b, vh_a, vh_b, vy_p, vy_h):
    best = {"w": None, "prod_f1": -1, "haz_f1": -1, "avg_f1": -1}
    rows = []
    for w in WEIGHT_STEPS:
        ens_vp = ensemble_predict_2models(vp_a, vp_b, w)
        ens_vh = ensemble_predict_2models(vh_a, vh_b, w)
        pf1, hf1, af1 = avg_task_f1(ens_vp, ens_vh, vy_p, vy_h)
        rows.append({"w": float(w), "prod_f1": pf1, "haz_f1": hf1, "avg_f1": af1})
        if af1 > best["avg_f1"]:
            best = {"w": float(w), "prod_f1": pf1, "haz_f1": hf1, "avg_f1": af1}
    return best, pd.DataFrame(rows).sort_values("w")

def test_grid(tp_a, tp_b, th_a, th_b, ty_p, ty_h):
    rows = []
    for w in WEIGHT_STEPS:
        ens_tp = ensemble_predict_2models(tp_a, tp_b, w)
        ens_th = ensemble_predict_2models(th_a, th_b, w)
        pf1, hf1, af1 = avg_task_f1(ens_tp, ens_th, ty_p, ty_h)
        rows.append({
            "w_deberta": float(w),
            "w_roberta": float(1.0 - w),
            "TEST_Product_F1": float(pf1),
            "TEST_Hazard_F1":  float(hf1),
            "TEST_Avg_F1":     float(af1),
        })
    return pd.DataFrame(rows).sort_values("w_deberta")

# ============================================================
# 8) RUN ONE GROUP (prints EXACTly what you asked)
# ============================================================
def run_group(group_name: str, deberta_spec, roberta_spec):
    print("\n" + "="*100)
    print(f"GROUP: {group_name}")
    print("="*100)

    # ---- load models
    repo_d, base_d = deberta_spec
    repo_r, base_r = roberta_spec

    print("\nLoading DeBERTa:", repo_d)
    tok_d, mod_d = load_multitask_from_hub(repo_d, base_d)

    print("\nLoading RoBERTa:", repo_r)
    tok_r, mod_r = load_multitask_from_hub(repo_r, base_r)

    # -------------------------
    # VALID
    # -------------------------
    print("\n" + "-"*80)
    print("VALID: BUILD + INFER")
    print("-"*80)

    vds_d, vy_p_d, vy_h_d = build_chunk_dataset_from_csv(VALID_CSV_PATH, tok_d)
    vds_r, vy_p_r, vy_h_r = build_chunk_dataset_from_csv(VALID_CSV_PATH, tok_r)

    common_valid = sorted(set(vy_p_d.keys()) & set(vy_p_r.keys()))
    vy_p = {d: vy_p_d[d] for d in common_valid}
    vy_h = {d: vy_h_d[d] for d in common_valid}

    vp_d, vh_d = infer_doc_probs(vds_d, mod_d)
    vp_r, vh_r = infer_doc_probs(vds_r, mod_r)

    # ---- single model VALID
    v_pf_d, v_hf_d, v_af_d = avg_task_f1(vp_d, vh_d, vy_p, vy_h)
    v_pf_r, v_hf_r, v_af_r = avg_task_f1(vp_r, vh_r, vy_p, vy_h)

    print("\n" + "-"*80)
    print("VALID: SINGLE MODEL PERFORMANCE")
    print("-"*80)
    print(f"[VALID][DeBERTa] Product F1: {v_pf_d:.4f} | Hazard F1: {v_hf_d:.4f} | Avg: {v_af_d:.4f}")
    print(f"[VALID][RoBERTa] Product F1: {v_pf_r:.4f} | Hazard F1: {v_hf_r:.4f} | Avg: {v_af_r:.4f}")

    # ---- tune best weight on VALID
    print("\n" + "-"*80)
    print("ENSEMBLE: TUNE BEST WEIGHT ON VALID (by Avg F1)")
    print("-"*80)

    best_valid, valid_grid = tune_best_weight_on_valid(vp_d, vp_r, vh_d, vh_r, vy_p, vy_h)
    tuned_w = best_valid["w"]
    print(f"Best VALID weight: w_DeBERTa={tuned_w:.2f}, w_RoBERTa={1.0-tuned_w:.2f}")
    print(f"VALID Ensemble Product F1: {best_valid['prod_f1']:.4f}")
    print(f"VALID Ensemble Hazard  F1: {best_valid['haz_f1']:.4f}")
    print(f"VALID Ensemble Avg     F1: {best_valid['avg_f1']:.4f}")

    # -------------------------
    # TEST
    # -------------------------
    print("\n" + "-"*80)
    print("TEST: BUILD + INFER")
    print("-"*80)

    tds_d, ty_p_d, ty_h_d = build_chunk_dataset_from_csv(TEST_CSV_PATH, tok_d)
    tds_r, ty_p_r, ty_h_r = build_chunk_dataset_from_csv(TEST_CSV_PATH, tok_r)

    common_test = sorted(set(ty_p_d.keys()) & set(ty_p_r.keys()))
    ty_p = {d: ty_p_d[d] for d in common_test}
    ty_h = {d: ty_h_d[d] for d in common_test}

    tp_d, th_d = infer_doc_probs(tds_d, mod_d)
    tp_r, th_r = infer_doc_probs(tds_r, mod_r)

    # ---- single model TEST
    t_pf_d, t_hf_d, t_af_d = avg_task_f1(tp_d, th_d, ty_p, ty_h)
    t_pf_r, t_hf_r, t_af_r = avg_task_f1(tp_r, th_r, ty_p, ty_h)

    print("\n" + "-"*80)
    print("TEST: SINGLE MODEL PERFORMANCE")
    print("-"*80)
    print(f"[TEST][DeBERTa] Product F1: {t_pf_d:.4f} | Hazard F1: {t_hf_d:.4f} | Avg: {t_af_d:.4f}")
    print(f"[TEST][RoBERTa] Product F1: {t_pf_r:.4f} | Hazard F1: {t_hf_r:.4f} | Avg: {t_af_r:.4f}")

    # ---- TEST eval with VALID-tuned weight
    print("\n" + "-"*80)
    print("TEST: ENSEMBLE EVAL WITH VALID-TUNED WEIGHT")
    print("-"*80)

    ens_tp = ensemble_predict_2models(tp_d, tp_r, tuned_w)
    ens_th = ensemble_predict_2models(th_d, th_r, tuned_w)
    tuned_pf, tuned_hf, tuned_af = avg_task_f1(ens_tp, ens_th, ty_p, ty_h)

    print(f"TEST (tuned) Product F1: {tuned_pf:.4f}")
    print(f"TEST (tuned) Hazard  F1: {tuned_hf:.4f}")
    print(f"TEST (tuned) Avg     F1: {tuned_af:.4f}")

    # ---- TEST grid: print each weight result (what you want)
    print("\n" + "-"*80)
    print("TEST: ENSEMBLE GRID (score for EACH weight)")
    print("-"*80)

    test_grid_df = test_grid(tp_d, tp_r, th_d, th_r, ty_p, ty_h)
    # print full grid (21 rows)
    print(test_grid_df.to_string(index=False))

    # ---- build compact summary row (single + tuned ensemble)
    summary_rows = [
        {
            "Group": group_name,
            "Model": "DeBERTa (single)",
            "VALID_Product_F1": v_pf_d, "VALID_Hazard_F1": v_hf_d, "VALID_Avg_F1": v_af_d,
            "TEST_Product_F1":  t_pf_d, "TEST_Hazard_F1":  t_hf_d, "TEST_Avg_F1":  t_af_d,
        },
        {
            "Group": group_name,
            "Model": "RoBERTa (single)",
            "VALID_Product_F1": v_pf_r, "VALID_Hazard_F1": v_hf_r, "VALID_Avg_F1": v_af_r,
            "TEST_Product_F1":  t_pf_r, "TEST_Hazard_F1":  t_hf_r, "TEST_Avg_F1":  t_af_r,
        },
        {
            "Group": group_name,
            "Model": f"Ensemble(VALID-tuned w={tuned_w:.2f})",
            "VALID_Product_F1": best_valid["prod_f1"],
            "VALID_Hazard_F1":  best_valid["haz_f1"],
            "VALID_Avg_F1":     best_valid["avg_f1"],
            "TEST_Product_F1":  tuned_pf,
            "TEST_Hazard_F1":   tuned_hf,
            "TEST_Avg_F1":      tuned_af,
        },
    ]
    summary_df = pd.DataFrame(summary_rows)

    # add group column to grids
    valid_grid = valid_grid.copy()
    valid_grid.insert(0, "Group", group_name)

    test_grid_df = test_grid_df.copy()
    test_grid_df.insert(0, "Group", group_name)

    return summary_df, valid_grid, test_grid_df


# ============================================================
# 9) RUN ALL GROUPS + FINAL TABLES + SAVE CSV
# ============================================================
all_summary = []
all_valid_grids = []
all_test_grids = []

for gname, specs in MODEL_GROUPS.items():
    s_df, vg_df, tg_df = run_group(gname, specs["deberta"], specs["roberta"])
    all_summary.append(s_df)
    all_valid_grids.append(vg_df)
    all_test_grids.append(tg_df)

final_summary_df = pd.concat(all_summary, ignore_index=True)
final_valid_grid_df = pd.concat(all_valid_grids, ignore_index=True)
final_test_grid_df  = pd.concat(all_test_grids,  ignore_index=True)

# rounding
for c in final_summary_df.columns:
    if c not in ["Group","Model"]:
        final_summary_df[c] = final_summary_df[c].astype(float).round(4)

for df_ in [final_valid_grid_df, final_test_grid_df]:
    for c in df_.columns:
        if c not in ["Group"]:
            df_[c] = df_[c].astype(float).round(4)

print("\n" + "="*100)
print("FINAL SUMMARY (single models + VALID-tuned ensemble per group)")
print("="*100)
print(final_summary_df.to_string(index=False))

print("\n" + "="*100)
print("FINAL VALID GRID (ALL GROUPS, ALL WEIGHTS)")
print("="*100)
print(final_valid_grid_df.head(10).to_string(index=False))

print("\n" + "="*100)
print("FINAL TEST GRID (ALL GROUPS, ALL WEIGHTS)")
print("="*100)
print(final_test_grid_df.head(10).to_string(index=False))

# save
final_summary_df.to_csv("summary_results.csv", index=False)
final_valid_grid_df.to_csv("valid_grid_results.csv", index=False)
final_test_grid_df.to_csv("test_grid_results.csv", index=False)

print("\nSaved files:")
print(" - summary_results.csv")
print(" - valid_grid_results.csv")
print(" - test_grid_results.csv")


Device: cuda
N_PRODUCT: 22 | N_HAZARD: 10

GROUP: Original

Loading DeBERTa: tringooo/DeBERTa-original


tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

2026-01-02 13:19:17.105950: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767359957.288930      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767359957.341726      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767359957.765264      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767359957.765296      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767359957.765299      24 computation_placer.cc:177] computation placer alr

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

[tringooo/DeBERTa-original] loaded. missing=0 unexpected=0

Loading RoBERTa: tringooo/RoBERTa-original


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[tringooo/RoBERTa-original] loaded. missing=0 unexpected=0

--------------------------------------------------------------------------------
VALID: BUILD + INFER
--------------------------------------------------------------------------------


Map:   0%|          | 0/813 [00:00<?, ? examples/s]

[incidents_valid.csv] docs=565 chunks=813 skipped_unknown=0


Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/858 [00:00<?, ? examples/s]

[incidents_valid.csv] docs=565 chunks=858 skipped_unknown=0


                                                      


--------------------------------------------------------------------------------
VALID: SINGLE MODEL PERFORMANCE
--------------------------------------------------------------------------------
[VALID][DeBERTa] Product F1: 0.6721 | Hazard F1: 0.8093 | Avg: 0.7407
[VALID][RoBERTa] Product F1: 0.7023 | Hazard F1: 0.8117 | Avg: 0.7570

--------------------------------------------------------------------------------
ENSEMBLE: TUNE BEST WEIGHT ON VALID (by Avg F1)
--------------------------------------------------------------------------------
Best VALID weight: w_DeBERTa=0.55, w_RoBERTa=0.45
VALID Ensemble Product F1: 0.7177
VALID Ensemble Hazard  F1: 0.8233
VALID Ensemble Avg     F1: 0.7705

--------------------------------------------------------------------------------
TEST: BUILD + INFER
--------------------------------------------------------------------------------


Map:   0%|          | 0/1469 [00:00<?, ? examples/s]

[incidents_test.csv] docs=997 chunks=1469 skipped_unknown=0


Map:   0%|          | 0/1540 [00:00<?, ? examples/s]

[incidents_test.csv] docs=997 chunks=1540 skipped_unknown=0


                                                      


--------------------------------------------------------------------------------
TEST: SINGLE MODEL PERFORMANCE
--------------------------------------------------------------------------------
[TEST][DeBERTa] Product F1: 0.7763 | Hazard F1: 0.7465 | Avg: 0.7614
[TEST][RoBERTa] Product F1: 0.7731 | Hazard F1: 0.7574 | Avg: 0.7653

--------------------------------------------------------------------------------
TEST: ENSEMBLE EVAL WITH VALID-TUNED WEIGHT
--------------------------------------------------------------------------------
TEST (tuned) Product F1: 0.8128
TEST (tuned) Hazard  F1: 0.7746
TEST (tuned) Avg     F1: 0.7937

--------------------------------------------------------------------------------
TEST: ENSEMBLE GRID (score for EACH weight)
--------------------------------------------------------------------------------
 w_deberta  w_roberta  TEST_Product_F1  TEST_Hazard_F1  TEST_Avg_F1
      0.00       1.00         0.773125        0.757380     0.765253
      0.05       0.95 

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

[tringooo/DeBERTa-FoodHazardDetection] loaded. missing=0 unexpected=0

Loading RoBERTa: tringooo/RoBERTa-FoodHazardDetection


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[tringooo/RoBERTa-FoodHazardDetection] loaded. missing=0 unexpected=0

--------------------------------------------------------------------------------
VALID: BUILD + INFER
--------------------------------------------------------------------------------


Map:   0%|          | 0/813 [00:00<?, ? examples/s]

[incidents_valid.csv] docs=565 chunks=813 skipped_unknown=0


Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/858 [00:00<?, ? examples/s]

[incidents_valid.csv] docs=565 chunks=858 skipped_unknown=0


                                                      


--------------------------------------------------------------------------------
VALID: SINGLE MODEL PERFORMANCE
--------------------------------------------------------------------------------
[VALID][DeBERTa] Product F1: 0.7340 | Hazard F1: 0.8186 | Avg: 0.7763
[VALID][RoBERTa] Product F1: 0.7413 | Hazard F1: 0.8455 | Avg: 0.7934

--------------------------------------------------------------------------------
ENSEMBLE: TUNE BEST WEIGHT ON VALID (by Avg F1)
--------------------------------------------------------------------------------
Best VALID weight: w_DeBERTa=0.55, w_RoBERTa=0.45
VALID Ensemble Product F1: 0.7735
VALID Ensemble Hazard  F1: 0.8783
VALID Ensemble Avg     F1: 0.8259

--------------------------------------------------------------------------------
TEST: BUILD + INFER
--------------------------------------------------------------------------------


Map:   0%|          | 0/1469 [00:00<?, ? examples/s]

[incidents_test.csv] docs=997 chunks=1469 skipped_unknown=0


Map:   0%|          | 0/1540 [00:00<?, ? examples/s]

[incidents_test.csv] docs=997 chunks=1540 skipped_unknown=0


                                                      


--------------------------------------------------------------------------------
TEST: SINGLE MODEL PERFORMANCE
--------------------------------------------------------------------------------
[TEST][DeBERTa] Product F1: 0.7453 | Hazard F1: 0.7246 | Avg: 0.7349
[TEST][RoBERTa] Product F1: 0.7896 | Hazard F1: 0.8159 | Avg: 0.8027

--------------------------------------------------------------------------------
TEST: ENSEMBLE EVAL WITH VALID-TUNED WEIGHT
--------------------------------------------------------------------------------
TEST (tuned) Product F1: 0.8161
TEST (tuned) Hazard  F1: 0.7524
TEST (tuned) Avg     F1: 0.7842

--------------------------------------------------------------------------------
TEST: ENSEMBLE GRID (score for EACH weight)
--------------------------------------------------------------------------------
 w_deberta  w_roberta  TEST_Product_F1  TEST_Hazard_F1  TEST_Avg_F1
      0.00       1.00         0.789575        0.815907     0.802741
      0.05       0.95 

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

[tringooo/DeBERTa-aug2] loaded. missing=0 unexpected=0

Loading RoBERTa: tringooo/RoBERTa-aug2


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[tringooo/RoBERTa-aug2] loaded. missing=0 unexpected=0

--------------------------------------------------------------------------------
VALID: BUILD + INFER
--------------------------------------------------------------------------------


Map:   0%|          | 0/813 [00:00<?, ? examples/s]

[incidents_valid.csv] docs=565 chunks=813 skipped_unknown=0


Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/858 [00:00<?, ? examples/s]

[incidents_valid.csv] docs=565 chunks=858 skipped_unknown=0


                                                      


--------------------------------------------------------------------------------
VALID: SINGLE MODEL PERFORMANCE
--------------------------------------------------------------------------------
[VALID][DeBERTa] Product F1: 0.7820 | Hazard F1: 0.8460 | Avg: 0.8140
[VALID][RoBERTa] Product F1: 0.7733 | Hazard F1: 0.8142 | Avg: 0.7938

--------------------------------------------------------------------------------
ENSEMBLE: TUNE BEST WEIGHT ON VALID (by Avg F1)
--------------------------------------------------------------------------------
Best VALID weight: w_DeBERTa=0.60, w_RoBERTa=0.40
VALID Ensemble Product F1: 0.7748
VALID Ensemble Hazard  F1: 0.8625
VALID Ensemble Avg     F1: 0.8186

--------------------------------------------------------------------------------
TEST: BUILD + INFER
--------------------------------------------------------------------------------


Map:   0%|          | 0/1469 [00:00<?, ? examples/s]

[incidents_test.csv] docs=997 chunks=1469 skipped_unknown=0


Map:   0%|          | 0/1540 [00:00<?, ? examples/s]

[incidents_test.csv] docs=997 chunks=1540 skipped_unknown=0


                                                      


--------------------------------------------------------------------------------
TEST: SINGLE MODEL PERFORMANCE
--------------------------------------------------------------------------------
[TEST][DeBERTa] Product F1: 0.7687 | Hazard F1: 0.7741 | Avg: 0.7714
[TEST][RoBERTa] Product F1: 0.7682 | Hazard F1: 0.7205 | Avg: 0.7443

--------------------------------------------------------------------------------
TEST: ENSEMBLE EVAL WITH VALID-TUNED WEIGHT
--------------------------------------------------------------------------------
TEST (tuned) Product F1: 0.7741
TEST (tuned) Hazard  F1: 0.7441
TEST (tuned) Avg     F1: 0.7591

--------------------------------------------------------------------------------
TEST: ENSEMBLE GRID (score for EACH weight)
--------------------------------------------------------------------------------
 w_deberta  w_roberta  TEST_Product_F1  TEST_Hazard_F1  TEST_Avg_F1
      0.00       1.00         0.768204        0.720488     0.744346
      0.05       0.95 