In [1]:
!ls /kaggle/input


aug_data1.csv		  cleaned_aug_data1_DS310.csv  incidents_valid.csv
chunked_deberta_512.json  cleaned_aug_data1_final.csv  RoBERTa
chunked_roberta_512.json  DeBERTa
cleaned_aug_data1.csv	  incidents_test.csv


In [2]:
# =========================
# CELL 1: SETUP
# =========================
import os
import re
import json
import difflib
from collections import defaultdict

import numpy as np
import pandas as pd
import torch
from torch import nn
from bs4 import BeautifulSoup
from datasets import Dataset
from transformers import AutoTokenizer
from sklearn.metrics import f1_score, classification_report
from tqdm import tqdm

from huggingface_hub import hf_hub_download

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------- Paths ----------
TRAIN_JSON_PATH = "/kaggle/input/chunked_deberta_512.json"
VALID_CSV_PATH  = "/kaggle/input/incidents_valid.csv"
TEST_CSV_PATH   = "/kaggle/input/incidents_test.csv"

# ---------- HF repos ----------
MODEL_SPECS = [
    {
        "name": "DeBERTa",
        "repo_id": "tringooo/DeBERTa-FoodHazardDetection",
        "base_model": "microsoft/deberta-v3-large",
    },
    {
        "name": "RoBERTa",
        "repo_id": "tringooo/RoBERTa-FoodHazardDetection",
        "base_model": "FacebookAI/roberta-large",
    },
]

# ---------- Chunking ----------
MAX_TOKENS    = 512
CHUNK_OVERLAP = 64
MIN_CHARS     = 30
BATCH_SIZE    = 16

# ---------- Ensemble grid ----------
WEIGHT_STEPS = np.linspace(0.0, 1.0, 21)  # 0.0..1.0 step 0.05

# ============================================================
# 1) LABEL SPACE (mapping giống lúc train)
# ============================================================
with open(TRAIN_JSON_PATH, "r", encoding="utf-8") as f:
    train_raw = json.load(f)
train_df = pd.DataFrame(train_raw)

prod_cat_type = train_df["product_category"].astype("category")
haz_cat_type  = train_df["hazard_category"].astype("category")

product_categories = list(prod_cat_type.cat.categories)
hazard_categories  = list(haz_cat_type.cat.categories)

product_label2id = {c: i for i, c in enumerate(product_categories)}
hazard_label2id  = {c: i for i, c in enumerate(hazard_categories)}

N_PRODUCT = len(product_categories)
N_HAZARD  = len(hazard_categories)

print("N_PRODUCT:", N_PRODUCT, "| N_HAZARD:", N_HAZARD)

# ============================================================
# 2) PREPROCESS (giống code của bạn)
# ============================================================
def extract_text_from_html(html_content):
    if pd.isna(html_content):
        return ""
    soup = BeautifulSoup(str(html_content), "html.parser")
    return soup.get_text(separator=" ").strip()

def basic_clean(text: str) -> str:
    text = text.replace("\xa0", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

NOISE_PATTERNS = []

def remove_recall_boilerplate(text: str) -> str:
    t = text
    for p in NOISE_PATTERNS:
        t = re.sub(p, " ", t, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", t).strip()

def normalize_sentence_for_dedupe(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\d{3,}", " ", s)
    s = re.sub(r'[^a-z0-9\s.,;:!?\-"]+', " ", s)
    return re.sub(r"\s+", " ", s).strip()

def advanced_deduplicate_sentences(text: str) -> str:
    sentences = re.split(r"(?<=[.!?])\s+", text)
    cleaned, seen, buf = [], set(), []

    for s in sentences:
        raw = s.strip()
        if not raw:
            continue
        norm = normalize_sentence_for_dedupe(raw)
        if len(norm) < 10:
            continue
        if norm in seen:
            continue

        dup = False
        for prev in buf[-50:]:
            if difflib.SequenceMatcher(None, norm, prev).ratio() >= 0.95:
                dup = True
                break
        if dup:
            continue

        cleaned.append(raw)
        seen.add(norm)
        buf.append(norm)

    return " ".join(cleaned).strip()

ENTITY_REPLACEMENTS = {
    r"\be\.?\s*coli\b": "escherichia coli",
    r"\bc\.?\s*botulinum\b": "clostridium botulinum",
    r"\blisteria\s+spp\b": "listeria monocytogenes",
    r"\bsoy\s+proteins?\b": "soybeans",
}

def normalize_entities(text: str) -> str:
    t = text
    for pat, rep in ENTITY_REPLACEMENTS.items():
        t = re.sub(pat, rep, t, flags=re.IGNORECASE)
    return t

def clean_foodhazard_text(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    t = basic_clean(text)
    t = remove_recall_boilerplate(t)
    t = advanced_deduplicate_sentences(t)
    t = normalize_entities(t)
    return re.sub(r"\s+", " ", t).strip()

# ============================================================
# 3) CHUNKING + SOFTMAX
# ============================================================
def chunk_by_tokens(text, tokenizer, max_tokens=512, overlap=64, min_chars=30):
    text = str(text).strip()
    if not text:
        return []
    enc = tokenizer(
        text,
        add_special_tokens=False,
        return_attention_mask=False,
        return_token_type_ids=False,
    )["input_ids"]

    chunks = []
    start = 0
    while start < len(enc):
        end = start + max_tokens
        sub_ids = enc[start:end]
        sub_text = tokenizer.decode(sub_ids, skip_special_tokens=True)
        sub_text = re.sub(r"\s+", " ", sub_text).strip()
        if len(sub_text) >= min_chars:
            chunks.append(sub_text)
        start += max_tokens - overlap
    return chunks

def softmax_np(x):
    x = x - np.max(x, axis=-1, keepdims=True)
    e = np.exp(x)
    return e / np.sum(e, axis=-1, keepdims=True)

# ============================================================
# 4) MODEL MULTI-TASK
# ============================================================
class MultiTaskClassifier(nn.Module):
    def __init__(self, base_model_name, n_product, n_hazard, dropout=0.1):
        super().__init__()
        from transformers import AutoModel
        self.backbone = AutoModel.from_pretrained(base_model_name)
        hidden_size = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.product_head = nn.Linear(hidden_size, n_product)
        self.hazard_head  = nn.Linear(hidden_size, n_hazard)

    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0]
        cls = self.dropout(cls)
        product_logits = self.product_head(cls)
        hazard_logits  = self.hazard_head(cls)
        return torch.cat([product_logits, hazard_logits], dim=-1)

def load_multitask_from_hub(repo_id: str, base_model: str):
    tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)

    weights_path = None
    for fname in ["model.safetensors", "pytorch_model.bin"]:
        try:
            weights_path = hf_hub_download(repo_id=repo_id, filename=fname)
            break
        except Exception:
            pass
    if weights_path is None:
        raise FileNotFoundError(f"Không tìm thấy weights trong repo: {repo_id}")

    model = MultiTaskClassifier(base_model_name=base_model, n_product=N_PRODUCT, n_hazard=N_HAZARD)

    if weights_path.endswith(".safetensors"):
        from safetensors.torch import load_file as safe_load_file
        state_dict = safe_load_file(weights_path)
    else:
        state_dict = torch.load(weights_path, map_location="cpu")

    missing, unexpected = model.load_state_dict(state_dict, strict=False)
    print(f"[{repo_id}] missing={len(missing)} unexpected={len(unexpected)}")

    model.to(device)
    model.eval()
    return tokenizer, model

# ============================================================
# 5) DATASET BUILD FROM CSV
# ============================================================
def build_chunk_dataset_from_csv(csv_path: str, tokenizer):
    df = pd.read_csv(csv_path)

    df["title_clean"] = df["title"].apply(extract_text_from_html).apply(clean_foodhazard_text)
    df["text_clean"]  = df["text"].apply(extract_text_from_html).apply(clean_foodhazard_text)
    df["merged_text"] = (df["title_clean"].fillna("") + " " + df["text_clean"].fillna("")).str.lower().str.strip()

    stt_col = "stt" if "stt" in df.columns else None

    chunk_records = []
    y_prod = {}
    y_haz  = {}

    skipped_unknown = 0
    for idx, row in df.iterrows():
        doc_id = int(row[stt_col]) if stt_col is not None else int(idx)

        prod_cat = str(row.get("product-category", "")).strip()
        haz_cat  = str(row.get("hazard-category", "")).strip()

        if (prod_cat not in product_label2id) or (haz_cat not in hazard_label2id):
            skipped_unknown += 1
            continue

        y_prod[doc_id] = product_label2id[prod_cat]
        y_haz[doc_id]  = hazard_label2id[haz_cat]

        text = str(row.get("merged_text", "") or "").strip()
        if not text:
            chunk_records.append({"doc_id": doc_id, "text": "[EMPTY]"})
            continue

        chunks = chunk_by_tokens(text, tokenizer, MAX_TOKENS, CHUNK_OVERLAP, MIN_CHARS)
        if not chunks:
            chunk_records.append({"doc_id": doc_id, "text": "[EMPTY]"})
            continue

        for ch in chunks:
            chunk_records.append({"doc_id": doc_id, "text": ch})

    chunk_df = pd.DataFrame(chunk_records)
    ds = Dataset.from_pandas(chunk_df[["doc_id", "text"]])

    def tok_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_TOKENS)

    tokenized = ds.map(tok_fn, batched=True)

    print(f"[{os.path.basename(csv_path)}] docs={len(y_prod)} chunks={len(chunk_records)} skipped_unknown={skipped_unknown}")
    return tokenized, y_prod, y_haz

# ============================================================
# 6) INFERENCE -> DOC PROBS (mean pooling)
# ============================================================
@torch.no_grad()
def infer_doc_probs(tokenized_ds: Dataset, model: nn.Module):
    """
    tokenized_ds: has columns [doc_id, text, input_ids, attention_mask, ...]
    returns:
      doc_prod: dict(doc_id -> np.array(N_PRODUCT))
      doc_haz : dict(doc_id -> np.array(N_HAZARD))
    """
    from torch.utils.data import DataLoader

    ds_no_text = tokenized_ds.remove_columns(["text"])

    def collate_fn(features):
        batch = {}
        for k in features[0].keys():
            if k == "__index_level_0__":
                continue
            vals = [f[k] for f in features]
            batch[k] = torch.tensor(vals, dtype=torch.long)
        return batch

    loader = DataLoader(ds_no_text, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    prod_sum = defaultdict(lambda: np.zeros(N_PRODUCT, dtype=np.float64))
    haz_sum  = defaultdict(lambda: np.zeros(N_HAZARD,  dtype=np.float64))
    cnt      = defaultdict(int)

    for batch in tqdm(loader, desc="Infer", leave=False):
        doc_ids = batch["doc_id"].cpu().numpy().astype(int)

        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attn_mask)
        logits = logits.detach().cpu().numpy()

        prod_logits = logits[:, :N_PRODUCT]
        haz_logits  = logits[:, N_PRODUCT:]

        prod_probs = softmax_np(prod_logits)
        haz_probs  = softmax_np(haz_logits)

        for d, pp, hp in zip(doc_ids, prod_probs, haz_probs):
            prod_sum[d] += pp
            haz_sum[d]  += hp
            cnt[d] += 1

    doc_prod = {}
    doc_haz  = {}
    for d in cnt.keys():
        c = max(cnt[d], 1)
        doc_prod[d] = prod_sum[d] / c
        doc_haz[d]  = haz_sum[d] / c

    return doc_prod, doc_haz
# ============================================================
# 7) ENSEMBLE + EVAL UTILS (MISSING FUNCTIONS)
# ============================================================
def ensemble_predict_2models(doc_probs_a: dict, doc_probs_b: dict, w_a: float):
    """
    doc_probs_*: dict(doc_id -> np.array(C,))
    returns dict(doc_id -> np.array(C,))
    """
    w_b = 1.0 - w_a
    out = {}
    common = set(doc_probs_a.keys()) & set(doc_probs_b.keys())
    for d in common:
        p = w_a * doc_probs_a[d] + w_b * doc_probs_b[d]
        s = float(p.sum())
        if s > 0:
            p = p / s
        out[d] = p
    return out

def eval_probs(doc_probs: dict, y_true: dict):
    """
    doc_probs: dict(doc_id -> prob vector)
    y_true:    dict(doc_id -> int label)
    returns (macro_f1, doc_ids, y_true_list, y_pred_list)
    """
    doc_ids = sorted(set(doc_probs.keys()) & set(y_true.keys()))
    y_t = [int(y_true[d]) for d in doc_ids]
    y_p = [int(np.argmax(doc_probs[d])) for d in doc_ids]
    f1 = f1_score(y_t, y_p, average="macro")
    return f1, doc_ids, y_t, y_p

def tune_best_weight_on_valid(valid_prod_a, valid_prod_b, valid_haz_a, valid_haz_b,
                             valid_y_prod: dict, valid_y_haz: dict):
    """
    Grid search w_a in WEIGHT_STEPS on VALID. Return best by avg_f1.
    """
    best = {"w": None, "prod_f1": -1, "haz_f1": -1, "avg_f1": -1}

    for w in WEIGHT_STEPS:
        ens_prod = ensemble_predict_2models(valid_prod_a, valid_prod_b, w)
        ens_haz  = ensemble_predict_2models(valid_haz_a,  valid_haz_b,  w)

        pf1, _, _, _ = eval_probs(ens_prod, valid_y_prod)
        hf1, _, _, _ = eval_probs(ens_haz,  valid_y_haz)
        af1 = 0.5 * (pf1 + hf1)

        if af1 > best["avg_f1"]:
            best = {"w": float(w), "prod_f1": float(pf1), "haz_f1": float(hf1), "avg_f1": float(af1)}

    return best

def grid_search_on_test_upper_bound(test_prod_a, test_prod_b, test_haz_a, test_haz_b,
                                   test_y_prod: dict, test_y_haz: dict):
    """
    Upper bound analysis only: tune w on TEST.
    returns (best_dict, rows)
    """
    rows = []
    best = {"w": None, "prod_f1": -1, "haz_f1": -1, "avg_f1": -1}

    for w in WEIGHT_STEPS:
        ens_prod = ensemble_predict_2models(test_prod_a, test_prod_b, w)
        ens_haz  = ensemble_predict_2models(test_haz_a,  test_haz_b,  w)

        pf1, _, _, _ = eval_probs(ens_prod, test_y_prod)
        hf1, _, _, _ = eval_probs(ens_haz,  test_y_haz)
        af1 = 0.5 * (pf1 + hf1)

        rows.append({"w": float(w), "prod_f1": float(pf1), "haz_f1": float(hf1), "avg_f1": float(af1)})

        if af1 > best["avg_f1"]:
            best = {"w": float(w), "prod_f1": float(pf1), "haz_f1": float(hf1), "avg_f1": float(af1)}

    return best, pd.DataFrame(rows).sort_values("avg_f1", ascending=False)



Device: cuda
N_PRODUCT: 22 | N_HAZARD: 10


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [4]:
# =========================
# CELL 2: TRAINING-FREE EVAL (VALID->TUNE->TEST)
# =========================

# 1) Load models from HuggingFace Hub
spec_a, spec_b = MODEL_SPECS[0], MODEL_SPECS[1]
print("\nLoading:", spec_a["name"], spec_a["repo_id"])
tokenizer_a, model_a = load_multitask_from_hub(spec_a["repo_id"], spec_a["base_model"])

print("\nLoading:", spec_b["name"], spec_b["repo_id"])
tokenizer_b, model_b = load_multitask_from_hub(spec_b["repo_id"], spec_b["base_model"])

# 2) VALID: dataset + infer
print("\n" + "="*60)
print("VALID: BUILD + INFER")
print("="*60)

valid_tok_a, valid_y_prod_a, valid_y_haz_a = build_chunk_dataset_from_csv(VALID_CSV_PATH, tokenizer_a)
valid_tok_b, valid_y_prod_b, valid_y_haz_b = build_chunk_dataset_from_csv(VALID_CSV_PATH, tokenizer_b)

common_valid_docs = sorted(set(valid_y_prod_a.keys()) & set(valid_y_prod_b.keys()))
valid_y_prod = {d: valid_y_prod_a[d] for d in common_valid_docs}
valid_y_haz  = {d: valid_y_haz_a[d]  for d in common_valid_docs}

valid_prod_a, valid_haz_a = infer_doc_probs(valid_tok_a, model_a)
valid_prod_b, valid_haz_b = infer_doc_probs(valid_tok_b, model_b)

# 3) Tune best weight on VALID
print("\n" + "="*60)
print("TUNE WEIGHT ON VALID")
print("="*60)

best_valid = tune_best_weight_on_valid(
    valid_prod_a, valid_prod_b,
    valid_haz_a,  valid_haz_b,
    valid_y_prod, valid_y_haz
)

tuned_w = best_valid["w"]
print(f"Best VALID weight: w_{spec_a['name']}={tuned_w:.2f}, w_{spec_b['name']}={1.0-tuned_w:.2f}")
print(f"VALID Product F1: {best_valid['prod_f1']:.4f}")
print(f"VALID Hazard  F1: {best_valid['haz_f1']:.4f}")
print(f"VALID Avg     F1: {best_valid['avg_f1']:.4f}")

# 4) TEST: dataset + infer
print("\n" + "="*60)
print("TEST: BUILD + INFER")
print("="*60)

test_tok_a, test_y_prod_a, test_y_haz_a = build_chunk_dataset_from_csv(TEST_CSV_PATH, tokenizer_a)
test_tok_b, test_y_prod_b, test_y_haz_b = build_chunk_dataset_from_csv(TEST_CSV_PATH, tokenizer_b)

common_test_docs = sorted(set(test_y_prod_a.keys()) & set(test_y_prod_b.keys()))
test_y_prod = {d: test_y_prod_a[d] for d in common_test_docs}
test_y_haz  = {d: test_y_haz_a[d]  for d in common_test_docs}

test_prod_a, test_haz_a = infer_doc_probs(test_tok_a, model_a)
test_prod_b, test_haz_b = infer_doc_probs(test_tok_b, model_b)

# 5) TEST evaluate with tuned weight
print("\n" + "="*60)
print("TEST EVAL WITH VALID-TUNED WEIGHT")
print("="*60)

ens_test_prod = ensemble_predict_2models(test_prod_a, test_prod_b, tuned_w)
ens_test_haz  = ensemble_predict_2models(test_haz_a,  test_haz_b,  tuned_w)

prod_f1, doc_ids_p, y_true_p, y_pred_p = eval_probs(ens_test_prod, test_y_prod)
haz_f1,  doc_ids_h, y_true_h, y_pred_h = eval_probs(ens_test_haz,  test_y_haz)
avg_f1 = 0.5 * (prod_f1 + haz_f1)

print(f"TEST (tuned) Product F1: {prod_f1:.4f}")
print(f"TEST (tuned) Hazard  F1: {haz_f1:.4f}")
print(f"TEST (tuned) Avg     F1: {avg_f1:.4f}")

print("\n" + "="*60)
print("CLASSIFICATION REPORT - PRODUCT (TEST, tuned)")
print("="*60)
labels_p = sorted(set(y_true_p) | set(y_pred_p))
names_p = [product_categories[i] for i in labels_p]
print(classification_report(y_true_p, y_pred_p, labels=labels_p, target_names=names_p, digits=4))

print("\n" + "="*60)
print("CLASSIFICATION REPORT - HAZARD (TEST, tuned)")
print("="*60)
labels_h = sorted(set(y_true_h) | set(y_pred_h))
names_h = [hazard_categories[i] for i in labels_h]
print(classification_report(y_true_h, y_pred_h, labels=labels_h, target_names=names_h, digits=4))

# 6) TEST grid search (upper bound analysis)
print("\n" + "="*60)
print("TEST GRID SEARCH (UPPER-BOUND, ANALYSIS ONLY)")
print("="*60)

best_test, rows = grid_search_on_test_upper_bound(
    test_prod_a, test_prod_b,
    test_haz_a,  test_haz_b,
    test_y_prod, test_y_haz
)

print(f"Best TEST weight (upper bound): w_{spec_a['name']}={best_test['w']:.2f}, w_{spec_b['name']}={1.0-best_test['w']:.2f}")
print(f"TEST Product F1: {best_test['prod_f1']:.4f}")
print(f"TEST Hazard  F1: {best_test['haz_f1']:.4f}")
print(f"TEST Avg     F1: {best_test['avg_f1']:.4f}")
print(f"Potential Avg improvement vs tuned: {best_test['avg_f1'] - avg_f1:+.4f}")

# Save summary for later cell (inference)
ensemble_state = {
    "tuned_w": tuned_w,
    "model_a_name": spec_a["name"],
    "model_b_name": spec_b["name"],
}
print("\nSaved ensemble_state (in notebook memory):", ensemble_state)



Loading: DeBERTa tringooo/DeBERTa-FoodHazardDetection


tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

2025-12-29 05:47:50.517708: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766987270.714242      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766987270.768890      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766987271.221626      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766987271.221655      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766987271.221658      55 computation_placer.cc:177] computation placer alr

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

[tringooo/DeBERTa-FoodHazardDetection] missing=0 unexpected=0

Loading: RoBERTa tringooo/RoBERTa-FoodHazardDetection


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[tringooo/RoBERTa-FoodHazardDetection] missing=0 unexpected=0

VALID: BUILD + INFER


Map:   0%|          | 0/813 [00:00<?, ? examples/s]

[incidents_valid.csv] docs=565 chunks=813 skipped_unknown=0


Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/858 [00:00<?, ? examples/s]

[incidents_valid.csv] docs=565 chunks=858 skipped_unknown=0


                                                      


TUNE WEIGHT ON VALID
Best VALID weight: w_DeBERTa=0.55, w_RoBERTa=0.45
VALID Product F1: 0.7735
VALID Hazard  F1: 0.8783
VALID Avg     F1: 0.8259

TEST: BUILD + INFER


Map:   0%|          | 0/1469 [00:00<?, ? examples/s]

[incidents_test.csv] docs=997 chunks=1469 skipped_unknown=0


Map:   0%|          | 0/1540 [00:00<?, ? examples/s]

[incidents_test.csv] docs=997 chunks=1540 skipped_unknown=0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



TEST EVAL WITH VALID-TUNED WEIGHT
TEST (tuned) Product F1: 0.8161
TEST (tuned) Hazard  F1: 0.7524
TEST (tuned) Avg     F1: 0.7842

CLASSIFICATION REPORT - PRODUCT (TEST, tuned)
                                                   precision    recall  f1-score   support

                              alcoholic beverages     1.0000    1.0000    1.0000        16
                      cereals and bakery products     0.8403    0.8264    0.8333       121
     cocoa and cocoa preparations, coffee and tea     0.7959    0.9286    0.8571        42
                                    confectionery     0.7500    0.7273    0.7385        33
dietetic foods, food supplements, fortified foods     0.6562    0.8077    0.7241        26
                                    fats and oils     1.0000    0.8333    0.9091         6
                   food additives and flavourings     0.5000    0.5000    0.5000         4
                            fruits and vegetables     0.8252    0.8252    0.8252       103
  

In [5]:
# ==============================
# VALID: SINGLE MODEL EVAL
# ==============================
print("\n" + "="*60)
print("VALID: SINGLE MODEL PERFORMANCE")
print("="*60)

# ----- Model A -----
prod_f1_a, _, y_true_p, y_pred_p = eval_probs(valid_prod_a, valid_y_prod)
haz_f1_a,  _, y_true_h, y_pred_h = eval_probs(valid_haz_a,  valid_y_haz)
avg_f1_a = 0.5 * (prod_f1_a + haz_f1_a)

print(f"[VALID][{spec_a['name']}] Product F1: {prod_f1_a:.4f}")
print(f"[VALID][{spec_a['name']}] Hazard  F1: {haz_f1_a:.4f}")
print(f"[VALID][{spec_a['name']}] Avg     F1: {avg_f1_a:.4f}")

# ----- Model B -----
prod_f1_b, _, _, _ = eval_probs(valid_prod_b, valid_y_prod)
haz_f1_b,  _, _, _ = eval_probs(valid_haz_b,  valid_y_haz)
avg_f1_b = 0.5 * (prod_f1_b + haz_f1_b)

print(f"[VALID][{spec_b['name']}] Product F1: {prod_f1_b:.4f}")
print(f"[VALID][{spec_b['name']}] Hazard  F1: {haz_f1_b:.4f}")
print(f"[VALID][{spec_b['name']}] Avg     F1: {avg_f1_b:.4f}")



VALID: SINGLE MODEL PERFORMANCE
[VALID][DeBERTa] Product F1: 0.7340
[VALID][DeBERTa] Hazard  F1: 0.8186
[VALID][DeBERTa] Avg     F1: 0.7763
[VALID][RoBERTa] Product F1: 0.7413
[VALID][RoBERTa] Hazard  F1: 0.8455
[VALID][RoBERTa] Avg     F1: 0.7934


In [7]:
# ==============================
# TEST: SINGLE MODEL PERFORMANCE
# ==============================
print("\n" + "="*60)
print("TEST: SINGLE MODEL PERFORMANCE")
print("="*60)

# ----- Model A -----
prod_f1_a, _, _, _ = eval_probs(test_prod_a, test_y_prod)
haz_f1_a,  _, _, _ = eval_probs(test_haz_a,  test_y_haz)
avg_f1_a = 0.5 * (prod_f1_a + haz_f1_a)

print(f"[TEST][{spec_a['name']}] Product F1: {prod_f1_a:.4f}")
print(f"[TEST][{spec_a['name']}] Hazard  F1: {haz_f1_a:.4f}")
print(f"[TEST][{spec_a['name']}] Avg     F1: {avg_f1_a:.4f}")

# ----- Model B -----
prod_f1_b, _, _, _ = eval_probs(test_prod_b, test_y_prod)
haz_f1_b,  _, _, _ = eval_probs(test_haz_b,  test_y_haz)
avg_f1_b = 0.5 * (prod_f1_b + haz_f1_b)

print(f"[TEST][{spec_b['name']}] Product F1: {prod_f1_b:.4f}")
print(f"[TEST][{spec_b['name']}] Hazard  F1: {haz_f1_b:.4f}")
print(f"[TEST][{spec_b['name']}] Avg     F1: {avg_f1_b:.4f}")



TEST: SINGLE MODEL PERFORMANCE
[TEST][DeBERTa] Product F1: 0.7453
[TEST][DeBERTa] Hazard  F1: 0.7246
[TEST][DeBERTa] Avg     F1: 0.7349
[TEST][RoBERTa] Product F1: 0.7896
[TEST][RoBERTa] Hazard  F1: 0.8159
[TEST][RoBERTa] Avg     F1: 0.8027


In [6]:
# =========================
# CELL 3: USER INFERENCE
# =========================
@torch.no_grad()
def predict_single_incident(title: str, text: str, w_a: float, topk: int = 3):
    # preprocess
    title_clean = clean_foodhazard_text(extract_text_from_html(title))
    text_clean  = clean_foodhazard_text(extract_text_from_html(text))
    merged = (f"{title_clean} {text_clean}").lower().strip()

    # chunks per tokenizer
    chunks_a = chunk_by_tokens(merged, tokenizer_a, MAX_TOKENS, CHUNK_OVERLAP, MIN_CHARS)
    if not chunks_a:
        chunks_a = ["[EMPTY]"]

    chunks_b = chunk_by_tokens(merged, tokenizer_b, MAX_TOKENS, CHUNK_OVERLAP, MIN_CHARS)
    if not chunks_b:
        chunks_b = ["[EMPTY]"]

    def infer_chunks(tokenizer, model, chunks):
        enc = tokenizer(
            chunks,
            truncation=True,
            padding="max_length",
            max_length=MAX_TOKENS,
            return_tensors="pt",
        ).to(device)

        logits = model(input_ids=enc["input_ids"], attention_mask=enc["attention_mask"]).detach().cpu().numpy()
        prod_logits = logits[:, :N_PRODUCT]
        haz_logits  = logits[:, N_PRODUCT:]

        prod_probs = softmax_np(prod_logits).mean(axis=0)
        haz_probs  = softmax_np(haz_logits).mean(axis=0)

        prod_probs = prod_probs / max(prod_probs.sum(), 1e-12)
        haz_probs  = haz_probs  / max(haz_probs.sum(),  1e-12)
        return prod_probs, haz_probs

    prod_a, haz_a = infer_chunks(tokenizer_a, model_a, chunks_a)
    prod_b, haz_b = infer_chunks(tokenizer_b, model_b, chunks_b)

    w_b = 1.0 - w_a
    prod = w_a * prod_a + w_b * prod_b
    haz  = w_a * haz_a  + w_b * haz_b

    prod = prod / max(prod.sum(), 1e-12)
    haz  = haz  / max(haz.sum(),  1e-12)

    pred_prod = int(np.argmax(prod))
    pred_haz  = int(np.argmax(haz))

    topk_p = np.argsort(-prod)[:topk]
    topk_h = np.argsort(-haz)[:topk]

    return {
        "pred_product": product_categories[pred_prod],
        "pred_hazard": hazard_categories[pred_haz],
        "topk_product": [(product_categories[i], float(prod[i])) for i in topk_p],
        "topk_hazard":  [(hazard_categories[i],  float(haz[i]))  for i in topk_h],
    }

# dùng weight đã tune ở Cell 2
w = ensemble_state["tuned_w"]
print(f"Using tuned weight from VALID: w_{ensemble_state['model_a_name']}={w:.2f} | w_{ensemble_state['model_b_name']}={1.0-w:.2f}")

# Interactive loop
try:
    while True:
        t = input("\nTitle: ").strip()
        c = input("Content: ").strip()
        out = predict_single_incident(t, c, w_a=w, topk=3)

        print("\n--- PREDICTION ---")
        print("Product category:", out["pred_product"])
        print("Hazard  category:", out["pred_hazard"])
        print("Top-3 Product:", out["topk_product"])
        print("Top-3 Hazard :", out["topk_hazard"])
except KeyboardInterrupt:
    print("\nStopped.")


Using tuned weight from VALID: w_DeBERTa=0.55 | w_RoBERTa=0.45

Stopped.
