In [1]:
############################################
# 0. CONFIG
############################################
MODEL_NAME = "FacebookAI/roberta-large"
DATA_PATH  = "/kaggle/input/foodhazard/chunked_original.json"

############################################
# 1. IMPORTS
############################################
import json, os, torch, pandas as pd
import numpy as np
from torch import nn
import torch.nn.functional as F
from datasets import Dataset, Features, Value
from transformers import (
    AutoTokenizer, AutoModel, TrainingArguments, Trainer
)
from sklearn.metrics import precision_recall_fscore_support

os.environ["WANDB_DISABLED"] = "true"



2025-12-31 07:49:35.819200: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767167375.997702      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767167376.048791      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [2]:
############################################
# 2. FOCAL LOSS
############################################
class FocalLoss(nn.Module):
    def __init__(self, alpha=1.0, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        """
        inputs: [B, C]
        targets: [B]
        """
        ce_loss = F.cross_entropy(inputs, targets, reduction="none")
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return loss.mean()

############################################
# 3. LOAD DATA
############################################
with open(DATA_PATH, "r", encoding="utf-8") as f:
    raw = json.load(f)
df = pd.DataFrame(raw)

# text field: Æ°u tiÃªn chunk_text, fallback sang text
if "chunk_text" in df.columns:
    df["text"] = df["chunk_text"]
elif "merged_text" in df.columns:
    df["text"] = df["merged_text"]
else:
    df["text"] = df["text"]

# encode product & hazard labels
df["product_labels"] = df["product_category"].astype("category").cat.codes
df["hazard_labels"]  = df["hazard_category"].astype("category").cat.codes

N_PRODUCT = df["product_category"].nunique()
N_HAZARD  = df["hazard_category"].nunique()
print("N_PRODUCT:", N_PRODUCT, "| N_HAZARD:", N_HAZARD)

############################################
# 4. MODEL: MULTI-TASK
############################################
class MultiTaskClassifier(nn.Module):
    def __init__(self, base_model_name, n_product, n_hazard):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(base_model_name)
        hidden_size = self.backbone.config.hidden_size

        self.dropout = nn.Dropout(0.1)
        self.product_head = nn.Linear(hidden_size, n_product)
        self.hazard_head  = nn.Linear(hidden_size, n_hazard)

        self.focal = FocalLoss()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        product_labels=None,
        hazard_labels=None,
        **kwargs
    ):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        cls = outputs.last_hidden_state[:, 0]  # [CLS]
        cls = self.dropout(cls)

        product_logits = self.product_head(cls)
        hazard_logits  = self.hazard_head(cls)

        loss = None
        if (product_labels is not None) and (hazard_labels is not None):
            loss_product = self.focal(product_logits, product_labels)
            loss_hazard  = self.focal(hazard_logits, hazard_labels)
            loss = 0.5 * loss_product + 0.5 * loss_hazard

        # Trainer cáº§n "logits" Ä‘á»ƒ khÃ´ng crash, ta concat hai head
        logits_concat = torch.cat([product_logits, hazard_logits], dim=-1)

        return {
            "loss": loss,
            "logits": logits_concat,          # Trainer dÃ¹ng cÃ¡i nÃ y
            "product_logits": product_logits, # Ä‘á»ƒ sau nÃ y infer tay
            "hazard_logits": hazard_logits,
        }

############################################
# 5. DATASET + TOKENIZER
############################################
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

features = Features({
    "text": Value("string"),
    "product_labels": Value("int64"),
    "hazard_labels": Value("int64"),
})

dataset = Dataset.from_pandas(
    df[["text", "product_labels", "hazard_labels"]],
    features=features
)

ds = dataset.train_test_split(test_size=0.2, seed=42)
tokenized = ds.map(tokenize_function, batched=True)

############################################
# 6. (Táº M THá»œI) KHÃ”NG COMPUTE_METRICS TRONG TRAINER
############################################
# Náº¿u muá»‘n giá»¯ khung:
# def compute_metrics(pred):
#     return {}

############################################
# 7. INIT MODEL
############################################
model = MultiTaskClassifier(
    base_model_name=MODEL_NAME,
    n_product=N_PRODUCT,
    n_hazard=N_HAZARD,
)

############################################
# 8. TRAINING ARGS 
############################################
args = TrainingArguments(
    output_dir="./result_multi_task",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,

    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    num_train_epochs=10,
    fp16=True,

    eval_strategy="epoch",     
    save_strategy="epoch",
    do_eval=True,
    load_best_model_at_end=True,

    metric_for_best_model="eval_loss",  
    greater_is_better=False,

    save_total_limit=2,
)

############################################
# 9. TRAINER
############################################
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
)

trainer.train()
print("DONE TRAINING ðŸš€")


N_PRODUCT: 22 | N_HAZARD: 10


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/6775 [00:00<?, ? examples/s]

Map:   0%|          | 0/1694 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.5366,0.615143
2,0.5271,0.481878
3,0.3742,0.439942
4,0.2789,0.438319
5,0.2113,0.476337
6,0.1449,0.487166
7,0.1195,0.505631
8,0.0856,0.521326
9,0.0763,0.528399
10,0.0658,0.529368


DONE TRAINING ðŸš€


In [3]:
# ============================================================
# EVAL TRAINED RoBERTa (multi-task, chunk-level -> doc-level) ON VALID + TEST
# - Uses your preprocessing + token-chunking + mean pooling over chunks
# - Computes:
#   (1) Hazard macro-F1 (doc-level)
#   (2) Product macro-F1 BUT ONLY on docs where hazard is predicted correctly (SemEval-style)
#   (3) Final Score = (Hazard_F1 + Product_F1_conditional)/2
# ============================================================

import os, re, json, difflib
import numpy as np
import pandas as pd
import torch
from torch import nn
from bs4 import BeautifulSoup
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import f1_score
from tqdm import tqdm

# ----------------------------
# CONFIG (adjust paths)
# ----------------------------
BASE_MODEL_NAME   = "FacebookAI/roberta-large"              # backbone used in MultiTaskClassifier
TRAIN_JSON_PATH   = "/kaggle/input/foodhazard/chunked_deberta_512.json"  # only for label mapping (categories)
VALID_CSV_PATH    = "/kaggle/input/incidents_valid.csv"
TEST_CSV_PATH     = "/kaggle/input/incidents_test.csv"

# If you just trained with Trainer:
#   best_ckpt = trainer.state.best_model_checkpoint
# else set manually to the folder created by Trainer (contains pytorch_model.bin or model.safetensors)
CHECKPOINT_DIR    = "./result_multi_task"   # <-- change if needed
# If trainer exists, uncomment this:
# CHECKPOINT_DIR = trainer.state.best_model_checkpoint

MAX_TOKENS    = 512
CHUNK_OVERLAP = 64
MIN_CHARS     = 30
BATCH_SIZE    = 16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ============================================================
# 1) LABEL SPACE: build mapping exactly like training used
# ============================================================
with open(TRAIN_JSON_PATH, "r", encoding="utf-8") as f:
    train_raw = json.load(f)
train_df = pd.DataFrame(train_raw)

prod_cat_type = train_df["product_category"].astype("category")
haz_cat_type  = train_df["hazard_category"].astype("category")

product_categories = list(prod_cat_type.cat.categories)
hazard_categories  = list(haz_cat_type.cat.categories)

product_label2id = {c: i for i, c in enumerate(product_categories)}
hazard_label2id  = {c: i for i, c in enumerate(hazard_categories)}

N_PRODUCT = len(product_categories)
N_HAZARD  = len(hazard_categories)
print("N_PRODUCT:", N_PRODUCT, "| N_HAZARD:", N_HAZARD)

# ============================================================
# 2) PREPROCESS (same as your pipeline)
# ============================================================
def extract_text_from_html(html_content):
    if pd.isna(html_content):
        return ""
    soup = BeautifulSoup(str(html_content), "html.parser")
    return soup.get_text(separator=" ").strip()

def basic_clean(text: str) -> str:
    text = text.replace("\xa0", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

NOISE_PATTERNS = []  # keep as you used

def remove_recall_boilerplate(text: str) -> str:
    t = text
    for p in NOISE_PATTERNS:
        t = re.sub(p, " ", t, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", t).strip()

def normalize_sentence_for_dedupe(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\d{3,}", " ", s)
    s = re.sub(r'[^a-z0-9\s.,;:!?\-"]+', " ", s)
    return re.sub(r"\s+", " ", s).strip()

def advanced_deduplicate_sentences(text: str) -> str:
    sentences = re.split(r"(?<=[.!?])\s+", text)
    cleaned, seen, buf = [], set(), []
    for s in sentences:
        raw = s.strip()
        if not raw:
            continue
        norm = normalize_sentence_for_dedupe(raw)
        if len(norm) < 10:
            continue
        if norm in seen:
            continue

        dup = False
        for prev in buf[-50:]:
            if difflib.SequenceMatcher(None, norm, prev).ratio() >= 0.95:
                dup = True
                break
        if dup:
            continue

        cleaned.append(raw)
        seen.add(norm)
        buf.append(norm)

    return " ".join(cleaned).strip()

ENTITY_REPLACEMENTS = {
    r"\be\.?\s*coli\b": "escherichia coli",
    r"\bc\.?\s*botulinum\b": "clostridium botulinum",
    r"\blisteria\s+spp\b": "listeria monocytogenes",
    r"\bsoy\s+proteins?\b": "soybeans",
}

def normalize_entities(text: str) -> str:
    t = text
    for pat, rep in ENTITY_REPLACEMENTS.items():
        t = re.sub(pat, rep, t, flags=re.IGNORECASE)
    return t

def clean_foodhazard_text(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    t = basic_clean(text)
    t = remove_recall_boilerplate(t)
    t = advanced_deduplicate_sentences(t)
    t = normalize_entities(t)
    return re.sub(r"\s+", " ", t).strip()

# ============================================================
# 3) CHUNKING + SOFTMAX
# ============================================================
def chunk_by_tokens(text, tokenizer, max_tokens=512, overlap=64, min_chars=30):
    text = str(text).strip()
    if not text:
        return []
    enc = tokenizer(
        text,
        add_special_tokens=False,
        return_attention_mask=False,
        return_token_type_ids=False,
    )["input_ids"]

    chunks = []
    start = 0
    while start < len(enc):
        end = start + max_tokens
        sub_ids = enc[start:end]
        sub_text = tokenizer.decode(sub_ids, skip_special_tokens=True)
        sub_text = re.sub(r"\s+", " ", sub_text).strip()
        if len(sub_text) >= min_chars:
            chunks.append(sub_text)
        start += max_tokens - overlap
    return chunks

def softmax_np(x):
    x = x - np.max(x, axis=-1, keepdims=True)
    e = np.exp(x)
    return e / np.sum(e, axis=-1, keepdims=True)

# ============================================================
# 4) MODEL (same architecture as training)
# ============================================================
class MultiTaskClassifier(nn.Module):
    def __init__(self, base_model_name, n_product, n_hazard, dropout=0.1):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(base_model_name)
        hidden_size = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.product_head = nn.Linear(hidden_size, n_product)
        self.hazard_head  = nn.Linear(hidden_size, n_hazard)

    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0]
        cls = self.dropout(cls)
        product_logits = self.product_head(cls)
        hazard_logits  = self.hazard_head(cls)
        return torch.cat([product_logits, hazard_logits], dim=-1)

def load_trained_checkpoint(checkpoint_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

    model = MultiTaskClassifier(
        base_model_name=BASE_MODEL_NAME,
        n_product=N_PRODUCT,
        n_hazard=N_HAZARD,
        dropout=0.1,
    )

    # Find weights
    weights_path = None
    for fname in ["model.safetensors", "pytorch_model.bin"]:
        cand = os.path.join(checkpoint_dir, fname)
        if os.path.exists(cand):
            weights_path = cand
            break
    # Trainer sometimes saves under checkpoint-* subfolder; handle that
    if weights_path is None and os.path.isdir(checkpoint_dir):
        # try find latest checkpoint-* that contains weights
        subs = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]
        subs = sorted(subs, key=lambda p: int(p.split("-")[-1]) if p.split("-")[-1].isdigit() else -1, reverse=True)
        for sub in subs:
            for fname in ["model.safetensors", "pytorch_model.bin"]:
                cand = os.path.join(sub, fname)
                if os.path.exists(cand):
                    checkpoint_dir = sub
                    weights_path = cand
                    break
            if weights_path is not None:
                break

    if weights_path is None:
        raise FileNotFoundError(f"Cannot find model.safetensors or pytorch_model.bin in {checkpoint_dir}")

    if weights_path.endswith(".safetensors"):
        from safetensors.torch import load_file as safe_load
        state_dict = safe_load(weights_path)
    else:
        state_dict = torch.load(weights_path, map_location="cpu")

    missing, unexpected = model.load_state_dict(state_dict, strict=False)
    print(f"Loaded weights from: {weights_path}")
    print(f"Missing keys: {len(missing)} | Unexpected keys: {len(unexpected)}")

    model.to(device)
    model.eval()
    return tokenizer, model

# ============================================================
# 5) BUILD CHUNK DATASET FROM CSV (doc -> chunks)
# ============================================================
def build_chunk_dataset_from_csv(csv_path: str, tokenizer):
    df = pd.read_csv(csv_path)

    df["title_clean"] = df["title"].apply(extract_text_from_html).apply(clean_foodhazard_text)
    df["text_clean"]  = df["text"].apply(extract_text_from_html).apply(clean_foodhazard_text)
    df["merged_text"] = (df["title_clean"].fillna("") + " " + df["text_clean"].fillna("")).str.lower().str.strip()

    stt_col = "stt" if "stt" in df.columns else None

    chunk_records = []
    y_prod = {}
    y_haz  = {}
    skipped_unknown = 0

    for idx, row in df.iterrows():
        doc_id = int(row[stt_col]) if stt_col is not None else int(idx)

        prod_cat = str(row.get("product-category", "")).strip()
        haz_cat  = str(row.get("hazard-category", "")).strip()

        if (prod_cat not in product_label2id) or (haz_cat not in hazard_label2id):
            skipped_unknown += 1
            continue

        y_prod[doc_id] = product_label2id[prod_cat]
        y_haz[doc_id]  = hazard_label2id[haz_cat]

        text = str(row.get("merged_text", "") or "").strip()
        if not text:
            chunk_records.append({"doc_id": doc_id, "text": "[EMPTY]"})
            continue

        chunks = chunk_by_tokens(text, tokenizer, MAX_TOKENS, CHUNK_OVERLAP, MIN_CHARS)
        if not chunks:
            chunk_records.append({"doc_id": doc_id, "text": "[EMPTY]"})
            continue

        for ch in chunks:
            chunk_records.append({"doc_id": doc_id, "text": ch})

    chunk_df = pd.DataFrame(chunk_records)
    ds = Dataset.from_pandas(chunk_df[["doc_id", "text"]])

    def tok_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_TOKENS)

    tokenized = ds.map(tok_fn, batched=True)
    print(f"[{os.path.basename(csv_path)}] docs={len(y_prod)} chunks={len(chunk_records)} skipped_unknown={skipped_unknown}")
    return tokenized, y_prod, y_haz

# ============================================================
# 6) INFER -> DOC PROBS (mean pooling over chunks)
# ============================================================
@torch.no_grad()
def infer_doc_probs(tokenized_ds: Dataset, model: nn.Module):
    from torch.utils.data import DataLoader

    ds_no_text = tokenized_ds.remove_columns(["text"])

    def collate_fn(features):
        batch = {}
        for k in features[0].keys():
            if k == "__index_level_0__":
                continue
            vals = [f[k] for f in features]
            batch[k] = torch.tensor(vals, dtype=torch.long)
        return batch

    loader = DataLoader(ds_no_text, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    prod_sum = {}
    haz_sum  = {}
    cnt      = {}

    for batch in tqdm(loader, desc="Infer", leave=False):
        doc_ids = batch["doc_id"].cpu().numpy().astype(int)

        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attn_mask).detach().cpu().numpy()

        prod_logits = logits[:, :N_PRODUCT]
        haz_logits  = logits[:, N_PRODUCT:]

        prod_probs = softmax_np(prod_logits)
        haz_probs  = softmax_np(haz_logits)

        for d, pp, hp in zip(doc_ids, prod_probs, haz_probs):
            if d not in prod_sum:
                prod_sum[d] = np.zeros(N_PRODUCT, dtype=np.float64)
                haz_sum[d]  = np.zeros(N_HAZARD,  dtype=np.float64)
                cnt[d]      = 0
            prod_sum[d] += pp
            haz_sum[d]  += hp
            cnt[d]      += 1

    doc_prod = {d: prod_sum[d] / max(cnt[d], 1) for d in cnt.keys()}
    doc_haz  = {d: haz_sum[d]  / max(cnt[d], 1) for d in cnt.keys()}
    return doc_prod, doc_haz

# ============================================================
# 7) SCORE (SemEval-style)
#   - hazard macro-F1 on all docs
#   - product macro-F1 only on docs where hazard predicted correct
#   - final score = mean of two
# ============================================================
def eval_hazard_macro_f1(doc_haz_probs: dict, y_haz: dict):
    doc_ids = sorted(set(doc_haz_probs.keys()) & set(y_haz.keys()))
    y_true = [int(y_haz[d]) for d in doc_ids]
    y_pred = [int(np.argmax(doc_haz_probs[d])) for d in doc_ids]
    return f1_score(y_true, y_pred, average="macro"), doc_ids, y_true, y_pred

def eval_product_macro_f1_conditional(doc_prod_probs: dict, y_prod: dict,
                                      doc_ids_all: list, haz_true: list, haz_pred: list):
    # only keep docs where hazard predicted correctly
    keep = [doc_ids_all[i] for i in range(len(doc_ids_all)) if haz_true[i] == haz_pred[i]]
    y_true = [int(y_prod[d]) for d in keep if d in doc_prod_probs and d in y_prod]
    y_pred = [int(np.argmax(doc_prod_probs[d])) for d in keep if d in doc_prod_probs and d in y_prod]
    if len(y_true) == 0:
        return 0.0, len(keep)
    return f1_score(y_true, y_pred, average="macro"), len(keep)

def semeval_score(doc_prod_probs: dict, doc_haz_probs: dict, y_prod: dict, y_haz: dict):
    haz_f1, doc_ids, haz_true, haz_pred = eval_hazard_macro_f1(doc_haz_probs, y_haz)
    prod_f1_cond, n_kept = eval_product_macro_f1_conditional(
        doc_prod_probs, y_prod, doc_ids, haz_true, haz_pred
    )
    score = 0.5 * (haz_f1 + prod_f1_cond)
    return score, haz_f1, prod_f1_cond, n_kept, len(doc_ids)

# ============================================================
# RUN VALID + TEST
# ============================================================
tokenizer, model = load_trained_checkpoint(CHECKPOINT_DIR)

def run_split(csv_path: str, split_name: str):
    tok_ds, y_prod, y_haz = build_chunk_dataset_from_csv(csv_path, tokenizer)
    doc_prod, doc_haz = infer_doc_probs(tok_ds, model)

    score, haz_f1, prod_f1_cond, n_kept, n_docs = semeval_score(doc_prod, doc_haz, y_prod, y_haz)

    print("\n" + "="*70)
    print(f"{split_name} RESULTS (RoBERTa trained checkpoint)")
    print("="*70)
    print(f"Docs evaluated                 : {n_docs}")
    print(f"Docs with correct hazard (cond): {n_kept}")
    print(f"Hazard macro-F1                : {haz_f1:.4f}")
    print(f"Product macro-F1 (conditional) : {prod_f1_cond:.4f}")
    print(f"FINAL SCORE                    : {score:.4f}")
    return {
        "split": split_name,
        "n_docs": n_docs,
        "n_kept": n_kept,
        "haz_f1": haz_f1,
        "prod_f1_cond": prod_f1_cond,
        "score": score,
    }

valid_res = run_split(VALID_CSV_PATH, "VALID")
test_res  = run_split(TEST_CSV_PATH,  "TEST")
print("\nSummary:", valid_res, test_res)


Device: cuda
N_PRODUCT: 22 | N_HAZARD: 10


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded weights from: ./result_multi_task/checkpoint-8470/model.safetensors
Missing keys: 0 | Unexpected keys: 0


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/incidents_valid.csv'