In [3]:
############################################
# 0. CONFIG
############################################
MODEL_NAME = "microsoft/deberta-v3-large"
DATA_PATH  = "/kaggle/input/foodhazard/chunked_deberta_512.json"

############################################
# 1. IMPORTS
############################################
import json, os, torch, pandas as pd
import numpy as np
from torch import nn
import torch.nn.functional as F
from datasets import Dataset, Features, Value
from transformers import (
    AutoTokenizer, AutoModel, TrainingArguments, Trainer
)
from sklearn.metrics import precision_recall_fscore_support

os.environ["WANDB_DISABLED"] = "true"

############################################
# 2. FOCAL LOSS
############################################
class FocalLoss(nn.Module):
    def __init__(self, alpha=1.0, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        """
        inputs: [B, C]
        targets: [B]
        """
        ce_loss = F.cross_entropy(inputs, targets, reduction="none")
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return loss.mean()

############################################
# 3. LOAD DATA
############################################
with open(DATA_PATH, "r", encoding="utf-8") as f:
    raw = json.load(f)
df = pd.DataFrame(raw)

# text field: Æ°u tiÃªn chunk_text, fallback sang text
if "chunk_text" in df.columns:
    df["text"] = df["chunk_text"]
elif "merged_text" in df.columns:
    df["text"] = df["merged_text"]
else:
    df["text"] = df["text"]

# encode product & hazard labels
df["product_labels"] = df["product_category"].astype("category").cat.codes
df["hazard_labels"]  = df["hazard_category"].astype("category").cat.codes

N_PRODUCT = df["product_category"].nunique()
N_HAZARD  = df["hazard_category"].nunique()
print("N_PRODUCT:", N_PRODUCT, "| N_HAZARD:", N_HAZARD)

############################################
# 4. MODEL: MULTI-TASK
############################################
class MultiTaskClassifier(nn.Module):
    def __init__(self, base_model_name, n_product, n_hazard):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(base_model_name)
        hidden_size = self.backbone.config.hidden_size

        self.dropout = nn.Dropout(0.1)
        self.product_head = nn.Linear(hidden_size, n_product)
        self.hazard_head  = nn.Linear(hidden_size, n_hazard)

        self.focal = FocalLoss()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        product_labels=None,
        hazard_labels=None,
        **kwargs
    ):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        cls = outputs.last_hidden_state[:, 0]  # [CLS]
        cls = self.dropout(cls)

        product_logits = self.product_head(cls)
        hazard_logits  = self.hazard_head(cls)

        loss = None
        if (product_labels is not None) and (hazard_labels is not None):
            loss_product = self.focal(product_logits, product_labels)
            loss_hazard  = self.focal(hazard_logits, hazard_labels)
            loss = 0.5 * loss_product + 0.5 * loss_hazard

        # Trainer cáº§n "logits" Ä‘á»ƒ khÃ´ng crash, ta concat hai head
        logits_concat = torch.cat([product_logits, hazard_logits], dim=-1)

        return {
            "loss": loss,
            "logits": logits_concat,          # Trainer dÃ¹ng cÃ¡i nÃ y
            "product_logits": product_logits, # Ä‘á»ƒ sau nÃ y infer tay
            "hazard_logits": hazard_logits,
        }

############################################
# 5. DATASET + TOKENIZER
############################################
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

features = Features({
    "text": Value("string"),
    "product_labels": Value("int64"),
    "hazard_labels": Value("int64"),
})

dataset = Dataset.from_pandas(
    df[["text", "product_labels", "hazard_labels"]],
    features=features
)

ds = dataset.train_test_split(test_size=0.2, seed=42)
tokenized = ds.map(tokenize_function, batched=True)

############################################
# 6. (Táº M THá»œI) KHÃ”NG COMPUTE_METRICS TRONG TRAINER
############################################
# Náº¿u muá»‘n giá»¯ khung:
# def compute_metrics(pred):
#     return {}

############################################
# 7. INIT MODEL
############################################
model = MultiTaskClassifier(
    base_model_name=MODEL_NAME,
    n_product=N_PRODUCT,
    n_hazard=N_HAZARD,
)

############################################
# 8. TRAINING ARGS 
############################################
args = TrainingArguments(
    output_dir="./result_multi_task",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,

    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    num_train_epochs=10,
    fp16=True,

    eval_strategy="epoch",     
    save_strategy="epoch",
    do_eval=True,
    load_best_model_at_end=True,

    metric_for_best_model="eval_loss",  
    greater_is_better=False,

    save_total_limit=2,
)

############################################
# 9. TRAINER
############################################
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
)

trainer.train()
print("DONE TRAINING ðŸš€")


2025-11-30 12:51:19.740693: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764507079.894719      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764507079.936057      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

N_PRODUCT: 22 | N_HAZARD: 10


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/6850 [00:00<?, ? examples/s]

Map:   0%|          | 0/1713 [00:00<?, ? examples/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,1.7876,0.946002
2,0.6306,0.523983
3,0.3749,0.426807
4,0.2676,0.414693
5,0.1819,0.421368
6,0.1316,0.437468
7,0.1009,0.446358
8,0.0655,0.466989
9,0.053,0.471808
10,0.0455,0.4734


DONE TRAINING ðŸš€
