In [1]:
%%capture 
!pip install transformers datasets torch scikit-learn pandas numpy protobuf==3.20.3

In [None]:
import os, random, re, html, unicodedata
from dataclasses import dataclass
from typing import Dict, Tuple, Optional, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from joblib import Parallel, delayed
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, classification_report, hamming_loss, 
    precision_recall_fscore_support
)
from tqdm.auto import tqdm
from sklearn.metrics import classification_report

# -------------------------
# Cấu hình chung
# -------------------------
SEED = 42
TEXT_COL = "comment_text"
LABEL_COLS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
SHARED_TOKENIZER = "distilbert-base-uncased"

THR_BIN = 0.8
KEEP_NORMAL = 100_000
VAL_RATIO = 0.1
OUT_DIR = "./ckpt"
os.makedirs(OUT_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

try:
    _STOPWORDS = set(stopwords.words("english"))
except LookupError:
    nltk.download("stopwords")
    _STOPWORDS = set(stopwords.words("english"))

# -------------------------
# Tiền xử lý & Làm sạch dữ liệu
# -------------------------
_COMBINED_RE = re.compile(
    r"(https?://\S+|www\.\S+)|(\b[\w\.-]+@[\w\.-]+\.\w+\b)|(@\w+)|(<[^>]+>)|([\x00-\x1f\x7f-\x9f])",
    re.IGNORECASE
)
_WS_RE = re.compile(r"\s+")

def fast_clean(s: str) -> str:
    def _replacer(match):
        if match.group(1): return " <URL> "
        if match.group(2): return " <EMAIL> "
        if match.group(3): return " <USER> "
        return " "
    s = html.unescape(str(s))
    s = unicodedata.normalize("NFKC", s.replace("\u200b", "").replace("\ufeff", ""))
    s = _COMBINED_RE.sub(_replacer, s)
    return _WS_RE.sub(" ", s).strip()

def preprocess_text_nltk(s: str, lower=True, keep_punct=True) -> str:
    s = fast_clean(s)
    tk = TweetTokenizer(preserve_case=not lower, reduce_len=True, strip_handles=False)
    tokens = tk.tokenize(s)
    if not keep_punct:
        tokens = [t for t in tokens if t in {"<URL>", "<EMAIL>", "<USER>"} or any(c.isalnum() for c in t)]
    return " ".join(tokens)

def apply_text_preprocess_nltk(texts: list, n_jobs=-1, **kwargs):
    processed = Parallel(n_jobs=n_jobs)(delayed(preprocess_text_nltk)(str(t), **kwargs) for t in texts)
    return processed

def load_and_prepare_splits():
    print("--- Đang tải dữ liệu và làm sạch (Drop None) ---")
    ds = load_dataset("nqdhocai/toxic-comment-detection")
    train_df = ds["train"].to_pandas()
    test_df = ds["test"].to_pandas()

    train_df = train_df.dropna(subset=[TEXT_COL])
    test_df = test_df.dropna(subset=[TEXT_COL])
    train_df = train_df[train_df[TEXT_COL].astype(str).str.strip() != ""]
    test_df = test_df[test_df[TEXT_COL].astype(str).str.strip() != ""]

    print("--- Tiền xử lý văn bản song song ---")
    train_df["comment_text_proc"] = apply_text_preprocess_nltk(train_df[TEXT_COL].tolist())
    test_df["comment_text_proc"] = apply_text_preprocess_nltk(test_df[TEXT_COL].tolist())

    train_df = train_df[train_df["comment_text_proc"].str.strip() != ""]
    test_df = test_df[test_df["comment_text_proc"].str.strip() != ""]

    for c in LABEL_COLS:
        train_df[c] = (pd.to_numeric(train_df[c], errors='coerce').fillna(0) >= THR_BIN).astype(int)
        test_df[c] = (pd.to_numeric(test_df[c], errors='coerce').fillna(0) >= THR_BIN).astype(int)

    mask_normal = (train_df[LABEL_COLS].sum(axis=1) == 0)
    train_toxic = train_df[~mask_normal]
    train_normal = train_df[mask_normal].sample(n=min(KEEP_NORMAL, sum(mask_normal)), random_state=SEED)
    train_df = pd.concat([train_toxic, train_normal]).sample(frac=1, random_state=SEED).reset_index(drop=True)

    X_train, X_val, y_train, y_val = train_test_split(
        train_df["comment_text_proc"].values, train_df[LABEL_COLS].values,
        test_size=VAL_RATIO, random_state=SEED, stratify=train_df[LABEL_COLS].values.sum(axis=1)
    )
    return X_train, y_train, X_val, y_val, test_df["comment_text_proc"].values, test_df[LABEL_COLS].values, test_df[TEXT_COL].values

X_train, y_train, X_val, y_val, X_test, y_test, X_test_raw = load_and_prepare_splits()

pos = y_train.sum(axis=0)
neg = len(y_train) - pos
pos_weight = torch.sqrt(torch.tensor(neg / (pos + 1e-6), device=device, dtype=torch.float32))

# -------------------------
# Định nghĩa Models 
# -------------------------
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, pad_id, embed_dim=256, hidden=256, num_labels=6, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.rnn = nn.RNN(embed_dim, hidden, batch_first=True, bidirectional=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden * 2, num_labels)
    def forward(self, input_ids, lengths):
        x = self.emb(input_ids)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        out, _ = self.rnn(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        x_pool, _ = torch.max(out, dim=1) # Max Pooling
        return self.fc(self.drop(x_pool))

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, pad_id, embed_dim=256, hidden=256, num_labels=6, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.lstm = nn.LSTM(embed_dim, hidden, batch_first=True, bidirectional=True)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden * 2, num_labels)
    def forward(self, input_ids, lengths):
        x = self.emb(input_ids)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        out, _ = self.lstm(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        x_pool, _ = torch.max(out, dim=1) # Max Pooling
        return self.fc(self.drop(x_pool))

class EncoderMultiLabel(nn.Module):
    def __init__(self, model_name, num_labels=6, dropout=0.3, pooling="cls"):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.pooling = pooling
        h_size = self.encoder.config.hidden_size
        self.fc = nn.Linear(h_size, num_labels)
        self.drop = nn.Dropout(dropout)
    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)[0]
        emb = out[:, 0] if self.pooling == "cls" else out.mean(dim=1)
        return self.fc(self.drop(emb))

# -------------------------
# Build, Loaders & Utils
# -------------------------
@dataclass
class ModelSpec:
    name: str; kind: str; tokenizer_name: str; max_len: int; batch_train: int; 
    batch_eval: int; epochs: int; lr: float; warmup_ratio: float = 0.1; 
    dropout: float = 0.3; embed_dim: int = 256; hidden: int = 256; pooling: str = "cls"

def build_model(spec, tokenizer):
    if spec.kind == "encoder":
        return EncoderMultiLabel(spec.tokenizer_name, num_labels=len(LABEL_COLS), dropout=spec.dropout, pooling=spec.pooling)
    else:
        cls = BiLSTMClassifier if spec.name == "bilstm" else RNNClassifier
        return cls(tokenizer.vocab_size, tokenizer.pad_token_id, spec.embed_dim, spec.hidden, num_labels=len(LABEL_COLS), dropout=spec.dropout)

def make_loaders(spec, tokenizer):
    class TextDS(Dataset):
        def __init__(self, texts, labels): self.texts, self.labels = texts, labels
        def __len__(self): return len(self.texts)
        def __getitem__(self, i): return self.texts[i], self.labels[i]

    def collate(batch):
        texts, labels = zip(*batch)
        texts = [str(t) for t in texts]
        enc = tokenizer(texts, padding=True, truncation=True, max_length=spec.max_len, return_tensors="pt")
        yb = torch.tensor(np.array(labels), dtype=torch.float32)
        if spec.kind == "encoder": return enc["input_ids"], enc["attention_mask"], yb
        return enc["input_ids"], enc["attention_mask"].sum(dim=1).long(), yb

    return (DataLoader(TextDS(X_train, y_train), batch_size=spec.batch_train, shuffle=True, collate_fn=collate),
            DataLoader(TextDS(X_val, y_val), batch_size=spec.batch_eval, collate_fn=collate),
            DataLoader(TextDS(X_test, y_test), batch_size=spec.batch_eval, collate_fn=collate))

# -------------------------
# Training Loop & Visualization
# -------------------------
def eval_loop(model, loader, spec, criterion):
    model.eval()
    total_loss, all_probs, all_trues = 0.0, [], []
    with torch.no_grad():
        for batch in loader:
            inputs = [b.to(device) for b in batch]
            logits = model(*inputs[:-1])
            loss = criterion(logits, inputs[-1])
            total_loss += loss.item() * inputs[-1].size(0)
            all_probs.append(torch.sigmoid(logits).cpu().numpy())
            all_trues.append(inputs[-1].cpu().numpy())
    return total_loss / len(loader.dataset), roc_auc_score(np.vstack(all_trues), np.vstack(all_probs), average="macro"), np.vstack(all_trues), np.vstack(all_probs)

def find_best_thresholds_per_label(y_true, y_prob, label_cols):
    """Tìm ngưỡng tối ưu cho từng nhãn trên tập Validation"""
    grid = np.linspace(0.05, 0.95, 19)
    thr_map = {}
    for j, col in enumerate(label_cols):
        yt, pr = y_true[:, j], y_prob[:, j]
        best_t, best_f1 = 0.5, -1.0
        for t in grid:
            yp = (pr >= t).astype(int)
            p, r, f1, _ = precision_recall_fscore_support(yt, yp, average='binary', zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        thr_map[col] = best_t
    return thr_map

def binarize_with_thresholds(y_prob, thr_map, label_cols):
    """Chuyển xác suất sang 0/1 dựa trên map ngưỡng"""
    y_pred = np.zeros_like(y_prob, dtype=int)
    for i, col in enumerate(label_cols):
        y_pred[:, i] = (y_prob[:, i] >= thr_map[col]).astype(int)
    return y_pred
    
def train_one_model(spec: ModelSpec):
    print(f"\n{'='*20} HUẤN LUYỆN: {spec.name.upper()} {'='*20}")
    tokenizer = AutoTokenizer.from_pretrained(spec.tokenizer_name)
    train_loader, val_loader, test_loader = make_loaders(spec, tokenizer)
    model = build_model(spec, tokenizer).to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=spec.lr, weight_decay=0.01 if spec.kind=="encoder" else 0)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    scheduler = get_linear_schedule_with_warmup(optimizer, int(len(train_loader)*spec.epochs*spec.warmup_ratio), len(train_loader)*spec.epochs)
    
    history = {"epochs": [], "train_loss": [], "val_loss": [], "val_auc": []}
    best_auc, best_path = -1.0, f"{OUT_DIR}/{spec.name}_best.pt"

    for ep in range(1, spec.epochs + 1):
        model.train()
        total_tr_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch {ep}", leave=False)
        for batch in pbar:
            optimizer.zero_grad()
            inputs = [b.to(device) for b in batch]
            logits = model(*inputs[:-1])
            loss = criterion(logits, inputs[-1])
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            total_tr_loss += loss.item() * inputs[-1].size(0)

        va_loss, va_auc, _, _ = eval_loop(model, val_loader, spec, criterion)
        history["epochs"].append(ep); history["train_loss"].append(total_tr_loss/len(train_loader.dataset))
        history["val_loss"].append(va_loss); history["val_auc"].append(va_auc)
        print(f"E{ep} | Tr-Loss: {total_tr_loss/len(train_loader.dataset):.4f} | Val-Loss: {va_loss:.4f} | Val-AUC: {va_auc:.4f}")

        if va_auc > best_auc:
            best_auc = va_auc
            torch.save(model.state_dict(), best_path)

    print(f"\n[Final Evaluation] Loading best model from {best_path}...")
    model.load_state_dict(torch.load(best_path))
    
    _, _, y_val_true, y_val_prob = eval_loop(model, val_loader, spec, criterion)
    best_thrs = find_best_thresholds_per_label(y_val_true, y_val_prob, LABEL_COLS)
    
    te_loss, te_auc, y_test_true, y_test_prob = eval_loop(model, test_loader, spec, criterion)
    
    y_test_pred = binarize_with_thresholds(y_test_prob, best_thrs, LABEL_COLS)
    
    print(f"\n--- TEST REPORT: {spec.name.upper()} ---")
    print(f"Test AUC (Macro): {te_auc:.4f}")
    print("\nClassification Report (Tuned Thresholds):")
    print(classification_report(y_test_true, y_test_pred, target_names=LABEL_COLS, digits=2))
            
    return history

def plot_learning_curves(all_results):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    colors = {'rnn': 'blue', 'bilstm': 'green', 'distilbert': 'red'}
    for name, hist in all_results.items():
        c = colors.get(name)
        ax1.plot(hist["epochs"], hist["train_loss"], label=f"{name}-Tr", ls='-', color=c, alpha=0.3)
        ax1.plot(hist["epochs"], hist["val_loss"], label=f"{name}-Val", ls='--', color=c)
        ax2.plot(hist["epochs"], hist["val_auc"], label=name, marker='o', color=c)
    ax1.set_title("Loss Curve (Train vs Val)"); ax1.legend(); ax1.grid(True)
    ax2.set_title("Validation AUC Curve"); ax2.legend(); ax2.grid(True)
    plt.show()

# -------------------------
# Thực thi
# -------------------------
MODEL_SPECS = [
    ModelSpec("rnn", "rnn_like", SHARED_TOKENIZER, 256, 128, 256, 5, 1e-3),
    ModelSpec("bilstm", "rnn_like", SHARED_TOKENIZER, 256, 128, 256, 5, 1e-3),
    ModelSpec("distilbert", "encoder", SHARED_TOKENIZER, 128, 16, 32, 3, 1e-5)
]

for spec in MODEL_SPECS:
    final_results = {}
    final_results[spec.name] = train_one_model(spec)

    plot_learning_curves(final_results)

--- Đang tải dữ liệu và làm sạch (Drop None) ---


README.md:   0%|          | 0.00/919 [00:00<?, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2223065 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/399 [00:00<?, ? examples/s]

--- Tiền xử lý văn bản song song ---



Epoch 1:   0%|          | 0/1109 [00:00<?, ?it/s]

E1 | Tr-Loss: 0.3811 | Val-Loss: 0.2464 | Val-AUC: 0.9512


Epoch 2:   0%|          | 0/1109 [00:00<?, ?it/s]

E2 | Tr-Loss: 0.2332 | Val-Loss: 0.2153 | Val-AUC: 0.9628


Epoch 3:   0%|          | 0/1109 [00:00<?, ?it/s]

E3 | Tr-Loss: 0.2080 | Val-Loss: 0.2077 | Val-AUC: 0.9662


Epoch 4:   0%|          | 0/1109 [00:00<?, ?it/s]

E4 | Tr-Loss: 0.1896 | Val-Loss: 0.2049 | Val-AUC: 0.9673


Epoch 5:   0%|          | 0/1109 [00:00<?, ?it/s]

E5 | Tr-Loss: 0.1760 | Val-Loss: 0.2034 | Val-AUC: 0.9679



Epoch 1:   0%|          | 0/1109 [00:00<?, ?it/s]

E1 | Tr-Loss: 0.3567 | Val-Loss: 0.2243 | Val-AUC: 0.9591


Epoch 2:   0%|          | 0/1109 [00:00<?, ?it/s]