# 03 — Baseline Model Training

Trains a `roberta-base` multi-label toxicity classifier on the unified Kaggle dataset.

**Targets:** `label_toxicity`, `label_hate_racism`, `label_harassment`  
**Architecture:** RoBERTa-base → linear head → sigmoid → BCEWithLogitsLoss  

**Modes:**
- `QUICK_TEST = True` → 300 samples, 1 epoch, runs ~10 min on CPU. Use to verify the pipeline.
- `QUICK_TEST = False` → full dataset, 3 epochs. Run on Vertex AI / Colab GPU.

In [1]:
# ── CONFIG ─────────────────────────────────────────────────────────
QUICK_TEST   = False    # Set False for full training on GPU
MODEL_NAME   = "roberta-base"
MAX_LENGTH   = 128
BATCH_SIZE   = 16 if not QUICK_TEST else 8
EPOCHS       = 3  if not QUICK_TEST else 1
LR           = 2e-5
SEED         = 42

MAX_LABELED  = 300  if QUICK_TEST else None   # None = all 16k
MAX_NEG      = 100  if QUICK_TEST else 5000   # per source
# ───────────────────────────────────────────────────────────────────

In [2]:
import sys, logging, warnings
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, average_precision_score
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaModel

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(message)s")

sys.path.insert(0, str(Path("..").resolve()))
from src.data.dataset_loader import load_training_data

ROOT      = Path("..").resolve()
MODEL_DIR = ROOT / "models"
MODEL_DIR.mkdir(exist_ok=True)

torch.manual_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {DEVICE}")
print(f"PyTorch: {torch.__version__}")
print(f"Quick test: {QUICK_TEST}")

Device: cpu
PyTorch: 2.10.0+cpu
Quick test: False


## 1. Load & Split Data

In [3]:
df = load_training_data(max_neg_per_source=MAX_NEG, max_labeled=MAX_LABELED)

LABEL_COLS = ["label_toxicity", "label_hate_racism", "label_harassment"]

print(f"\nTotal rows: {len(df):,}")
print("\nClass balance:")
for col in LABEL_COLS:
    pos = df[col].sum()
    print(f"  {col}: {pos:,} positive ({pos/len(df)*100:.1f}%)")

train_df, val_df = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df["label_toxicity"])
print(f"\nTrain: {len(train_df):,}  |  Val: {len(val_df):,}")

toxic-comments-detection: 16554 rows
open-assistant: 5000 rows (tox < 0.10)
VADER filter skipped: No module named 'nltk'
yt-us-comments: 15000 rows
Total training rows: 36554  (labeled=16554  neg=20000)



Total rows: 36,554

Class balance:
  label_toxicity: 16,554 positive (45.3%)
  label_hate_racism: 2,729 positive (7.5%)
  label_harassment: 5,538 positive (15.2%)

Train: 31,070  |  Val: 5,484


In [None]:
# ── Export training CSV for Vertex AI ────────────────────────────────────────
# Save the full training DataFrame so Vertex AI can read it from GCS.
# This file is small (~15 MB) and is the only data the training job needs.
import json as _json

_csv_path = ROOT / "data" / "processed" / "training_data.csv"
train_df_export = pd.concat([train_df, val_df], ignore_index=True)  # full dataset before split
train_df_export.to_csv(_csv_path, index=False)

_size_mb = _csv_path.stat().st_size / 1024 / 1024
print(f"Exported {len(train_df_export):,} rows → {_csv_path}  ({_size_mb:.1f} MB)")
print("\nTo upload to GCS and submit Vertex AI job:")
print("  bash vertex_ai/upload_data.sh    # uploads data/processed/training_data.csv")
print("  bash vertex_ai/submit_training.sh  # submits GPU training job (~1 hour)")

## 2. Tokenizer & Dataset

In [4]:
print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

class ToxicityDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.texts  = df["text"].tolist()
        self.labels = df[LABEL_COLS].values.astype("float32")
        self.tok    = tokenizer
        self.max_len = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tok(
            self.texts[idx],
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels":         torch.tensor(self.labels[idx]),
        }

train_ds = ToxicityDataset(train_df, tokenizer, MAX_LENGTH)
val_ds   = ToxicityDataset(val_df,   tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=0)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

print(f"Train batches: {len(train_loader)}  |  Val batches: {len(val_loader)}")

Loading tokenizer: roberta-base


HTTP Request: HEAD https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json "HTTP/1.1 200 OK"
HTTP Request: GET https://huggingface.co/api/models/roberta-base/tree/main/additional_chat_templates?recursive=false&expand=false "HTTP/1.1 307 Temporary Redirect"
HTTP Request: GET https://huggingface.co/api/models/FacebookAI/roberta-base/tree/main/additional_chat_templates?recursive=false&expand=false "HTTP/1.1 404 Not Found"
HTTP Request: GET https://huggingface.co/api/models/roberta-base/tree/main?recursive=true&expand=false "HTTP/1.1 307 Temporary Redirect"
HTTP Request: GET https://huggingface.co/api/models/FacebookAI/roberta-base/tree/main?recursive=true&expand=false "HTTP/1.1 200 OK"


Train batches: 1942  |  Val batches: 343


## 3. Model — RoBERTa + Multi-Label Head + MC Dropout

In [5]:
class ToxicityClassifier(nn.Module):
    """
    roberta-base → [CLS] pooling → dropout → linear → 3 logits
    MC Dropout: keep dropout active at inference (call model.train() during sampling)
    """
    def __init__(self, model_name: str, num_labels: int = 3, dropout: float = 0.3):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        hidden = self.roberta.config.hidden_size   # 768 for roberta-base
        self.dropout    = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask):
        out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]   # [CLS] token
        return self.classifier(self.dropout(cls))  # raw logits

print(f"Loading model: {MODEL_NAME}")
model = ToxicityClassifier(MODEL_NAME).to(DEVICE)

# Class-weighted loss (positives are rare → up-weight them)
pos_counts  = train_df[LABEL_COLS].sum().values
neg_counts  = len(train_df) - pos_counts
pos_weights = torch.tensor(neg_counts / (pos_counts + 1e-6), dtype=torch.float32).to(DEVICE)
print(f"pos_weights: {pos_weights.tolist()}")

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)

HTTP Request: HEAD https://huggingface.co/roberta-base/resolve/main/config.json "HTTP/1.1 200 OK"
HTTP Request: HEAD https://huggingface.co/roberta-base/resolve/main/model.safetensors "HTTP/1.1 302 Found"


Loading model: roberta-base


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaModel LOAD REPORT[0m from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
pooler.dense.weight             | MISSING    | 
pooler.dense.bias               | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


pos_weights: [1.2082444429397583, 12.409581184387207, 5.623321056365967]


## 4. Training Loop

In [6]:
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in loader:
        ids  = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        lbls = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(ids, mask)
        loss   = criterion(logits, lbls)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    all_logits, all_labels, total_loss = [], [], 0
    for batch in loader:
        ids  = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        lbls = batch["labels"].to(device)
        logits = model(ids, mask)
        total_loss += criterion(logits, lbls).item()
        all_logits.append(torch.sigmoid(logits).cpu().numpy())
        all_labels.append(lbls.cpu().numpy())

    probs  = np.vstack(all_logits)
    labels = np.vstack(all_labels)
    preds  = (probs >= 0.5).astype(int)

    metrics = {"val_loss": total_loss / len(loader)}
    for i, col in enumerate(LABEL_COLS):
        name = col.replace("label_", "")
        if labels[:, i].sum() > 0:
            metrics[f"f1_{name}"]     = f1_score(labels[:, i], preds[:, i], zero_division=0)
            metrics[f"prauc_{name}"]  = average_precision_score(labels[:, i], probs[:, i])
        else:
            metrics[f"f1_{name}"]    = 0.0
            metrics[f"prauc_{name}"] = 0.0
    return metrics


print("Starting training...\n")
best_f1 = 0.0

for epoch in range(1, EPOCHS + 1):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, DEVICE)
    metrics    = evaluate(model, val_loader, criterion, DEVICE)

    avg_f1 = np.mean([metrics[f"f1_{c.replace('label_', '')}"]
                      for c in LABEL_COLS])

    print(f"Epoch {epoch}/{EPOCHS}")
    print(f"  train_loss={train_loss:.4f}  val_loss={metrics['val_loss']:.4f}")
    for col in LABEL_COLS:
        name = col.replace("label_", "")
        print(f"  {name:20s} F1={metrics[f'f1_{name}']:.3f}  PR-AUC={metrics[f'prauc_{name}']:.3f}")
    print(f"  avg_f1={avg_f1:.3f}")
    print()

    if avg_f1 > best_f1:
        best_f1 = avg_f1
        torch.save(model.state_dict(), MODEL_DIR / "roberta_toxicity_best.pt")
        print(f"  >> Saved best model (avg_f1={best_f1:.3f})")

print(f"Training complete. Best avg F1: {best_f1:.3f}")

Starting training...



KeyboardInterrupt: 

## 5. Save Full Model & Tokenizer

In [None]:
# Load best weights and save tokenizer alongside
model.load_state_dict(torch.load(MODEL_DIR / "roberta_toxicity_best.pt", map_location=DEVICE))
tokenizer.save_pretrained(MODEL_DIR / "tokenizer")

import json
meta = {
    "model_name": MODEL_NAME,
    "label_cols": LABEL_COLS,
    "max_length": MAX_LENGTH,
    "best_avg_f1": round(best_f1, 4),
    "quick_test": QUICK_TEST,
}
with open(MODEL_DIR / "model_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved:")
print(f"  {MODEL_DIR}/roberta_toxicity_best.pt")
print(f"  {MODEL_DIR}/tokenizer/")
print(f"  {MODEL_DIR}/model_meta.json")
print()
print(json.dumps(meta, indent=2))

Saved:
  C:\Users\owner\Downloads\data_scientist_porfolio\YoutubeCommentSection\models/roberta_toxicity_best.pt
  C:\Users\owner\Downloads\data_scientist_porfolio\YoutubeCommentSection\models/tokenizer/
  C:\Users\owner\Downloads\data_scientist_porfolio\YoutubeCommentSection\models/model_meta.json

{
  "model_name": "roberta-base",
  "label_cols": [
    "label_toxicity",
    "label_hate_racism",
    "label_harassment"
  ],
  "max_length": 128,
  "best_avg_f1": 0.5781,
  "quick_test": true
}


## 5b. Calibration Metrics — Brier Score + ECE

In [None]:
def brier_score_multilabel(probs, labels, label_cols):
    """Brier Score per label: mean squared error of probabilities. Lower is better."""
    return {
        f"brier_{col.replace('label_', '')}": float(np.mean((probs[:, i] - labels[:, i]) ** 2))
        for i, col in enumerate(label_cols)
    }


def expected_calibration_error(probs, labels, label_cols, n_bins=10):
    """Per-label ECE: weighted |avg_confidence - avg_accuracy| across confidence bins. Lower is better."""
    ece = {}
    bins = np.linspace(0, 1, n_bins + 1)
    for i, col in enumerate(label_cols):
        name = col.replace("label_", "")
        p, y = probs[:, i], labels[:, i]
        bin_ece = sum(
            (mask := (p >= lo) & (p < hi)).sum() / len(p) * abs(p[mask].mean() - y[mask].mean())
            for lo, hi in zip(bins[:-1], bins[1:])
            if (p >= lo).any() and (p < hi).any() and ((p >= lo) & (p < hi)).sum() > 0
        )
        ece[f"ece_{name}"] = round(float(bin_ece), 4)
    return ece


# Run on validation set using best saved weights
model.load_state_dict(torch.load(MODEL_DIR / "roberta_toxicity_best.pt", map_location=DEVICE))
model.eval()

val_logits_list, val_labels_list = [], []
with torch.no_grad():
    for batch in val_loader:
        ids   = batch["input_ids"].to(DEVICE)
        mask  = batch["attention_mask"].to(DEVICE)
        logits = model(ids, mask)
        val_logits_list.append(torch.sigmoid(logits).cpu().numpy())
        val_labels_list.append(batch["labels"].numpy())

val_probs  = np.vstack(val_logits_list)
val_labels = np.vstack(val_labels_list)

brier = brier_score_multilabel(val_probs, val_labels, LABEL_COLS)
ece   = expected_calibration_error(val_probs, val_labels, LABEL_COLS)

print("Calibration Metrics (Validation Set):")
print("\nBrier Score (lower = better, 0 = perfect):")
for k, v in brier.items():
    print(f"  {k}: {v:.4f}")
print("\nExpected Calibration Error (lower = better, 0 = perfectly calibrated):")
for k, v in ece.items():
    print(f"  {k}: {v:.4f}")

# Persist into model_meta.json
import json
with open(MODEL_DIR / "model_meta.json") as f:
    meta = json.load(f)
meta.update(brier)
meta.update(ece)
with open(MODEL_DIR / "model_meta.json", "w") as f:
    json.dump(meta, f, indent=2)
print("\nUpdated model_meta.json with calibration metrics.")
print(json.dumps({k: v for k, v in meta.items() if k.startswith(("brier", "ece"))}, indent=2))

Calibration Metrics (Validation Set):

Brier Score (lower = better, 0 = perfect):
  brier_toxicity: 0.0457
  brier_hate_racism: 0.2265
  brier_harassment: 0.1996

Expected Calibration Error (lower = better, 0 = perfectly calibrated):
  ece_toxicity: 0.0587
  ece_hate_racism: 0.2624
  ece_harassment: 0.2187

Updated model_meta.json with calibration metrics.
{
  "brier_toxicity": 0.04568375647068024,
  "brier_hate_racism": 0.22646364569664001,
  "brier_harassment": 0.19959355890750885,
  "ece_toxicity": 0.0587,
  "ece_hate_racism": 0.2624,
  "ece_harassment": 0.2187
}


## 6. Quick Inference Test

In [None]:
def predict(texts: list[str], threshold: float = 0.5) -> pd.DataFrame:
    model.eval()
    enc = tokenizer(
        texts, max_length=MAX_LENGTH, padding=True,
        truncation=True, return_tensors="pt"
    )
    with torch.no_grad():
        logits = model(enc["input_ids"].to(DEVICE), enc["attention_mask"].to(DEVICE))
        probs  = torch.sigmoid(logits).cpu().numpy()

    results = pd.DataFrame(probs, columns=LABEL_COLS)
    results.insert(0, "text", texts)
    for col in LABEL_COLS:
        results[col.replace("label_", "flag_")] = (results[col] >= threshold).astype(int)
    return results


test_comments = [
    "This is a great video, thanks for sharing!",
    "You are an absolute idiot and should shut up",
    "I hate people like you, get out of this country",
    "Great explanation, very helpful for my project.",
    "Kill yourself you worthless piece of garbage",
]

results = predict(test_comments)
pd.set_option("display.max_colwidth", 50)
pd.set_option("display.float_format", "{:.3f}".format)
results

Unnamed: 0,text,label_toxicity,label_hate_racism,label_harassment,flag_toxicity,flag_hate_racism,flag_harassment
0,"This is a great video, thanks for sharing!",0.005,0.005,0.005,0,0,0
1,You are an absolute idiot and should shut up,0.005,0.005,0.005,0,0,0
2,"I hate people like you, get out of this country",0.005,0.004,0.005,0,0,0
3,"Great explanation, very helpful for my project.",0.004,0.004,0.005,0,0,0
4,Kill yourself you worthless piece of garbage,0.004,0.004,0.005,0,0,0


## Next Steps

**If quick test passed:**
1. Re-run with `QUICK_TEST = False` on **Vertex AI or Google Colab** (GPU) for full training
2. **Notebook 04** — run the trained model on `comments_clean.parquet` to generate silver labels
3. MC Dropout inference: call `model.train()` during inference, run T=10 forward passes → epistemic uncertainty

**To run on Colab (free GPU):**
```python
# In Colab:
!git clone https://github.com/dehiska/YoutubeCommentSection
# Upload kaggle datasets to Colab or mount Drive
# Set QUICK_TEST = False, run all cells
```