# Task 1A — Fine-tuning BERT for **GoEmotions** (Multi-label Text Classification) — Full Run

This notebook fine-tunes a BERT-family encoder model for **multi-label emotion classification** on:
- Dataset: `google-research-datasets/go_emotions` (config: `raw`)
- Model: `bert-base-uncased`

It is designed for **Google Colab** and saves outputs under your Google Drive folder.

**Recommended repo:** `finetuning-bert-text-classification`  
**Notebook path:** `notebooks/02_finetune_bert_go_emotions_fullrun_drive.ipynb`


## 0) Mount Google Drive (required)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1) Project directory on Drive (required)

In [2]:
# Required path (as requested)
PROJECT_DIR = "/content/drive/MyDrive/finetuning-bert-text-classification"

In [3]:
from pathlib import Path

PROJECT_PATH = Path(PROJECT_DIR)
REPORTS_DIR = PROJECT_PATH / "reports"
NOTEBOOKS_DIR = PROJECT_PATH / "notebooks"
MODELS_DIR = PROJECT_PATH / "models"
OUTPUTS_DIR = PROJECT_PATH / "outputs"

for d in [REPORTS_DIR, NOTEBOOKS_DIR, MODELS_DIR, OUTPUTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("PROJECT_PATH:", PROJECT_PATH)

PROJECT_PATH: /content/drive/MyDrive/finetuning-bert-text-classification


## 2) Install dependencies

In [4]:
!pip -q install -U transformers datasets evaluate accelerate scikit-learn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/512.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m165.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[?25h

## 3) Imports & reproducibility

In [5]:
import os, random
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed,
)
from sklearn.metrics import f1_score, precision_score, recall_score

SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
print("Torch:", torch.__version__)

Device: cuda
Torch: 2.9.0+cu126


## 4) Configuration (FULL RUN)

In [6]:
MODEL_CHECKPOINT = "bert-base-uncased"
DATASET_NAME = "google-research-datasets/go_emotions"
DATASET_CONFIG = "raw"

MAX_LENGTH = 128
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32
WEIGHT_DECAY = 0.01

OUTPUT_DIR = str(OUTPUTS_DIR / "bert_go_emotions")

print("MODEL_CHECKPOINT:", MODEL_CHECKPOINT)
print("DATASET:", DATASET_NAME, DATASET_CONFIG)

MODEL_CHECKPOINT: bert-base-uncased
DATASET: google-research-datasets/go_emotions raw


## 5) Load dataset

In [7]:
ds = load_dataset(DATASET_NAME, DATASET_CONFIG)
ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

raw/train-00000-of-00001.parquet:   0%|          | 0.00/24.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211225 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
        num_rows: 211225
    })
})

### 5.1 Inspect columns

In [8]:
print(ds)
print("Train columns:", ds["train"].column_names)
pd.DataFrame(ds["train"][:3])

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
        num_rows: 211225
    })
})
Train columns: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1


## 6) Label setup (multi-label -> multi-hot vectors)

In [19]:
# GoEmotions uses a list of label ids per example.
# We'll convert it to a multi-hot vector of length num_labels.

# Identify the label columns by inspecting the dataset's column names.
# Based on the output of ds["train"].column_names in the previous cell,
# the emotion labels start from the 10th column (index 9).
all_dataset_columns = ds["train"].column_names
# Filter out non-label columns (metadata and text related)
non_label_columns = ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear']
label_names = [col for col in all_dataset_columns if col not in non_label_columns]
num_labels = len(label_names)

print("num_labels:", num_labels)
print("First 10 labels:", label_names[:10])

def to_multi_hot(batch):
    multi_hot_vectors = []
    # Iterate through each example in the batch
    for i in range(len(batch['text'])):
        vec = [0.0] * num_labels  # Initialize with floats
        for j, label_name in enumerate(label_names):
            # For each label_name, check its value (0 or 1) for the current example
            if batch[label_name][i] == 1:
                vec[j] = 1.0  # Assign float
        multi_hot_vectors.append(vec)
    return {"labels": multi_hot_vectors}

# Split the original 'train' dataset into train, validation, and test sets
# First, split into train and a temporary validation/test set (80/20 split)
train_temp_split = ds["train"].train_test_split(test_size=0.2, seed=SEED)
train_dataset = train_temp_split["train"]

# Then, split the temporary set into validation and test sets (50/50 split of the 20% temp)
val_test_split = train_temp_split["test"].train_test_split(test_size=0.5, seed=SEED)
validation_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

splits = DatasetDict(
    train=train_dataset,
    validation=validation_dataset,
    test=test_dataset,
)

splits = splits.map(to_multi_hot, batched=True)

print("Example label count (first row):", sum(splits["train"][0]["labels"]))


num_labels: 28
First 10 labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment']


Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/21122 [00:00<?, ? examples/s]

Map:   0%|          | 0/21123 [00:00<?, ? examples/s]

Example label count (first row): 1.0


## 7) Tokenization

In [20]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

# Remove raw columns to save memory (keep 'labels')
remove_cols = [c for c in ["text", "id"] if c in splits["train"].column_names]
tokenized = splits.map(tokenize_batch, batched=True, remove_columns=remove_cols)

print("Tokenized columns:", tokenized["train"].column_names)
print("Sizes:", {k: len(v) for k, v in tokenized.items()})

Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/21122 [00:00<?, ? examples/s]

Map:   0%|          | 0/21123 [00:00<?, ? examples/s]

Tokenized columns: ['author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']
Sizes: {'train': 168980, 'validation': 21122, 'test': 21123}


## 8) Data collator

In [21]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 9) Metrics (Micro-F1, Macro-F1, Precision, Recall)

In [22]:
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= 0.5).astype(int)

    y_true = labels.astype(int)
    y_pred = preds.astype(int)

    micro_f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)
    macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
    micro_p = precision_score(y_true, y_pred, average="micro", zero_division=0)
    micro_r = recall_score(y_true, y_pred, average="micro", zero_division=0)

    return {
        "f1_micro": micro_f1,
        "f1_macro": macro_f1,
        "precision_micro": micro_p,
        "recall_micro": micro_r,
    }

## 10) Load model (multi-label classification)

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    problem_type="multi_label_classification",
).to(device)

print("Loaded model with num_labels:", model.config.num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model with num_labels: 28


## 11) TrainingArguments + Trainer (version-compatible)

In [24]:
import inspect
from transformers import TrainingArguments, Trainer

use_fp16 = torch.cuda.is_available()

ta_kwargs = dict(
    output_dir=OUTPUT_DIR,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    greater_is_better=True,
    fp16=use_fp16,
    report_to="none",
    seed=SEED,
)

ta_params = inspect.signature(TrainingArguments.__init__).parameters
if "eval_strategy" in ta_params:
    ta_kwargs["eval_strategy"] = "epoch"
else:
    ta_kwargs["evaluation_strategy"] = "epoch"

training_args = TrainingArguments(**ta_kwargs)

trainer_kwargs = dict(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_params = inspect.signature(Trainer.__init__).parameters
if "processing_class" in trainer_params:
    trainer_kwargs["processing_class"] = tokenizer
else:
    trainer_kwargs["tokenizer"] = tokenizer

trainer = Trainer(**trainer_kwargs)

training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False,

## 12) Train

In [25]:
train_result = trainer.train()
train_result

Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Precision Micro,Recall Micro
1,0.113,0.111906,0.323583,0.227581,0.648985,0.215521
2,0.1076,0.109747,0.355855,0.252513,0.632431,0.247582
3,0.1001,0.110476,0.376015,0.286707,0.594756,0.274909


TrainOutput(global_step=31686, training_loss=0.11130163229106992, metrics={'train_runtime': 2172.2692, 'train_samples_per_second': 233.369, 'train_steps_per_second': 14.587, 'total_flos': 8950232767844352.0, 'train_loss': 0.11130163229106992, 'epoch': 3.0})

## 13) Evaluate (Validation + Test)

In [26]:
val_metrics = trainer.evaluate(eval_dataset=tokenized["validation"])
test_metrics = trainer.evaluate(eval_dataset=tokenized["test"])

print("Validation metrics:", val_metrics)
print("Test metrics:", test_metrics)

Validation metrics: {'eval_loss': 0.11047551780939102, 'eval_f1_micro': 0.37601536772777167, 'eval_f1_macro': 0.28670724907584694, 'eval_precision_micro': 0.5947564892785832, 'eval_recall_micro': 0.2749087115284298, 'eval_runtime': 13.4978, 'eval_samples_per_second': 1564.843, 'eval_steps_per_second': 48.971, 'epoch': 3.0}
Test metrics: {'eval_loss': 0.1108538955450058, 'eval_f1_micro': 0.37708974923009236, 'eval_f1_macro': 0.28463548166023117, 'eval_precision_micro': 0.5989169359769412, 'eval_recall_micro': 0.2751715558409246, 'eval_runtime': 13.8403, 'eval_samples_per_second': 1526.19, 'eval_steps_per_second': 47.759, 'epoch': 3.0}


## 14) Save model + tokenizer

In [27]:
save_path = MODELS_DIR / "bert_go_emotions_best"
save_path.mkdir(parents=True, exist_ok=True)

trainer.save_model(str(save_path))
tokenizer.save_pretrained(str(save_path))

print("Saved to:", save_path)

Saved to: /content/drive/MyDrive/finetuning-bert-text-classification/models/bert_go_emotions_best


## 15) Write report to `reports/summary_go_emotions.md`

In [28]:
from datetime import datetime

report_path = REPORTS_DIR / "summary_go_emotions.md"
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

val_loss = val_metrics.get("eval_loss", None)
val_f1_micro = val_metrics.get("eval_f1_micro", None)
val_f1_macro = val_metrics.get("eval_f1_macro", None)
val_p_micro = val_metrics.get("eval_precision_micro", None)
val_r_micro = val_metrics.get("eval_recall_micro", None)

test_loss = test_metrics.get("eval_loss", None)
test_f1_micro = test_metrics.get("eval_f1_micro", None)
test_f1_macro = test_metrics.get("eval_f1_macro", None)
test_p_micro = test_metrics.get("eval_precision_micro", None)
test_r_micro = test_metrics.get("eval_recall_micro", None)

report_text = f"""# Task 1A — Fine-tuning BERT for GoEmotions (Multi-label)

Generated: {now}

## Setup
- Dataset: {DATASET_NAME} ({DATASET_CONFIG})
- Model: {MODEL_CHECKPOINT}
- Max length: {MAX_LENGTH}
- Epochs: {NUM_EPOCHS}
- Learning rate: {LEARNING_RATE}
- Train batch size: {TRAIN_BATCH_SIZE}
- Eval batch size: {EVAL_BATCH_SIZE}
- Weight decay: {WEIGHT_DECAY}

## Data sizes used
- Train: {len(tokenized["train"])}
- Validation: {len(tokenized["validation"])}
- Test: {len(tokenized["test"])}

## Results
| Split | Loss | Micro-F1 | Macro-F1 | Micro-Precision | Micro-Recall |
|------|------|----------|----------|-----------------|--------------|
| Validation | {val_loss:.4f} | {val_f1_micro:.4f} | {val_f1_macro:.4f} | {val_p_micro:.4f} | {val_r_micro:.4f} |
| Test | {test_loss:.4f} | {test_f1_micro:.4f} | {test_f1_macro:.4f} | {test_p_micro:.4f} | {test_r_micro:.4f} |
"""

report_path.write_text(report_text, encoding="utf-8")
print("Wrote:", report_path)

Wrote: /content/drive/MyDrive/finetuning-bert-text-classification/reports/summary_go_emotions.md


In [29]:
print("Train:", len(tokenized["train"]))
print("Validation:", len(tokenized["validation"]))
print("Test:", len(tokenized["test"]))


Train: 168980
Validation: 21122
Test: 21123


In [30]:
print(val_metrics)
print(test_metrics)


{'eval_loss': 0.11047551780939102, 'eval_f1_micro': 0.37601536772777167, 'eval_f1_macro': 0.28670724907584694, 'eval_precision_micro': 0.5947564892785832, 'eval_recall_micro': 0.2749087115284298, 'eval_runtime': 13.4978, 'eval_samples_per_second': 1564.843, 'eval_steps_per_second': 48.971, 'epoch': 3.0}
{'eval_loss': 0.1108538955450058, 'eval_f1_micro': 0.37708974923009236, 'eval_f1_macro': 0.28463548166023117, 'eval_precision_micro': 0.5989169359769412, 'eval_recall_micro': 0.2751715558409246, 'eval_runtime': 13.8403, 'eval_samples_per_second': 1526.19, 'eval_steps_per_second': 47.759, 'epoch': 3.0}
