In [None]:
! pip install transformers datasets accelerate evaluate optuna

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna, evaluate
Successfully installed colorlog-6.10.1 evaluate-0.4.6 optuna-4.6.0


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [None]:
import transformers
import pandas as pd
from datasets import load_dataset
import evaluate # load_metric from datasets is removed
import numpy as np
import random
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import set_seed
import torch

In [None]:
def set_all_seeds(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    set_seed(seed)  # HF helper (sets torch, random, numpy used inside HF)

In [None]:
set_all_seeds(42)

In [None]:
# loading a dataset from the csvs at https://github.com/inflaton/Explainable-Sentiment-Analysis-with-DeepSeek-R1/tree/main/dataset
data = load_dataset(
    "csv",
    data_files={
        "train": "GoEmotions-train.csv",
        "validation": "GoEmotions-val.csv",
        "test": "GoEmotions-test.csv",
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
def train_model(model_checkpoint="distilbert-base-uncased",
                dataset=data,
                n_hpo_trials=0,
                batch_size=16,
                learning_rate=2e-5,
                num_train_epochs=5):
    """model_checkpoint: one of the huggingface model checkpoints
    dataset: A dataset object, e.g. from load_dataset
    n_hpo_trials: number of trials in the hyperparameter optimization
    batch_size: batch size before hyperparameter tuning
    learning_rate: learning rate before hyperparameter tuning
    num_train_epochs: number of epochs used in training before hyperparameter tuning

    Returns: None, pushes model to hub

    This function trains the model from start to finish, with hyperparameter tuning included if n_hpo_trials > 0.

    """
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    precision = evaluate.load('precision')
    recall = evaluate.load('recall')

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    sentence1_key = "Text"

    # 1. Label maps
    labels = sorted(list(set(dataset["train"]["Emotion"])))
    label2id = {label: i for i, label in enumerate(labels)}
    id2label = {i: label for label, i in label2id.items()}

    # 2. Encode string labels to ints
    def encode_labels(example):
        example["labels"] = label2id[example["Emotion"]]
        return example

    dataset = dataset.map(encode_labels)

    # 3. Tokenization
    def preprocess_function(examples):
        return tokenizer(examples[sentence1_key], truncation=True)

    encoded_dataset = dataset.map(preprocess_function, batched=True)

    num_labels = len(labels) # 27 labels

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                            num_labels=num_labels,
                                                            id2label=id2label,
                                                            label2id=label2id)
    metric_name = "f1_weighted"
    model_name = model_checkpoint.split("/")[-1]

    args = TrainingArguments(
        f"{model_name}-finetuned-GoEmotions",
        eval_strategy = "epoch",
        save_strategy = "epoch", # determines how often a model can be pushed
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model=metric_name,
        push_to_hub=True, # pushing to hugging face hub automatically is very helpful here
        hub_private_repo=True,
        logging_steps = 100,
        fp16 = True,
        report_to = 'none',
        disable_tqdm = False
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        acc = accuracy.compute(predictions=predictions, references=labels)

        f1_macro = f1.compute(
            predictions=predictions,
            references=labels,
            average="macro"
        )

        f1_weighted = f1.compute(
            predictions=predictions,
            references=labels,
            average="weighted"
        )

        recall_macro = recall.compute(
            predictions=predictions,
            references=labels,
            average="macro"
        )
        precision_macro = precision.compute(
            predictions=predictions,
            references=labels,
            average="macro"
        )

        return {
            "accuracy": acc["accuracy"],
            "f1_macro": f1_macro["f1"],
            "f1_weighted": f1_weighted["f1"],
            "recall_macro": recall_macro["recall"],
            "precision_macro": precision_macro["precision"]
        }

    trainer = Trainer(
        model,
        args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )


    if n_hpo_trials == 0:
        trainer.train()
        trainer.push_to_hub() # making SURE it pushes to hub
        return None




    else:
        def model_init():
            return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

        args_hpo = TrainingArguments(
            f"{model_name}-finetuned-GoEmotions-hpo",  # separate dir
            eval_strategy = "epoch",
            save_strategy = "epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model=metric_name,
            push_to_hub=False,
            fp16 = True,
            report_to = 'none',
            logging_strategy = 'no',
            disable_tqdm = True # disabling progress bar - seems to be an occasional rare bug in the progress bar where it divides by None
        )

        trainer = Trainer(
            model_init=model_init,
            args=args_hpo,
            train_dataset=encoded_dataset["train"],
            eval_dataset=encoded_dataset["validation"],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        def hp_space(trial):
            return {
                # BERT LR range, log scale
                "learning_rate": trial.suggest_float(
                    "learning_rate",
                    1e-5,
                    5e-5,
                    log=True
                ),

                "num_train_epochs": trial.suggest_int(
                    "num_train_epochs",
                    2,
                    8
                ),

                # Batch sizes
                "per_device_train_batch_size": trial.suggest_categorical(
                    "per_device_train_batch_size",
                    [8, 16, 32]
                ),
                #  weight decay range for AdamW
                "weight_decay": trial.suggest_float(
                    "weight_decay",
                    0.0,
                    0.3
                ),
                # Warmup fraction of total steps
                "warmup_ratio": trial.suggest_float(
                    "warmup_ratio",
                    0.0,
                    0.2
                )
            }

        def compute_objective(metrics):
            return metrics["eval_f1_weighted"]


        best_run = trainer.hyperparameter_search(n_trials=n_hpo_trials,
                                         direction="maximize",
                                         hp_space=hp_space,
                                         compute_objective=compute_objective)

        # rebuild trainer with push_to_hub=True, apply best hyperparams, and train once
        trainer = Trainer(
            model,
            args=args,
            train_dataset=encoded_dataset["train"],
            eval_dataset=encoded_dataset["validation"],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        # train after finding the best hyperparameters
        for n, v in best_run.hyperparameters.items():
            setattr(trainer.args, n, v)

        trainer.train()
        trainer.push_to_hub() # making SURE it pushes to hub
        return None











In [None]:
%%time
train_model(model_checkpoint="distilbert-base-uncased", n_hpo_trials = 10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/23485 [00:00<?, ? examples/s]

Map:   0%|          | 0/2956 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/23485 [00:00<?, ? examples/s]

Map:   0%|          | 0/2956 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-12-01 23:06:56,302] A new study created in memory with name: no-name-e9acfc71-52f9-4f3b-9ed4-e0cfb6130ebe
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newl

{'eval_loss': 1.5990452766418457, 'eval_accuracy': 0.5798376184032477, 'eval_f1_macro': 0.37937689230837746, 'eval_f1_weighted': 0.5429733062100628, 'eval_recall_macro': 0.3880330777408326, 'eval_precision_macro': 0.44169229157814516, 'eval_runtime': 1.6587, 'eval_samples_per_second': 1782.1, 'eval_steps_per_second': 111.532, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2863004207611084, 'eval_accuracy': 0.6461434370771313, 'eval_f1_macro': 0.5100316896126897, 'eval_f1_weighted': 0.6315718052657252, 'eval_recall_macro': 0.5119626240608945, 'eval_precision_macro': 0.5298536289695079, 'eval_runtime': 1.6376, 'eval_samples_per_second': 1805.056, 'eval_steps_per_second': 112.969, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2536022663116455, 'eval_accuracy': 0.6468200270635994, 'eval_f1_macro': 0.5189034114806186, 'eval_f1_weighted': 0.6366453002050094, 'eval_recall_macro': 0.5182631085372589, 'eval_precision_macro': 0.5290071922073604, 'eval_runtime': 1.6442, 'eval_samples_per_second': 1797.822, 'eval_steps_per_second': 112.516, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2776684761047363, 'eval_accuracy': 0.6447902571041949, 'eval_f1_macro': 0.5281333891018024, 'eval_f1_weighted': 0.6390934534593704, 'eval_recall_macro': 0.5273868956860522, 'eval_precision_macro': 0.5593367262216332, 'eval_runtime': 1.641, 'eval_samples_per_second': 1801.36, 'eval_steps_per_second': 112.737, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2995392084121704, 'eval_accuracy': 0.6451285520974289, 'eval_f1_macro': 0.5278171541407775, 'eval_f1_weighted': 0.6379461779808838, 'eval_recall_macro': 0.5250567413394345, 'eval_precision_macro': 0.5608433699941864, 'eval_runtime': 1.6744, 'eval_samples_per_second': 1765.402, 'eval_steps_per_second': 110.487, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3098821640014648, 'eval_accuracy': 0.6481732070365359, 'eval_f1_macro': 0.5280646682889838, 'eval_f1_weighted': 0.6414078658888422, 'eval_recall_macro': 0.527414521183826, 'eval_precision_macro': 0.5423471217233298, 'eval_runtime': 1.6322, 'eval_samples_per_second': 1811.057, 'eval_steps_per_second': 113.344, 'epoch': 6.0}
{'train_runtime': 183.4512, 'train_samples_per_second': 768.106, 'train_steps_per_second': 24.006, 'train_loss': 1.2656775534315394, 'epoch': 6.0}


[I 2025-12-01 23:10:01,258] Trial 0 finished with value: 0.6414078658888422 and parameters: {'learning_rate': 1.909749750147754e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 32, 'weight_decay': 0.05453030151954725, 'warmup_ratio': 0.17922054085114544}. Best is trial 0 with value: 0.6414078658888422.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2698469161987305, 'eval_accuracy': 0.6407307171853857, 'eval_f1_macro': 0.504865208269642, 'eval_f1_weighted': 0.6281578068816297, 'eval_recall_macro': 0.5040305887133111, 'eval_precision_macro': 0.5200483869471598, 'eval_runtime': 1.6331, 'eval_samples_per_second': 1810.079, 'eval_steps_per_second': 113.283, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2293998003005981, 'eval_accuracy': 0.6498646820027063, 'eval_f1_macro': 0.5184056290389608, 'eval_f1_weighted': 0.6399454136285933, 'eval_recall_macro': 0.5215491422289512, 'eval_precision_macro': 0.5251926137802476, 'eval_runtime': 1.6321, 'eval_samples_per_second': 1811.178, 'eval_steps_per_second': 113.352, 'epoch': 2.0}
{'train_runtime': 111.2818, 'train_samples_per_second': 422.082, 'train_steps_per_second': 26.383, 'train_loss': 1.4594500383174387, 'epoch': 2.0}


[I 2025-12-01 23:11:54,041] Trial 1 finished with value: 0.6399454136285933 and parameters: {'learning_rate': 3.558897011356488e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.13715570066156374, 'warmup_ratio': 0.1358590183247894}. Best is trial 0 with value: 0.6414078658888422.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2749245166778564, 'eval_accuracy': 0.6458051420838972, 'eval_f1_macro': 0.5135608637604699, 'eval_f1_weighted': 0.6358157372922403, 'eval_recall_macro': 0.5157507989041025, 'eval_precision_macro': 0.5263672903665758, 'eval_runtime': 1.6343, 'eval_samples_per_second': 1808.775, 'eval_steps_per_second': 113.201, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2453659772872925, 'eval_accuracy': 0.6471583220568335, 'eval_f1_macro': 0.5256454737117955, 'eval_f1_weighted': 0.6388915608282781, 'eval_recall_macro': 0.5270700268232701, 'eval_precision_macro': 0.5362866351458591, 'eval_runtime': 1.6264, 'eval_samples_per_second': 1817.474, 'eval_steps_per_second': 113.746, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4152772426605225, 'eval_accuracy': 0.6370094722598105, 'eval_f1_macro': 0.5406315167922998, 'eval_f1_weighted': 0.6350197433691696, 'eval_recall_macro': 0.5248749726096906, 'eval_precision_macro': 0.5870189655787713, 'eval_runtime': 1.6767, 'eval_samples_per_second': 1762.951, 'eval_steps_per_second': 110.334, 'epoch': 3.0}
{'eval_loss': 1.525698184967041, 'eval_accuracy': 0.6359945872801083, 'eval_f1_macro': 0.5469695018290364, 'eval_f1_weighted': 0.6322675085326991, 'eval_recall_macro': 0.5400445672934884, 'eval_precision_macro': 0.5820107644915543, 'eval_runtime': 1.6085, 'eval_samples_per_second': 1837.698, 'eval_steps_per_second': 115.012, 'epoch': 4.0}
{'train_runtime': 433.4325, 'train_samples_per_second': 216.735, 'train_steps_per_second': 27.095, 'train_loss': 0.9986994688777248, 'epoch': 4.0}


[I 2025-12-01 23:19:08,953] Trial 2 finished with value: 0.6322675085326991 and parameters: {'learning_rate': 4.281649131801574e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'weight_decay': 0.2281235947935052, 'warmup_ratio': 0.055616653234893024}. Best is trial 0 with value: 0.6414078658888422.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.411995768547058, 'eval_accuracy': 0.6200947225981055, 'eval_f1_macro': 0.46470086506893477, 'eval_f1_weighted': 0.6057653183018481, 'eval_recall_macro': 0.4583426535310939, 'eval_precision_macro': 0.4980080750873966, 'eval_runtime': 1.646, 'eval_samples_per_second': 1795.873, 'eval_steps_per_second': 112.394, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2359739542007446, 'eval_accuracy': 0.6495263870094723, 'eval_f1_macro': 0.5127356365771614, 'eval_f1_weighted': 0.6366330644850162, 'eval_recall_macro': 0.5178041008689059, 'eval_precision_macro': 0.5218418893174638, 'eval_runtime': 1.6307, 'eval_samples_per_second': 1812.697, 'eval_steps_per_second': 113.447, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2570393085479736, 'eval_accuracy': 0.6505412719891746, 'eval_f1_macro': 0.5291807168649649, 'eval_f1_weighted': 0.6450587257861079, 'eval_recall_macro': 0.5189275499723409, 'eval_precision_macro': 0.5726496986846655, 'eval_runtime': 1.6388, 'eval_samples_per_second': 1803.783, 'eval_steps_per_second': 112.889, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3311147689819336, 'eval_accuracy': 0.6441136671177267, 'eval_f1_macro': 0.5370152397031942, 'eval_f1_weighted': 0.6410641051459042, 'eval_recall_macro': 0.5330164512772201, 'eval_precision_macro': 0.5459200058382011, 'eval_runtime': 1.6425, 'eval_samples_per_second': 1799.643, 'eval_steps_per_second': 112.63, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4441347122192383, 'eval_accuracy': 0.6403924221921515, 'eval_f1_macro': 0.5310952878497683, 'eval_f1_weighted': 0.63525432817284, 'eval_recall_macro': 0.5274536492418979, 'eval_precision_macro': 0.5396577377420606, 'eval_runtime': 1.6072, 'eval_samples_per_second': 1839.22, 'eval_steps_per_second': 115.107, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4909889698028564, 'eval_accuracy': 0.6393775372124493, 'eval_f1_macro': 0.5422075041087623, 'eval_f1_weighted': 0.6355975037552405, 'eval_recall_macro': 0.5344770230914311, 'eval_precision_macro': 0.5726378764302072, 'eval_runtime': 1.6359, 'eval_samples_per_second': 1806.903, 'eval_steps_per_second': 113.084, 'epoch': 6.0}
{'train_runtime': 646.7749, 'train_samples_per_second': 217.866, 'train_steps_per_second': 27.237, 'train_loss': 1.07365452128037, 'epoch': 6.0}


[I 2025-12-01 23:29:57,196] Trial 3 finished with value: 0.6355975037552405 and parameters: {'learning_rate': 1.802745278118134e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 8, 'weight_decay': 0.2650242710753668, 'warmup_ratio': 0.19290674021672577}. Best is trial 0 with value: 0.6414078658888422.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3368124961853027, 'eval_accuracy': 0.6326116373477673, 'eval_f1_macro': 0.5025289507727521, 'eval_f1_weighted': 0.620092544969325, 'eval_recall_macro': 0.5081302515547095, 'eval_precision_macro': 0.5163229084855091, 'eval_runtime': 1.6383, 'eval_samples_per_second': 1804.31, 'eval_steps_per_second': 112.922, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2448272705078125, 'eval_accuracy': 0.6508795669824087, 'eval_f1_macro': 0.5226712134718509, 'eval_f1_weighted': 0.6404126517074397, 'eval_recall_macro': 0.5200521217720281, 'eval_precision_macro': 0.5644033721302436, 'eval_runtime': 1.7021, 'eval_samples_per_second': 1736.649, 'eval_steps_per_second': 108.687, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.28984797000885, 'eval_accuracy': 0.6481732070365359, 'eval_f1_macro': 0.5326765006425537, 'eval_f1_weighted': 0.6410908130034326, 'eval_recall_macro': 0.5274285974840836, 'eval_precision_macro': 0.5561874622472988, 'eval_runtime': 1.6492, 'eval_samples_per_second': 1792.424, 'eval_steps_per_second': 112.178, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.368521809577942, 'eval_accuracy': 0.631935047361299, 'eval_f1_macro': 0.530696067278395, 'eval_f1_weighted': 0.6301568494130698, 'eval_recall_macro': 0.5254905380811975, 'eval_precision_macro': 0.5679933324624847, 'eval_runtime': 1.6773, 'eval_samples_per_second': 1762.38, 'eval_steps_per_second': 110.298, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4935139417648315, 'eval_accuracy': 0.6349797023004059, 'eval_f1_macro': 0.5285087180000146, 'eval_f1_weighted': 0.6318346675436666, 'eval_recall_macro': 0.5251147473326704, 'eval_precision_macro': 0.5640897744753798, 'eval_runtime': 1.6587, 'eval_samples_per_second': 1782.125, 'eval_steps_per_second': 111.534, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.570556640625, 'eval_accuracy': 0.6299052774018945, 'eval_f1_macro': 0.5287305047461626, 'eval_f1_weighted': 0.6291760919506476, 'eval_recall_macro': 0.5231674106141468, 'eval_precision_macro': 0.5479760380435935, 'eval_runtime': 1.6198, 'eval_samples_per_second': 1824.881, 'eval_steps_per_second': 114.209, 'epoch': 6.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.608222246170044, 'eval_accuracy': 0.6275372124492558, 'eval_f1_macro': 0.5212239706893764, 'eval_f1_weighted': 0.6262449737212523, 'eval_recall_macro': 0.5185014444839576, 'eval_precision_macro': 0.5361926346601943, 'eval_runtime': 1.6066, 'eval_samples_per_second': 1839.963, 'eval_steps_per_second': 115.153, 'epoch': 7.0}
{'train_runtime': 214.1628, 'train_samples_per_second': 767.617, 'train_steps_per_second': 23.991, 'train_loss': 0.8478216079943558, 'epoch': 7.0}


[I 2025-12-01 23:33:33,279] Trial 4 finished with value: 0.6262449737212523 and parameters: {'learning_rate': 3.9977152589007314e-05, 'num_train_epochs': 7, 'per_device_train_batch_size': 32, 'weight_decay': 0.11728672447859195, 'warmup_ratio': 0.09201733548743436}. Best is trial 0 with value: 0.6414078658888422.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-01 23:35:21,203] Trial 5 pruned. 


{'eval_loss': 1.267921805381775, 'eval_accuracy': 0.6447902571041949, 'eval_f1_macro': 0.5111331981412012, 'eval_f1_weighted': 0.6353634536567672, 'eval_recall_macro': 0.5109215580647058, 'eval_precision_macro': 0.5229780027404366, 'eval_runtime': 1.6365, 'eval_samples_per_second': 1806.281, 'eval_steps_per_second': 113.045, 'epoch': 1.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3920832872390747, 'eval_accuracy': 0.6261840324763194, 'eval_f1_macro': 0.4665183480559232, 'eval_f1_weighted': 0.6078661442942535, 'eval_recall_macro': 0.47564313490816806, 'eval_precision_macro': 0.4803739854224752, 'eval_runtime': 1.668, 'eval_samples_per_second': 1772.231, 'eval_steps_per_second': 110.914, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2550238370895386, 'eval_accuracy': 0.6512178619756428, 'eval_f1_macro': 0.5138444228909367, 'eval_f1_weighted': 0.6386507526754767, 'eval_recall_macro': 0.51757425500086, 'eval_precision_macro': 0.5248717403452505, 'eval_runtime': 1.6229, 'eval_samples_per_second': 1821.395, 'eval_steps_per_second': 113.991, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-01 23:36:51,613] Trial 6 pruned. 


{'eval_loss': 1.2557450532913208, 'eval_accuracy': 0.6478349120433018, 'eval_f1_macro': 0.5182706179740032, 'eval_f1_weighted': 0.6386203449748262, 'eval_recall_macro': 0.5166617487655101, 'eval_precision_macro': 0.5282454396310953, 'eval_runtime': 1.6449, 'eval_samples_per_second': 1797.032, 'eval_steps_per_second': 112.466, 'epoch': 3.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-01 23:37:21,569] Trial 7 pruned. 


{'eval_loss': 1.585952639579773, 'eval_accuracy': 0.5825439783491204, 'eval_f1_macro': 0.38029479425210483, 'eval_f1_weighted': 0.5462329294192089, 'eval_recall_macro': 0.38739134743343734, 'eval_precision_macro': 0.4751149349502517, 'eval_runtime': 1.6344, 'eval_samples_per_second': 1808.62, 'eval_steps_per_second': 113.192, 'epoch': 1.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3275424242019653, 'eval_accuracy': 0.6305818673883626, 'eval_f1_macro': 0.4986190541662871, 'eval_f1_weighted': 0.6165530177684927, 'eval_recall_macro': 0.5047830098111824, 'eval_precision_macro': 0.5117250883417931, 'eval_runtime': 1.6175, 'eval_samples_per_second': 1827.516, 'eval_steps_per_second': 114.374, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.236153483390808, 'eval_accuracy': 0.6481732070365359, 'eval_f1_macro': 0.512429877604337, 'eval_f1_weighted': 0.6364457585722147, 'eval_recall_macro': 0.515406788049845, 'eval_precision_macro': 0.5224129188322633, 'eval_runtime': 1.6388, 'eval_samples_per_second': 1803.725, 'eval_steps_per_second': 112.885, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.271156668663025, 'eval_accuracy': 0.648849797023004, 'eval_f1_macro': 0.5315145258392519, 'eval_f1_weighted': 0.6415362778839151, 'eval_recall_macro': 0.5262339649109725, 'eval_precision_macro': 0.5535851094774246, 'eval_runtime': 1.6361, 'eval_samples_per_second': 1806.722, 'eval_steps_per_second': 113.073, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3399631977081299, 'eval_accuracy': 0.6363328822733424, 'eval_f1_macro': 0.5284743117319407, 'eval_f1_weighted': 0.6332095764933062, 'eval_recall_macro': 0.525156840568844, 'eval_precision_macro': 0.5716823262931336, 'eval_runtime': 1.6517, 'eval_samples_per_second': 1789.677, 'eval_steps_per_second': 112.006, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4236361980438232, 'eval_accuracy': 0.6336265223274695, 'eval_f1_macro': 0.529075147840116, 'eval_f1_weighted': 0.6286730490632354, 'eval_recall_macro': 0.5215892734288948, 'eval_precision_macro': 0.5712814816958726, 'eval_runtime': 1.6355, 'eval_samples_per_second': 1807.374, 'eval_steps_per_second': 113.114, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4778831005096436, 'eval_accuracy': 0.6332882273342354, 'eval_f1_macro': 0.5301768890599728, 'eval_f1_weighted': 0.631798316038956, 'eval_recall_macro': 0.527074671006928, 'eval_precision_macro': 0.5637100863628056, 'eval_runtime': 1.6574, 'eval_samples_per_second': 1783.525, 'eval_steps_per_second': 111.621, 'epoch': 6.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.5429763793945312, 'eval_accuracy': 0.631935047361299, 'eval_f1_macro': 0.5252687382094957, 'eval_f1_weighted': 0.6293580422107989, 'eval_recall_macro': 0.5257641603191234, 'eval_precision_macro': 0.5556246056546114, 'eval_runtime': 1.6395, 'eval_samples_per_second': 1802.991, 'eval_steps_per_second': 112.839, 'epoch': 7.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.5669541358947754, 'eval_accuracy': 0.6278755074424899, 'eval_f1_macro': 0.5283941991421272, 'eval_f1_weighted': 0.6263288439634478, 'eval_recall_macro': 0.5268158988245896, 'eval_precision_macro': 0.5512902319167704, 'eval_runtime': 1.7016, 'eval_samples_per_second': 1737.141, 'eval_steps_per_second': 108.718, 'epoch': 8.0}
{'train_runtime': 243.1886, 'train_samples_per_second': 772.569, 'train_steps_per_second': 24.146, 'train_loss': 0.8080007776577401, 'epoch': 8.0}


[I 2025-12-01 23:41:26,290] Trial 8 finished with value: 0.6263288439634478 and parameters: {'learning_rate': 2.9676486507310917e-05, 'num_train_epochs': 8, 'per_device_train_batch_size': 32, 'weight_decay': 0.2274192429721497, 'warmup_ratio': 0.026869857159658173}. Best is trial 0 with value: 0.6414078658888422.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-01 23:43:14,312] Trial 9 pruned. 
  trainer = Trainer(


{'eval_loss': 1.2897783517837524, 'eval_accuracy': 0.6359945872801083, 'eval_f1_macro': 0.5047948939959491, 'eval_f1_weighted': 0.6267741509953956, 'eval_recall_macro': 0.5024724408402583, 'eval_precision_macro': 0.5214151635325399, 'eval_runtime': 1.6219, 'eval_samples_per_second': 1822.567, 'eval_steps_per_second': 114.065, 'epoch': 1.0}


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Recall Macro,Precision Macro
1,1.7864,1.599045,0.579838,0.379377,0.542973,0.388033,0.441692
2,1.3226,1.2863,0.646143,0.510032,0.631572,0.511963,0.529854
3,1.0922,1.253602,0.64682,0.518903,0.636645,0.518263,0.529007
4,0.943,1.277668,0.64479,0.528133,0.639093,0.527387,0.559337
5,0.7991,1.299539,0.645129,0.527817,0.637946,0.525057,0.560843
6,0.7384,1.309882,0.648173,0.528065,0.641408,0.527415,0.542347


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...motions/training_args.bin: 100%|##########| 5.91kB / 5.91kB            

  ...motions/model.safetensors:  16%|#5        | 41.9MB /  268MB            

CPU times: user 39min 13s, sys: 38.4 s, total: 39min 52s
Wall time: 40min 5s


In [None]:
%%time
train_model(model_checkpoint="bert-base-uncased", n_hpo_trials = 10)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/23485 [00:00<?, ? examples/s]

Map:   0%|          | 0/2956 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-12-01 23:53:44,266] A new study created in memory with name: no-name-f3f21e3e-8207-4f0c-945c-3d8a57e28bfe
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use

{'eval_loss': 1.3758805990219116, 'eval_accuracy': 0.6231393775372125, 'eval_f1_macro': 0.4983077106639139, 'eval_f1_weighted': 0.6125383303165653, 'eval_recall_macro': 0.4854831347051538, 'eval_precision_macro': 0.547492088164136, 'eval_runtime': 2.9062, 'eval_samples_per_second': 1017.132, 'eval_steps_per_second': 63.657, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.287972331047058, 'eval_accuracy': 0.63531799729364, 'eval_f1_macro': 0.5520231808906547, 'eval_f1_weighted': 0.6280813480451887, 'eval_recall_macro': 0.5447537652124672, 'eval_precision_macro': 0.5876077718507741, 'eval_runtime': 2.8858, 'eval_samples_per_second': 1024.323, 'eval_steps_per_second': 64.107, 'epoch': 2.0}
{'eval_loss': 1.3404685258865356, 'eval_accuracy': 0.648849797023004, 'eval_f1_macro': 0.5637730228886741, 'eval_f1_weighted': 0.6473095367364134, 'eval_recall_macro': 0.5516455525144126, 'eval_precision_macro': 0.5987913674989244, 'eval_runtime': 2.8878, 'eval_samples_per_second': 1023.614, 'eval_steps_per_second': 64.062, 'epoch': 3.0}
{'eval_loss': 1.6015583276748657, 'eval_accuracy': 0.6295669824086604, 'eval_f1_macro': 0.5572586191193073, 'eval_f1_weighted': 0.6311752661169178, 'eval_recall_macro': 0.5429862417758782, 'eval_precision_macro': 0.6140516538360773, 'eval_runtime': 2.8237, 'eval_samples_per_second': 1046.862, 'eval_steps_per_second': 65.

[I 2025-12-02 00:13:26,909] Trial 0 finished with value: 0.6293509761984211 and parameters: {'learning_rate': 4.8324838549133024e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 8, 'weight_decay': 0.21595467904741572, 'warmup_ratio': 0.19480636375813354}. Best is trial 0 with value: 0.6293509761984211.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3981564044952393, 'eval_accuracy': 0.6268606224627875, 'eval_f1_macro': 0.4779726420379446, 'eval_f1_weighted': 0.6123323862332662, 'eval_recall_macro': 0.4707305964718941, 'eval_precision_macro': 0.5210328367718199, 'eval_runtime': 2.919, 'eval_samples_per_second': 1012.685, 'eval_steps_per_second': 63.378, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2676897048950195, 'eval_accuracy': 0.6468200270635994, 'eval_f1_macro': 0.5079219734189446, 'eval_f1_weighted': 0.6333939930732474, 'eval_recall_macro': 0.5095932204144682, 'eval_precision_macro': 0.5278179205774465, 'eval_runtime': 2.8347, 'eval_samples_per_second': 1042.802, 'eval_steps_per_second': 65.263, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2484201192855835, 'eval_accuracy': 0.6549391069012178, 'eval_f1_macro': 0.5233169380538131, 'eval_f1_weighted': 0.6459705217802503, 'eval_recall_macro': 0.5185216978227348, 'eval_precision_macro': 0.5353215531303179, 'eval_runtime': 2.8879, 'eval_samples_per_second': 1023.592, 'eval_steps_per_second': 64.061, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.263810157775879, 'eval_accuracy': 0.6451285520974289, 'eval_f1_macro': 0.5177110815013863, 'eval_f1_weighted': 0.6400053528554859, 'eval_recall_macro': 0.515720445421303, 'eval_precision_macro': 0.5246818097732362, 'eval_runtime': 2.8291, 'eval_samples_per_second': 1044.856, 'eval_steps_per_second': 65.392, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.296676754951477, 'eval_accuracy': 0.6512178619756428, 'eval_f1_macro': 0.5218012617208021, 'eval_f1_weighted': 0.6440136871914959, 'eval_recall_macro': 0.5195664988221921, 'eval_precision_macro': 0.5293618830268918, 'eval_runtime': 2.8457, 'eval_samples_per_second': 1038.758, 'eval_steps_per_second': 65.01, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3270846605300903, 'eval_accuracy': 0.6481732070365359, 'eval_f1_macro': 0.5335119966763491, 'eval_f1_weighted': 0.641866490047323, 'eval_recall_macro': 0.5285210364668637, 'eval_precision_macro': 0.5728983857738823, 'eval_runtime': 2.8602, 'eval_samples_per_second': 1033.494, 'eval_steps_per_second': 64.681, 'epoch': 6.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.335085153579712, 'eval_accuracy': 0.6525710419485792, 'eval_f1_macro': 0.5314836948079379, 'eval_f1_weighted': 0.6462863270355393, 'eval_recall_macro': 0.52779995005767, 'eval_precision_macro': 0.5666447568701467, 'eval_runtime': 2.8307, 'eval_samples_per_second': 1044.258, 'eval_steps_per_second': 65.354, 'epoch': 7.0}
{'train_runtime': 731.7814, 'train_samples_per_second': 224.65, 'train_steps_per_second': 14.042, 'train_loss': 1.0636136955953193, 'epoch': 7.0}


[I 2025-12-02 00:25:40,561] Trial 1 finished with value: 0.6462863270355393 and parameters: {'learning_rate': 1.0192550903105173e-05, 'num_train_epochs': 7, 'per_device_train_batch_size': 16, 'weight_decay': 0.10639103762148287, 'warmup_ratio': 0.019871578070085064}. Best is trial 1 with value: 0.6462863270355393.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3057868480682373, 'eval_accuracy': 0.6363328822733424, 'eval_f1_macro': 0.5059216297318399, 'eval_f1_weighted': 0.6269615911409481, 'eval_recall_macro': 0.5015628476171531, 'eval_precision_macro': 0.5209476728139936, 'eval_runtime': 2.8121, 'eval_samples_per_second': 1051.182, 'eval_steps_per_second': 65.788, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.233575701713562, 'eval_accuracy': 0.6532476319350473, 'eval_f1_macro': 0.5401143389466414, 'eval_f1_weighted': 0.6435128741380428, 'eval_recall_macro': 0.5397510439093718, 'eval_precision_macro': 0.5502850569661991, 'eval_runtime': 2.8312, 'eval_samples_per_second': 1044.075, 'eval_steps_per_second': 65.343, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3013924360275269, 'eval_accuracy': 0.6461434370771313, 'eval_f1_macro': 0.5349728433134578, 'eval_f1_weighted': 0.6420536807750502, 'eval_recall_macro': 0.5248148536312267, 'eval_precision_macro': 0.5786929193201332, 'eval_runtime': 2.9638, 'eval_samples_per_second': 997.36, 'eval_steps_per_second': 62.419, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4220937490463257, 'eval_accuracy': 0.6359945872801083, 'eval_f1_macro': 0.5503992968638578, 'eval_f1_weighted': 0.6350033081111864, 'eval_recall_macro': 0.5367989340205297, 'eval_precision_macro': 0.5913095551215442, 'eval_runtime': 2.8077, 'eval_samples_per_second': 1052.818, 'eval_steps_per_second': 65.89, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.5782625675201416, 'eval_accuracy': 0.6434370771312584, 'eval_f1_macro': 0.5557408176556636, 'eval_f1_weighted': 0.6404094528759592, 'eval_recall_macro': 0.5439701400857755, 'eval_precision_macro': 0.5859070521308968, 'eval_runtime': 2.7839, 'eval_samples_per_second': 1061.811, 'eval_steps_per_second': 66.453, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.6755684614181519, 'eval_accuracy': 0.642083897158322, 'eval_f1_macro': 0.5524810359584575, 'eval_f1_weighted': 0.6380615659522549, 'eval_recall_macro': 0.5421918963900239, 'eval_precision_macro': 0.5809660317598353, 'eval_runtime': 2.7949, 'eval_samples_per_second': 1057.659, 'eval_steps_per_second': 66.193, 'epoch': 6.0}
{'train_runtime': 1176.5664, 'train_samples_per_second': 119.764, 'train_steps_per_second': 14.972, 'train_loss': 0.9129632573902986, 'epoch': 6.0}


[I 2025-12-02 00:45:18,747] Trial 2 finished with value: 0.6380615659522549 and parameters: {'learning_rate': 1.7287929282847327e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 8, 'weight_decay': 0.1736140647029599, 'warmup_ratio': 0.10522548768130247}. Best is trial 1 with value: 0.6462863270355393.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3616563081741333, 'eval_accuracy': 0.6346414073071719, 'eval_f1_macro': 0.49508757694137656, 'eval_f1_weighted': 0.6182421438486992, 'eval_recall_macro': 0.5006736184092724, 'eval_precision_macro': 0.5075174267863657, 'eval_runtime': 2.8221, 'eval_samples_per_second': 1047.431, 'eval_steps_per_second': 65.553, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.238152265548706, 'eval_accuracy': 0.6539242219215156, 'eval_f1_macro': 0.5169519008536926, 'eval_f1_weighted': 0.6426564151580554, 'eval_recall_macro': 0.517231317554673, 'eval_precision_macro': 0.535556052087799, 'eval_runtime': 2.8443, 'eval_samples_per_second': 1039.257, 'eval_steps_per_second': 65.041, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.249719262123108, 'eval_accuracy': 0.6518944519621109, 'eval_f1_macro': 0.531025009481983, 'eval_f1_weighted': 0.6434085386569303, 'eval_recall_macro': 0.5250362806808767, 'eval_precision_macro': 0.5551772528953979, 'eval_runtime': 2.8083, 'eval_samples_per_second': 1052.596, 'eval_steps_per_second': 65.876, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.289950966835022, 'eval_accuracy': 0.6407307171853857, 'eval_f1_macro': 0.5245616612018926, 'eval_f1_weighted': 0.6359881103410167, 'eval_recall_macro': 0.5208876409346066, 'eval_precision_macro': 0.535943570931955, 'eval_runtime': 2.8219, 'eval_samples_per_second': 1047.518, 'eval_steps_per_second': 65.558, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3207199573516846, 'eval_accuracy': 0.648849797023004, 'eval_f1_macro': 0.5354279998288004, 'eval_f1_weighted': 0.6435164889259282, 'eval_recall_macro': 0.5356208384157051, 'eval_precision_macro': 0.5396932110562443, 'eval_runtime': 2.8138, 'eval_samples_per_second': 1050.527, 'eval_steps_per_second': 65.747, 'epoch': 5.0}
{'train_runtime': 277.5135, 'train_samples_per_second': 423.133, 'train_steps_per_second': 13.225, 'train_loss': 1.1360901045214578, 'epoch': 5.0}


[I 2025-12-02 00:49:57,867] Trial 3 finished with value: 0.6435164889259282 and parameters: {'learning_rate': 2.511632548583993e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.17547739271812754, 'warmup_ratio': 0.11912611617985136}. Best is trial 1 with value: 0.6462863270355393.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3548314571380615, 'eval_accuracy': 0.631596752368065, 'eval_f1_macro': 0.49956046746248634, 'eval_f1_weighted': 0.6216196365131268, 'eval_recall_macro': 0.48851210568035197, 'eval_precision_macro': 0.5278472706217451, 'eval_runtime': 2.8188, 'eval_samples_per_second': 1048.676, 'eval_steps_per_second': 65.631, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2465935945510864, 'eval_accuracy': 0.6447902571041949, 'eval_f1_macro': 0.5468448499138523, 'eval_f1_weighted': 0.6352496122589331, 'eval_recall_macro': 0.5389053928972312, 'eval_precision_macro': 0.5894584016783837, 'eval_runtime': 2.8653, 'eval_samples_per_second': 1031.665, 'eval_steps_per_second': 64.566, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2892272472381592, 'eval_accuracy': 0.6471583220568335, 'eval_f1_macro': 0.5465612996197637, 'eval_f1_weighted': 0.6441478243304787, 'eval_recall_macro': 0.5325717378986441, 'eval_precision_macro': 0.5987912247723772, 'eval_runtime': 2.8036, 'eval_samples_per_second': 1054.343, 'eval_steps_per_second': 65.986, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4636121988296509, 'eval_accuracy': 0.6380243572395129, 'eval_f1_macro': 0.5544182277305183, 'eval_f1_weighted': 0.6380768675131492, 'eval_recall_macro': 0.542840477458573, 'eval_precision_macro': 0.5883053084032005, 'eval_runtime': 2.8026, 'eval_samples_per_second': 1054.748, 'eval_steps_per_second': 66.011, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.6884807348251343, 'eval_accuracy': 0.6370094722598105, 'eval_f1_macro': 0.5409367590435843, 'eval_f1_weighted': 0.633778970997123, 'eval_recall_macro': 0.5322538813761469, 'eval_precision_macro': 0.5816516269663926, 'eval_runtime': 2.8021, 'eval_samples_per_second': 1054.92, 'eval_steps_per_second': 66.022, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.8375860452651978, 'eval_accuracy': 0.6383626522327469, 'eval_f1_macro': 0.5514229049534349, 'eval_f1_weighted': 0.6345189479885246, 'eval_recall_macro': 0.5412530559318678, 'eval_precision_macro': 0.5790738564392287, 'eval_runtime': 2.7871, 'eval_samples_per_second': 1060.602, 'eval_steps_per_second': 66.377, 'epoch': 6.0}
{'train_runtime': 1169.3152, 'train_samples_per_second': 120.506, 'train_steps_per_second': 15.065, 'train_loss': 0.9266364628569198, 'epoch': 6.0}


[I 2025-12-02 01:09:28,759] Trial 4 finished with value: 0.6345189479885246 and parameters: {'learning_rate': 2.1892349262036315e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 8, 'weight_decay': 0.047703687343287816, 'warmup_ratio': 0.19718959584727003}. Best is trial 1 with value: 0.6462863270355393.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-02 01:10:20,132] Trial 5 pruned. 


{'eval_loss': 1.3984328508377075, 'eval_accuracy': 0.6278755074424899, 'eval_f1_macro': 0.47559274696406817, 'eval_f1_weighted': 0.6102499778949119, 'eval_recall_macro': 0.4687363724626795, 'eval_precision_macro': 0.5340587317247921, 'eval_runtime': 2.8112, 'eval_samples_per_second': 1051.499, 'eval_steps_per_second': 65.808, 'epoch': 1.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-02 01:11:11,861] Trial 6 pruned. 


{'eval_loss': 1.3669488430023193, 'eval_accuracy': 0.6288903924221921, 'eval_f1_macro': 0.49659247594498257, 'eval_f1_weighted': 0.6142542615149038, 'eval_recall_macro': 0.4941267992691123, 'eval_precision_macro': 0.5173284502162765, 'eval_runtime': 2.8375, 'eval_samples_per_second': 1041.769, 'eval_steps_per_second': 65.199, 'epoch': 1.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3562694787979126, 'eval_accuracy': 0.6332882273342354, 'eval_f1_macro': 0.4959678279150363, 'eval_f1_weighted': 0.6185995278632277, 'eval_recall_macro': 0.4977781007927846, 'eval_precision_macro': 0.508065374103777, 'eval_runtime': 2.8854, 'eval_samples_per_second': 1024.478, 'eval_steps_per_second': 64.117, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.234368920326233, 'eval_accuracy': 0.645466847090663, 'eval_f1_macro': 0.5185903600735478, 'eval_f1_weighted': 0.6351891005804017, 'eval_recall_macro': 0.5153417136314205, 'eval_precision_macro': 0.546938982452962, 'eval_runtime': 2.8225, 'eval_samples_per_second': 1047.301, 'eval_steps_per_second': 65.545, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2603657245635986, 'eval_accuracy': 0.6518944519621109, 'eval_f1_macro': 0.5481479137772582, 'eval_f1_weighted': 0.6449555795974479, 'eval_recall_macro': 0.5342635686329064, 'eval_precision_macro': 0.6301745098440326, 'eval_runtime': 2.8287, 'eval_samples_per_second': 1045.003, 'eval_steps_per_second': 65.401, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3346903324127197, 'eval_accuracy': 0.638700947225981, 'eval_f1_macro': 0.5412425109105224, 'eval_f1_weighted': 0.6355549993242559, 'eval_recall_macro': 0.5298730759872327, 'eval_precision_macro': 0.5927452425812745, 'eval_runtime': 2.8061, 'eval_samples_per_second': 1053.43, 'eval_steps_per_second': 65.928, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3887091875076294, 'eval_accuracy': 0.6430987821380244, 'eval_f1_macro': 0.5429418134266556, 'eval_f1_weighted': 0.6395361813847563, 'eval_recall_macro': 0.539416503880569, 'eval_precision_macro': 0.5787010568491575, 'eval_runtime': 2.8525, 'eval_samples_per_second': 1036.301, 'eval_steps_per_second': 64.856, 'epoch': 5.0}
{'train_runtime': 282.7827, 'train_samples_per_second': 415.248, 'train_steps_per_second': 12.978, 'train_loss': 1.070638397798876, 'epoch': 5.0}


[I 2025-12-02 01:15:56,341] Trial 7 finished with value: 0.6395361813847563 and parameters: {'learning_rate': 3.553442170698851e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.29332174055621846, 'warmup_ratio': 0.17397316952921948}. Best is trial 1 with value: 0.6462863270355393.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-02 01:17:36,573] Trial 8 pruned. 


{'eval_loss': 1.2710233926773071, 'eval_accuracy': 0.6424221921515562, 'eval_f1_macro': 0.5100371694748062, 'eval_f1_weighted': 0.6323130787751259, 'eval_recall_macro': 0.5111062291437839, 'eval_precision_macro': 0.5173417392139261, 'eval_runtime': 2.9644, 'eval_samples_per_second': 997.182, 'eval_steps_per_second': 62.408, 'epoch': 1.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2403312921524048, 'eval_accuracy': 0.6471583220568335, 'eval_f1_macro': 0.5246459009130521, 'eval_f1_weighted': 0.6390429340697735, 'eval_recall_macro': 0.5178379448775559, 'eval_precision_macro': 0.5568008907446814, 'eval_runtime': 2.8389, 'eval_samples_per_second': 1041.254, 'eval_steps_per_second': 65.166, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.225367546081543, 'eval_accuracy': 0.6552774018944519, 'eval_f1_macro': 0.5551693327810769, 'eval_f1_weighted': 0.6476731395501788, 'eval_recall_macro': 0.5500174649116881, 'eval_precision_macro': 0.5885228739005854, 'eval_runtime': 2.9276, 'eval_samples_per_second': 1009.694, 'eval_steps_per_second': 63.191, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3064032793045044, 'eval_accuracy': 0.6491880920162382, 'eval_f1_macro': 0.550885332628077, 'eval_f1_weighted': 0.6434919462977681, 'eval_recall_macro': 0.5444750339431287, 'eval_precision_macro': 0.5889088463168302, 'eval_runtime': 2.8408, 'eval_samples_per_second': 1040.568, 'eval_steps_per_second': 65.124, 'epoch': 3.0}
{'train_runtime': 589.8307, 'train_samples_per_second': 119.45, 'train_steps_per_second': 14.933, 'train_loss': 1.1251822739271118, 'epoch': 3.0}


[I 2025-12-02 01:27:28,046] Trial 9 finished with value: 0.6434919462977681 and parameters: {'learning_rate': 2.948143663196088e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'weight_decay': 0.15869804179475797, 'warmup_ratio': 0.03747122765997653}. Best is trial 1 with value: 0.6462863270355393.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Recall Macro,Precision Macro
1,1.511,1.417936,0.619418,0.447473,0.598821,0.442609,0.49555
2,1.2943,1.26954,0.647497,0.508472,0.6343,0.503252,0.536452
3,1.0111,1.259128,0.650203,0.521918,0.641656,0.516302,0.536274
4,0.8999,1.269555,0.648173,0.522452,0.64327,0.521286,0.528789
5,0.8201,1.295601,0.648512,0.527464,0.64255,0.52472,0.537644
6,0.6941,1.32471,0.647497,0.521948,0.641231,0.52341,0.525589
7,0.6408,1.336051,0.647158,0.524708,0.64032,0.524969,0.529704


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...motions/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...motions/model.safetensors:  10%|9         | 41.8MB /  438MB            

CPU times: user 1h 41min 49s, sys: 1min 1s, total: 1h 42min 51s
Wall time: 1h 48min 48s


In [None]:
%%time
train_model(model_checkpoint="bert-large-uncased", n_hpo_trials = 10)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/23485 [00:00<?, ? examples/s]

Map:   0%|          | 0/2956 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-12-02 01:49:07,966] A new study created in memory with name: no-name-ae759725-8e60-4b25-9c77-d173fa272ed1
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to 

{'eval_loss': 1.336988091468811, 'eval_accuracy': 0.6343031123139378, 'eval_f1_macro': 0.4894767010283804, 'eval_f1_weighted': 0.6177078006292896, 'eval_recall_macro': 0.4945269035720379, 'eval_precision_macro': 0.524228754636353, 'eval_runtime': 4.9913, 'eval_samples_per_second': 592.229, 'eval_steps_per_second': 37.064, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2058900594711304, 'eval_accuracy': 0.6539242219215156, 'eval_f1_macro': 0.5441831958904616, 'eval_f1_weighted': 0.6437272195186203, 'eval_recall_macro': 0.5406771282989293, 'eval_precision_macro': 0.5881373403165073, 'eval_runtime': 5.1019, 'eval_samples_per_second': 579.388, 'eval_steps_per_second': 36.261, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2454075813293457, 'eval_accuracy': 0.6576454668470907, 'eval_f1_macro': 0.5454525793222079, 'eval_f1_weighted': 0.6512849694774981, 'eval_recall_macro': 0.5373984856322556, 'eval_precision_macro': 0.595172435590457, 'eval_runtime': 5.0036, 'eval_samples_per_second': 590.771, 'eval_steps_per_second': 36.973, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3055858612060547, 'eval_accuracy': 0.6515561569688768, 'eval_f1_macro': 0.5560767729294444, 'eval_f1_weighted': 0.648088006032109, 'eval_recall_macro': 0.5466126025339164, 'eval_precision_macro': 0.5901166974882863, 'eval_runtime': 5.0125, 'eval_samples_per_second': 589.726, 'eval_steps_per_second': 36.908, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.359083652496338, 'eval_accuracy': 0.6498646820027063, 'eval_f1_macro': 0.556501420155784, 'eval_f1_weighted': 0.6454263540896593, 'eval_recall_macro': 0.550252597190572, 'eval_precision_macro': 0.5737294164762333, 'eval_runtime': 4.9735, 'eval_samples_per_second': 594.349, 'eval_steps_per_second': 37.197, 'epoch': 5.0}
{'train_runtime': 1181.9319, 'train_samples_per_second': 99.35, 'train_steps_per_second': 3.105, 'train_loss': 1.045034498999489, 'epoch': 5.0}


[I 2025-12-02 02:08:51,957] Trial 0 finished with value: 0.6454263540896593 and parameters: {'learning_rate': 1.9838771148156116e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.11518960434101935, 'warmup_ratio': 0.09772013950541869}. Best is trial 0 with value: 0.6454263540896593.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3428362607955933, 'eval_accuracy': 0.6271989174560216, 'eval_f1_macro': 0.4698728376186355, 'eval_f1_weighted': 0.6078463845158859, 'eval_recall_macro': 0.4818264715064082, 'eval_precision_macro': 0.4800584584546895, 'eval_runtime': 4.9956, 'eval_samples_per_second': 591.725, 'eval_steps_per_second': 37.033, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2065743207931519, 'eval_accuracy': 0.6579837618403248, 'eval_f1_macro': 0.5350683375991082, 'eval_f1_weighted': 0.6475514234221361, 'eval_recall_macro': 0.5391910627774037, 'eval_precision_macro': 0.5434995052514393, 'eval_runtime': 5.0453, 'eval_samples_per_second': 585.896, 'eval_steps_per_second': 36.668, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2243683338165283, 'eval_accuracy': 0.6549391069012178, 'eval_f1_macro': 0.548571648571701, 'eval_f1_weighted': 0.6477588908399027, 'eval_recall_macro': 0.5384301935343265, 'eval_precision_macro': 0.5931380884204311, 'eval_runtime': 5.0327, 'eval_samples_per_second': 587.359, 'eval_steps_per_second': 36.76, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2647384405136108, 'eval_accuracy': 0.6593369418132612, 'eval_f1_macro': 0.5565514979444233, 'eval_f1_weighted': 0.6540078155051587, 'eval_recall_macro': 0.5533483810281514, 'eval_precision_macro': 0.5838384583670204, 'eval_runtime': 4.9746, 'eval_samples_per_second': 594.224, 'eval_steps_per_second': 37.189, 'epoch': 4.0}
{'train_runtime': 1035.862, 'train_samples_per_second': 90.688, 'train_steps_per_second': 2.834, 'train_loss': 1.2589574496817524, 'epoch': 4.0}


[I 2025-12-02 02:26:09,986] Trial 1 finished with value: 0.6540078155051587 and parameters: {'learning_rate': 1.841484674121251e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.0699444122359867, 'warmup_ratio': 0.19320077734578317}. Best is trial 1 with value: 0.6540078155051587.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.3477240800857544, 'eval_accuracy': 0.6312584573748309, 'eval_f1_macro': 0.4691678968831165, 'eval_f1_weighted': 0.6135378062745634, 'eval_recall_macro': 0.477513597801044, 'eval_precision_macro': 0.4843704981626865, 'eval_runtime': 5.0432, 'eval_samples_per_second': 586.139, 'eval_steps_per_second': 36.683, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2523558139801025, 'eval_accuracy': 0.6562922868741543, 'eval_f1_macro': 0.509637324617134, 'eval_f1_weighted': 0.6429659100926854, 'eval_recall_macro': 0.512572024339464, 'eval_precision_macro': 0.5396036857034683, 'eval_runtime': 4.9844, 'eval_samples_per_second': 593.055, 'eval_steps_per_second': 37.116, 'epoch': 2.0}
{'train_runtime': 538.1495, 'train_samples_per_second': 87.281, 'train_steps_per_second': 2.728, 'train_loss': 1.6995613451549727, 'epoch': 2.0}


[I 2025-12-02 02:35:10,264] Trial 2 finished with value: 0.6429659100926854 and parameters: {'learning_rate': 1.1789261542050777e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'weight_decay': 0.10012789788034385, 'warmup_ratio': 0.16984971700233972}. Best is trial 1 with value: 0.6540078155051587.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.4282467365264893, 'eval_accuracy': 0.6048714479025711, 'eval_f1_macro': 0.4765993306306271, 'eval_f1_weighted': 0.5926469262026708, 'eval_recall_macro': 0.46869784531669983, 'eval_precision_macro': 0.5110608134516897, 'eval_runtime': 5.1052, 'eval_samples_per_second': 579.013, 'eval_steps_per_second': 36.237, 'epoch': 1.0}
{'eval_loss': 1.2657490968704224, 'eval_accuracy': 0.6478349120433018, 'eval_f1_macro': 0.5464712794447668, 'eval_f1_weighted': 0.6358203513914392, 'eval_recall_macro': 0.5399582074495766, 'eval_precision_macro': 0.609000735039496, 'eval_runtime': 5.0864, 'eval_samples_per_second': 581.163, 'eval_steps_per_second': 36.372, 'epoch': 2.0}
{'eval_loss': 1.3641000986099243, 'eval_accuracy': 0.6529093369418133, 'eval_f1_macro': 0.5721981583588277, 'eval_f1_weighted': 0.6505350773005592, 'eval_recall_macro': 0.5543341274281565, 'eval_precision_macro': 0.62140734802115, 'eval_runtime': 5.1164, 'eval_samples_per_second': 577.747, 'eval_steps_per_second': 36.1

[I 2025-12-02 03:24:40,915] Trial 3 finished with value: 0.6449378518344951 and parameters: {'learning_rate': 3.6007065845060886e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'weight_decay': 0.2694397700040253, 'warmup_ratio': 0.1590820328782426}. Best is trial 1 with value: 0.6540078155051587.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2907615900039673, 'eval_accuracy': 0.6397158322056834, 'eval_f1_macro': 0.5104094554221261, 'eval_f1_weighted': 0.6263839591055748, 'eval_recall_macro': 0.5113930035359855, 'eval_precision_macro': 0.5230571459537496, 'eval_runtime': 5.0477, 'eval_samples_per_second': 585.613, 'eval_steps_per_second': 36.65, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.1847317218780518, 'eval_accuracy': 0.6623815967523681, 'eval_f1_macro': 0.55682207524078, 'eval_f1_weighted': 0.655899574725719, 'eval_recall_macro': 0.5508764017728236, 'eval_precision_macro': 0.621034244273241, 'eval_runtime': 5.2356, 'eval_samples_per_second': 564.599, 'eval_steps_per_second': 35.335, 'epoch': 2.0}
{'train_runtime': 530.0594, 'train_samples_per_second': 88.613, 'train_steps_per_second': 2.77, 'train_loss': 1.4466566039041213, 'epoch': 2.0}


[I 2025-12-02 03:33:33,133] Trial 4 finished with value: 0.655899574725719 and parameters: {'learning_rate': 4.9524317251066464e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'weight_decay': 0.17294172007325087, 'warmup_ratio': 0.1869990038412411}. Best is trial 4 with value: 0.655899574725719.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-02 03:40:55,204] Trial 5 pruned. 


{'eval_loss': 3.0147488117218018, 'eval_accuracy': 0.11028416779431664, 'eval_f1_macro': 0.007357753853792854, 'eval_f1_weighted': 0.021908981536226217, 'eval_recall_macro': 0.037037037037037035, 'eval_precision_macro': 0.004084598807196913, 'eval_runtime': 5.0341, 'eval_samples_per_second': 587.2, 'eval_steps_per_second': 36.75, 'epoch': 1.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-02 03:48:15,052] Trial 6 pruned. 


{'eval_loss': 1.2637687921524048, 'eval_accuracy': 0.6437753721244925, 'eval_f1_macro': 0.5219586994621322, 'eval_f1_weighted': 0.6361938761958653, 'eval_recall_macro': 0.512955821625641, 'eval_precision_macro': 0.5471118541991498, 'eval_runtime': 5.0573, 'eval_samples_per_second': 584.5, 'eval_steps_per_second': 36.581, 'epoch': 1.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-02 03:52:02,314] Trial 7 pruned. 


{'eval_loss': 1.3020578622817993, 'eval_accuracy': 0.6468200270635994, 'eval_f1_macro': 0.5116054091998797, 'eval_f1_weighted': 0.6347282465313697, 'eval_recall_macro': 0.5080388400728919, 'eval_precision_macro': 0.5290325804686101, 'eval_runtime': 5.0717, 'eval_samples_per_second': 582.843, 'eval_steps_per_second': 36.477, 'epoch': 1.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-02 03:55:46,412] Trial 8 pruned. 


{'eval_loss': 1.4027280807495117, 'eval_accuracy': 0.6204330175913396, 'eval_f1_macro': 0.49089560669291815, 'eval_f1_weighted': 0.6087505781077877, 'eval_recall_macro': 0.47611424271221814, 'eval_precision_macro': 0.562352639029029, 'eval_runtime': 4.9917, 'eval_samples_per_second': 592.182, 'eval_steps_per_second': 37.061, 'epoch': 1.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-12-02 03:57:46,976] Trial 9 pruned. 
  trainer = Trainer(


{'eval_loss': 1.3292512893676758, 'eval_accuracy': 0.628213802435724, 'eval_f1_macro': 0.49121384788851646, 'eval_f1_weighted': 0.6127496017466286, 'eval_recall_macro': 0.5002546227401647, 'eval_precision_macro': 0.5079285303270134, 'eval_runtime': 5.0046, 'eval_samples_per_second': 590.653, 'eval_steps_per_second': 36.966, 'epoch': 1.0}


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Recall Macro,Precision Macro
1,1.3606,1.303221,0.630582,0.500672,0.615093,0.501487,0.515234
2,0.9737,1.192364,0.658322,0.546876,0.651442,0.542822,0.576698


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...motions/training_args.bin: 100%|##########| 5.91kB / 5.91kB            

  ...motions/model.safetensors:   3%|3         | 41.9MB / 1.34GB            

CPU times: user 1h 32min 59s, sys: 1min 20s, total: 1h 34min 19s
Wall time: 2h 21min 38s


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("virtual12345/bert-large-uncased-finetuned-GoEmotions")
print("Model loaded successfully!")

config.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Model loaded successfully!


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("virtual12345/bert-base-uncased-finetuned-GoEmotions")
print("Model loaded successfully!")

config.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Model loaded successfully!


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("virtual12345/distilbert-base-uncased-finetuned-GoEmotions")
print("Model loaded successfully!")

config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Model loaded successfully!
