In [12]:
import os
import sqlite3
import time
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
import warnings

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.trainer_callback import EarlyStoppingCallback
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer

from datasets import Dataset
from tqdm.auto import tqdm

# === Silence warnings/logging ===
warnings.filterwarnings("ignore", message=".*pin_memory.*")
logging.getLogger("transformers").setLevel(logging.ERROR)

# === Model list ===
MODELS = [
    "distilroberta-base",
    "distilbert-base-uncased",
    "nreimers/MiniLM-L6-H384-uncased",
    "google/electra-small-discriminator",
    "huawei-noah/TinyBERT_General_4L_312D"
]

# === Dataset info ===
DATASETS = {
    "news": "data/news.db",
    "reddit": "data/reddit.db",
    "twitter": "data/twitter.db"
}

# === Metric computation ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, zero_division=0),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds)
    }

# === Benchmark loop over datasets ===
all_results = []

for source_name, db_path in tqdm(DATASETS.items(), desc="🔬 Benchmarking Datasets"):
    print(f"\n📂 Benchmarking on dataset: {source_name}")
    con = sqlite3.connect(db_path)
    df = pd.read_sql("SELECT summary AS text, label FROM labeled_data", con)
    con.close()
    df.dropna(subset=["text", "label"], inplace=True)   
    df["label"] = df["label"].astype(int)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for model_name in tqdm(MODELS, desc=f"🔬 Benchmarking Models on {source_name}"):
        print(f"\n🔍 Model: {model_name}")
        fold_scores, all_probs, all_labels = [], [], []

        for fold, (train_idx, val_idx) in enumerate(skf.split(df["text"], df["label"])):
            print(f" ↪ Fold {fold+1}/5")
            train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]

            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
            safe_model_name = model_name.replace("/", "_").replace("-", "_")
            model_dir = f"./results/{source_name}_{safe_model_name}_fold{fold+1}"

            if os.path.exists(model_dir):
                model = AutoModelForSequenceClassification.from_pretrained(model_dir)
            else:
                model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

            train_ds = Dataset.from_pandas(train_df).map(lambda x: tokenizer(x["text"], truncation=True, max_length=512), batched=True)
            val_ds = Dataset.from_pandas(val_df).map(lambda x: tokenizer(x["text"], truncation=True, max_length=512), batched=True)

            train_ds = train_ds.rename_column("label", "labels")
            val_ds = val_ds.rename_column("label", "labels")
            train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
            val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

            args = TrainingArguments(
                output_dir=model_dir,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=16,
                num_train_epochs=5,
                eval_strategy="epoch",
                save_strategy="epoch",
                save_total_limit=1,
                load_best_model_at_end=True,
                logging_steps=50,
                report_to="none",
                disable_tqdm=True,
                fp16=False,
                learning_rate=2e-5,
                no_cuda=True,
            )

            checkpoint_path = os.path.join(model_dir, "pytorch_model.bin")
            if os.path.exists(checkpoint_path):
                model = AutoModelForSequenceClassification.from_pretrained(model_dir)
                model.to("cpu")
                train_time = 0.0
            else:
                trainer = Trainer(
                    model=model,
                    args=args,
                    train_dataset=train_ds,
                    eval_dataset=val_ds,
                    compute_metrics=compute_metrics,
                    data_collator=DataCollatorWithPadding(tokenizer),
                    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
                )
                start = time.time()
                trainer.train()
                trainer.save_model(model_dir)
                train_time = round(time.time() - start, 2)

            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=train_ds,
                eval_dataset=val_ds,
                compute_metrics=compute_metrics,
                data_collator=DataCollatorWithPadding(tokenizer),
                callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
            )

            metrics = trainer.evaluate()
            metrics["training_time"] = train_time

            preds = trainer.predict(val_ds)
            probs = torch.nn.functional.softmax(torch.tensor(preds.predictions), dim=1).numpy()
            true_labels = preds.label_ids

            all_probs.append(probs[:, 1])
            all_labels.append(true_labels)

            fpr, tpr, _ = roc_curve(true_labels, probs[:, 1])
            roc_auc = auc(fpr, tpr)
            metrics["auc"] = roc_auc

            os.makedirs("roc_curves", exist_ok=True)
            plt.figure()
            plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"AUC={roc_auc:.2f}")
            plt.plot([0, 1], [0, 1], linestyle='--', color='navy')
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.title(f"{model_name} Fold {fold+1} ROC ({source_name})")
            plt.legend(loc="lower right")
            plt.savefig(f"roc_curves/{source_name}_{safe_model_name}_fold{fold+1}.png")
            plt.close()

            fold_scores.append({k.replace("eval_", ""): v for k, v in metrics.items()})

        # Overall ROC curve
        if fold_scores:
            all_probs_flat = np.concatenate(all_probs)
            all_labels_flat = np.concatenate(all_labels)
            fpr, tpr, _ = roc_curve(all_labels_flat, all_probs_flat)
            overall_auc = auc(fpr, tpr)

            plt.figure()
            plt.plot(fpr, tpr, color='green', lw=2, label=f"AUC={overall_auc:.2f}")
            plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.title(f"{model_name} Overall ROC ({source_name})")
            plt.legend(loc="lower right")
            os.makedirs("roc_overall", exist_ok=True)
            plt.savefig(f"roc_overall/{source_name}_{safe_model_name}_overall.png")
            plt.close()

            avg = pd.DataFrame(fold_scores).mean().to_dict()
            avg.update({"model": model_name, "auc": overall_auc, "source": source_name})
            all_results.append(avg)

# === Save Final Results ===
results_df = pd.DataFrame(all_results)
required_cols = ["source", "model", "accuracy", "f1", "precision", "recall", "auc", "training_time"]
results_df = results_df[[col for col in required_cols if col in results_df.columns]]
results_df.to_csv("multimodal_results.csv", index=False)
print("\n✅ Multimodal Benchmarking complete. Results saved to multimodal_results.csv")


🔬 Benchmarking Datasets:   0%|          | 0/3 [00:00<?, ?it/s]


📂 Benchmarking on dataset: news


🔬 Benchmarking Models on news:   0%|          | 0/5 [00:00<?, ?it/s]


🔍 Model: distilroberta-base
 ↪ Fold 1/5


Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]



{'eval_loss': 0.2725760042667389, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.871, 'eval_samples_per_second': 98.739, 'eval_steps_per_second': 6.889, 'epoch': 1.0}
{'loss': 0.2832, 'grad_norm': 12.548629760742188, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.22960707545280457, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.8788, 'eval_samples_per_second': 97.864, 'eval_steps_per_second': 6.828, 'epoch': 2.0}
{'loss': 0.1884, 'grad_norm': 1.0273587703704834, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.20294125378131866, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.9213, 'eval_samples_per_second': 93.346, 'eval_steps_pe

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.21402080357074738, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.7932, 'eval_samples_per_second': 107.154, 'eval_steps_per_second': 7.564, 'epoch': 1.0}
{'loss': 0.3428, 'grad_norm': 4.338933944702148, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.20018112659454346, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.726, 'eval_samples_per_second': 117.084, 'eval_steps_per_second': 8.265, 'epoch': 2.0}
{'loss': 0.1715, 'grad_norm': 0.09864344447851181, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.19849254190921783, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.7325, 'eval_samples_per_second': 116.045, 'eval_s

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.2319040149450302, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.674, 'eval_samples_per_second': 126.108, 'eval_steps_per_second': 8.902, 'epoch': 1.0}
{'loss': 0.2996, 'grad_norm': 5.123295783996582, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.17682217061519623, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.6626, 'eval_samples_per_second': 128.281, 'eval_steps_per_second': 9.055, 'epoch': 2.0}
{'loss': 0.2115, 'grad_norm': 0.18781600892543793, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.2185056060552597, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.6855, 'eval_samples_per_second': 123.989, 'eval_ste

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.2921079397201538, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.6251, 'eval_samples_per_second': 135.974, 'eval_steps_per_second': 9.598, 'epoch': 1.0}
{'loss': 0.2888, 'grad_norm': 0.3015683591365814, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.28225430846214294, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.644, 'eval_samples_per_second': 131.995, 'eval_steps_per_second': 9.317, 'epoch': 2.0}
{'loss': 0.1929, 'grad_norm': 0.12719431519508362, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.3295104503631592, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.6563, 'eval_samples_per_second': 129.504, 'eval_st

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.32508549094200134, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.7125, 'eval_samples_per_second': 119.294, 'eval_steps_per_second': 8.421, 'epoch': 1.0}
{'loss': 0.2703, 'grad_norm': 0.35294127464294434, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2012069672346115, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.8359, 'eval_samples_per_second': 101.689, 'eval_steps_per_second': 7.178, 'epoch': 2.0}
{'loss': 0.2108, 'grad_norm': 0.41594642400741577, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.25969991087913513, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.8622, 'eval_samples_per_second': 98.581, 'eval_

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]



{'eval_loss': 0.21767926216125488, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.8178, 'eval_samples_per_second': 105.166, 'eval_steps_per_second': 7.337, 'epoch': 1.0}
{'loss': 0.2735, 'grad_norm': 3.0594356060028076, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.1895730048418045, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.8342, 'eval_samples_per_second': 103.096, 'eval_steps_per_second': 7.193, 'epoch': 2.0}
{'loss': 0.1702, 'grad_norm': 0.6296712160110474, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.19582048058509827, 'eval_accuracy': 0.9418604651162791, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.9151, 'eval_samples_per_second': 93.98, 'eval_steps

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.20217949151992798, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.7417, 'eval_samples_per_second': 114.599, 'eval_steps_per_second': 8.089, 'epoch': 1.0}
{'loss': 0.3071, 'grad_norm': 4.020382881164551, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.1824098825454712, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.7198, 'eval_samples_per_second': 118.088, 'eval_steps_per_second': 8.336, 'epoch': 2.0}
{'loss': 0.1494, 'grad_norm': 0.26365983486175537, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.18364161252975464, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.6733, 'eval_samples_per_second': 126.246, 'eval_s

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.21337060630321503, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.6807, 'eval_samples_per_second': 124.868, 'eval_steps_per_second': 8.814, 'epoch': 1.0}
{'loss': 0.2856, 'grad_norm': 2.4613687992095947, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.1760420799255371, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.6877, 'eval_samples_per_second': 123.606, 'eval_steps_per_second': 8.725, 'epoch': 2.0}
{'loss': 0.1991, 'grad_norm': 0.3810742497444153, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.1793520301580429, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.694, 'eval_samples_per_second': 122.473, 'eval_ste

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.24578231573104858, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.6556, 'eval_samples_per_second': 129.652, 'eval_steps_per_second': 9.152, 'epoch': 1.0}
{'loss': 0.2525, 'grad_norm': 0.3866673409938812, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.25035738945007324, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.6429, 'eval_samples_per_second': 132.22, 'eval_steps_per_second': 9.333, 'epoch': 2.0}
{'train_runtime': 27.9914, 'train_samples_per_second': 60.912, 'train_steps_per_second': 7.681, 'train_loss': 0.2328180989553762, 'epoch': 2.0}
{'eval_loss': 0.24578231573104858, 'eval_model_preparation_time': 0.0003, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eva

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.2362852692604065, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.7606, 'eval_samples_per_second': 111.751, 'eval_steps_per_second': 7.888, 'epoch': 1.0}
{'loss': 0.2591, 'grad_norm': 0.4268350601196289, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.18759892880916595, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.672, 'eval_samples_per_second': 126.486, 'eval_steps_per_second': 8.928, 'epoch': 2.0}
{'loss': 0.1897, 'grad_norm': 0.57805997133255, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.19867631793022156, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.6572, 'eval_samples_per_second': 129.332, 'eval_step

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]



{'eval_loss': 0.2593626081943512, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.4055, 'eval_samples_per_second': 212.075, 'eval_steps_per_second': 14.796, 'epoch': 1.0}
{'loss': 0.4091, 'grad_norm': 1.5069103240966797, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2395489513874054, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.3673, 'eval_samples_per_second': 234.124, 'eval_steps_per_second': 16.334, 'epoch': 2.0}
{'loss': 0.2273, 'grad_norm': 0.6472819447517395, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.21115824580192566, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.3582, 'eval_samples_per_second': 240.074, 'eval_ste

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.24687251448631287, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.3651, 'eval_samples_per_second': 232.803, 'eval_steps_per_second': 16.433, 'epoch': 1.0}
{'loss': 0.426, 'grad_norm': 1.3612679243087769, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.21830421686172485, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.2979, 'eval_samples_per_second': 285.377, 'eval_steps_per_second': 20.144, 'epoch': 2.0}
{'loss': 0.2097, 'grad_norm': 0.6814373731613159, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.2002369612455368, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.2782, 'eval_samples_per_second': 305.532, 'eval_

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.2439829260110855, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.2594, 'eval_samples_per_second': 327.726, 'eval_steps_per_second': 23.134, 'epoch': 1.0}
{'loss': 0.4111, 'grad_norm': 0.5611834526062012, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.21563546359539032, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.3562, 'eval_samples_per_second': 238.605, 'eval_steps_per_second': 16.843, 'epoch': 2.0}
{'loss': 0.2332, 'grad_norm': 0.7247955203056335, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.20561476051807404, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.3181, 'eval_samples_per_second': 267.217, 'eval

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.24973726272583008, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.2352, 'eval_samples_per_second': 361.411, 'eval_steps_per_second': 25.511, 'epoch': 1.0}
{'loss': 0.4006, 'grad_norm': 1.082229733467102, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.22386223077774048, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.277, 'eval_samples_per_second': 306.853, 'eval_steps_per_second': 21.66, 'epoch': 2.0}
{'loss': 0.2316, 'grad_norm': 0.6823395490646362, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.2230616807937622, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.2294, 'eval_samples_per_second': 370.53, 'eval_ste

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.265223890542984, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.2667, 'eval_samples_per_second': 318.74, 'eval_steps_per_second': 22.499, 'epoch': 1.0}
{'loss': 0.3908, 'grad_norm': 0.9739252924919128, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2403392791748047, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.3121, 'eval_samples_per_second': 272.349, 'eval_steps_per_second': 19.225, 'epoch': 2.0}
{'loss': 0.2302, 'grad_norm': 0.8416628837585449, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.23198087513446808, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.2562, 'eval_samples_per_second': 331.71, 'eval_ste

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]



{'eval_loss': 0.37330707907676697, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.5268, 'eval_samples_per_second': 163.247, 'eval_steps_per_second': 11.389, 'epoch': 1.0}
{'loss': 0.4805, 'grad_norm': 0.9072086215019226, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2952280342578888, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.594, 'eval_samples_per_second': 144.775, 'eval_steps_per_second': 10.101, 'epoch': 2.0}
{'loss': 0.319, 'grad_norm': 1.4833482503890991, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.2666594982147217, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.6392, 'eval_samples_per_second': 134.55, 'eval_steps_

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.3688727021217346, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.5925, 'eval_samples_per_second': 143.45, 'eval_steps_per_second': 10.126, 'epoch': 1.0}
{'loss': 0.4881, 'grad_norm': 0.7501160502433777, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2850476801395416, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.5789, 'eval_samples_per_second': 146.839, 'eval_steps_per_second': 10.365, 'epoch': 2.0}
{'loss': 0.3079, 'grad_norm': 1.4949367046356201, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.252668559551239, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.4988, 'eval_samples_per_second': 170.411, 'eval_ste

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.3654288947582245, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.4381, 'eval_samples_per_second': 194.025, 'eval_steps_per_second': 13.696, 'epoch': 1.0}
{'loss': 0.4791, 'grad_norm': 1.335382342338562, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2850651144981384, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.4521, 'eval_samples_per_second': 188.003, 'eval_steps_per_second': 13.271, 'epoch': 2.0}
{'loss': 0.3212, 'grad_norm': 1.4823768138885498, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.2520396411418915, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.5071, 'eval_samples_per_second': 167.604, 'eval_st

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.37009748816490173, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.3973, 'eval_samples_per_second': 213.934, 'eval_steps_per_second': 15.101, 'epoch': 1.0}
{'loss': 0.4782, 'grad_norm': 1.8107556104660034, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2912852466106415, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.5583, 'eval_samples_per_second': 152.236, 'eval_steps_per_second': 10.746, 'epoch': 2.0}
{'loss': 0.3248, 'grad_norm': 1.5662672519683838, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.2566853165626526, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.5677, 'eval_samples_per_second': 149.735, 'eval_

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.3742031455039978, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.4505, 'eval_samples_per_second': 188.699, 'eval_steps_per_second': 13.32, 'epoch': 1.0}
{'loss': 0.4737, 'grad_norm': 1.7718229293823242, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.29566532373428345, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.62, 'eval_samples_per_second': 137.102, 'eval_steps_per_second': 9.678, 'epoch': 2.0}
{'loss': 0.3178, 'grad_norm': 1.452549934387207, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.26830077171325684, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.583, 'eval_samples_per_second': 145.801, 'eval_steps

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]



{'eval_loss': 0.38383209705352783, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.2264, 'eval_samples_per_second': 379.79, 'eval_steps_per_second': 26.497, 'epoch': 1.0}
{'loss': 0.5046, 'grad_norm': 0.9243854880332947, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.27599743008613586, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.211, 'eval_samples_per_second': 407.566, 'eval_steps_per_second': 28.435, 'epoch': 2.0}
{'loss': 0.2938, 'grad_norm': 1.2556771039962769, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.24260519444942474, 'eval_accuracy': 0.9302325581395349, 'eval_f1': 0.963855421686747, 'eval_precision': 0.9302325581395349, 'eval_recall': 1.0, 'eval_runtime': 0.2097, 'eval_samples_per_second': 410.108, 'eval_ste

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.38053053617477417, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.1924, 'eval_samples_per_second': 441.899, 'eval_steps_per_second': 31.193, 'epoch': 1.0}
{'loss': 0.516, 'grad_norm': 0.6319003105163574, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2671181559562683, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.1846, 'eval_samples_per_second': 460.433, 'eval_steps_per_second': 32.501, 'epoch': 2.0}
{'loss': 0.2884, 'grad_norm': 1.1343848705291748, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.22556044161319733, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.1791, 'eval_samples_per_second': 474.627, 'eval_

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.3779316544532776, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.1694, 'eval_samples_per_second': 501.652, 'eval_steps_per_second': 35.411, 'epoch': 1.0}
{'loss': 0.5052, 'grad_norm': 1.2076281309127808, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2677046060562134, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.1622, 'eval_samples_per_second': 523.967, 'eval_steps_per_second': 36.986, 'epoch': 2.0}
{'loss': 0.3033, 'grad_norm': 1.1320030689239502, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.23113685846328735, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.1785, 'eval_samples_per_second': 476.164, 'eval_

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.38094890117645264, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.1704, 'eval_samples_per_second': 498.855, 'eval_steps_per_second': 35.213, 'epoch': 1.0}
{'loss': 0.5026, 'grad_norm': 1.5926722288131714, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.27602362632751465, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.165, 'eval_samples_per_second': 515.03, 'eval_steps_per_second': 36.355, 'epoch': 2.0}
{'loss': 0.2982, 'grad_norm': 1.1580004692077637, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.24376508593559265, 'eval_accuracy': 0.9411764705882353, 'eval_f1': 0.9696969696969697, 'eval_precision': 0.9411764705882353, 'eval_recall': 1.0, 'eval_runtime': 0.1775, 'eval_samples_per_second': 478.825, 'eval_

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]



{'eval_loss': 0.39015036821365356, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.1787, 'eval_samples_per_second': 475.631, 'eval_steps_per_second': 33.574, 'epoch': 1.0}
{'loss': 0.5004, 'grad_norm': 1.5738344192504883, 'learning_rate': 1.544186046511628e-05, 'epoch': 1.1627906976744187}
{'eval_loss': 0.2899605333805084, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.1775, 'eval_samples_per_second': 478.976, 'eval_steps_per_second': 33.81, 'epoch': 2.0}
{'loss': 0.2976, 'grad_norm': 1.1747928857803345, 'learning_rate': 1.0790697674418607e-05, 'epoch': 2.3255813953488373}
{'eval_loss': 0.2554120421409607, 'eval_accuracy': 0.9294117647058824, 'eval_f1': 0.9634146341463414, 'eval_precision': 0.9294117647058824, 'eval_recall': 1.0, 'eval_runtime': 0.1766, 'eval_samples_per_second': 481.306, 'eval_s

🔬 Benchmarking Models on reddit:   0%|          | 0/5 [00:00<?, ?it/s]


🔍 Model: distilroberta-base
 ↪ Fold 1/5


Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6745, 'grad_norm': 2.9708504676818848, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6709, 'grad_norm': 1.8787527084350586, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6487, 'grad_norm': 4.9142656326293945, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6651, 'grad_norm': 6.091533660888672, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6831, 'grad_norm': 1.7346537113189697, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.666, 'grad_norm': 3.4265847206115723, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6571, 'grad_norm': 6.038039207458496, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6899, 'grad_norm': 5.084569454193115, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6581104397773743, 'eval_accurac

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6983, 'grad_norm': 3.657151222229004, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6651, 'grad_norm': 2.081792116165161, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6709, 'grad_norm': 2.8065805435180664, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6805, 'grad_norm': 4.685303688049316, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6605, 'grad_norm': 3.697326898574829, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6577, 'grad_norm': 2.744352102279663, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6451, 'grad_norm': 2.554262161254883, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6891, 'grad_norm': 3.3556716442108154, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6707407832145691, 'eval_accuracy'

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6793, 'grad_norm': 3.2168924808502197, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6662, 'grad_norm': 2.7267110347747803, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6737, 'grad_norm': 3.923222064971924, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6733, 'grad_norm': 2.3912441730499268, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6762, 'grad_norm': 4.4173808097839355, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6772, 'grad_norm': 6.328276634216309, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6773, 'grad_norm': 5.382719039916992, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6258, 'grad_norm': 5.758304595947266, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.654363751411438, 'eval_accuracy

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.678, 'grad_norm': 2.635056734085083, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.677, 'grad_norm': 3.855855703353882, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6682, 'grad_norm': 6.116100788116455, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6749, 'grad_norm': 2.3981661796569824, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6622, 'grad_norm': 3.2995734214782715, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6827, 'grad_norm': 2.5380067825317383, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6462, 'grad_norm': 3.8369555473327637, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6814, 'grad_norm': 2.7776267528533936, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6673684120178223, 'eval_accuracy

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6948, 'grad_norm': 1.1078535318374634, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6848, 'grad_norm': 1.8007371425628662, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6594, 'grad_norm': 1.9658942222595215, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6707, 'grad_norm': 1.4361441135406494, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6482, 'grad_norm': 6.2500762939453125, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6559, 'grad_norm': 4.22108793258667, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6706, 'grad_norm': 2.242241621017456, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6618, 'grad_norm': 1.8365297317504883, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6657025814056396, 'eval_accura

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6733, 'grad_norm': 1.295670986175537, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6781, 'grad_norm': 1.5639458894729614, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6599, 'grad_norm': 3.498812198638916, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6626, 'grad_norm': 2.164628267288208, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6762, 'grad_norm': 1.5917214155197144, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6609, 'grad_norm': 2.404388904571533, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6548, 'grad_norm': 3.2315263748168945, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6873, 'grad_norm': 1.9128912687301636, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6550033688545227, 'eval_accurac

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6973, 'grad_norm': 1.845839023590088, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6652, 'grad_norm': 2.525959014892578, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6669, 'grad_norm': 1.9181749820709229, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6882, 'grad_norm': 2.131230592727661, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6609, 'grad_norm': 1.2268824577331543, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6358, 'grad_norm': 1.5557129383087158, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6527, 'grad_norm': 1.7211086750030518, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6832, 'grad_norm': 1.3721065521240234, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6737378835678101, 'eval_accura

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6801, 'grad_norm': 1.975373387336731, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.669, 'grad_norm': 2.2988648414611816, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6768, 'grad_norm': 1.848924994468689, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6615, 'grad_norm': 2.3941893577575684, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6826, 'grad_norm': 2.2811355590820312, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6733, 'grad_norm': 2.935598373413086, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6749, 'grad_norm': 2.741961717605591, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6301, 'grad_norm': 2.7568302154541016, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6510468125343323, 'eval_accuracy

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6738, 'grad_norm': 2.319082260131836, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6731, 'grad_norm': 2.1939897537231445, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.663, 'grad_norm': 3.657231092453003, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6839, 'grad_norm': 2.091259717941284, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6626, 'grad_norm': 2.1200337409973145, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6857, 'grad_norm': 1.8825801610946655, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.653, 'grad_norm': 2.546527862548828, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6702, 'grad_norm': 1.744624137878418, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6646100282669067, 'eval_accuracy':

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6852, 'grad_norm': 2.4447576999664307, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6793, 'grad_norm': 2.774160623550415, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6584, 'grad_norm': 2.7552034854888916, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6595, 'grad_norm': 2.1207587718963623, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.647, 'grad_norm': 4.531023025512695, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6601, 'grad_norm': 3.4155335426330566, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6796, 'grad_norm': 2.1192476749420166, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.65, 'grad_norm': 2.1380960941314697, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6707428097724915, 'eval_accuracy

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6718, 'grad_norm': 0.5033336281776428, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6824, 'grad_norm': 0.5920584201812744, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.658, 'grad_norm': 2.545560121536255, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6693, 'grad_norm': 1.4483602046966553, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6731, 'grad_norm': 1.1080918312072754, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6588, 'grad_norm': 1.308760166168213, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6509, 'grad_norm': 2.101017951965332, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6863, 'grad_norm': 1.2553479671478271, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6538683176040649, 'eval_accurac

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6873, 'grad_norm': 0.5636524558067322, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6674, 'grad_norm': 1.2740596532821655, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6725, 'grad_norm': 1.7859610319137573, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6747, 'grad_norm': 1.542829155921936, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6615, 'grad_norm': 0.6293372511863708, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6363, 'grad_norm': 0.9906664490699768, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6481, 'grad_norm': 1.0303068161010742, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6823, 'grad_norm': 0.8706465363502502, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6694290041923523, 'eval_accu

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6805, 'grad_norm': 0.846293032169342, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6759, 'grad_norm': 1.4590367078781128, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.667, 'grad_norm': 1.396805763244629, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6569, 'grad_norm': 1.4225397109985352, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6747, 'grad_norm': 1.9254332780838013, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6722, 'grad_norm': 2.477825403213501, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.666, 'grad_norm': 2.4317662715911865, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6315, 'grad_norm': 3.4660778045654297, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6544129252433777, 'eval_accuracy

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.683, 'grad_norm': 1.1822267770767212, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6779, 'grad_norm': 1.0896239280700684, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6641, 'grad_norm': 1.8032580614089966, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6792, 'grad_norm': 1.1003581285476685, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.66, 'grad_norm': 1.6279809474945068, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6784, 'grad_norm': 1.249212384223938, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6517, 'grad_norm': 1.7954941987991333, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6673, 'grad_norm': 3.192115306854248, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6693369150161743, 'eval_accuracy

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6871, 'grad_norm': 0.533864438533783, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6818, 'grad_norm': 1.249891996383667, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6577, 'grad_norm': 1.4609014987945557, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6578, 'grad_norm': 0.8848105669021606, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6453, 'grad_norm': 2.5950074195861816, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6522, 'grad_norm': 1.6156045198440552, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6759, 'grad_norm': 1.2230926752090454, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.652, 'grad_norm': 1.185301423072815, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6673189997673035, 'eval_accurac

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6741, 'grad_norm': 0.8472967147827148, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6814, 'grad_norm': 0.762062668800354, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6605, 'grad_norm': 0.9294658303260803, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.667, 'grad_norm': 1.5287344455718994, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.675, 'grad_norm': 1.0813629627227783, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.664, 'grad_norm': 1.6549783945083618, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6507, 'grad_norm': 1.9692636728286743, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6981, 'grad_norm': 1.2719240188598633, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6561354994773865, 'eval_accurac

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6867, 'grad_norm': 0.718248188495636, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6728, 'grad_norm': 1.416353702545166, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6723, 'grad_norm': 1.1361829042434692, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6756, 'grad_norm': 1.619657039642334, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6578, 'grad_norm': 1.4658055305480957, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6341, 'grad_norm': 1.5304169654846191, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6489, 'grad_norm': 1.288179874420166, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6834, 'grad_norm': 0.9467114806175232, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6700488924980164, 'eval_accurac

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6812, 'grad_norm': 0.7530817985534668, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6778, 'grad_norm': 1.2537493705749512, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6685, 'grad_norm': 1.3012458086013794, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6612, 'grad_norm': 1.4816577434539795, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6746, 'grad_norm': 2.1221909523010254, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6712, 'grad_norm': 1.9221105575561523, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.668, 'grad_norm': 1.5674018859863281, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6344, 'grad_norm': 2.165336847305298, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6540288925170898, 'eval_accur

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6794, 'grad_norm': 1.1732608079910278, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6753, 'grad_norm': 1.1058162450790405, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6663, 'grad_norm': 1.6510801315307617, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6756, 'grad_norm': 1.6392871141433716, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6555, 'grad_norm': 1.8383417129516602, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6789, 'grad_norm': 1.1833206415176392, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6554, 'grad_norm': 1.2769498825073242, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.659, 'grad_norm': 3.3120148181915283, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6879466772079468, 'eval_accu

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6847, 'grad_norm': 0.9692885875701904, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.683, 'grad_norm': 1.0462443828582764, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6686, 'grad_norm': 1.1250609159469604, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.661, 'grad_norm': 0.856107771396637, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6424, 'grad_norm': 2.250523567199707, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6647, 'grad_norm': 1.5469032526016235, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6784, 'grad_norm': 1.493057370185852, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6648, 'grad_norm': 1.0595818758010864, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6674872040748596, 'eval_accuracy

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6751, 'grad_norm': 0.48485007882118225, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6873, 'grad_norm': 0.5461505651473999, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6677, 'grad_norm': 0.8400477170944214, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.678, 'grad_norm': 1.1763590574264526, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6864, 'grad_norm': 0.47168195247650146, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6735, 'grad_norm': 0.7000719308853149, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6661, 'grad_norm': 2.24179744720459, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6871, 'grad_norm': 1.162272572517395, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6688855290412903, 'eval_accur

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6888, 'grad_norm': 0.45539891719818115, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6741, 'grad_norm': 1.166305661201477, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.676, 'grad_norm': 1.2740782499313354, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6765, 'grad_norm': 1.1406620740890503, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6641, 'grad_norm': 1.685905933380127, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6458, 'grad_norm': 1.1200714111328125, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6486, 'grad_norm': 1.086976408958435, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6926, 'grad_norm': 0.7507683634757996, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6757062673568726, 'eval_accura

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.684, 'grad_norm': 0.6278679966926575, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6813, 'grad_norm': 1.2043119668960571, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6736, 'grad_norm': 1.150592565536499, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6827, 'grad_norm': 0.5214333534240723, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6751, 'grad_norm': 1.4287575483322144, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6806, 'grad_norm': 1.641924262046814, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6822, 'grad_norm': 1.6838706731796265, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6493, 'grad_norm': 1.3668779134750366, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6665232181549072, 'eval_accura

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6818, 'grad_norm': 0.9259114265441895, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6776, 'grad_norm': 1.046395182609558, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6711, 'grad_norm': 0.836593747138977, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6815, 'grad_norm': 0.8883461952209473, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6609, 'grad_norm': 0.9560843706130981, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6863, 'grad_norm': 0.836189329624176, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6648, 'grad_norm': 0.8168344497680664, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6856, 'grad_norm': 0.9211280345916748, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.6712929606437683, 'eval_accura

Map:   0%|          | 0/3556 [00:00<?, ? examples/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]



{'loss': 0.6894, 'grad_norm': 0.5060149431228638, 'learning_rate': 1.9559550561797755e-05, 'epoch': 0.11235955056179775}
{'loss': 0.6859, 'grad_norm': 0.5610818862915039, 'learning_rate': 1.9110112359550563e-05, 'epoch': 0.2247191011235955}
{'loss': 0.6721, 'grad_norm': 0.7837947010993958, 'learning_rate': 1.8660674157303374e-05, 'epoch': 0.33707865168539325}
{'loss': 0.6646, 'grad_norm': 1.10281240940094, 'learning_rate': 1.8211235955056182e-05, 'epoch': 0.449438202247191}
{'loss': 0.6464, 'grad_norm': 2.8971712589263916, 'learning_rate': 1.776179775280899e-05, 'epoch': 0.5617977528089888}
{'loss': 0.6616, 'grad_norm': 1.0890828371047974, 'learning_rate': 1.73123595505618e-05, 'epoch': 0.6741573033707865}
{'loss': 0.6824, 'grad_norm': 0.5355200171470642, 'learning_rate': 1.6862921348314607e-05, 'epoch': 0.7865168539325843}
{'loss': 0.6708, 'grad_norm': 0.8323929309844971, 'learning_rate': 1.6413483146067418e-05, 'epoch': 0.898876404494382}
{'eval_loss': 0.670386791229248, 'eval_accura

🔬 Benchmarking Models on twitter:   0%|          | 0/5 [00:00<?, ?it/s]


🔍 Model: distilroberta-base
 ↪ Fold 1/5


Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6468, 'grad_norm': 11.715965270996094, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6078, 'grad_norm': 14.816520690917969, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.5258, 'grad_norm': 8.223982810974121, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5222, 'grad_norm': 6.556027889251709, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.4962, 'grad_norm': 11.437966346740723, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.4971732497215271, 'eval_accuracy': 0.756007393715342, 'eval_f1': 0.8070175438596491, 'eval_precision': 0.7379679144385026, 'eval_recall': 0.8903225806451613, 'eval_runtime': 2.3695, 'eval_samples_per_second': 228.32, 'eval_steps_per_second': 14.349, 'epoch': 1.0}
{'loss': 0.4604, 'grad_norm': 10.520174980163574, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6194, 'grad_norm': 6.554109573364258, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.5525, 'grad_norm': 8.655595779418945, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.5502, 'grad_norm': 17.41946792602539, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.4663, 'grad_norm': 8.553668022155762, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5065, 'grad_norm': 5.216148376464844, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.46433863043785095, 'eval_accuracy': 0.7744916820702403, 'eval_f1': 0.7966666666666666, 'eval_precision': 0.8241379310344827, 'eval_recall': 0.7709677419354839, 'eval_runtime': 2.2999, 'eval_samples_per_second': 235.233, 'eval_steps_per_second': 14.784, 'epoch': 1.0}
{'loss': 0.4361, 'grad_norm': 14.566835403442383, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6587, 'grad_norm': 5.306907653808594, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.5801, 'grad_norm': 5.408236026763916, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.5285, 'grad_norm': 5.46597957611084, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.4743, 'grad_norm': 11.616714477539062, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5229, 'grad_norm': 5.526549339294434, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.4658206105232239, 'eval_accuracy': 0.7578558225508318, 'eval_f1': 0.7943485086342229, 'eval_precision': 0.7737003058103975, 'eval_recall': 0.8161290322580645, 'eval_runtime': 2.3893, 'eval_samples_per_second': 226.425, 'eval_steps_per_second': 14.23, 'epoch': 1.0}
{'loss': 0.4522, 'grad_norm': 12.454157829284668, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'l

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6599, 'grad_norm': 6.705679416656494, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.5572, 'grad_norm': 15.317065238952637, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.5145, 'grad_norm': 9.515280723571777, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5155, 'grad_norm': 11.663983345031738, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5078, 'grad_norm': 8.443514823913574, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5038653016090393, 'eval_accuracy': 0.7634011090573013, 'eval_f1': 0.8078078078078078, 'eval_precision': 0.7535014005602241, 'eval_recall': 0.8705501618122977, 'eval_runtime': 2.3189, 'eval_samples_per_second': 233.295, 'eval_steps_per_second': 14.662, 'epoch': 1.0}
{'loss': 0.42, 'grad_norm': 13.553136825561523, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6808, 'grad_norm': 5.392425060272217, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.5639, 'grad_norm': 3.2917251586914062, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.5694, 'grad_norm': 20.245378494262695, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.4581, 'grad_norm': 13.10313892364502, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5008, 'grad_norm': 9.649072647094727, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.46618127822875977, 'eval_accuracy': 0.7781885397412199, 'eval_f1': 0.8165137614678899, 'eval_precision': 0.7739130434782608, 'eval_recall': 0.8640776699029126, 'eval_runtime': 1.9067, 'eval_samples_per_second': 283.73, 'eval_steps_per_second': 17.831, 'epoch': 1.0}
{'loss': 0.4171, 'grad_norm': 12.271933555603027, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}


Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6415, 'grad_norm': 3.644869089126587, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.608, 'grad_norm': 5.545026779174805, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.5582, 'grad_norm': 3.8678479194641113, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5188, 'grad_norm': 5.029913902282715, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.4984, 'grad_norm': 6.950965881347656, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.48806047439575195, 'eval_accuracy': 0.7652495378927912, 'eval_f1': 0.8072837632776935, 'eval_precision': 0.7621776504297995, 'eval_recall': 0.8580645161290322, 'eval_runtime': 1.8894, 'eval_samples_per_second': 286.339, 'eval_steps_per_second': 17.995, 'epoch': 1.0}
{'loss': 0.4413, 'grad_norm': 6.30697774887085, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'l

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6368, 'grad_norm': 4.8972978591918945, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.5947, 'grad_norm': 3.3506314754486084, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.577, 'grad_norm': 5.035531520843506, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.4878, 'grad_norm': 5.284509658813477, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.4764, 'grad_norm': 3.660808801651001, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.4741344153881073, 'eval_accuracy': 0.7855822550831792, 'eval_f1': 0.8047138047138047, 'eval_precision': 0.8415492957746479, 'eval_recall': 0.7709677419354839, 'eval_runtime': 2.0506, 'eval_samples_per_second': 263.821, 'eval_steps_per_second': 16.58, 'epoch': 1.0}
{'loss': 0.4328, 'grad_norm': 5.526778697967529, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'l

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6539, 'grad_norm': 3.385134220123291, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.5754, 'grad_norm': 4.173018455505371, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.5217, 'grad_norm': 4.932323932647705, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.4743, 'grad_norm': 3.8181936740875244, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5235, 'grad_norm': 4.641900539398193, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.4916287064552307, 'eval_accuracy': 0.7689463955637708, 'eval_f1': 0.8175182481751825, 'eval_precision': 0.7466666666666667, 'eval_recall': 0.9032258064516129, 'eval_runtime': 2.1422, 'eval_samples_per_second': 252.549, 'eval_steps_per_second': 15.872, 'epoch': 1.0}
{'loss': 0.451, 'grad_norm': 9.55419921875, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'loss'

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6612, 'grad_norm': 4.072965145111084, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.5792, 'grad_norm': 6.911856174468994, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.5362, 'grad_norm': 3.8841660022735596, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5277, 'grad_norm': 4.7496209144592285, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.4664, 'grad_norm': 4.707864284515381, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.4889822006225586, 'eval_accuracy': 0.767097966728281, 'eval_f1': 0.8037383177570093, 'eval_precision': 0.7747747747747747, 'eval_recall': 0.8349514563106796, 'eval_runtime': 1.9804, 'eval_samples_per_second': 273.177, 'eval_steps_per_second': 17.168, 'epoch': 1.0}
{'loss': 0.4302, 'grad_norm': 5.273223876953125, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.67, 'grad_norm': 2.9662485122680664, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.5852, 'grad_norm': 2.262160539627075, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.582, 'grad_norm': 7.582458972930908, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.4599, 'grad_norm': 10.345844268798828, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.481, 'grad_norm': 7.385904788970947, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.481075257062912, 'eval_accuracy': 0.7837338262476895, 'eval_f1': 0.8085106382978723, 'eval_precision': 0.8178807947019867, 'eval_recall': 0.7993527508090615, 'eval_runtime': 1.9907, 'eval_samples_per_second': 271.77, 'eval_steps_per_second': 17.08, 'epoch': 1.0}
{'loss': 0.4475, 'grad_norm': 6.542410850524902, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'loss':

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6758, 'grad_norm': 0.891433835029602, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.667, 'grad_norm': 3.1647918224334717, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6186, 'grad_norm': 5.853017330169678, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5942, 'grad_norm': 2.5941524505615234, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5473, 'grad_norm': 3.740128517150879, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5449878573417664, 'eval_accuracy': 0.7338262476894639, 'eval_f1': 0.7894736842105263, 'eval_precision': 0.7219251336898396, 'eval_recall': 0.8709677419354839, 'eval_runtime': 0.9468, 'eval_samples_per_second': 571.379, 'eval_steps_per_second': 35.909, 'epoch': 1.0}
{'loss': 0.5174, 'grad_norm': 2.6852879524230957, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6814, 'grad_norm': 1.6446183919906616, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.622, 'grad_norm': 3.040900707244873, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6371, 'grad_norm': 2.978882312774658, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5605, 'grad_norm': 3.0470709800720215, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5375, 'grad_norm': 4.322759628295898, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5265315175056458, 'eval_accuracy': 0.7523105360443623, 'eval_f1': 0.7721088435374149, 'eval_precision': 0.8165467625899281, 'eval_recall': 0.7322580645161291, 'eval_runtime': 1.0973, 'eval_samples_per_second': 493.018, 'eval_steps_per_second': 30.984, 'epoch': 1.0}
{'loss': 0.5036, 'grad_norm': 5.643391132354736, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.688, 'grad_norm': 2.111790418624878, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6601, 'grad_norm': 2.28493070602417, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6057, 'grad_norm': 6.266758918762207, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5587, 'grad_norm': 5.391426086425781, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5609, 'grad_norm': 3.717111825942993, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5372664332389832, 'eval_accuracy': 0.7282809611829945, 'eval_f1': 0.7815750371471025, 'eval_precision': 0.7245179063360881, 'eval_recall': 0.8483870967741935, 'eval_runtime': 0.9335, 'eval_samples_per_second': 579.521, 'eval_steps_per_second': 36.421, 'epoch': 1.0}
{'loss': 0.543, 'grad_norm': 6.851960182189941, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'loss

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6868, 'grad_norm': 0.7931824922561646, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6465, 'grad_norm': 2.131150007247925, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.5975, 'grad_norm': 4.614065647125244, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5927, 'grad_norm': 10.01741886138916, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5477, 'grad_norm': 5.9681715965271, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5223366022109985, 'eval_accuracy': 0.7523105360443623, 'eval_f1': 0.7879746835443038, 'eval_precision': 0.7708978328173375, 'eval_recall': 0.8058252427184466, 'eval_runtime': 1.085, 'eval_samples_per_second': 498.6, 'eval_steps_per_second': 31.335, 'epoch': 1.0}
{'loss': 0.5151, 'grad_norm': 9.820189476013184, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'loss'

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6853, 'grad_norm': 1.328438401222229, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6441, 'grad_norm': 1.7439939975738525, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6094, 'grad_norm': 3.0900068283081055, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5529, 'grad_norm': 5.844950199127197, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5311, 'grad_norm': 6.936277866363525, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.49935680627822876, 'eval_accuracy': 0.7744916820702403, 'eval_f1': 0.8157099697885196, 'eval_precision': 0.7648725212464589, 'eval_recall': 0.8737864077669902, 'eval_runtime': 1.2472, 'eval_samples_per_second': 433.771, 'eval_steps_per_second': 27.261, 'epoch': 1.0}
{'loss': 0.4592, 'grad_norm': 11.82394027709961, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}


Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6706, 'grad_norm': 1.0456337928771973, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6666, 'grad_norm': 2.429389715194702, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6475, 'grad_norm': 4.233270168304443, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.623, 'grad_norm': 2.390976905822754, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5805, 'grad_norm': 4.893255233764648, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5680114030838013, 'eval_accuracy': 0.7171903881700554, 'eval_f1': 0.777292576419214, 'eval_precision': 0.7082228116710876, 'eval_recall': 0.8612903225806452, 'eval_runtime': 1.7506, 'eval_samples_per_second': 309.045, 'eval_steps_per_second': 19.422, 'epoch': 1.0}
{'loss': 0.5609, 'grad_norm': 4.455507278442383, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'lo

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6754, 'grad_norm': 1.4552444219589233, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6459, 'grad_norm': 1.7439923286437988, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6535, 'grad_norm': 2.3988430500030518, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.6124, 'grad_norm': 2.664525032043457, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5879, 'grad_norm': 3.1489944458007812, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5614460706710815, 'eval_accuracy': 0.7245841035120147, 'eval_f1': 0.7569331158238173, 'eval_precision': 0.7656765676567657, 'eval_recall': 0.7483870967741936, 'eval_runtime': 1.5394, 'eval_samples_per_second': 351.442, 'eval_steps_per_second': 22.087, 'epoch': 1.0}
{'loss': 0.5647, 'grad_norm': 5.5065226554870605, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6833, 'grad_norm': 1.755516529083252, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.66, 'grad_norm': 1.5060088634490967, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6453, 'grad_norm': 2.8222436904907227, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.6148, 'grad_norm': 2.955977439880371, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.6125, 'grad_norm': 2.8826375007629395, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5843824744224548, 'eval_accuracy': 0.7153419593345656, 'eval_f1': 0.7830985915492957, 'eval_precision': 0.695, 'eval_recall': 0.896774193548387, 'eval_runtime': 1.8465, 'eval_samples_per_second': 292.993, 'eval_steps_per_second': 18.414, 'epoch': 1.0}
{'loss': 0.592, 'grad_norm': 4.876107692718506, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'loss': 0.5454, 

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.68, 'grad_norm': 1.1564017534255981, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6518, 'grad_norm': 3.4106643199920654, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6369, 'grad_norm': 3.584158182144165, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.6263, 'grad_norm': 4.425158977508545, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.6107, 'grad_norm': 2.801501512527466, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5884409546852112, 'eval_accuracy': 0.711645101663586, 'eval_f1': 0.7732558139534884, 'eval_precision': 0.7018469656992085, 'eval_recall': 0.86084142394822, 'eval_runtime': 1.5038, 'eval_samples_per_second': 359.746, 'eval_steps_per_second': 22.609, 'epoch': 1.0}
{'loss': 0.5767, 'grad_norm': 3.0217182636260986, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'los

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6819, 'grad_norm': 1.6870958805084229, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6576, 'grad_norm': 1.299269199371338, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6513, 'grad_norm': 2.0378713607788086, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5909, 'grad_norm': 3.805234432220459, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5841, 'grad_norm': 2.729715585708618, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5385247468948364, 'eval_accuracy': 0.756007393715342, 'eval_f1': 0.7962962962962963, 'eval_precision': 0.7610619469026548, 'eval_recall': 0.8349514563106796, 'eval_runtime': 1.5164, 'eval_samples_per_second': 356.765, 'eval_steps_per_second': 22.421, 'epoch': 1.0}
{'loss': 0.5264, 'grad_norm': 6.826160907745361, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6734, 'grad_norm': 1.0330833196640015, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6439, 'grad_norm': 3.7261335849761963, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6214, 'grad_norm': 9.091059684753418, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.6023, 'grad_norm': 4.052025318145752, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5547, 'grad_norm': 3.947922945022583, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5752870440483093, 'eval_accuracy': 0.6857670979667283, 'eval_f1': 0.7557471264367817, 'eval_precision': 0.6813471502590673, 'eval_recall': 0.8483870967741935, 'eval_runtime': 0.6139, 'eval_samples_per_second': 881.231, 'eval_steps_per_second': 55.382, 'epoch': 1.0}
{'loss': 0.5609, 'grad_norm': 4.7591657638549805, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}


Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6713, 'grad_norm': 2.384836196899414, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6267, 'grad_norm': 2.832188129425049, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6403, 'grad_norm': 3.377864360809326, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5902, 'grad_norm': 1.4343371391296387, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5835, 'grad_norm': 2.209688901901245, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5735596418380737, 'eval_accuracy': 0.7005545286506469, 'eval_f1': 0.7530487804878049, 'eval_precision': 0.7138728323699421, 'eval_recall': 0.7967741935483871, 'eval_runtime': 0.7259, 'eval_samples_per_second': 745.321, 'eval_steps_per_second': 46.841, 'epoch': 1.0}
{'loss': 0.5547, 'grad_norm': 6.43011474609375, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'l

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6837, 'grad_norm': 1.5802969932556152, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6482, 'grad_norm': 1.8523855209350586, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6257, 'grad_norm': 6.302910804748535, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.5868, 'grad_norm': 4.535139560699463, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5805, 'grad_norm': 3.7358829975128174, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5549963116645813, 'eval_accuracy': 0.7134935304990758, 'eval_f1': 0.7633587786259542, 'eval_precision': 0.7246376811594203, 'eval_recall': 0.8064516129032258, 'eval_runtime': 0.7059, 'eval_samples_per_second': 766.395, 'eval_steps_per_second': 48.165, 'epoch': 1.0}
{'loss': 0.5735, 'grad_norm': 10.139005661010742, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6824, 'grad_norm': 1.026371955871582, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6349, 'grad_norm': 3.3838675022125244, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6161, 'grad_norm': 4.0729546546936035, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.6133, 'grad_norm': 4.425028324127197, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5743, 'grad_norm': 4.055292129516602, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.566743791103363, 'eval_accuracy': 0.7134935304990758, 'eval_f1': 0.7832167832167832, 'eval_precision': 0.6896551724137931, 'eval_recall': 0.9061488673139159, 'eval_runtime': 0.6972, 'eval_samples_per_second': 776.0, 'eval_steps_per_second': 48.769, 'epoch': 1.0}
{'loss': 0.5567, 'grad_norm': 8.141851425170898, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'lo

Map:   0%|          | 0/2164 [00:00<?, ? examples/s]

Map:   0%|          | 0/541 [00:00<?, ? examples/s]



{'loss': 0.6858, 'grad_norm': 1.826180338859558, 'learning_rate': 1.9276752767527677e-05, 'epoch': 0.18450184501845018}
{'loss': 0.6424, 'grad_norm': 2.0221266746520996, 'learning_rate': 1.8538745387453875e-05, 'epoch': 0.36900369003690037}
{'loss': 0.6435, 'grad_norm': 5.970825672149658, 'learning_rate': 1.7800738007380077e-05, 'epoch': 0.5535055350553506}
{'loss': 0.578, 'grad_norm': 6.533298969268799, 'learning_rate': 1.7062730627306276e-05, 'epoch': 0.7380073800738007}
{'loss': 0.5674, 'grad_norm': 6.677089214324951, 'learning_rate': 1.6324723247232474e-05, 'epoch': 0.922509225092251}
{'eval_loss': 0.5403273105621338, 'eval_accuracy': 0.7486136783733827, 'eval_f1': 0.7951807228915663, 'eval_precision': 0.7436619718309859, 'eval_recall': 0.8543689320388349, 'eval_runtime': 0.6106, 'eval_samples_per_second': 885.966, 'eval_steps_per_second': 55.68, 'epoch': 1.0}
{'loss': 0.5184, 'grad_norm': 10.174674034118652, 'learning_rate': 1.5586715867158672e-05, 'epoch': 1.1070110701107012}
{'l