# Importing libraries

In [16]:
import os
import torch
import wandb
import evaluate
import numpy as np
from PIL import Image
import io
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    default_data_collator
)
from torchvision import transforms
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, roc_auc_score


# Initializing wandb

In [17]:
wandb.login(key='90c6ded1d5897b142f28dcf8658ff0ead673bf4d')

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Nitro\_netrc


True

# Load dataset

In [18]:
dataset = load_dataset("derek-thomas/ScienceQA")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
        num_rows: 12726
    })
    validation: Dataset({
        features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
        num_rows: 4241
    })
    test: Dataset({
        features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
        num_rows: 4241
    })
})


# Initializing tokenizer and the model

In [20]:
model_name = "microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Setting up the image transformations

In [21]:
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Creating prompt format

In [22]:

def create_prompt(question, choices, hint=None, lecture=None):
    """Create a structured prompt with clear instructions."""
    choices_text = '\n'.join([f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)])
    hint_text = f"Hint: {hint}\n" if hint else ""
    lecture_text = f"Lecture Context: {lecture}\n" if lecture else ""

    return (
        "You are a specialized science tutor assisting a student. Your goal is to answer "
        "the question below based on scientific principles, leveraging both the lecture context "
        "and any provided hints.\n\n"
        f"{lecture_text}{hint_text}Question: {question}\n\n"
        f"Choices:\n{choices_text}\n\n"
        "Provide the most accurate answer based on logical reasoning and scientific knowledge:"
    )



# Processing examples

def process_example(example):
    """Process a single example from the dataset"""
    prompt = create_prompt(example["question"], example["choices"])

    encoding = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    pixel_values = None
    if example["image"] is not None:
        try:
            if isinstance(example["image"], dict) and "bytes" in example["image"]:
                image = Image.open(io.BytesIO(example["image"]["bytes"]))
            else:
                image = example["image"]
            pixel_values = image_transform(image)
        except Exception as e:
            print(f"Error processing image: {e}")
            pixel_values = torch.zeros((3, 224, 224))
    else:
        pixel_values = torch.zeros((3, 224, 224))

    return {
        "input_ids": encoding["input_ids"][0],
        "attention_mask": encoding["attention_mask"][0],
        "pixel_values": pixel_values,
        "labels": example["answer"]
    }


# Processing dataset

In [23]:
processed_dataset = dataset.map(
    process_example,
    remove_columns=dataset["train"].column_names
)

Map: 100%|██████████| 12726/12726 [01:21<00:00, 156.04 examples/s]
Map: 100%|██████████| 4241/4241 [00:33<00:00, 125.98 examples/s]
Map: 100%|██████████| 4241/4241 [00:34<00:00, 124.15 examples/s]


In [24]:
def process_dataset(dataset):
    """Process an entire dataset."""
    processed_data = [process_example(example) for example in dataset]
    return processed_data


# Evaluation function

In [25]:
metrics = {
    "accuracy": evaluate.load("accuracy"),
    "precision": evaluate.load("precision"),
    "recall": evaluate.load("recall"),
    "f1": evaluate.load("f1")
}

def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)

    results = {
        "accuracy": metrics["accuracy"].compute(predictions=predictions, references=labels)["accuracy"],
        "precision": metrics["precision"].compute(predictions=predictions, references=labels, average='weighted')["precision"],
        "recall": metrics["recall"].compute(predictions=predictions, references=labels, average='weighted')["recall"],
        "f1": metrics["f1"].compute(predictions=predictions, references=labels, average='weighted')["f1"]
    }

    results["confusion_matrix"] = confusion_matrix(labels, predictions).tolist()

    if len(np.unique(labels)) == 2:
        results["roc_auc"] = roc_auc_score(labels, predictions)

    return results



# Setting up training arguments

In [39]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  
    save_total_limit=2,     
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb"
)

# Initializing the model

In [40]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(set(processed_dataset["train"]["labels"]))
)


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Initializing the trainer

In [41]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


# Trainning the model

In [42]:
trainer.train()

  1%|▏         | 500/39800 [1:06:06<85:37:53,  7.84s/it]

{'loss': 1.0711, 'grad_norm': 3.2666122913360596, 'learning_rate': 1.9748743718592968e-05, 'epoch': 0.63}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  2%|▏         | 796/39800 [1:55:01<70:48:06,  6.53s/it]

{'eval_loss': 0.9749144911766052, 'eval_accuracy': 0.41664701721292147, 'eval_precision': 0.3348470063674198, 'eval_recall': 0.41664701721292147, 'eval_f1': 0.3711568784800813, 'eval_confusion_matrix': [[627, 982, 0, 0, 0], [620, 1140, 0, 0, 0], [593, 7, 0, 0, 0], [265, 0, 0, 0, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 586.5901, 'eval_samples_per_second': 7.23, 'eval_steps_per_second': 0.227, 'epoch': 1.0}


  3%|▎         | 1000/39800 [2:22:10<84:35:58,  7.85s/it]  

{'loss': 0.9805, 'grad_norm': 2.9857382774353027, 'learning_rate': 1.949748743718593e-05, 'epoch': 1.26}


  4%|▍         | 1500/39800 [3:28:31<83:57:22,  7.89s/it]

{'loss': 0.978, 'grad_norm': 3.8058903217315674, 'learning_rate': 1.9246231155778897e-05, 'epoch': 1.88}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  4%|▍         | 1592/39800 [3:50:31<70:39:22,  6.66s/it]

{'eval_loss': 0.9782114624977112, 'eval_accuracy': 0.4046215515208677, 'eval_precision': 0.34046592163187894, 'eval_recall': 0.4046215515208677, 'eval_f1': 0.35507637254035934, 'eval_confusion_matrix': [[1095, 404, 0, 110, 0], [1190, 451, 0, 119, 0], [112, 358, 0, 130, 0], [89, 6, 0, 170, 0], [0, 0, 0, 7, 0]], 'eval_runtime': 591.6206, 'eval_samples_per_second': 7.168, 'eval_steps_per_second': 0.225, 'epoch': 2.0}


  5%|▌         | 2000/39800 [4:45:09<82:28:25,  7.85s/it]   

{'loss': 0.9721, 'grad_norm': 4.794418811798096, 'learning_rate': 1.899497487437186e-05, 'epoch': 2.51}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  6%|▌         | 2388/39800 [5:46:36<68:24:23,  6.58s/it]

{'eval_loss': 0.9215185642242432, 'eval_accuracy': 0.45791087007781184, 'eval_precision': 0.49491337702281213, 'eval_recall': 0.45791087007781184, 'eval_f1': 0.4237508608112336, 'eval_confusion_matrix': [[241, 872, 476, 20, 0], [193, 1060, 477, 30, 0], [30, 32, 500, 38, 0], [18, 22, 84, 141, 0], [0, 7, 0, 0, 0]], 'eval_runtime': 589.7085, 'eval_samples_per_second': 7.192, 'eval_steps_per_second': 0.226, 'epoch': 3.0}


  6%|▋         | 2500/39800 [6:01:32<81:48:14,  7.90s/it]   

{'loss': 0.9312, 'grad_norm': 4.44630765914917, 'learning_rate': 1.8743718592964826e-05, 'epoch': 3.14}


  8%|▊         | 3000/39800 [7:07:56<80:45:47,  7.90s/it]

{'loss': 0.8778, 'grad_norm': 6.739133834838867, 'learning_rate': 1.8492462311557792e-05, 'epoch': 3.77}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  8%|▊         | 3184/39800 [7:41:39<66:13:50,  6.51s/it]

{'eval_loss': 0.7798731327056885, 'eval_accuracy': 0.5944352746993633, 'eval_precision': 0.6316739870030751, 'eval_recall': 0.5944352746993633, 'eval_f1': 0.5877998826237648, 'eval_confusion_matrix': [[650, 636, 258, 65, 0], [232, 1225, 238, 65, 0], [10, 57, 435, 98, 0], [1, 16, 37, 211, 0], [0, 7, 0, 0, 0]], 'eval_runtime': 584.1018, 'eval_samples_per_second': 7.261, 'eval_steps_per_second': 0.228, 'epoch': 4.0}


  9%|▉         | 3500/39800 [8:23:15<78:41:07,  7.80s/it]   

{'loss': 0.7421, 'grad_norm': 24.22088050842285, 'learning_rate': 1.8241206030150755e-05, 'epoch': 4.4}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 10%|█         | 3980/39800 [9:35:50<64:23:20,  6.47s/it]

{'eval_loss': 0.646457314491272, 'eval_accuracy': 0.6892242395661401, 'eval_precision': 0.6902495345874452, 'eval_recall': 0.6892242395661401, 'eval_f1': 0.6863802864916, 'eval_confusion_matrix': [[980, 485, 130, 14, 0], [234, 1402, 110, 14, 0], [98, 107, 379, 16, 0], [76, 15, 12, 162, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 583.2847, 'eval_samples_per_second': 7.271, 'eval_steps_per_second': 0.228, 'epoch': 5.0}


 10%|█         | 4000/39800 [9:38:32<80:11:07,  8.06s/it]   

{'loss': 0.64, 'grad_norm': 3.369701862335205, 'learning_rate': 1.798994974874372e-05, 'epoch': 5.03}


 11%|█▏        | 4500/39800 [10:44:05<76:26:17,  7.80s/it]

{'loss': 0.5279, 'grad_norm': 12.082267761230469, 'learning_rate': 1.7738693467336684e-05, 'epoch': 5.65}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 12%|█▏        | 4776/39800 [11:29:57<63:40:10,  6.54s/it]

{'eval_loss': 0.6570603847503662, 'eval_accuracy': 0.7236500825277057, 'eval_precision': 0.7225044484927128, 'eval_recall': 0.7236500825277057, 'eval_f1': 0.722255266750461, 'eval_confusion_matrix': [[1094, 407, 87, 21, 0], [253, 1384, 97, 26, 0], [88, 78, 404, 30, 0], [47, 22, 9, 187, 0], [6, 1, 0, 0, 0]], 'eval_runtime': 584.7346, 'eval_samples_per_second': 7.253, 'eval_steps_per_second': 0.227, 'epoch': 6.0}


 13%|█▎        | 5000/39800 [11:59:25<76:51:57,  7.95s/it]   

{'loss': 0.4629, 'grad_norm': 35.02655029296875, 'learning_rate': 1.748743718592965e-05, 'epoch': 6.28}


 14%|█▍        | 5500/39800 [13:05:04<74:28:06,  7.82s/it]

{'loss': 0.4175, 'grad_norm': 21.169736862182617, 'learning_rate': 1.7236180904522616e-05, 'epoch': 6.91}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 14%|█▍        | 5572/39800 [13:24:13<62:34:57,  6.58s/it]

{'eval_loss': 0.6055203080177307, 'eval_accuracy': 0.7533600565904268, 'eval_precision': 0.7607354827048078, 'eval_recall': 0.7533600565904268, 'eval_f1': 0.7549257950166796, 'eval_confusion_matrix': [[1180, 257, 127, 45, 0], [228, 1353, 138, 41, 0], [79, 26, 455, 40, 0], [39, 5, 14, 207, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 584.2823, 'eval_samples_per_second': 7.258, 'eval_steps_per_second': 0.228, 'epoch': 7.0}


 15%|█▌        | 6000/39800 [14:20:36<75:15:55,  8.02s/it]   

{'loss': 0.3717, 'grad_norm': 14.516680717468262, 'learning_rate': 1.698492462311558e-05, 'epoch': 7.54}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 16%|█▌        | 6368/39800 [15:18:53<60:49:30,  6.55s/it]

{'eval_loss': 0.6501244306564331, 'eval_accuracy': 0.7722235321858052, 'eval_precision': 0.7778504334872591, 'eval_recall': 0.7722235321858052, 'eval_f1': 0.7726296119123225, 'eval_confusion_matrix': [[1307, 204, 57, 41, 0], [296, 1342, 74, 48, 0], [104, 30, 413, 53, 0], [28, 15, 9, 213, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 588.3945, 'eval_samples_per_second': 7.208, 'eval_steps_per_second': 0.226, 'epoch': 8.0}


 16%|█▋        | 6500/39800 [15:36:22<72:56:06,  7.88s/it]   

{'loss': 0.3553, 'grad_norm': 13.282550811767578, 'learning_rate': 1.6733668341708545e-05, 'epoch': 8.17}


 18%|█▊        | 7000/39800 [16:42:00<71:17:20,  7.82s/it]

{'loss': 0.3251, 'grad_norm': 9.985687255859375, 'learning_rate': 1.6482412060301508e-05, 'epoch': 8.79}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 18%|█▊        | 7164/39800 [17:13:36<67:53:59,  7.49s/it]

{'eval_loss': 0.6890677809715271, 'eval_accuracy': 0.7663286960622495, 'eval_precision': 0.7663799102818849, 'eval_recall': 0.7663286960622495, 'eval_f1': 0.763510484411993, 'eval_confusion_matrix': [[1210, 344, 48, 7, 0], [216, 1490, 42, 12, 0], [119, 98, 364, 19, 0], [37, 29, 13, 186, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 597.5233, 'eval_samples_per_second': 7.098, 'eval_steps_per_second': 0.223, 'epoch': 9.0}


 19%|█▉        | 7500/39800 [17:58:12<71:29:06,  7.97s/it]   

{'loss': 0.3093, 'grad_norm': 8.797348976135254, 'learning_rate': 1.6231155778894474e-05, 'epoch': 9.42}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 20%|██        | 7960/39800 [19:09:27<65:19:54,  7.39s/it]

{'eval_loss': 0.7007471323013306, 'eval_accuracy': 0.7717519452959207, 'eval_precision': 0.7806562358060891, 'eval_recall': 0.7717519452959207, 'eval_f1': 0.7698410449701326, 'eval_confusion_matrix': [[1405, 155, 37, 12, 0], [388, 1312, 39, 21, 0], [153, 61, 368, 18, 0], [34, 22, 21, 188, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 591.5353, 'eval_samples_per_second': 7.169, 'eval_steps_per_second': 0.225, 'epoch': 10.0}


 20%|██        | 8000/39800 [19:14:55<75:49:24,  8.58s/it]   

{'loss': 0.3047, 'grad_norm': 7.572292804718018, 'learning_rate': 1.5979899497487437e-05, 'epoch': 10.05}


 21%|██▏       | 8500/39800 [20:21:37<69:28:11,  7.99s/it]

{'loss': 0.2782, 'grad_norm': 3.704681634902954, 'learning_rate': 1.5728643216080403e-05, 'epoch': 10.68}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 22%|██▏       | 8756/39800 [21:05:38<58:04:26,  6.73s/it]

{'eval_loss': 0.6647679209709167, 'eval_accuracy': 0.7785899551992455, 'eval_precision': 0.7813082727097979, 'eval_recall': 0.7785899551992455, 'eval_f1': 0.778827909355352, 'eval_confusion_matrix': [[1229, 265, 100, 15, 0], [199, 1424, 122, 15, 0], [59, 63, 461, 17, 0], [25, 26, 26, 188, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 594.7015, 'eval_samples_per_second': 7.131, 'eval_steps_per_second': 0.224, 'epoch': 11.0}


 23%|██▎       | 9000/39800 [21:38:27<69:18:26,  8.10s/it]   

{'loss': 0.2743, 'grad_norm': 1.9509462118148804, 'learning_rate': 1.547738693467337e-05, 'epoch': 11.31}


 24%|██▍       | 9500/39800 [22:45:29<67:10:40,  7.98s/it]

{'loss': 0.2639, 'grad_norm': 21.795888900756836, 'learning_rate': 1.5226130653266332e-05, 'epoch': 11.93}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 24%|██▍       | 9552/39800 [23:02:21<56:22:50,  6.71s/it]

{'eval_loss': 0.8506534099578857, 'eval_accuracy': 0.7877858995519924, 'eval_precision': 0.7889974847871749, 'eval_recall': 0.7877858995519924, 'eval_f1': 0.7870596620676812, 'eval_confusion_matrix': [[1342, 183, 63, 21, 0], [323, 1360, 54, 23, 0], [64, 69, 429, 38, 0], [23, 27, 5, 210, 0], [0, 0, 0, 7, 0]], 'eval_runtime': 593.5775, 'eval_samples_per_second': 7.145, 'eval_steps_per_second': 0.224, 'epoch': 12.0}


 25%|██▌       | 10000/39800 [24:02:06<66:01:21,  7.98s/it]  

{'loss': 0.2411, 'grad_norm': 17.64837074279785, 'learning_rate': 1.4974874371859299e-05, 'epoch': 12.56}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 26%|██▌       | 10348/39800 [24:58:43<53:44:23,  6.57s/it]

{'eval_loss': 0.8239515423774719, 'eval_accuracy': 0.7925017684508371, 'eval_precision': 0.7914036051934361, 'eval_recall': 0.7925017684508371, 'eval_f1': 0.7916076312148025, 'eval_confusion_matrix': [[1292, 251, 54, 12, 0], [247, 1439, 65, 9, 0], [87, 70, 431, 12, 0], [13, 31, 22, 199, 0], [0, 7, 0, 0, 0]], 'eval_runtime': 615.7723, 'eval_samples_per_second': 6.887, 'eval_steps_per_second': 0.216, 'epoch': 13.0}


 26%|██▋       | 10500/39800 [25:20:28<64:53:38,  7.97s/it]   

{'loss': 0.2444, 'grad_norm': 126.51744079589844, 'learning_rate': 1.4723618090452263e-05, 'epoch': 13.19}


 28%|██▊       | 11000/39800 [26:27:02<64:06:49,  8.01s/it]

{'loss': 0.2295, 'grad_norm': 18.019622802734375, 'learning_rate': 1.4472361809045228e-05, 'epoch': 13.82}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 28%|██▊       | 11144/39800 [26:55:53<52:30:24,  6.60s/it]

{'eval_loss': 0.8167638778686523, 'eval_accuracy': 0.7967460504597972, 'eval_precision': 0.7988647154599973, 'eval_recall': 0.7967460504597972, 'eval_f1': 0.7968438876800693, 'eval_confusion_matrix': [[1307, 192, 87, 23, 0], [261, 1385, 89, 25, 0], [60, 41, 470, 29, 0], [20, 11, 17, 217, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 593.561, 'eval_samples_per_second': 7.145, 'eval_steps_per_second': 0.224, 'epoch': 14.0}


 29%|██▉       | 11500/39800 [27:42:56<62:47:40,  7.99s/it]   

{'loss': 0.2235, 'grad_norm': 6.134210109710693, 'learning_rate': 1.4221105527638192e-05, 'epoch': 14.45}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 30%|███       | 11940/39800 [28:51:17<51:14:49,  6.62s/it]

{'eval_loss': 0.8873016834259033, 'eval_accuracy': 0.7922659750058948, 'eval_precision': 0.7910291268211772, 'eval_recall': 0.7922659750058948, 'eval_f1': 0.7914357522193185, 'eval_confusion_matrix': [[1283, 247, 71, 8, 0], [253, 1441, 52, 14, 0], [62, 88, 435, 15, 0], [24, 17, 23, 201, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 585.5176, 'eval_samples_per_second': 7.243, 'eval_steps_per_second': 0.227, 'epoch': 15.0}


 30%|███       | 12000/39800 [28:59:20<60:54:32,  7.89s/it]   

{'loss': 0.2131, 'grad_norm': 2.1968119144439697, 'learning_rate': 1.3969849246231157e-05, 'epoch': 15.08}


 31%|███▏      | 12500/39800 [30:05:24<60:18:16,  7.95s/it]

{'loss': 0.213, 'grad_norm': 5.041721820831299, 'learning_rate': 1.3718592964824123e-05, 'epoch': 15.7}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 32%|███▏      | 12736/39800 [30:46:07<49:16:01,  6.55s/it]

{'eval_loss': 0.9507129788398743, 'eval_accuracy': 0.7899080405564726, 'eval_precision': 0.7885224772476256, 'eval_recall': 0.7899080405564726, 'eval_f1': 0.7890853124441576, 'eval_confusion_matrix': [[1274, 259, 66, 10, 0], [237, 1441, 68, 14, 0], [92, 72, 424, 12, 0], [20, 14, 20, 211, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 585.6758, 'eval_samples_per_second': 7.241, 'eval_steps_per_second': 0.227, 'epoch': 16.0}


 33%|███▎      | 13000/39800 [31:20:52<58:21:20,  7.84s/it]   

{'loss': 0.1913, 'grad_norm': 14.324834823608398, 'learning_rate': 1.3467336683417087e-05, 'epoch': 16.33}


 34%|███▍      | 13500/39800 [32:26:31<58:31:55,  8.01s/it]

{'loss': 0.1965, 'grad_norm': 12.473600387573242, 'learning_rate': 1.3216080402010052e-05, 'epoch': 16.96}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 34%|███▍      | 13532/39800 [32:40:24<47:30:05,  6.51s/it]

{'eval_loss': 0.9781504273414612, 'eval_accuracy': 0.7854279651025702, 'eval_precision': 0.7851547750240246, 'eval_recall': 0.7854279651025702, 'eval_f1': 0.7846862906848839, 'eval_confusion_matrix': [[1290, 231, 60, 28, 0], [257, 1406, 68, 29, 0], [85, 64, 415, 36, 0], [20, 11, 14, 220, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 583.1717, 'eval_samples_per_second': 7.272, 'eval_steps_per_second': 0.228, 'epoch': 17.0}


 35%|███▌      | 14000/39800 [33:41:53<55:54:52,  7.80s/it]   

{'loss': 0.1823, 'grad_norm': 1.4533655643463135, 'learning_rate': 1.2964824120603017e-05, 'epoch': 17.59}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 36%|███▌      | 14328/39800 [34:34:41<46:05:59,  6.52s/it]

{'eval_loss': 1.0152989625930786, 'eval_accuracy': 0.7849563782126857, 'eval_precision': 0.7842631989633511, 'eval_recall': 0.7849563782126857, 'eval_f1': 0.7834990395416612, 'eval_confusion_matrix': [[1277, 274, 50, 8, 0], [260, 1447, 43, 10, 0], [99, 88, 396, 17, 0], [25, 18, 13, 209, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 584.1429, 'eval_samples_per_second': 7.26, 'eval_steps_per_second': 0.228, 'epoch': 18.0}


 36%|███▋      | 14500/39800 [34:57:17<54:52:14,  7.81s/it]   

{'loss': 0.1782, 'grad_norm': 102.63861846923828, 'learning_rate': 1.2713567839195981e-05, 'epoch': 18.22}


 38%|███▊      | 15000/39800 [36:03:03<54:01:06,  7.84s/it]

{'loss': 0.1814, 'grad_norm': 39.0299072265625, 'learning_rate': 1.2462311557788947e-05, 'epoch': 18.84}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 38%|███▊      | 15124/39800 [36:29:06<45:00:41,  6.57s/it]

{'eval_loss': 1.0000382661819458, 'eval_accuracy': 0.7983966045743929, 'eval_precision': 0.7982879607252109, 'eval_recall': 0.7983966045743929, 'eval_f1': 0.7981032135759663, 'eval_confusion_matrix': [[1276, 236, 84, 13, 0], [225, 1434, 87, 14, 0], [57, 57, 465, 21, 0], [10, 28, 16, 211, 0], [0, 7, 0, 0, 0]], 'eval_runtime': 583.2121, 'eval_samples_per_second': 7.272, 'eval_steps_per_second': 0.228, 'epoch': 19.0}


 39%|███▉      | 15500/39800 [37:18:43<54:26:54,  8.07s/it]   

{'loss': 0.1654, 'grad_norm': 10.261225700378418, 'learning_rate': 1.2211055276381912e-05, 'epoch': 19.47}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 40%|████      | 15920/39800 [38:23:43<43:51:58,  6.61s/it]

{'eval_loss': 1.0795623064041138, 'eval_accuracy': 0.7988681914642773, 'eval_precision': 0.798105950446219, 'eval_recall': 0.7988681914642773, 'eval_f1': 0.7978560411422579, 'eval_confusion_matrix': [[1278, 272, 52, 7, 0], [223, 1472, 54, 11, 0], [70, 82, 435, 13, 0], [18, 29, 15, 203, 0], [0, 7, 0, 0, 0]], 'eval_runtime': 586.4637, 'eval_samples_per_second': 7.231, 'eval_steps_per_second': 0.227, 'epoch': 20.0}


 40%|████      | 16000/39800 [38:34:19<53:59:24,  8.17s/it]   

{'loss': 0.1759, 'grad_norm': 0.8818494081497192, 'learning_rate': 1.1959798994974876e-05, 'epoch': 20.1}


 41%|████▏     | 16500/39800 [39:40:21<50:34:03,  7.81s/it]

{'loss': 0.1606, 'grad_norm': 2.749199151992798, 'learning_rate': 1.170854271356784e-05, 'epoch': 20.73}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 42%|████▏     | 16716/39800 [40:18:26<42:00:31,  6.55s/it]

{'eval_loss': 1.1517727375030518, 'eval_accuracy': 0.7941523225654327, 'eval_precision': 0.7932203671159012, 'eval_recall': 0.7941523225654327, 'eval_f1': 0.7928473866885914, 'eval_confusion_matrix': [[1337, 215, 44, 13, 0], [275, 1413, 56, 16, 0], [97, 75, 409, 19, 0], [15, 24, 17, 209, 0], [0, 7, 0, 0, 0]], 'eval_runtime': 582.1381, 'eval_samples_per_second': 7.285, 'eval_steps_per_second': 0.228, 'epoch': 21.0}


 43%|████▎     | 17000/39800 [40:55:59<50:05:28,  7.91s/it]   

{'loss': 0.1619, 'grad_norm': 1.8568977117538452, 'learning_rate': 1.1457286432160805e-05, 'epoch': 21.36}


 44%|████▍     | 17500/39800 [42:02:00<46:35:57,  7.52s/it]

{'loss': 0.1526, 'grad_norm': 0.9540131092071533, 'learning_rate': 1.120603015075377e-05, 'epoch': 21.98}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 44%|████▍     | 17512/39800 [42:12:51<38:49:00,  6.27s/it]

{'eval_loss': 1.0293322801589966, 'eval_accuracy': 0.8113652440462155, 'eval_precision': 0.8107350763387092, 'eval_recall': 0.8113652440462155, 'eval_f1': 0.8108900829731378, 'eval_confusion_matrix': [[1333, 198, 68, 10, 0], [239, 1434, 75, 12, 0], [62, 58, 462, 18, 0], [13, 22, 18, 212, 0], [0, 7, 0, 0, 0]], 'eval_runtime': 563.9361, 'eval_samples_per_second': 7.52, 'eval_steps_per_second': 0.236, 'epoch': 22.0}


 45%|████▌     | 18000/39800 [43:14:43<45:31:13,  7.52s/it]   

{'loss': 0.146, 'grad_norm': 0.837472140789032, 'learning_rate': 1.0954773869346736e-05, 'epoch': 22.61}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 46%|████▌     | 18308/39800 [44:03:18<38:46:21,  6.49s/it]

{'eval_loss': 1.0811172723770142, 'eval_accuracy': 0.7917943881160103, 'eval_precision': 0.7918172204555383, 'eval_recall': 0.7917943881160103, 'eval_f1': 0.7911751519177981, 'eval_confusion_matrix': [[1315, 227, 57, 10, 0], [278, 1405, 64, 13, 0], [105, 49, 428, 18, 0], [24, 13, 18, 210, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 567.629, 'eval_samples_per_second': 7.471, 'eval_steps_per_second': 0.234, 'epoch': 23.0}


 46%|████▋     | 18500/39800 [44:28:04<45:24:26,  7.67s/it]   

{'loss': 0.147, 'grad_norm': 0.006968795321881771, 'learning_rate': 1.07035175879397e-05, 'epoch': 23.24}


 48%|████▊     | 19000/39800 [45:34:56<46:28:09,  8.04s/it]

{'loss': 0.1468, 'grad_norm': 0.7928552031517029, 'learning_rate': 1.0452261306532665e-05, 'epoch': 23.87}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 48%|████▊     | 19104/39800 [45:59:06<39:32:32,  6.88s/it]

{'eval_loss': 1.1382555961608887, 'eval_accuracy': 0.7967460504597972, 'eval_precision': 0.7969025477546436, 'eval_recall': 0.7967460504597972, 'eval_f1': 0.7952973976863562, 'eval_confusion_matrix': [[1343, 214, 39, 13, 0], [275, 1429, 44, 12, 0], [114, 68, 397, 21, 0], [21, 15, 19, 210, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 606.4382, 'eval_samples_per_second': 6.993, 'eval_steps_per_second': 0.219, 'epoch': 24.0}


 49%|████▉     | 19500/39800 [46:52:11<44:57:54,  7.97s/it]   

{'loss': 0.1399, 'grad_norm': 2.085848331451416, 'learning_rate': 1.020100502512563e-05, 'epoch': 24.5}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 50%|█████     | 19900/39800 [47:56:42<36:30:17,  6.60s/it]

{'eval_loss': 1.1525033712387085, 'eval_accuracy': 0.7925017684508371, 'eval_precision': 0.7921580293738582, 'eval_recall': 0.7925017684508371, 'eval_f1': 0.7921403696164526, 'eval_confusion_matrix': [[1288, 234, 74, 13, 0], [263, 1400, 83, 14, 0], [59, 62, 460, 19, 0], [23, 12, 17, 213, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 600.3463, 'eval_samples_per_second': 7.064, 'eval_steps_per_second': 0.222, 'epoch': 25.0}


 50%|█████     | 20000/39800 [48:10:00<42:40:18,  7.76s/it]   

{'loss': 0.1416, 'grad_norm': 0.862608015537262, 'learning_rate': 9.949748743718594e-06, 'epoch': 25.13}


 52%|█████▏    | 20500/39800 [49:30:35<56:02:05, 10.45s/it]

{'loss': 0.1278, 'grad_norm': 1.3461982011795044, 'learning_rate': 9.698492462311559e-06, 'epoch': 25.75}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 52%|█████▏    | 20696/39800 [50:10:05<37:04:15,  6.99s/it]

{'eval_loss': 1.2413654327392578, 'eval_accuracy': 0.7953312897901438, 'eval_precision': 0.7937746307629584, 'eval_recall': 0.7953312897901438, 'eval_f1': 0.7944728342673933, 'eval_confusion_matrix': [[1277, 256, 66, 10, 0], [240, 1448, 60, 12, 0], [74, 71, 434, 21, 0], [22, 13, 16, 214, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 616.4008, 'eval_samples_per_second': 6.88, 'eval_steps_per_second': 0.216, 'epoch': 26.0}


 53%|█████▎    | 21000/39800 [50:50:34<41:21:03,  7.92s/it]   

{'loss': 0.147, 'grad_norm': 21.958770751953125, 'learning_rate': 9.447236180904523e-06, 'epoch': 26.38}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 54%|█████▍    | 21492/39800 [52:04:43<35:04:48,  6.90s/it]

{'eval_loss': 1.2282932996749878, 'eval_accuracy': 0.7917943881160103, 'eval_precision': 0.7906777284097909, 'eval_recall': 0.7917943881160103, 'eval_f1': 0.7905143500862193, 'eval_confusion_matrix': [[1285, 266, 47, 11, 0], [244, 1452, 53, 11, 0], [120, 58, 404, 18, 0], [24, 10, 14, 217, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 581.9468, 'eval_samples_per_second': 7.288, 'eval_steps_per_second': 0.229, 'epoch': 27.0}


 54%|█████▍    | 21500/39800 [52:05:50<112:47:39, 22.19s/it] 

{'loss': 0.1365, 'grad_norm': 0.01799265295267105, 'learning_rate': 9.195979899497488e-06, 'epoch': 27.01}


 55%|█████▌    | 22000/39800 [53:14:52<43:03:17,  8.71s/it] 

{'loss': 0.1281, 'grad_norm': 2.576378107070923, 'learning_rate': 8.944723618090452e-06, 'epoch': 27.64}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 56%|█████▌    | 22288/39800 [54:09:01<37:47:28,  7.77s/it]

{'eval_loss': 1.2009509801864624, 'eval_accuracy': 0.7941523225654327, 'eval_precision': 0.7938707162322557, 'eval_recall': 0.7941523225654327, 'eval_f1': 0.7935855112162391, 'eval_confusion_matrix': [[1314, 223, 65, 7, 0], [271, 1409, 69, 11, 0], [94, 61, 429, 16, 0], [21, 9, 19, 216, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 608.0337, 'eval_samples_per_second': 6.975, 'eval_steps_per_second': 0.219, 'epoch': 28.0}


 57%|█████▋    | 22500/39800 [54:38:27<38:39:43,  8.05s/it]  

{'loss': 0.1269, 'grad_norm': 10.840523719787598, 'learning_rate': 8.693467336683418e-06, 'epoch': 28.27}


 58%|█████▊    | 23000/39800 [55:45:51<37:22:59,  8.01s/it]

{'loss': 0.1298, 'grad_norm': 0.6617726683616638, 'learning_rate': 8.442211055276383e-06, 'epoch': 28.89}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 58%|█████▊    | 23084/39800 [56:07:04<31:04:43,  6.69s/it]

{'eval_loss': 1.1584004163742065, 'eval_accuracy': 0.7941523225654327, 'eval_precision': 0.7931799158609996, 'eval_recall': 0.7941523225654327, 'eval_f1': 0.7931832005317673, 'eval_confusion_matrix': [[1263, 261, 61, 24, 0], [220, 1454, 61, 25, 0], [77, 61, 425, 37, 0], [18, 8, 13, 226, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 600.6749, 'eval_samples_per_second': 7.06, 'eval_steps_per_second': 0.221, 'epoch': 29.0}


 59%|█████▉    | 23500/39800 [57:02:59<36:01:59,  7.96s/it]  

{'loss': 0.129, 'grad_norm': 1.182868480682373, 'learning_rate': 8.190954773869347e-06, 'epoch': 29.52}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 60%|██████    | 23880/39800 [58:03:55<32:09:08,  7.27s/it]

{'eval_loss': 1.2160433530807495, 'eval_accuracy': 0.7976892242395661, 'eval_precision': 0.7963964282026318, 'eval_recall': 0.7976892242395661, 'eval_f1': 0.7966351633856811, 'eval_confusion_matrix': [[1303, 242, 51, 13, 0], [244, 1447, 52, 17, 0], [100, 62, 420, 18, 0], [22, 12, 18, 213, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 603.8317, 'eval_samples_per_second': 7.023, 'eval_steps_per_second': 0.22, 'epoch': 30.0}


 60%|██████    | 24000/39800 [58:20:03<35:03:47,  7.99s/it]  

{'loss': 0.1221, 'grad_norm': 1.2236837148666382, 'learning_rate': 7.939698492462312e-06, 'epoch': 30.15}


 62%|██████▏   | 24500/39800 [59:27:15<34:25:03,  8.10s/it]

{'loss': 0.1222, 'grad_norm': 0.4154203534126282, 'learning_rate': 7.688442211055276e-06, 'epoch': 30.78}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 62%|██████▏   | 24676/39800 [60:00:42<27:44:35,  6.60s/it]

{'eval_loss': 1.314376950263977, 'eval_accuracy': 0.8014619193586419, 'eval_precision': 0.8008398719003855, 'eval_recall': 0.8014619193586419, 'eval_f1': 0.8010232416158285, 'eval_confusion_matrix': [[1296, 229, 72, 12, 0], [239, 1425, 77, 19, 0], [63, 54, 457, 26, 0], [19, 8, 17, 221, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 596.5624, 'eval_samples_per_second': 7.109, 'eval_steps_per_second': 0.223, 'epoch': 31.0}


 63%|██████▎   | 25000/39800 [60:44:13<32:45:41,  7.97s/it]  

{'loss': 0.1185, 'grad_norm': 1.0674813985824585, 'learning_rate': 7.437185929648242e-06, 'epoch': 31.41}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 64%|██████▍   | 25472/39800 [61:57:15<26:33:54,  6.67s/it]

{'eval_loss': 1.2672585248947144, 'eval_accuracy': 0.798632398019335, 'eval_precision': 0.7987784914280219, 'eval_recall': 0.798632398019335, 'eval_f1': 0.7983852599135177, 'eval_confusion_matrix': [[1270, 240, 81, 18, 0], [218, 1433, 92, 17, 0], [49, 56, 465, 30, 0], [20, 11, 15, 219, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 598.6046, 'eval_samples_per_second': 7.085, 'eval_steps_per_second': 0.222, 'epoch': 32.0}


 64%|██████▍   | 25500/39800 [62:01:06<31:21:53,  7.90s/it]  

{'loss': 0.1169, 'grad_norm': 0.4861578941345215, 'learning_rate': 7.185929648241206e-06, 'epoch': 32.04}


 65%|██████▌   | 26000/39800 [63:08:02<30:33:33,  7.97s/it]

{'loss': 0.1141, 'grad_norm': 1.5366400480270386, 'learning_rate': 6.934673366834172e-06, 'epoch': 32.66}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 66%|██████▌   | 26268/39800 [63:54:13<25:27:01,  6.77s/it]

{'eval_loss': 1.3399474620819092, 'eval_accuracy': 0.796510257014855, 'eval_precision': 0.7957467334056355, 'eval_recall': 0.796510257014855, 'eval_f1': 0.7957295631595226, 'eval_confusion_matrix': [[1309, 224, 60, 16, 0], [274, 1425, 48, 13, 0], [85, 63, 430, 22, 0], [22, 10, 19, 214, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 609.365, 'eval_samples_per_second': 6.96, 'eval_steps_per_second': 0.218, 'epoch': 33.0}


 67%|██████▋   | 26500/39800 [64:26:00<30:13:37,  8.18s/it]  

{'loss': 0.1157, 'grad_norm': 0.8603045344352722, 'learning_rate': 6.683417085427136e-06, 'epoch': 33.29}


 68%|██████▊   | 27000/39800 [65:34:18<29:02:01,  8.17s/it]

{'loss': 0.1143, 'grad_norm': 2.1132845878601074, 'learning_rate': 6.4321608040201015e-06, 'epoch': 33.92}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 68%|██████▊   | 27064/39800 [65:53:08<24:19:56,  6.88s/it]

{'eval_loss': 1.3002879619598389, 'eval_accuracy': 0.7979250176845084, 'eval_precision': 0.7965244153403084, 'eval_recall': 0.7979250176845084, 'eval_f1': 0.797206149218108, 'eval_confusion_matrix': [[1277, 246, 71, 15, 0], [233, 1453, 62, 12, 0], [64, 70, 444, 22, 0], [22, 11, 22, 210, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 611.1419, 'eval_samples_per_second': 6.939, 'eval_steps_per_second': 0.218, 'epoch': 34.0}


 69%|██████▉   | 27500/39800 [66:52:59<27:53:43,  8.16s/it]  

{'loss': 0.1116, 'grad_norm': 1.6098898649215698, 'learning_rate': 6.180904522613066e-06, 'epoch': 34.55}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 70%|███████   | 27860/39800 [67:52:23<22:34:05,  6.80s/it]

{'eval_loss': 1.260278344154358, 'eval_accuracy': 0.798632398019335, 'eval_precision': 0.7975192104627009, 'eval_recall': 0.798632398019335, 'eval_f1': 0.7980377503176994, 'eval_confusion_matrix': [[1283, 239, 72, 15, 0], [230, 1445, 72, 13, 0], [60, 71, 451, 18, 0], [24, 15, 18, 208, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 614.719, 'eval_samples_per_second': 6.899, 'eval_steps_per_second': 0.216, 'epoch': 35.0}


 70%|███████   | 28000/39800 [68:11:36<26:43:58,  8.16s/it]  

{'loss': 0.1201, 'grad_norm': 1.0159612894058228, 'learning_rate': 5.9296482412060305e-06, 'epoch': 35.18}


 72%|███████▏  | 28500/39800 [69:21:01<26:25:11,  8.42s/it]

{'loss': 0.1068, 'grad_norm': 0.0016670286422595382, 'learning_rate': 5.678391959798996e-06, 'epoch': 35.8}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 72%|███████▏  | 28656/39800 [69:51:40<19:47:04,  6.39s/it]

{'eval_loss': 1.3601526021957397, 'eval_accuracy': 0.7988681914642773, 'eval_precision': 0.7977050908217553, 'eval_recall': 0.7988681914642773, 'eval_f1': 0.7982587161428979, 'eval_confusion_matrix': [[1278, 244, 73, 14, 0], [222, 1451, 76, 11, 0], [70, 65, 444, 21, 0], [16, 16, 18, 215, 0], [6, 1, 0, 0, 0]], 'eval_runtime': 567.3935, 'eval_samples_per_second': 7.475, 'eval_steps_per_second': 0.234, 'epoch': 36.0}


 73%|███████▎  | 29000/39800 [70:35:18<22:31:21,  7.51s/it]  

{'loss': 0.1057, 'grad_norm': 0.014761482365429401, 'learning_rate': 5.42713567839196e-06, 'epoch': 36.43}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 74%|███████▍  | 29452/39800 [71:42:36<18:19:12,  6.37s/it]

{'eval_loss': 1.3126016855239868, 'eval_accuracy': 0.7979250176845084, 'eval_precision': 0.7971922124174975, 'eval_recall': 0.7979250176845084, 'eval_f1': 0.7973705189688949, 'eval_confusion_matrix': [[1294, 230, 67, 18, 0], [231, 1431, 75, 23, 0], [70, 60, 439, 31, 0], [16, 13, 16, 220, 0], [1, 0, 0, 6, 0]], 'eval_runtime': 575.9802, 'eval_samples_per_second': 7.363, 'eval_steps_per_second': 0.231, 'epoch': 37.0}


 74%|███████▍  | 29500/39800 [71:48:49<21:53:19,  7.65s/it]  

{'loss': 0.1139, 'grad_norm': 0.8411106467247009, 'learning_rate': 5.175879396984925e-06, 'epoch': 37.06}


 75%|███████▌  | 30000/39800 [72:55:00<21:40:31,  7.96s/it]

{'loss': 0.1087, 'grad_norm': 1.797417402267456, 'learning_rate': 4.92462311557789e-06, 'epoch': 37.69}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 76%|███████▌  | 30248/39800 [73:40:31<18:53:49,  7.12s/it]

{'eval_loss': 1.3691126108169556, 'eval_accuracy': 0.7974534307946239, 'eval_precision': 0.7976136753670989, 'eval_recall': 0.7974534307946239, 'eval_f1': 0.7970384144205089, 'eval_confusion_matrix': [[1294, 222, 68, 25, 0], [239, 1423, 71, 27, 0], [70, 56, 439, 35, 0], [10, 12, 17, 226, 0], [0, 0, 0, 7, 0]], 'eval_runtime': 610.479, 'eval_samples_per_second': 6.947, 'eval_steps_per_second': 0.218, 'epoch': 38.0}


 77%|███████▋  | 30500/39800 [74:16:26<22:15:04,  8.61s/it]  

{'loss': 0.1027, 'grad_norm': 1.7076877355575562, 'learning_rate': 4.673366834170855e-06, 'epoch': 38.32}


 78%|███████▊  | 31000/39800 [75:31:35<20:13:48,  8.28s/it]

{'loss': 0.1088, 'grad_norm': 1.6430957317352295, 'learning_rate': 4.42211055276382e-06, 'epoch': 38.94}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 78%|███████▊  | 31044/39800 [75:48:17<18:18:32,  7.53s/it]

{'eval_loss': 1.3848220109939575, 'eval_accuracy': 0.799575571799104, 'eval_precision': 0.7989704347399486, 'eval_recall': 0.799575571799104, 'eval_f1': 0.7991833081045505, 'eval_confusion_matrix': [[1293, 224, 75, 17, 0], [233, 1431, 80, 16, 0], [75, 56, 450, 19, 0], [19, 11, 18, 217, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 595.517, 'eval_samples_per_second': 7.122, 'eval_steps_per_second': 0.223, 'epoch': 39.0}


 79%|███████▉  | 31500/39800 [76:45:39<17:11:57,  7.46s/it]  

{'loss': 0.1035, 'grad_norm': 2.196545124053955, 'learning_rate': 4.170854271356784e-06, 'epoch': 39.57}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 80%|████████  | 31840/39800 [77:40:17<14:09:51,  6.41s/it]

{'eval_loss': 1.323240041732788, 'eval_accuracy': 0.804763027587833, 'eval_precision': 0.8036765955747731, 'eval_recall': 0.804763027587833, 'eval_f1': 0.8041858151525171, 'eval_confusion_matrix': [[1300, 225, 69, 15, 0], [228, 1444, 70, 18, 0], [71, 57, 451, 21, 0], [18, 13, 16, 218, 0], [5, 0, 0, 2, 0]], 'eval_runtime': 581.1479, 'eval_samples_per_second': 7.298, 'eval_steps_per_second': 0.229, 'epoch': 40.0}


 80%|████████  | 32000/39800 [78:02:37<16:10:22,  7.46s/it]  

{'loss': 0.1008, 'grad_norm': 1.4505665302276611, 'learning_rate': 3.919597989949749e-06, 'epoch': 40.2}


 82%|████████▏ | 32500/39800 [79:06:25<15:40:54,  7.73s/it]

{'loss': 0.1017, 'grad_norm': 1.0812832117080688, 'learning_rate': 3.6683417085427137e-06, 'epoch': 40.83}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 82%|████████▏ | 32636/39800 [79:36:30<15:37:23,  7.85s/it]

{'eval_loss': 1.3625993728637695, 'eval_accuracy': 0.8054704079226598, 'eval_precision': 0.8058073932087204, 'eval_recall': 0.8054704079226598, 'eval_f1': 0.8050915752405592, 'eval_confusion_matrix': [[1326, 196, 68, 19, 0], [243, 1415, 74, 28, 0], [73, 50, 449, 28, 0], [11, 11, 17, 226, 0], [2, 0, 0, 5, 0]], 'eval_runtime': 669.7436, 'eval_samples_per_second': 6.332, 'eval_steps_per_second': 0.199, 'epoch': 41.0}


 83%|████████▎ | 33000/39800 [80:27:20<16:04:18,  8.51s/it]  

{'loss': 0.1081, 'grad_norm': 1.5863286256790161, 'learning_rate': 3.4170854271356786e-06, 'epoch': 41.46}


 84%|████████▍ | 33432/39800 [81:28:43<13:44:08,  7.77s/it]
 84%|████████▍ | 33432/39800 [81:39:23<13:44:08,  7.77s/it]

{'eval_loss': 1.3346153497695923, 'eval_accuracy': 0.8024050931384108, 'eval_precision': 0.8011759234624877, 'eval_recall': 0.8024050931384108, 'eval_f1': 0.801685245087311, 'eval_confusion_matrix': [[1304, 229, 60, 15, 1], [233, 1449, 65, 13, 0], [87, 58, 434, 21, 0], [21, 13, 15, 216, 0], [6, 0, 0, 1, 0]], 'eval_runtime': 639.7503, 'eval_samples_per_second': 6.629, 'eval_steps_per_second': 0.208, 'epoch': 42.0}


 84%|████████▍ | 33500/39800 [81:49:27<17:17:26,  9.88s/it]  

{'loss': 0.0968, 'grad_norm': 0.0020379135385155678, 'learning_rate': 3.165829145728643e-06, 'epoch': 42.09}


 85%|████████▌ | 34000/39800 [82:57:29<12:31:23,  7.77s/it]

{'loss': 0.1048, 'grad_norm': 0.003478741506114602, 'learning_rate': 2.914572864321608e-06, 'epoch': 42.71}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 86%|████████▌ | 34228/39800 [83:37:02<10:07:07,  6.54s/it]

{'eval_loss': 1.3343160152435303, 'eval_accuracy': 0.8019335062485263, 'eval_precision': 0.8012778902723263, 'eval_recall': 0.8019335062485263, 'eval_f1': 0.8014873453015078, 'eval_confusion_matrix': [[1301, 218, 74, 16, 0], [237, 1433, 73, 17, 0], [77, 49, 450, 24, 0], [20, 13, 15, 217, 0], [7, 0, 0, 0, 0]], 'eval_runtime': 583.5609, 'eval_samples_per_second': 7.267, 'eval_steps_per_second': 0.228, 'epoch': 43.0}


 87%|████████▋ | 34500/39800 [84:12:55<11:46:28,  8.00s/it]  

{'loss': 0.098, 'grad_norm': 2.876899003982544, 'learning_rate': 2.663316582914573e-06, 'epoch': 43.34}


 88%|████████▊ | 35000/39800 [85:18:28<10:25:33,  7.82s/it]

{'loss': 0.1028, 'grad_norm': 0.9905723333358765, 'learning_rate': 2.412060301507538e-06, 'epoch': 43.97}


 88%|████████▊ | 35024/39800 [85:21:37<8:55:06,  6.72s/it] 
 88%|████████▊ | 35024/39800 [85:31:25<8:55:06,  6.72s/it]

{'eval_loss': 1.3644397258758545, 'eval_accuracy': 0.802640886583353, 'eval_precision': 0.803832415462323, 'eval_recall': 0.802640886583353, 'eval_f1': 0.803063443721072, 'eval_confusion_matrix': [[1292, 223, 73, 19, 2], [217, 1442, 76, 22, 3], [65, 53, 451, 30, 1], [16, 14, 15, 218, 2], [2, 0, 0, 4, 1]], 'eval_runtime': 587.3559, 'eval_samples_per_second': 7.22, 'eval_steps_per_second': 0.226, 'epoch': 44.0}


 89%|████████▉ | 35500/39800 [86:34:00<9:37:25,  8.06s/it]   

{'loss': 0.0957, 'grad_norm': 1.3867244720458984, 'learning_rate': 2.1608040201005025e-06, 'epoch': 44.6}


 90%|█████████ | 35820/39800 [87:16:07<7:12:51,  6.53s/it] 
 90%|█████████ | 35820/39800 [87:25:50<7:12:51,  6.53s/it]

{'eval_loss': 1.3755415678024292, 'eval_accuracy': 0.802640886583353, 'eval_precision': 0.8026170153977293, 'eval_recall': 0.802640886583353, 'eval_f1': 0.8024732746025558, 'eval_confusion_matrix': [[1285, 231, 74, 17, 2], [213, 1447, 75, 24, 1], [65, 55, 453, 27, 0], [14, 15, 17, 219, 0], [2, 2, 0, 3, 0]], 'eval_runtime': 583.1189, 'eval_samples_per_second': 7.273, 'eval_steps_per_second': 0.228, 'epoch': 45.0}


 90%|█████████ | 36000/39800 [87:49:37<8:15:52,  7.83s/it]   

{'loss': 0.1025, 'grad_norm': 0.0008905731374397874, 'learning_rate': 1.9095477386934674e-06, 'epoch': 45.23}


 92%|█████████▏| 36500/39800 [88:55:34<7:12:09,  7.86s/it]

{'loss': 0.096, 'grad_norm': 2.4420628547668457, 'learning_rate': 1.6582914572864323e-06, 'epoch': 45.85}


 92%|█████████▏| 36616/39800 [89:12:10<6:44:26,  7.62s/it]
 92%|█████████▏| 36616/39800 [89:23:57<6:44:26,  7.62s/it]

{'eval_loss': 1.4035056829452515, 'eval_accuracy': 0.801697712803584, 'eval_precision': 0.8008653672728058, 'eval_recall': 0.801697712803584, 'eval_f1': 0.8012057645586385, 'eval_confusion_matrix': [[1290, 230, 71, 18, 0], [218, 1450, 70, 22, 0], [76, 54, 443, 26, 1], [17, 14, 17, 217, 0], [3, 1, 0, 3, 0]], 'eval_runtime': 706.8325, 'eval_samples_per_second': 6.0, 'eval_steps_per_second': 0.188, 'epoch': 46.0}


 93%|█████████▎| 37000/39800 [90:15:07<6:32:54,  8.42s/it]   

{'loss': 0.0963, 'grad_norm': 0.8375272154808044, 'learning_rate': 1.407035175879397e-06, 'epoch': 46.48}


 94%|█████████▍| 37412/39800 [91:14:43<4:30:38,  6.80s/it]
 94%|█████████▍| 37412/39800 [91:26:14<4:30:38,  6.80s/it]

{'eval_loss': 1.4052531719207764, 'eval_accuracy': 0.8033482669181796, 'eval_precision': 0.8029353031477453, 'eval_recall': 0.8033482669181796, 'eval_f1': 0.8030612181282011, 'eval_confusion_matrix': [[1289, 228, 75, 16, 1], [215, 1450, 73, 21, 1], [69, 54, 451, 26, 0], [17, 13, 18, 217, 0], [4, 1, 0, 2, 0]], 'eval_runtime': 690.8687, 'eval_samples_per_second': 6.139, 'eval_steps_per_second': 0.193, 'epoch': 47.0}


 94%|█████████▍| 37500/39800 [91:39:33<5:14:38,  8.21s/it]   

{'loss': 0.0923, 'grad_norm': 1.40641450881958, 'learning_rate': 1.155778894472362e-06, 'epoch': 47.11}


 95%|█████████▌| 38000/39800 [92:50:39<3:46:30,  7.55s/it]

{'loss': 0.0948, 'grad_norm': 2.0500423908233643, 'learning_rate': 9.045226130653267e-07, 'epoch': 47.74}


 96%|█████████▌| 38208/39800 [93:17:30<2:49:56,  6.40s/it]
 96%|█████████▌| 38208/39800 [93:27:08<2:49:56,  6.40s/it]

{'eval_loss': 1.4233304262161255, 'eval_accuracy': 0.8005187455788729, 'eval_precision': 0.7995837971084439, 'eval_recall': 0.8005187455788729, 'eval_f1': 0.8000023014980355, 'eval_confusion_matrix': [[1296, 226, 71, 15, 1], [231, 1442, 68, 19, 0], [77, 55, 442, 26, 0], [20, 14, 16, 215, 0], [4, 1, 0, 2, 0]], 'eval_runtime': 577.8941, 'eval_samples_per_second': 7.339, 'eval_steps_per_second': 0.23, 'epoch': 48.0}


 97%|█████████▋| 38500/39800 [94:06:26<2:51:34,  7.92s/it]  

{'loss': 0.0929, 'grad_norm': 1.1098741292953491, 'learning_rate': 6.532663316582916e-07, 'epoch': 48.37}


 98%|█████████▊| 39000/39800 [95:36:44<2:59:34, 13.47s/it]

{'loss': 0.0947, 'grad_norm': 2.001002788543701, 'learning_rate': 4.0201005025125634e-07, 'epoch': 48.99}


 98%|█████████▊| 39004/39800 [95:37:36<2:42:29, 12.25s/it]
 98%|█████████▊| 39004/39800 [95:51:54<2:42:29, 12.25s/it]

{'eval_loss': 1.4218014478683472, 'eval_accuracy': 0.8009903324687574, 'eval_precision': 0.8002567944985194, 'eval_recall': 0.8009903324687574, 'eval_f1': 0.8005954972712875, 'eval_confusion_matrix': [[1296, 225, 71, 16, 1], [225, 1444, 72, 18, 1], [76, 56, 443, 25, 0], [18, 16, 17, 214, 0], [3, 2, 0, 2, 0]], 'eval_runtime': 858.2817, 'eval_samples_per_second': 4.941, 'eval_steps_per_second': 0.155, 'epoch': 49.0}


 99%|█████████▉| 39500/39800 [97:09:21<48:20,  9.67s/it]    

{'loss': 0.0914, 'grad_norm': 5.636263847351074, 'learning_rate': 1.5075376884422112e-07, 'epoch': 49.62}


100%|██████████| 39800/39800 [98:01:40<00:00,  6.93s/it]  
100%|██████████| 39800/39800 [98:12:11<00:00,  6.93s/it]

{'eval_loss': 1.418050765991211, 'eval_accuracy': 0.801697712803584, 'eval_precision': 0.8008813862004194, 'eval_recall': 0.801697712803584, 'eval_f1': 0.80126711689309, 'eval_confusion_matrix': [[1296, 225, 71, 16, 1], [225, 1447, 69, 18, 1], [75, 57, 443, 25, 0], [17, 17, 17, 214, 0], [3, 2, 0, 2, 0]], 'eval_runtime': 622.639, 'eval_samples_per_second': 6.811, 'eval_steps_per_second': 0.214, 'epoch': 50.0}


100%|██████████| 39800/39800 [98:12:16<00:00,  8.88s/it]

{'train_runtime': 353536.8485, 'train_samples_per_second': 1.8, 'train_steps_per_second': 0.113, 'train_loss': 0.2421450955664093, 'epoch': 50.0}





TrainOutput(global_step=39800, training_loss=0.2421450955664093, metrics={'train_runtime': 353536.8485, 'train_samples_per_second': 1.8, 'train_steps_per_second': 0.113, 'total_flos': 4.87731404788992e+16, 'train_loss': 0.2421450955664093, 'epoch': 50.0})

# Evaluating on the test set

In [43]:
test_results = trainer.evaluate(processed_dataset["test"])
print("Test Results:", test_results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 133/133 [10:21<00:00,  4.67s/it]

Test Results: {'eval_loss': 1.1231170892715454, 'eval_accuracy': 0.794388116010375, 'eval_precision': 0.7933377948889511, 'eval_recall': 0.794388116010375, 'eval_f1': 0.7937734178572083, 'eval_confusion_matrix': [[1383, 235, 64, 18, 0], [248, 1338, 78, 25, 0], [65, 49, 450, 19, 0], [14, 30, 19, 198, 0], [0, 7, 1, 0, 0]], 'eval_runtime': 626.1541, 'eval_samples_per_second': 6.773, 'eval_steps_per_second': 0.212, 'epoch': 50.0}



