In [None]:
!pip install torch transformers accelerate bitsandbytes datasets torch-tb-profiler peft evaluate



In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from evaluate import load
from torch.profiler import profile, record_function, ProfilerActivity
import time

# Load TinyBERT model and tokenizer for GLUE (MNLI task)
model_name = "huawei-noah/TinyBERT_General_4L_312D"  # Change to "huawei-noah/TinyBERT_General_4L_312D" for TinyBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,  # MNLI has 3 classes: "entailment", "neutral", "contradiction"
    ignore_mismatched_sizes=True
)
model.eval()

# Load the GLUE MNLI dataset
dataset = load_dataset("glue", "mnli", split="validation_matched[:100]")  # Use a subset for profiling
metric = load("glue", "mnli")

# Preprocess the input data for MNLI
def preprocess(example):
    inputs = tokenizer(
        example["premise"],
        example["hypothesis"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    inputs["labels"] = torch.tensor(example["label"]).unsqueeze(0)  # Add label tensor
    return inputs

# Process the dataset
inputs = [preprocess(dataset[i]) for i in range(len(dataset))]

# Move model and inputs to GPU (if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Step 1: Baseline Inference Latency Profiling
print("Running inference latency profiling...")
with torch.no_grad():
    start_time = time.time()
    for input_data in inputs:
        input_data = {k: v.to(device) for k, v in input_data.items()}
        outputs = model(**input_data)
    end_time = time.time()
    print(f"Inference Latency: {end_time - start_time:.4f} seconds")

# Step 2: Detailed Profiling with PyTorch Profiler
print("\nRunning detailed profiling...")
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             record_shapes=True, with_stack=True) as prof:
    with torch.no_grad():
        for input_data in inputs:
            with record_function("model_inference"):
                outputs = model(**{k: v.to(device) for k, v in input_data.items()})

# Print the profiling results
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
prof.export_chrome_trace("glue_mnli_profiler_trace.json")
print("Profiler data saved to glue_mnli_profiler_trace.json")

# Step 3: Compute Accuracy
print("\nComputing accuracy...")
predictions = []
labels = []

with torch.no_grad():
    for input_data in inputs:
        input_data = {k: v.to(device) for k, v in input_data.items()}
        outputs = model(**input_data)
        logits = outputs.logits
        predictions.append(torch.argmax(logits, dim=-1).cpu().item())
        labels.append(input_data["labels"].cpu().item())

accuracy = metric.compute(predictions=predictions, references=labels)["accuracy"]
print(f"Accuracy on GLUE (MNLI) dataset: {accuracy:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running inference latency profiling...
Inference Latency: 0.6742 seconds

Running detailed profiling...
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         0.00%       0.000us         0.00%       0.000us       0.000us     628.758ms        73.54%     628.758ms       6.288ms           100  
                                        model_inference        39.82%     260.339ms       100.00%     6

In [None]:
!pip install wandb -Uq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import wandb
wandb.login()

sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric

parameters_dict = {
    # Optimizer Selection
    'optimizer': {
        'values': ['adamw_torch', 'adafactor', 'adamw_hf', 'adamw_8bit']
    },

    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
    },

    # Learning Rate Scheduler
    'lr_scheduler': {
        'values': [
            'linear',
            'cosine',
            'cosine_with_restarts',
            'polynomial',
            'constant',
            'constant_with_warmup'
        ]
    },

    # Weight Decay
    'weight_decay': {
        'values': [0.0, 0.01, 0.001, 0.1]
    },

    # Warm-up Steps (as a percentage of total training steps)
    'warmup_ratio': {
        'values': [0.0, 0.05, 0.1, 0.15]
    },

    # Batch Size
    'train_batch_size': {
        'values': [4, 8, 16, 32]
    },

    'gradient_accumulation_steps': {
        'values' : [2, 4, 8, 16, 32]
    },

    # LoRA Hyperparameters
    'lora_r': {
        'values': [8, 16, 32]
    },
    'lora_alpha': {
        'values': [8, 16, 32]
    },
    'lora_dropout': {
        'values': [0.05, 0.1, 0.15]
    }
}

sweep_config['parameters'] = parameters_dict

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
sweep_id = wandb.sweep(sweep_config, project="TinyBert with GLUE on Kaggle")

Create sweep with ID: giymvsf9
Sweep URL: https://wandb.ai/garima440-new-york-university/TinyBert%20with%20GLUE%20on%20Kaggle/sweeps/giymvsf9


In [None]:
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset
from evaluate import load
from peft import LoraConfig, get_peft_model
import os
import wandb
from transformers.integrations import WandbCallback
from transformers.trainer_callback import TrainerCallback

# Load TinyBERT model and tokenizer
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,  # MNLI has 3 classes
    ignore_mismatched_sizes=True,
    torch_dtype=torch.float32  # Explicitly set to float32
)

global_best_accuracy = 0
global_best_model_checkpoint = "./global_best_model"

class SaveBestAcrossSweepsCallback(TrainerCallback):
    def __init__(self, metric_name="eval_accuracy"):
        self.metric_name = metric_name
        self.last_eval_metrics = None

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # Store the latest evaluation metrics
        self.last_eval_metrics = metrics
        print(f"Evaluation Metrics: {metrics}")

    def on_train_end(self, args, state, control, **kwargs):
        global global_best_accuracy, global_best_model_checkpoint

        # Use the stored evaluation metrics
        if self.last_eval_metrics:
            current_accuracy = self.last_eval_metrics.get(self.metric_name, 0)
            print(f"Sweep Evaluation Accuracy: {current_accuracy:.4f}")

            if current_accuracy > global_best_accuracy:
                global_best_accuracy = current_accuracy
                print(f"New global best {self.metric_name} found: {current_accuracy:.4f}")

                # Save the best model and tokenizer
                model = kwargs.get('model')
                if model:
                    try:
                        model.save_pretrained(global_best_model_checkpoint)
                        tokenizer.save_pretrained(global_best_model_checkpoint)

                        # Log the best model to wandb
                        if wandb.run:
                            wandb.save(os.path.join(global_best_model_checkpoint, "*"))

                        print(f"Global best model saved to {global_best_model_checkpoint}")
                    except Exception as e:
                        print(f"Error saving global best model: {e}")
        else:
            print("No evaluation metrics found for this sweep.")

def train(config=None):
    global model
    with wandb.init(config=config):
        config = wandb.config

        lora_config = LoraConfig(
            r=config.lora_r,
            lora_alpha=config.lora_alpha,
            target_modules=["query", "value"],
            lora_dropout=config.lora_dropout,
            bias="none",
            task_type="SEQ_CLS"
        )

        model = get_peft_model(model, lora_config)
        model.to("cuda")


        train_dataset = load_dataset("glue", "mnli", split="train[:20000]")
        validation_dataset = load_dataset("glue", "mnli", split="validation_matched[:5000]")

        def preprocess_function(examples):
            inputs = tokenizer(
                examples["premise"],
                examples["hypothesis"],
                padding="max_length",
                truncation=True,
                max_length=512
            )
            inputs["labels"] = examples["label"]
            return inputs

        train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
        validation_dataset = validation_dataset.map(preprocess_function, batched=True, remove_columns=validation_dataset.column_names)

        training_args = TrainingArguments(
            output_dir="./results",
            num_train_epochs=5,  # Each sweep is 5 epochs
            per_device_train_batch_size=config.train_batch_size,
            per_device_eval_batch_size=32,
            warmup_ratio=config.warmup_ratio,
            weight_decay=config.weight_decay,
            logging_dir="./logs",
            logging_steps=10,
            evaluation_strategy="epoch",
            save_strategy="no",  # Disable default saving
            load_best_model_at_end=False,
            metric_for_best_model="accuracy",
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            learning_rate=config.learning_rate,
            lr_scheduler_type=config.lr_scheduler,
            fp16=True,
            bf16=False
        )

        metric = load("glue", "mnli")

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = torch.argmax(torch.from_numpy(logits), dim=-1)
            return metric.compute(predictions=predictions, references=labels)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=validation_dataset,
            compute_metrics=compute_metrics,
            callbacks=[SaveBestAcrossSweepsCallback(), WandbCallback()]
        )

        model.print_trainable_parameters()

        trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
wandb.agent(sweep_id, train, count=20)

[34m[1mwandb[0m: Agent Starting Run: at3hhe3k with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 16
[34m[1mwandb[0m: 	learning_rate: 0.0004486513370893436
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	lr_scheduler: constant
[34m[1mwandb[0m: 	optimizer: adamw_hf
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.001


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 80,811 || all params: 14,431,998 || trainable%: 0.5599




Epoch,Training Loss,Validation Loss,Accuracy
0,0.8058,0.774015,0.7386
1,0.6758,0.658774,0.7518
2,0.618,0.624771,0.7642
4,0.6016,0.607433,0.7648


Evaluation Metrics: {'eval_loss': 0.7740150094032288, 'eval_accuracy': 0.7386, 'eval_runtime': 11.9832, 'eval_samples_per_second': 417.25, 'eval_steps_per_second': 13.102, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 0.65877366065979, 'eval_accuracy': 0.7518, 'eval_runtime': 11.9728, 'eval_samples_per_second': 417.612, 'eval_steps_per_second': 13.113, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 0.6247705817222595, 'eval_accuracy': 0.7642, 'eval_runtime': 12.0067, 'eval_samples_per_second': 416.434, 'eval_steps_per_second': 13.076, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 0.611847460269928, 'eval_accuracy': 0.7618, 'eval_runtime': 12.0064, 'eval_samples_per_second': 416.443, 'eval_steps_per_second': 13.076, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 0.6074326038360596, 'eval_accuracy': 0.7648, 'eval_runtime': 12.0649, 'eval_samples_per_second': 414.427, 'eval_steps_per_second': 13.013, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.7648
New global best eval_acc



Global best model saved to ./global_best_model


VBox(children=(Label(value='1.245 MB of 1.245 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▁▅▅██▇▇██
eval/loss,██▃▃▂▂▁▁▁▁
eval/runtime,▂▂▁▁▄▄▄▄██
eval/samples_per_second,▇▇██▅▅▅▅▁▁
eval/steps_per_second,▇▇██▅▅▅▅▁▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇█
train/grad_norm,▁▁▁▂▂▂▂▂▂▂▂▃▃▄▃▃▃▃▃▃█▆▆▄▄▄▃▃▃▃▅▃▄▄▃▆▄▅▅▇
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▇▆▆▆▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▂▁▁▁▁▂▂▁▁▁▁▁▁▁

0,1
eval/accuracy,0.7648
eval/loss,0.60743
eval/runtime,12.0649
eval/samples_per_second,414.427
eval/steps_per_second,13.013
total_flos,1456486801735680.0
train/epoch,4.992
train/global_step,780.0
train/grad_norm,13.93696
train/learning_rate,0.00045


[34m[1mwandb[0m: Agent Starting Run: np0swwew with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 16
[34m[1mwandb[0m: 	learning_rate: 5.02978480871983e-05
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	lr_scheduler: constant_with_warmup
[34m[1mwandb[0m: 	optimizer: adamw_hf
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	warmup_ratio: 0.15
[34m[1mwandb[0m: 	weight_decay: 0.01


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 40,875 || all params: 14,392,062 || trainable%: 0.2840




Epoch,Training Loss,Validation Loss,Accuracy
0,1.0966,1.097332,0.3684
1,1.0808,1.081196,0.4736
2,1.0578,1.056182,0.4838
4,1.0103,1.0061,0.528


Evaluation Metrics: {'eval_loss': 1.0973318815231323, 'eval_accuracy': 0.3684, 'eval_runtime': 10.7726, 'eval_samples_per_second': 464.139, 'eval_steps_per_second': 14.574, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 1.0811959505081177, 'eval_accuracy': 0.4736, 'eval_runtime': 10.8074, 'eval_samples_per_second': 462.644, 'eval_steps_per_second': 14.527, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 1.0561820268630981, 'eval_accuracy': 0.4838, 'eval_runtime': 10.8367, 'eval_samples_per_second': 461.397, 'eval_steps_per_second': 14.488, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 1.0323957204818726, 'eval_accuracy': 0.5056, 'eval_runtime': 10.7353, 'eval_samples_per_second': 465.754, 'eval_steps_per_second': 14.625, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 1.0060999393463135, 'eval_accuracy': 0.528, 'eval_runtime': 10.7253, 'eval_samples_per_second': 466.186, 'eval_steps_per_second': 14.638, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.5280


0,1
eval/accuracy,▁▁▆▆▆▆▇▇██
eval/loss,██▇▇▅▅▃▃▁▁
eval/runtime,▄▄▆▆██▂▂▁▁
eval/samples_per_second,▅▅▃▃▁▁▇▇██
eval/steps_per_second,▅▅▃▃▁▁▇▇██
train/epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇██
train/grad_norm,▃▁▂▃▄▃▃▂▂▂▂▂▂▂▂▂▄▄▃▄▄▅▃▄▃▅▆▆▄▆▄▄▄▅▄▆▇█▅▅
train/learning_rate,▁▂▂▄▆███████████████████████████████████
train/loss,███████████▇▇▇▇▇▇▆▆▆▅▅▅▄▄▄▃▄▄▃▃▃▃▂▂▁▁▁▁▁

0,1
eval/accuracy,0.528
eval/loss,1.0061
eval/runtime,10.7253
eval/samples_per_second,466.186
eval/steps_per_second,14.638
total_flos,1444238091878400.0
train/epoch,4.992
train/global_step,780.0
train/grad_norm,3.45456
train/learning_rate,5e-05


[34m[1mwandb[0m: Agent Starting Run: wssi7orj with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 0.0003996381106320194
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	lr_scheduler: cosine
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	warmup_ratio: 0.15
[34m[1mwandb[0m: 	weight_decay: 0.01


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 80,811 || all params: 14,431,998 || trainable%: 0.5599




Epoch,Training Loss,Validation Loss,Accuracy
1,0.9198,0.868355,0.614
2,0.8217,0.745674,0.6818
3,0.712,0.705342,0.7022
4,0.7101,0.689738,0.7124
5,0.7559,0.689632,0.7116


Evaluation Metrics: {'eval_loss': 0.8683546781539917, 'eval_accuracy': 0.614, 'eval_runtime': 10.8234, 'eval_samples_per_second': 461.962, 'eval_steps_per_second': 14.506, 'epoch': 1.0}
Evaluation Metrics: {'eval_loss': 0.7456740140914917, 'eval_accuracy': 0.6818, 'eval_runtime': 10.8562, 'eval_samples_per_second': 460.566, 'eval_steps_per_second': 14.462, 'epoch': 2.0}
Evaluation Metrics: {'eval_loss': 0.7053418755531311, 'eval_accuracy': 0.7022, 'eval_runtime': 10.8681, 'eval_samples_per_second': 460.064, 'eval_steps_per_second': 14.446, 'epoch': 3.0}
Evaluation Metrics: {'eval_loss': 0.6897376179695129, 'eval_accuracy': 0.7124, 'eval_runtime': 10.7948, 'eval_samples_per_second': 463.185, 'eval_steps_per_second': 14.544, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 0.6896318197250366, 'eval_accuracy': 0.7116, 'eval_runtime': 10.809, 'eval_samples_per_second': 462.576, 'eval_steps_per_second': 14.525, 'epoch': 5.0}
Sweep Evaluation Accuracy: 0.7116


0,1
eval/accuracy,▁▁▆▆▇▇████
eval/loss,██▃▃▂▂▁▁▁▁
eval/runtime,▄▄▇▇██▁▁▂▂
eval/samples_per_second,▅▅▂▂▁▁██▇▇
eval/steps_per_second,▅▅▂▂▁▁██▇▇
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇█
train/global_step,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇██
train/grad_norm,▁▁▁▂▃▃▄▃▅▇▅▇▇▇▅▅▅▅▅▇▅▆▅█▆▅▆█▇█▅▅█▇▆▅▇█▇▆
train/learning_rate,▁▃▄▄▅▆▇███▇▇▇▇▇▆▆▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
train/loss,███▇▇▆▅▄▅▄▄▄▃▄▃▃▂▃▃▃▃▂▃▁▂▃▂▂▂▂▃▁▂▁▂▂▂▃▁▃

0,1
eval/accuracy,0.7116
eval/loss,0.68963
eval/runtime,10.809
eval/samples_per_second,462.576
eval/steps_per_second,14.525
total_flos,1458820915200000.0
train/epoch,5.0
train/global_step,3125.0
train/grad_norm,2.02696
train/learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: 290mojgh with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 16
[34m[1mwandb[0m: 	learning_rate: 4.799885161056019e-05
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	lr_scheduler: constant
[34m[1mwandb[0m: 	optimizer: adamw_torch
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 40,875 || all params: 14,392,062 || trainable%: 0.2840




Epoch,Training Loss,Validation Loss,Accuracy
0,1.0971,1.097554,0.3506
1,1.0948,1.095301,0.3944
2,1.0825,1.083522,0.4768
3,1.0707,1.069302,0.4784
4,1.0593,1.056158,0.4872


Evaluation Metrics: {'eval_loss': 1.0975538492202759, 'eval_accuracy': 0.3506, 'eval_runtime': 10.807, 'eval_samples_per_second': 462.663, 'eval_steps_per_second': 14.528, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 1.095300555229187, 'eval_accuracy': 0.3944, 'eval_runtime': 10.765, 'eval_samples_per_second': 464.468, 'eval_steps_per_second': 14.584, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 1.0835217237472534, 'eval_accuracy': 0.4768, 'eval_runtime': 10.8086, 'eval_samples_per_second': 462.593, 'eval_steps_per_second': 14.525, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 1.0693022012710571, 'eval_accuracy': 0.4784, 'eval_runtime': 10.8279, 'eval_samples_per_second': 461.769, 'eval_steps_per_second': 14.5, 'epoch': 3.9936}
Evaluation Metrics: {'eval_loss': 1.0561578273773193, 'eval_accuracy': 0.4872, 'eval_runtime': 10.788, 'eval_samples_per_second': 463.478, 'eval_steps_per_second': 14.553, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.4872


0,1
eval/accuracy,▁▁▃▃▇▇████
eval/loss,████▆▆▃▃▁▁
eval/runtime,▆▆▁▁▆▆██▄▄
eval/samples_per_second,▃▃██▃▃▁▁▅▅
eval/steps_per_second,▃▃██▃▃▁▁▅▅
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇████
train/grad_norm,▄▄▁▅▃▄▆▄▁▁▃▃▆▆▂▁▁▄▂▃▃▄▇▅▅▅▇▇▅▅▇▆▅▅█▇▇▇▇▅
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,████████████▇▇▇▇▇▇▇▇▆▆▆▅▅▅▄▄▃▃▃▃▃▂▂▁▂▂▁▁

0,1
eval/accuracy,0.4872
eval/loss,1.05616
eval/runtime,10.788
eval/samples_per_second,463.478
eval/steps_per_second,14.553
total_flos,1444238091878400.0
train/epoch,4.992
train/global_step,390.0
train/grad_norm,1.62062
train/learning_rate,5e-05


[34m[1mwandb[0m: Agent Starting Run: 05k0437p with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.0006176083216140577
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.15
[34m[1mwandb[0m: 	lora_r: 32
[34m[1mwandb[0m: 	lr_scheduler: constant
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	warmup_ratio: 0.15
[34m[1mwandb[0m: 	weight_decay: 0.01


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 160,683 || all params: 14,511,870 || trainable%: 1.1073




Epoch,Training Loss,Validation Loss,Accuracy
0,0.9066,0.871383,0.6298
1,0.7678,0.745293,0.6898
2,0.7318,0.68559,0.7174
4,0.6579,0.66224,0.7306


Evaluation Metrics: {'eval_loss': 0.871383011341095, 'eval_accuracy': 0.6298, 'eval_runtime': 10.7935, 'eval_samples_per_second': 463.243, 'eval_steps_per_second': 14.546, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 0.7452929615974426, 'eval_accuracy': 0.6898, 'eval_runtime': 10.9706, 'eval_samples_per_second': 455.763, 'eval_steps_per_second': 14.311, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 0.6855900287628174, 'eval_accuracy': 0.7174, 'eval_runtime': 10.9338, 'eval_samples_per_second': 457.297, 'eval_steps_per_second': 14.359, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 0.657213032245636, 'eval_accuracy': 0.7322, 'eval_runtime': 10.8976, 'eval_samples_per_second': 458.817, 'eval_steps_per_second': 14.407, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 0.6622398495674133, 'eval_accuracy': 0.7306, 'eval_runtime': 10.9192, 'eval_samples_per_second': 457.909, 'eval_steps_per_second': 14.378, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.7306


0,1
eval/accuracy,▁▁▅▅▇▇████
eval/loss,██▄▄▂▂▁▁▁▁
eval/runtime,▁▁██▇▇▅▅▆▆
eval/samples_per_second,██▁▁▂▂▄▄▃▃
eval/steps_per_second,██▁▁▂▂▄▄▃▃
train/epoch,▁▁▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇█
train/grad_norm,▁▁▁▁▂▃▂▃▃▃▄▄▄▄▅▄▆▅▇▇▅▆▅▆▆▆▄▅▅▅▆▅▅▄▆▆▇▄▅█
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,████▇▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▁▂▁▂▃▃▁▁▁▁▁▂▁▁▂▁

0,1
eval/accuracy,0.7306
eval/loss,0.66224
eval/runtime,10.9192
eval/samples_per_second,457.909
eval/steps_per_second,14.378
total_flos,1480984221450240.0
train/epoch,4.992
train/global_step,780.0
train/grad_norm,8.98488
train/learning_rate,0.00062


[34m[1mwandb[0m: Agent Starting Run: xs95t7fx with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 1.1861392877441224e-05
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.15
[34m[1mwandb[0m: 	lora_r: 32
[34m[1mwandb[0m: 	lr_scheduler: constant
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 160,683 || all params: 14,511,870 || trainable%: 1.1073




Epoch,Training Loss,Validation Loss,Accuracy
0,1.0968,1.097727,0.3676
2,1.0884,1.086602,0.4764
4,1.0648,1.062975,0.4844


Evaluation Metrics: {'eval_loss': 1.0977267026901245, 'eval_accuracy': 0.3676, 'eval_runtime': 10.9709, 'eval_samples_per_second': 455.75, 'eval_steps_per_second': 14.311, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 1.095811128616333, 'eval_accuracy': 0.4202, 'eval_runtime': 10.9864, 'eval_samples_per_second': 455.108, 'eval_steps_per_second': 14.29, 'epoch': 2.0}
Evaluation Metrics: {'eval_loss': 1.0866016149520874, 'eval_accuracy': 0.4764, 'eval_runtime': 10.9233, 'eval_samples_per_second': 457.739, 'eval_steps_per_second': 14.373, 'epoch': 2.9984}
Evaluation Metrics: {'eval_loss': 1.074446678161621, 'eval_accuracy': 0.4786, 'eval_runtime': 10.8945, 'eval_samples_per_second': 458.946, 'eval_steps_per_second': 14.411, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 1.0629745721817017, 'eval_accuracy': 0.4844, 'eval_runtime': 10.825, 'eval_samples_per_second': 461.894, 'eval_steps_per_second': 14.503, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.4844


0,1
eval/accuracy,▁▁▄▄██████
eval/loss,████▆▆▃▃▁▁
eval/runtime,▇▇██▅▅▄▄▁▁
eval/samples_per_second,▂▂▁▁▄▄▅▅██
eval/steps_per_second,▂▂▁▁▄▄▅▅██
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇████
train/grad_norm,▁▅▃▃▄▂▅▂▃▄▅▃▃▂▄▄▂▇▇▆▃▅▅▂▂▄▃▃▇█▆▅▄▅▆▄▅▄▃▅
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,███████████▇█▇█▇▇▇▇▇▆▆▆▆▆▆▆▆▆▅▄▄▅▄▃▂▃▂▂▁

0,1
eval/accuracy,0.4844
eval/loss,1.06297
eval/runtime,10.825
eval/samples_per_second,461.894
eval/steps_per_second,14.503
total_flos,1480984221450240.0
train/epoch,4.992
train/global_step,1560.0
train/grad_norm,1.24522
train/learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: eir17vih with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 7.61698757159985e-05
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.15
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	lr_scheduler: polynomial
[34m[1mwandb[0m: 	optimizer: adamw_hf
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.1


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 80,811 || all params: 14,431,998 || trainable%: 0.5599




Epoch,Training Loss,Validation Loss,Accuracy
0,1.0965,1.097304,0.341
1,1.0928,1.092622,0.4626
2,1.0797,1.081043,0.4786
3,1.0728,1.073344,0.4794
4,1.0712,1.071255,0.4818


Evaluation Metrics: {'eval_loss': 1.097303867340088, 'eval_accuracy': 0.341, 'eval_runtime': 10.881, 'eval_samples_per_second': 459.516, 'eval_steps_per_second': 14.429, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 1.0926220417022705, 'eval_accuracy': 0.4626, 'eval_runtime': 10.8436, 'eval_samples_per_second': 461.101, 'eval_steps_per_second': 14.479, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 1.0810425281524658, 'eval_accuracy': 0.4786, 'eval_runtime': 10.8939, 'eval_samples_per_second': 458.972, 'eval_steps_per_second': 14.412, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 1.0733444690704346, 'eval_accuracy': 0.4794, 'eval_runtime': 10.8053, 'eval_samples_per_second': 462.735, 'eval_steps_per_second': 14.53, 'epoch': 3.9936}
Evaluation Metrics: {'eval_loss': 1.071255087852478, 'eval_accuracy': 0.4818, 'eval_runtime': 10.829, 'eval_samples_per_second': 461.722, 'eval_steps_per_second': 14.498, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.4818


0,1
eval/accuracy,▁▁▇▇██████
eval/loss,██▇▇▄▄▂▂▁▁
eval/runtime,▇▇▄▄██▁▁▃▃
eval/samples_per_second,▂▂▅▅▁▁██▆▆
eval/steps_per_second,▂▂▅▅▁▁██▆▆
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▅▅▁▆▆▄▄▂▂▄▂▃▃▄█▆▆▃▃▄▃▄▄▄▄▇▇▅▇▄▅▆▆▆▆▄▆▆▅▅
train/learning_rate,▄▄███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█████████▇▇▇▇▇▇▇▅▅▆▅▃▃▃▃▃▂▃▃▃▂▂▁▂▂▁▁▁▁▂▁

0,1
eval/accuracy,0.4818
eval/loss,1.07126
eval/runtime,10.829
eval/samples_per_second,461.722
eval/steps_per_second,14.498
total_flos,1456486801735680.0
train/epoch,4.992
train/global_step,390.0
train/grad_norm,0.82589
train/learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: df94yyyx with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 32
[34m[1mwandb[0m: 	learning_rate: 1.7090420019453033e-05
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	lr_scheduler: constant_with_warmup
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 80,811 || all params: 14,431,998 || trainable%: 0.5599




Epoch,Training Loss,Validation Loss,Accuracy
0,1.0976,1.097991,0.3734
1,1.0957,1.096845,0.415
2,1.0909,1.092468,0.474
4,1.074,1.073797,0.4808


Evaluation Metrics: {'eval_loss': 1.0979909896850586, 'eval_accuracy': 0.3734, 'eval_runtime': 10.8671, 'eval_samples_per_second': 460.102, 'eval_steps_per_second': 14.447, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 1.0968449115753174, 'eval_accuracy': 0.415, 'eval_runtime': 10.8202, 'eval_samples_per_second': 462.097, 'eval_steps_per_second': 14.51, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 1.0924677848815918, 'eval_accuracy': 0.474, 'eval_runtime': 10.8927, 'eval_samples_per_second': 459.023, 'eval_steps_per_second': 14.413, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 1.0834447145462036, 'eval_accuracy': 0.478, 'eval_runtime': 10.8174, 'eval_samples_per_second': 462.219, 'eval_steps_per_second': 14.514, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 1.0737972259521484, 'eval_accuracy': 0.4808, 'eval_runtime': 10.9231, 'eval_samples_per_second': 457.746, 'eval_steps_per_second': 14.373, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.4808


0,1
eval/accuracy,▁▁▄▄██████
eval/loss,████▆▆▄▄▁▁
eval/runtime,▄▄▁▁▆▆▁▁██
eval/samples_per_second,▅▅██▃▃██▁▁
eval/steps_per_second,▅▅██▃▃██▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
train/grad_norm,▁▅▅▄▄▃▃▃▂▂▄▁▁▇▃▄▃▃▁▁▅▃▃▄▄█▆▄▄▃▃▅▃▅▃▅▅▄▆▇
train/learning_rate,▁▃██████████████████████████████████████
train/loss,██████████▇▇███▇▇▇▇▇▇▇▇▆▇▆▅▅▅▅▅▅▄▃▂▁▂▂▂▁

0,1
eval/accuracy,0.4808
eval/loss,1.0738
eval/runtime,10.9231
eval/samples_per_second,457.746
eval/steps_per_second,14.373
total_flos,1456486801735680.0
train/epoch,4.992
train/global_step,780.0
train/grad_norm,3.82007
train/learning_rate,2e-05


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: gfc7295v with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 32
[34m[1mwandb[0m: 	learning_rate: 1.625710306286862e-05
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.05
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	lr_scheduler: cosine
[34m[1mwandb[0m: 	optimizer: adamw_hf
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.001


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 40,875 || all params: 14,392,062 || trainable%: 0.2840




Epoch,Training Loss,Validation Loss,Accuracy
0,1.0981,1.0982,0.3664
1,1.0974,1.097843,0.3614
2,1.0961,1.097595,0.3584
3,1.0956,1.097466,0.3588
4,1.0964,1.097448,0.3588


Evaluation Metrics: {'eval_loss': 1.0981999635696411, 'eval_accuracy': 0.3664, 'eval_runtime': 10.9267, 'eval_samples_per_second': 457.595, 'eval_steps_per_second': 14.368, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 1.0978425741195679, 'eval_accuracy': 0.3614, 'eval_runtime': 10.8571, 'eval_samples_per_second': 460.528, 'eval_steps_per_second': 14.461, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 1.097594976425171, 'eval_accuracy': 0.3584, 'eval_runtime': 10.7958, 'eval_samples_per_second': 463.144, 'eval_steps_per_second': 14.543, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 1.097465991973877, 'eval_accuracy': 0.3588, 'eval_runtime': 10.875, 'eval_samples_per_second': 459.769, 'eval_steps_per_second': 14.437, 'epoch': 3.9936}
Evaluation Metrics: {'eval_loss': 1.0974476337432861, 'eval_accuracy': 0.3588, 'eval_runtime': 10.8945, 'eval_samples_per_second': 458.947, 'eval_steps_per_second': 14.411, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.3588


0,1
eval/accuracy,██▄▄▁▁▁▁▁▁
eval/loss,██▅▅▂▂▁▁▁▁
eval/runtime,██▄▄▁▁▅▅▆▆
eval/samples_per_second,▁▁▅▅██▄▄▃▃
eval/steps_per_second,▁▁▅▅██▄▄▃▃
train/epoch,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▄▄▁▁▂▅▅▃▃▄▅▅▅▂▂▂▂▃█▃▄▄▂▂▃▃▆▆▃▃▁▁▄▃▁▆▆▂▄▂
train/learning_rate,▅████████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁
train/loss,█████▇▇▇▆▇▅▅▅▆▄▅▅▅▅▄▆▄▃▄▂▂▂▂▆▅▁▃▃▂▂▄▄▄▄▃

0,1
eval/accuracy,0.3588
eval/loss,1.09745
eval/runtime,10.8945
eval/samples_per_second,458.947
eval/steps_per_second,14.411
total_flos,1444238091878400.0
train/epoch,4.992
train/global_step,390.0
train/grad_norm,1.26833
train/learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: dp600hbr with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 16
[34m[1mwandb[0m: 	learning_rate: 2.7911795257224173e-05
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.15
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	lr_scheduler: constant
[34m[1mwandb[0m: 	optimizer: adamw_hf
[34m[1mwandb[0m: 	train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0
[34m[1mwandb[0m: 	weight_decay: 0.01


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 80,811 || all params: 14,431,998 || trainable%: 0.5599


Epoch,Training Loss,Validation Loss,Accuracy
0,1.0943,1.093883,0.4664
2,1.0525,1.04291,0.5054
4,1.0061,0.995829,0.534


Evaluation Metrics: {'eval_loss': 1.0938830375671387, 'eval_accuracy': 0.4664, 'eval_runtime': 10.9541, 'eval_samples_per_second': 456.45, 'eval_steps_per_second': 14.333, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 1.0680770874023438, 'eval_accuracy': 0.484, 'eval_runtime': 10.9367, 'eval_samples_per_second': 457.178, 'eval_steps_per_second': 14.355, 'epoch': 2.0}
Evaluation Metrics: {'eval_loss': 1.0429096221923828, 'eval_accuracy': 0.5054, 'eval_runtime': 10.8473, 'eval_samples_per_second': 460.942, 'eval_steps_per_second': 14.474, 'epoch': 2.9984}
Evaluation Metrics: {'eval_loss': 1.0184144973754883, 'eval_accuracy': 0.5228, 'eval_runtime': 10.85, 'eval_samples_per_second': 460.83, 'eval_steps_per_second': 14.47, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 0.9958286881446838, 'eval_accuracy': 0.534, 'eval_runtime': 10.9559, 'eval_samples_per_second': 456.377, 'eval_steps_per_second': 14.33, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.5340


0,1
eval/accuracy,▁▁▃▃▅▅▇▇██
eval/loss,██▆▆▄▄▃▃▁▁
eval/runtime,██▇▇▁▁▁▁██
eval/samples_per_second,▁▁▂▂████▁▁
eval/steps_per_second,▁▁▂▂████▁▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇████
train/grad_norm,▂▃▁▁▅▅▄▂▃▄▅▃▃▃▃▃▄▃▄▄▃▄▄▆▅▅▇▆▆▅▄▅▆▇▆▇▇▆▇█
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█████████████▇▇▆▆▆▆▅▅▄▅▄▄▄▃▄▄▃▃▂▃▃▂▂▁▂▂▁

0,1
eval/accuracy,0.534
eval/loss,0.99583
eval/runtime,10.9559
eval/samples_per_second,456.377
eval/steps_per_second,14.33
total_flos,1456486801735680.0
train/epoch,4.992
train/global_step,1560.0
train/grad_norm,5.50354
train/learning_rate,3e-05


[34m[1mwandb[0m: Agent Starting Run: 0xa0h7p2 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.2629802841396012e-05
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	lora_dropout: 0.15
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	lr_scheduler: constant_with_warmup
[34m[1mwandb[0m: 	optimizer: adamw_hf
[34m[1mwandb[0m: 	train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.1


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 40,875 || all params: 14,392,062 || trainable%: 0.2840




Epoch,Training Loss,Validation Loss,Accuracy
1,1.0974,1.097273,0.3612
2,1.0904,1.088341,0.4796
3,1.0629,1.063406,0.4788
4,1.0511,1.042836,0.486
5,1.0497,1.024445,0.4974


Evaluation Metrics: {'eval_loss': 1.0972734689712524, 'eval_accuracy': 0.3612, 'eval_runtime': 10.9218, 'eval_samples_per_second': 457.8, 'eval_steps_per_second': 14.375, 'epoch': 1.0}
Evaluation Metrics: {'eval_loss': 1.088341236114502, 'eval_accuracy': 0.4796, 'eval_runtime': 10.8741, 'eval_samples_per_second': 459.808, 'eval_steps_per_second': 14.438, 'epoch': 2.0}
Evaluation Metrics: {'eval_loss': 1.0634055137634277, 'eval_accuracy': 0.4788, 'eval_runtime': 10.8301, 'eval_samples_per_second': 461.677, 'eval_steps_per_second': 14.497, 'epoch': 3.0}
Evaluation Metrics: {'eval_loss': 1.0428359508514404, 'eval_accuracy': 0.486, 'eval_runtime': 10.8728, 'eval_samples_per_second': 459.865, 'eval_steps_per_second': 14.44, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 1.0244454145431519, 'eval_accuracy': 0.4974, 'eval_runtime': 10.8316, 'eval_samples_per_second': 461.61, 'eval_steps_per_second': 14.495, 'epoch': 5.0}
Sweep Evaluation Accuracy: 0.4974


0,1
eval/accuracy,▁▁▇▇▇▇▇▇██
eval/loss,██▇▇▅▅▃▃▁▁
eval/runtime,██▄▄▁▁▄▄▁▁
eval/samples_per_second,▁▁▅▅██▅▅██
eval/steps_per_second,▁▁▅▅██▅▅██
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
train/grad_norm,▂▅▅▁▃▃▆▅▃▁▆▁▇▂▂▄█▆▅▃▃▄▂▃▄▂▄▄▆▅▇▅▄▇▆▆▆▆▇▅
train/learning_rate,▁███████████████████████████████████████
train/loss,██████████▇▇███▇▇▇▇▇▆▆▆▆▇▆▅▅▅▆▄▄▃▅▄▁▂▅▃▄

0,1
eval/accuracy,0.4974
eval/loss,1.02445
eval/runtime,10.8316
eval/samples_per_second,461.61
eval/steps_per_second,14.495
total_flos,1446552576000000.0
train/epoch,5.0
train/global_step,6250.0
train/grad_norm,2.2361
train/learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: y0n83ib3 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.0002781118528691012
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 32
[34m[1mwandb[0m: 	lr_scheduler: cosine_with_restarts
[34m[1mwandb[0m: 	optimizer: adafactor
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	warmup_ratio: 0
[34m[1mwandb[0m: 	weight_decay: 0


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 160,683 || all params: 14,511,870 || trainable%: 1.1073




Epoch,Training Loss,Validation Loss,Accuracy
0,1.0743,1.062466,0.4792
1,1.0049,0.991924,0.5358
2,0.9523,0.948297,0.5666
3,0.9346,0.931988,0.5862
4,0.9501,0.932678,0.5822


Evaluation Metrics: {'eval_loss': 1.0624656677246094, 'eval_accuracy': 0.4792, 'eval_runtime': 10.9491, 'eval_samples_per_second': 456.66, 'eval_steps_per_second': 14.339, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 0.9919240474700928, 'eval_accuracy': 0.5358, 'eval_runtime': 10.9503, 'eval_samples_per_second': 456.609, 'eval_steps_per_second': 14.338, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 0.9482971429824829, 'eval_accuracy': 0.5666, 'eval_runtime': 10.9509, 'eval_samples_per_second': 456.584, 'eval_steps_per_second': 14.337, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 0.9319877028465271, 'eval_accuracy': 0.5862, 'eval_runtime': 10.9039, 'eval_samples_per_second': 458.55, 'eval_steps_per_second': 14.398, 'epoch': 3.9936}
Evaluation Metrics: {'eval_loss': 0.9326784014701843, 'eval_accuracy': 0.5822, 'eval_runtime': 10.9174, 'eval_samples_per_second': 457.987, 'eval_steps_per_second': 14.381, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.5822


0,1
eval/accuracy,▁▁▅▅▇▇████
eval/loss,██▄▄▂▂▁▁▁▁
eval/runtime,██████▁▁▃▃
eval/samples_per_second,▁▁▁▁▁▁██▆▆
eval/steps_per_second,▁▁▁▁▁▁██▆▆
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,▃▃▁▂▃▃▄▄▄▄▅▄▄▄▅▆▆▄▄▄▆▆▅▅▅▅▅▅██▇▇▆▆▇▆▇▆█▆
train/learning_rate,██████▇▇▇▇▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
train/loss,███████▇▇▆▆▅▅▅▄▄▄▄▄▃▂▂▃▃▂▂▂▁▁▂▁▁▁▁▁▂▂▂▂▂

0,1
eval/accuracy,0.5822
eval/loss,0.93268
eval/runtime,10.9174
eval/samples_per_second,457.987
eval/steps_per_second,14.381
total_flos,1480984221450240.0
train/epoch,4.992
train/global_step,390.0
train/grad_norm,1.44192
train/learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: sow0axxu with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 0.0004438363192903519
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	lr_scheduler: linear
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0
[34m[1mwandb[0m: 	weight_decay: 0.1


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 80,811 || all params: 14,431,998 || trainable%: 0.5599




Epoch,Training Loss,Validation Loss,Accuracy
1,0.7964,0.736703,0.6958
2,0.7418,0.668333,0.729
3,0.6582,0.656741,0.733
4,0.5953,0.63828,0.7412
5,0.6221,0.639986,0.7408


Evaluation Metrics: {'eval_loss': 0.736702561378479, 'eval_accuracy': 0.6958, 'eval_runtime': 10.9726, 'eval_samples_per_second': 455.68, 'eval_steps_per_second': 14.308, 'epoch': 1.0}
Evaluation Metrics: {'eval_loss': 0.6683334112167358, 'eval_accuracy': 0.729, 'eval_runtime': 10.9243, 'eval_samples_per_second': 457.696, 'eval_steps_per_second': 14.372, 'epoch': 2.0}
Evaluation Metrics: {'eval_loss': 0.6567407250404358, 'eval_accuracy': 0.733, 'eval_runtime': 10.878, 'eval_samples_per_second': 459.643, 'eval_steps_per_second': 14.433, 'epoch': 3.0}
Evaluation Metrics: {'eval_loss': 0.6382800936698914, 'eval_accuracy': 0.7412, 'eval_runtime': 10.891, 'eval_samples_per_second': 459.095, 'eval_steps_per_second': 14.416, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 0.6399861574172974, 'eval_accuracy': 0.7408, 'eval_runtime': 10.855, 'eval_samples_per_second': 460.617, 'eval_steps_per_second': 14.463, 'epoch': 5.0}
Sweep Evaluation Accuracy: 0.7408


0,1
eval/accuracy,▁▁▆▆▇▇████
eval/loss,██▃▃▂▂▁▁▁▁
eval/runtime,██▅▅▂▂▃▃▁▁
eval/samples_per_second,▁▁▄▄▇▇▆▆██
eval/steps_per_second,▁▁▄▄▇▇▆▆██
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇█
train/global_step,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
train/grad_norm,▁▃▂▂▂▄▅▃▃▂▃▃▄▆▄▃▅▅▅▄▃▅▅▅▆▅▆▄▆▇▄▄▆█▅▄▅▆▄▆
train/learning_rate,██████▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁
train/loss,███▆▆▆▆▄▅▃▄▄▄▃▅▄▄▅▃▃▂▂▂▃▃▂▂▄▂▂▂▃▂▄▂▁▁▃▃▂

0,1
eval/accuracy,0.7408
eval/loss,0.63999
eval/runtime,10.855
eval/samples_per_second,460.617
eval/steps_per_second,14.463
total_flos,1458820915200000.0
train/epoch,5.0
train/global_step,6250.0
train/grad_norm,8.47301
train/learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: uhifpseh with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 5.525194448670556e-05
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 32
[34m[1mwandb[0m: 	lr_scheduler: constant
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 160,683 || all params: 14,511,870 || trainable%: 1.1073




Epoch,Training Loss,Validation Loss,Accuracy
1,1.057,1.04812,0.492
2,0.9985,0.973045,0.5476
3,0.9469,0.939146,0.5594
4,0.9219,0.889335,0.5972
5,0.8714,0.85323,0.6282


Evaluation Metrics: {'eval_loss': 1.0481200218200684, 'eval_accuracy': 0.492, 'eval_runtime': 10.9071, 'eval_samples_per_second': 458.418, 'eval_steps_per_second': 14.394, 'epoch': 1.0}
Evaluation Metrics: {'eval_loss': 0.9730445146560669, 'eval_accuracy': 0.5476, 'eval_runtime': 10.957, 'eval_samples_per_second': 456.329, 'eval_steps_per_second': 14.329, 'epoch': 2.0}
Evaluation Metrics: {'eval_loss': 0.9391456842422485, 'eval_accuracy': 0.5594, 'eval_runtime': 11.0108, 'eval_samples_per_second': 454.099, 'eval_steps_per_second': 14.259, 'epoch': 3.0}
Evaluation Metrics: {'eval_loss': 0.8893353343009949, 'eval_accuracy': 0.5972, 'eval_runtime': 10.8923, 'eval_samples_per_second': 459.041, 'eval_steps_per_second': 14.414, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 0.8532300591468811, 'eval_accuracy': 0.6282, 'eval_runtime': 10.8921, 'eval_samples_per_second': 459.049, 'eval_steps_per_second': 14.414, 'epoch': 5.0}
Sweep Evaluation Accuracy: 0.6282


0,1
eval/accuracy,▁▁▄▄▄▄▆▆██
eval/loss,██▅▅▄▄▂▂▁▁
eval/runtime,▂▂▅▅██▁▁▁▁
eval/samples_per_second,▇▇▄▄▁▁████
eval/steps_per_second,▇▇▄▄▁▁████
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇█████
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇█
train/grad_norm,▁▂▁▁▁▂▂▂▃▂▂▂▃▂▃▂▄▅▅▃▅▃▄▃▄▇▅▄▃▇▆▅▅▆▇█▆▅▅▅
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█████▇▇▆▆▆▆▆▅▆▆▆▄▅▄▄▃▄▄▃▃▃▃▃▃▄▃▄▃▂▁▂▂▂▂▃

0,1
eval/accuracy,0.6282
eval/loss,0.85323
eval/runtime,10.8921
eval/samples_per_second,459.049
eval/steps_per_second,14.414
total_flos,1483357593600000.0
train/epoch,5.0
train/global_step,3125.0
train/grad_norm,5.11056
train/learning_rate,6e-05


[34m[1mwandb[0m: Agent Starting Run: 3resivq7 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 32
[34m[1mwandb[0m: 	learning_rate: 0.0002708390624659358
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	lr_scheduler: cosine
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	warmup_ratio: 0.15
[34m[1mwandb[0m: 	weight_decay: 0


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 80,811 || all params: 14,431,998 || trainable%: 0.5599


Epoch,Training Loss,Validation Loss,Accuracy
0,1.0983,1.097671,0.3378
1,1.0957,1.094693,0.3988
2,1.0905,1.086113,0.4812
3,1.0821,1.080993,0.481
4,1.0787,1.0803,0.4806


Evaluation Metrics: {'eval_loss': 1.097670555114746, 'eval_accuracy': 0.3378, 'eval_runtime': 10.8839, 'eval_samples_per_second': 459.395, 'eval_steps_per_second': 14.425, 'epoch': 0.9728}
Evaluation Metrics: {'eval_loss': 1.0946928262710571, 'eval_accuracy': 0.3988, 'eval_runtime': 10.9147, 'eval_samples_per_second': 458.099, 'eval_steps_per_second': 14.384, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 1.08611261844635, 'eval_accuracy': 0.4812, 'eval_runtime': 10.9183, 'eval_samples_per_second': 457.947, 'eval_steps_per_second': 14.38, 'epoch': 2.9696}
Evaluation Metrics: {'eval_loss': 1.0809928178787231, 'eval_accuracy': 0.481, 'eval_runtime': 10.8943, 'eval_samples_per_second': 458.956, 'eval_steps_per_second': 14.411, 'epoch': 3.9936}
Evaluation Metrics: {'eval_loss': 1.0802998542785645, 'eval_accuracy': 0.4806, 'eval_runtime': 10.9724, 'eval_samples_per_second': 455.689, 'eval_steps_per_second': 14.309, 'epoch': 4.864}
Sweep Evaluation Accuracy: 0.4806


0,1
eval/accuracy,▁▁▄▄██████
eval/loss,██▇▇▃▃▁▁▁▁
eval/runtime,▁▁▃▃▄▄▂▂██
eval/samples_per_second,██▆▆▅▅▇▇▁▁
eval/steps_per_second,██▆▆▅▅▇▇▁▁
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▃▄▄▅▅▅▅▆▆▇▇▇▇██████
train/global_step,▁▁▂▂▂▂▃▃▃▃▃▃▄▄▅▅▅▅▆▆▇▇▇▇██████
train/grad_norm,▂▂▄▄▁▁▂▂▇▇▆▆▆▆██▆▆
train/learning_rate,▆▆██▇▇▆▆▅▅▄▄▃▃▂▂▁▁
train/loss,████▇▇▆▆▅▅▃▃▂▂▂▂▁▁

0,1
eval/accuracy,0.4806
eval/loss,1.0803
eval/runtime,10.9724
eval/samples_per_second,455.689
eval/steps_per_second,14.309
total_flos,1419140986306560.0
train/epoch,4.864
train/global_step,95.0
train/grad_norm,2.43516
train/learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: joix463a with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 16
[34m[1mwandb[0m: 	learning_rate: 0.0007158746505068307
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 32
[34m[1mwandb[0m: 	lr_scheduler: linear
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.001


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 160,683 || all params: 14,511,870 || trainable%: 1.1073




Epoch,Training Loss,Validation Loss,Accuracy
0,0.9411,0.938707,0.561
1,0.7938,0.778376,0.673
2,0.7616,0.721162,0.704
4,0.7092,0.702326,0.7108


Evaluation Metrics: {'eval_loss': 0.9387072324752808, 'eval_accuracy': 0.561, 'eval_runtime': 10.9379, 'eval_samples_per_second': 457.127, 'eval_steps_per_second': 14.354, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 0.7783761620521545, 'eval_accuracy': 0.673, 'eval_runtime': 10.9549, 'eval_samples_per_second': 456.416, 'eval_steps_per_second': 14.331, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 0.7211621999740601, 'eval_accuracy': 0.704, 'eval_runtime': 10.9536, 'eval_samples_per_second': 456.47, 'eval_steps_per_second': 14.333, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 0.714576005935669, 'eval_accuracy': 0.7046, 'eval_runtime': 11.0193, 'eval_samples_per_second': 453.749, 'eval_steps_per_second': 14.248, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 0.7023257613182068, 'eval_accuracy': 0.7108, 'eval_runtime': 10.9496, 'eval_samples_per_second': 456.638, 'eval_steps_per_second': 14.338, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.7108


0,1
eval/accuracy,▁▁▆▆██████
eval/loss,██▃▃▂▂▁▁▁▁
eval/runtime,▁▁▂▂▂▂██▂▂
eval/samples_per_second,██▇▇▇▇▁▁▇▇
eval/steps_per_second,██▆▆▇▇▁▁▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
train/grad_norm,▂▂▂▁▂▂▅▃▃▂▃▄▄▄▄▄▃▄▆▆▅▅▅▅▇▆▄▄▆▆█▅▆▄▆▇▅▆▄▆
train/learning_rate,▂▂▃▄▄██▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▁
train/loss,███▇▆▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▁▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▂▁

0,1
eval/accuracy,0.7108
eval/loss,0.70233
eval/runtime,10.9496
eval/samples_per_second,456.638
eval/steps_per_second,14.338
total_flos,1480984221450240.0
train/epoch,4.992
train/global_step,780.0
train/grad_norm,5.85772
train/learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: jfyl2cm0 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 32
[34m[1mwandb[0m: 	learning_rate: 5.466854411236187e-05
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	lr_scheduler: constant
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	warmup_ratio: 0
[34m[1mwandb[0m: 	weight_decay: 0.001


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 40,875 || all params: 14,392,062 || trainable%: 0.2840


Epoch,Training Loss,Validation Loss,Accuracy
0,1.0983,1.098203,0.3584
1,1.0976,1.097858,0.3484
2,1.0969,1.09754,0.338
3,1.0958,1.097069,0.3396
4,1.0948,1.096376,0.361


Evaluation Metrics: {'eval_loss': 1.0982027053833008, 'eval_accuracy': 0.3584, 'eval_runtime': 10.8763, 'eval_samples_per_second': 459.713, 'eval_steps_per_second': 14.435, 'epoch': 0.9728}
Evaluation Metrics: {'eval_loss': 1.097858190536499, 'eval_accuracy': 0.3484, 'eval_runtime': 10.8825, 'eval_samples_per_second': 459.452, 'eval_steps_per_second': 14.427, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 1.097540020942688, 'eval_accuracy': 0.338, 'eval_runtime': 10.9135, 'eval_samples_per_second': 458.148, 'eval_steps_per_second': 14.386, 'epoch': 2.9696}
Evaluation Metrics: {'eval_loss': 1.0970693826675415, 'eval_accuracy': 0.3396, 'eval_runtime': 10.9214, 'eval_samples_per_second': 457.818, 'eval_steps_per_second': 14.375, 'epoch': 3.9936}
Evaluation Metrics: {'eval_loss': 1.096375823020935, 'eval_accuracy': 0.361, 'eval_runtime': 11.0482, 'eval_samples_per_second': 452.561, 'eval_steps_per_second': 14.21, 'epoch': 4.864}
Sweep Evaluation Accuracy: 0.3610


0,1
eval/accuracy,▇▇▄▄▁▁▁▁██
eval/loss,██▇▇▅▅▄▄▁▁
eval/runtime,▁▁▁▁▃▃▃▃██
eval/samples_per_second,████▆▆▆▆▁▁
eval/steps_per_second,████▆▆▆▆▁▁
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▃▄▄▅▅▅▅▆▆▇▇▇▇██████
train/global_step,▁▁▂▂▂▂▃▃▃▃▃▃▄▄▅▅▅▅▆▆▇▇▇▇██████
train/grad_norm,▄▄▆▆▂▂▃▃▆▆▆▆▁▁██▆▆
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,██▇▇▇▇▆▆▅▅▃▃▃▃▃▃▁▁

0,1
eval/accuracy,0.361
eval/loss,1.09638
eval/runtime,11.0482
eval/samples_per_second,452.561
eval/steps_per_second,14.21
total_flos,1407206345932800.0
train/epoch,4.864
train/global_step,95.0
train/grad_norm,1.67935
train/learning_rate,5e-05


[34m[1mwandb[0m: Agent Starting Run: 417hib91 with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.0003657371211368039
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	lr_scheduler: constant_with_warmup
[34m[1mwandb[0m: 	optimizer: adamw_torch
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	warmup_ratio: 0.15
[34m[1mwandb[0m: 	weight_decay: 0.01


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 40,875 || all params: 14,392,062 || trainable%: 0.2840


Epoch,Training Loss,Validation Loss,Accuracy
0,0.9801,0.969488,0.534
2,0.7947,0.807192,0.6538
4,0.734,0.706591,0.706


Evaluation Metrics: {'eval_loss': 0.9694877862930298, 'eval_accuracy': 0.534, 'eval_runtime': 10.8956, 'eval_samples_per_second': 458.902, 'eval_steps_per_second': 14.41, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 0.8495268821716309, 'eval_accuracy': 0.6406, 'eval_runtime': 10.9037, 'eval_samples_per_second': 458.559, 'eval_steps_per_second': 14.399, 'epoch': 2.0}
Evaluation Metrics: {'eval_loss': 0.8071922659873962, 'eval_accuracy': 0.6538, 'eval_runtime': 10.8801, 'eval_samples_per_second': 459.554, 'eval_steps_per_second': 14.43, 'epoch': 2.9984}
Evaluation Metrics: {'eval_loss': 0.7372356653213501, 'eval_accuracy': 0.6868, 'eval_runtime': 10.8731, 'eval_samples_per_second': 459.849, 'eval_steps_per_second': 14.439, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 0.7065905332565308, 'eval_accuracy': 0.706, 'eval_runtime': 10.9332, 'eval_samples_per_second': 457.322, 'eval_steps_per_second': 14.36, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.7060


0,1
eval/accuracy,▁▁▅▅▆▆▇▇██
eval/loss,██▅▅▄▄▂▂▁▁
eval/runtime,▄▄▅▅▂▂▁▁██
eval/samples_per_second,▅▅▄▄▇▇██▁▁
eval/steps_per_second,▅▅▄▄▇▇██▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇████
train/grad_norm,▁▁▁▁▁▂▂▂▂▅▃▅▄▃▄▄▃▄▄▄▅▅▅█▆▄▄▄▄▅▅▅▅▆█▅▅▆▆▆
train/learning_rate,▁▄▄▅▅▆▆▆████████████████████████████████
train/loss,█████▇▇▇▆▅▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▃▂▂▃▂▁▂▁▁▁

0,1
eval/accuracy,0.706
eval/loss,0.70659
eval/runtime,10.9332
eval/samples_per_second,457.322
eval/steps_per_second,14.36
total_flos,1444238091878400.0
train/epoch,4.992
train/global_step,1560.0
train/grad_norm,15.0216
train/learning_rate,0.00037


[34m[1mwandb[0m: Agent Starting Run: p9ysimmd with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 7.22001078772142e-05
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	lr_scheduler: polynomial
[34m[1mwandb[0m: 	optimizer: adafactor
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	warmup_ratio: 0
[34m[1mwandb[0m: 	weight_decay: 0


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 80,811 || all params: 14,431,998 || trainable%: 0.5599




Epoch,Training Loss,Validation Loss,Accuracy
1,1.0376,1.022918,0.5158
2,0.9801,0.957147,0.5578
3,0.9421,0.929627,0.5658
4,0.9459,0.913083,0.5794
5,0.914,0.90654,0.5858


Evaluation Metrics: {'eval_loss': 1.0229183435440063, 'eval_accuracy': 0.5158, 'eval_runtime': 11.0677, 'eval_samples_per_second': 451.765, 'eval_steps_per_second': 14.185, 'epoch': 1.0}
Evaluation Metrics: {'eval_loss': 0.9571465849876404, 'eval_accuracy': 0.5578, 'eval_runtime': 10.9006, 'eval_samples_per_second': 458.69, 'eval_steps_per_second': 14.403, 'epoch': 2.0}
Evaluation Metrics: {'eval_loss': 0.9296265840530396, 'eval_accuracy': 0.5658, 'eval_runtime': 10.9172, 'eval_samples_per_second': 457.991, 'eval_steps_per_second': 14.381, 'epoch': 3.0}
Evaluation Metrics: {'eval_loss': 0.9130834937095642, 'eval_accuracy': 0.5794, 'eval_runtime': 11.0065, 'eval_samples_per_second': 454.279, 'eval_steps_per_second': 14.264, 'epoch': 4.0}
Evaluation Metrics: {'eval_loss': 0.9065399169921875, 'eval_accuracy': 0.5858, 'eval_runtime': 11.0159, 'eval_samples_per_second': 453.889, 'eval_steps_per_second': 14.252, 'epoch': 5.0}
Sweep Evaluation Accuracy: 0.5858


0,1
eval/accuracy,▁▁▅▅▆▆▇▇██
eval/loss,██▄▄▂▂▁▁▁▁
eval/runtime,██▁▁▂▂▅▅▆▆
eval/samples_per_second,▁▁██▇▇▄▄▃▃
eval/steps_per_second,▁▁██▇▇▄▄▃▃
train/epoch,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/global_step,▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇█
train/grad_norm,▁▁▁▁▁▂▃▂▃▄▃▂▃▄▄▄▅▅▃▅▆▄▅▆▃█▅▅▅▇▅▆▆▇▄▆▆▅▅▄
train/learning_rate,███████▇▇▇▇▇▇▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁
train/loss,███████▇▇▅▅▄▅▄▄▄▄▃▅▃▄▂▃▂▂▂▂▂▂▂▃▂▂▂▃▁▂▃▁▂

0,1
eval/accuracy,0.5858
eval/loss,0.90654
eval/runtime,11.0159
eval/samples_per_second,453.889
eval/steps_per_second,14.252
total_flos,1458820915200000.0
train/epoch,5.0
train/global_step,3125.0
train/grad_norm,4.11891
train/learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: fri91lbm with config:
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 3.111564544178538e-05
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	lr_scheduler: cosine
[34m[1mwandb[0m: 	optimizer: adamw_8bit
[34m[1mwandb[0m: 	train_batch_size: 32
[34m[1mwandb[0m: 	warmup_ratio: 0.15
[34m[1mwandb[0m: 	weight_decay: 0.1


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
WandbCallback
SaveBestAcrossSweepsCallback


trainable params: 80,811 || all params: 14,431,998 || trainable%: 0.5599




Epoch,Training Loss,Validation Loss,Accuracy
0,1.098,1.098127,0.3668
1,1.0969,1.097542,0.3492
2,1.0947,1.09694,0.3532
3,1.0934,1.096509,0.3604
4,1.0947,1.096423,0.3636


Evaluation Metrics: {'eval_loss': 1.098126769065857, 'eval_accuracy': 0.3668, 'eval_runtime': 10.9573, 'eval_samples_per_second': 456.318, 'eval_steps_per_second': 14.328, 'epoch': 0.9984}
Evaluation Metrics: {'eval_loss': 1.0975420475006104, 'eval_accuracy': 0.3492, 'eval_runtime': 10.9793, 'eval_samples_per_second': 455.404, 'eval_steps_per_second': 14.3, 'epoch': 1.9968}
Evaluation Metrics: {'eval_loss': 1.096940040588379, 'eval_accuracy': 0.3532, 'eval_runtime': 10.9083, 'eval_samples_per_second': 458.366, 'eval_steps_per_second': 14.393, 'epoch': 2.9952}
Evaluation Metrics: {'eval_loss': 1.096509337425232, 'eval_accuracy': 0.3604, 'eval_runtime': 10.8964, 'eval_samples_per_second': 458.865, 'eval_steps_per_second': 14.408, 'epoch': 3.9936}
Evaluation Metrics: {'eval_loss': 1.0964232683181763, 'eval_accuracy': 0.3636, 'eval_runtime': 10.9564, 'eval_samples_per_second': 456.353, 'eval_steps_per_second': 14.329, 'epoch': 4.992}
Sweep Evaluation Accuracy: 0.3636


0,1
eval/accuracy,██▁▁▃▃▅▅▇▇
eval/loss,██▆▆▃▃▁▁▁▁
eval/runtime,▆▆██▂▂▁▁▆▆
eval/samples_per_second,▃▃▁▁▇▇██▃▃
eval/steps_per_second,▃▃▁▁▇▇██▃▃
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇██████
train/grad_norm,▄▄▂▅▅▄▄▅▆▂▂▂▄▄█▃▁▁▄▁▂▂▂▅▅▆▆▂▂▃▂▄▄▄▆▃▄▃▂▂
train/learning_rate,▂▃▅▆▆████▇▇▇▇▇▇▆▆▆▆▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁
train/loss,█████▇▇▇▇▇▆▆▆▇▇▆▅▅▅▅▄▆▅▃▄▂▂▂▂▃▂▂▆▆▅▃▃▁▃▂

0,1
eval/accuracy,0.3636
eval/loss,1.09642
eval/runtime,10.9564
eval/samples_per_second,456.353
eval/steps_per_second,14.329
total_flos,1456486801735680.0
train/epoch,4.992
train/global_step,390.0
train/grad_norm,0.33368
train/learning_rate,0.0


In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
import torch

In [None]:
global_best_model_checkpoint = "./global_best_model"

In [None]:
# Save the model
model.save_pretrained(global_best_model_checkpoint)

# Save the tokenizer
tokenizer.save_pretrained(global_best_model_checkpoint)

('./global_best_model/tokenizer_config.json',
 './global_best_model/special_tokens_map.json',
 './global_best_model/vocab.txt',
 './global_best_model/added_tokens.json',
 './global_best_model/tokenizer.json')