<a href="https://colab.research.google.com/github/danjshaw/ece57000-finalProject/blob/main/lora-bert/source/lora-bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
if not os.path.exists('/content/drive/MyDrive/lora-bert/'):
    os.makedirs('/content/drive/MyDrive/lora-bert/')
output_dir = '/content/drive/MyDrive/lora-bert/'

In [None]:
!pip install datasets
!pip install evaluate
!pip freeze > /content/drive/MyDrive/lora-bert/requirements.txt



In [None]:
import torch
import torch.nn as nn
import numpy as np

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from transformers import set_seed
set_seed(0)

In [None]:
def get_trainable_parameters(model):
  trainable_parameters = 0
  parameters = 0
  for param in model.parameters():
    parameters += param.numel()
    if param.requires_grad:
      trainable_parameters += param.numel()
  return {'total_parameters': parameters, 'trainable_parameters': trainable_parameters}

In [None]:
import csv
def write_results_to_csv(file_name, results):
  with open(file_name, 'w', newline='') as csvfile:
      writer = csv.DictWriter(csvfile, fieldnames=list(results[0].keys()))
      writer.writeheader()
      for result in results:
        writer.writerow(result)

In [None]:
def get_model_name(name_and_path):
  return name_and_path.split('/')[1]

# Full Fine-Tuning Implementation



Followed examples from this [Hugging Face NLP course](https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt#fine-tuning-a-model-with-the-trainer-api) on how to use the trainer API for fine-tuning.

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, set_seed
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate
import time

def ft_hyperparameter_tune(checkpoint, epochs, batch_sizes, learning_rates):
  model_name = checkpoint
  if '/' in checkpoint:
    model_name = get_model_name(checkpoint)

  raw_datasets = load_dataset("glue", "mrpc")
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  def tokenize_function(example):
      return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_preds):
      metric = evaluate.load("glue", "mrpc")
      logits, labels = eval_preds
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

  ft_results = []
  progress = 0
  iterations = len(batch_sizes)*len(learning_rates)

  # Hyperparameter tuning
  start_time = time.time()
  for size in batch_sizes:
    for rate in learning_rates:
      # Track progress
      progress += 1
      print(f"Progress: {progress}/{iterations}")

      # Initialize result
      result = {"batch_size": size, "learning_rate": rate}

      model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

      # Setup the trainer
      training_args = TrainingArguments(
          "fine-tuning-trainer",
          eval_strategy="epoch",
          per_device_eval_batch_size=size,
          per_device_train_batch_size=size,
          num_train_epochs=epochs,
          learning_rate=rate,
          disable_tqdm=True,
          report_to="none"
      )
      trainer = Trainer(
          model,
          training_args,
          train_dataset=tokenized_datasets["train"],
          eval_dataset=tokenized_datasets["validation"],
          data_collator=data_collator,
          processing_class=tokenizer,
          compute_metrics=compute_metrics
      )

      # Train the model
      ft_results.append(result | trainer.train().metrics | trainer.evaluate() | get_trainable_parameters(model))

  write_results_to_csv(output_dir+model_name+'-ft-results.csv', ft_results)

  end_time = time.time()

  runtime_seconds = end_time - start_time
  runtime_minutes = runtime_seconds / 60

  # Output the best result
  max_ft_result = ft_results[0]
  for _, result in enumerate(ft_results):
    if result['eval_f1'] > max_ft_result['eval_f1']:
      max_ft_result = result
  print(f'\n================ \
          \nTotal Runtime: {runtime_minutes} minutes \
          \nBest Result: \
          \n\tF1={max_ft_result["eval_f1"]} \
          \n\tBatch Size={max_ft_result["batch_size"]} \
          \n\tLearning Rate={max_ft_result["learning_rate"]}'
  )

  return ft_results

# Low-Rank Adaptation (LoRA) Implementation

In [None]:
class LoraModule(nn.Module):
  def __init__(self, in_features, out_features, rank, alpha):
    super().__init__()
    self.scale = alpha / rank
    self.A = nn.Parameter(torch.randn(in_features, rank))
    self.B = nn.Parameter(torch.zeros(rank, out_features))

  def forward(self, x):
    return (self.scale * (x @ self.A @ self.B))

class LoraLinear(nn.Module):
  def __init__(self, linear, rank, alpha):
    super().__init__()
    if (isinstance(linear, LoraLinear)):
      self.linear = linear.linear
      self.lora = LoraModule(self.linear.in_features, self.linear.out_features, rank, alpha)

    else:
      self.linear = linear
      self.lora = LoraModule(self.linear.in_features, self.linear.out_features, rank, alpha)

  def forward(self, x):
    return self.linear(x) + self.lora(x)

In [None]:
def configure_lora_model(model, rank, alpha):
  # Replace the query and value linear layers with LoRA layers
  for _, layer in enumerate(model.bert.encoder.layer):
    s = layer.attention.self
    s.query = LoraLinear(s.query, rank, alpha)
    s.value = LoraLinear(s.value, rank, alpha)

  # Freeze the pre-trained weights
  for name, param in model.named_parameters():
    if 'A' in name or 'B' in name:
      param.requires_grad = True
    else:
      param.requires_grad = False

In [None]:
from transformers import Trainer, TrainingArguments, set_seed
import time

def lora_hyperparameter_tune(checkpoint, ranks, alphas, epochs, batch_sizes, learning_rates):
  model_name = checkpoint
  if '/' in checkpoint:
    model_name = get_model_name(checkpoint)

  # Setup model and dataset
  raw_datasets = load_dataset("glue", "mrpc")
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  def tokenize_function(example):
      return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_preds):
      metric = evaluate.load("glue", "mrpc")
      logits, labels = eval_preds
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

  lora_results = []
  progress = 0
  iterations = len(batch_sizes)*len(learning_rates)*len(ranks)*len(alphas)

  # Hyperparameter tuning
  start_time = time.time()
  for rank in ranks:
    for alpha in alphas:
      for size in batch_sizes:
        for rate in learning_rates:
          # Track progress
          progress += 1
          print(f"Progress: {progress}/{iterations}")

          # Initialize result
          result = {"rank": rank, "alpha": alpha, "batch_size": size, "learning_rate": rate}

          # Configure an existing model with new LoRA layers
          configure_lora_model(model, rank, alpha)

          # Setup the trainer
          training_args = TrainingArguments(
              "lora-trainer",
              eval_strategy="epoch",
              per_device_eval_batch_size=size,
              per_device_train_batch_size=size,
              num_train_epochs=epochs,
              learning_rate=rate,
              disable_tqdm=True,
              report_to="none"
          )
          trainer = Trainer(
              model,
              training_args,
              train_dataset=tokenized_datasets["train"],
              eval_dataset=tokenized_datasets["validation"],
              data_collator=data_collator,
              processing_class=tokenizer,
              compute_metrics=compute_metrics
          )

          # Train the model
          lora_results.append(result | trainer.train().metrics | trainer.evaluate() | get_trainable_parameters(model))

  write_results_to_csv(output_dir+model_name+'-lora-results.csv', lora_results)

  end_time = time.time()

  runtime_seconds = end_time - start_time
  runtime_hours = runtime_seconds / 3600

  # Output the best result
  max_lora_result = lora_results[0]
  for _, result in enumerate(lora_results):
    if result['eval_f1'] > max_lora_result['eval_f1']:
      max_lora_result = result
  print(f'\n================ \
          \nTotal Runtime: {runtime_hours} hours \
          \nBest Result: \
          \n\tF1={max_lora_result["eval_f1"]} \
          \n\tRank={max_lora_result["rank"]} \
          \n\tAlpha={max_lora_result["alpha"]} \
          \n\tBatch Size={max_lora_result["batch_size"]} \
          \n\tLearning Rate={max_lora_result["learning_rate"]}'
  )

  return lora_results

# Run

Hyperparameters from [google-research/bert](https://github.com/google-research/bert):



> For each task, we selected the best fine-tuning hyperparameters from the lists below, and trained for 4 epochs:
> * batch sizes: 8, 16, 32, 64, 128
> * learning rates: 3e-4, 1e-4, 5e-5, 3e-5



In [None]:
epochs = 4
batch_sizes = [8, 16, 32, 64, 128]
learning_rates = [3e-4, 1e-4, 5e-5, 3e-5]

In [None]:
ranks = [1, 2, 4, 8, 16]
alphas = [1, 2, 4, 8, 16]

## BERT Tiny

### Full Fine-Tuning

In [None]:
_ = ft_hyperparameter_tune("prajjwal1/bert-tiny", epochs, batch_sizes, learning_rates)

Progress: 1/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5735582113265991, 'eval_accuracy': 0.7083333333333334, 'eval_f1': 0.8155038759689922, 'eval_runtime': 0.7462, 'eval_samples_per_second': 546.755, 'eval_steps_per_second': 68.344, 'epoch': 1.0}
{'loss': 0.5946, 'grad_norm': 4.331884384155273, 'learning_rate': 0.00021830065359477123, 'epoch': 1.0893246187363834}
{'eval_loss': 0.59549880027771, 'eval_accuracy': 0.7107843137254902, 'eval_f1': 0.8138801261829653, 'eval_runtime': 0.7551, 'eval_samples_per_second': 540.351, 'eval_steps_per_second': 67.544, 'epoch': 2.0}
{'loss': 0.4682, 'grad_norm': 38.01247024536133, 'learning_rate': 0.00013660130718954247, 'epoch': 2.178649237472767}
{'eval_loss': 0.7907573580741882, 'eval_accuracy': 0.7132352941176471, 'eval_f1': 0.8020304568527918, 'eval_runtime': 0.7908, 'eval_samples_per_second': 515.959, 'eval_steps_per_second': 64.495, 'epoch': 3.0}
{'loss': 0.2993, 'grad_norm': 5.858227252960205, 'learning_rate': 5.4901960784313716e-05, 'epoch': 3.2679738562091503}
{'eval_loss': 1.063

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.571905791759491, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8169440242057489, 'eval_runtime': 0.742, 'eval_samples_per_second': 549.881, 'eval_steps_per_second': 68.735, 'epoch': 1.0}
{'loss': 0.5884, 'grad_norm': 4.293759822845459, 'learning_rate': 7.276688453159042e-05, 'epoch': 1.0893246187363834}
{'eval_loss': 0.5654860734939575, 'eval_accuracy': 0.7352941176470589, 'eval_f1': 0.8285714285714286, 'eval_runtime': 0.7268, 'eval_samples_per_second': 561.396, 'eval_steps_per_second': 70.175, 'epoch': 2.0}
{'loss': 0.4993, 'grad_norm': 8.675013542175293, 'learning_rate': 4.5533769063180834e-05, 'epoch': 2.178649237472767}
{'eval_loss': 0.6270501613616943, 'eval_accuracy': 0.7352941176470589, 'eval_f1': 0.8296529968454258, 'eval_runtime': 0.7328, 'eval_samples_per_second': 556.751, 'eval_steps_per_second': 69.594, 'epoch': 3.0}
{'loss': 0.4045, 'grad_norm': 26.807472229003906, 'learning_rate': 1.8300653594771242e-05, 'epoch': 3.2679738562091503}
{'eval_loss': 0.665

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5733863711357117, 'eval_accuracy': 0.7009803921568627, 'eval_f1': 0.8134556574923547, 'eval_runtime': 0.7314, 'eval_samples_per_second': 557.82, 'eval_steps_per_second': 69.728, 'epoch': 1.0}
{'loss': 0.6013, 'grad_norm': 4.323869228363037, 'learning_rate': 3.638344226579521e-05, 'epoch': 1.0893246187363834}
{'eval_loss': 0.5774785876274109, 'eval_accuracy': 0.7083333333333334, 'eval_f1': 0.8194233687405159, 'eval_runtime': 0.8325, 'eval_samples_per_second': 490.097, 'eval_steps_per_second': 61.262, 'epoch': 2.0}
{'loss': 0.5438, 'grad_norm': 5.283394813537598, 'learning_rate': 2.2766884531590417e-05, 'epoch': 2.178649237472767}
{'eval_loss': 0.5611313581466675, 'eval_accuracy': 0.7279411764705882, 'eval_f1': 0.8246445497630331, 'eval_runtime': 0.8057, 'eval_samples_per_second': 506.371, 'eval_steps_per_second': 63.296, 'epoch': 3.0}
{'loss': 0.4849, 'grad_norm': 14.096569061279297, 'learning_rate': 9.150326797385621e-06, 'epoch': 3.2679738562091503}
{'eval_loss': 0.587

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5838139653205872, 'eval_accuracy': 0.7009803921568627, 'eval_f1': 0.8184523809523809, 'eval_runtime': 0.7331, 'eval_samples_per_second': 556.566, 'eval_steps_per_second': 69.571, 'epoch': 1.0}
{'loss': 0.6134, 'grad_norm': 2.5178942680358887, 'learning_rate': 2.1830065359477124e-05, 'epoch': 1.0893246187363834}
{'eval_loss': 0.5830901861190796, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8180451127819549, 'eval_runtime': 0.7499, 'eval_samples_per_second': 544.068, 'eval_steps_per_second': 68.008, 'epoch': 2.0}
{'loss': 0.5689, 'grad_norm': 4.079008102416992, 'learning_rate': 1.366013071895425e-05, 'epoch': 2.178649237472767}
{'eval_loss': 0.568699836730957, 'eval_accuracy': 0.7132352941176471, 'eval_f1': 0.8197226502311248, 'eval_runtime': 0.7535, 'eval_samples_per_second': 541.464, 'eval_steps_per_second': 67.683, 'epoch': 3.0}
{'loss': 0.5302, 'grad_norm': 6.041450023651123, 'learning_rate': 5.490196078431373e-06, 'epoch': 3.2679738562091503}
{'eval_loss': 0.577

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5624045133590698, 'eval_accuracy': 0.7230392156862745, 'eval_f1': 0.8231611893583725, 'eval_runtime': 0.6508, 'eval_samples_per_second': 626.926, 'eval_steps_per_second': 39.951, 'epoch': 1.0}
{'eval_loss': 0.5531349778175354, 'eval_accuracy': 0.7328431372549019, 'eval_f1': 0.8256, 'eval_runtime': 0.6683, 'eval_samples_per_second': 610.525, 'eval_steps_per_second': 38.906, 'epoch': 2.0}
{'loss': 0.5239, 'grad_norm': 15.012598991394043, 'learning_rate': 0.00013695652173913042, 'epoch': 2.1739130434782608}
{'eval_loss': 0.590812087059021, 'eval_accuracy': 0.7401960784313726, 'eval_f1': 0.8221476510067114, 'eval_runtime': 0.6627, 'eval_samples_per_second': 615.64, 'eval_steps_per_second': 39.232, 'epoch': 3.0}
{'eval_loss': 0.7519146800041199, 'eval_accuracy': 0.7377450980392157, 'eval_f1': 0.8243021346469622, 'eval_runtime': 0.6452, 'eval_samples_per_second': 632.36, 'eval_steps_per_second': 40.297, 'epoch': 4.0}
{'train_runtime': 13.4429, 'train_samples_per_second': 1091

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.564354419708252, 'eval_accuracy': 0.7132352941176471, 'eval_f1': 0.8180404354587869, 'eval_runtime': 0.6696, 'eval_samples_per_second': 609.328, 'eval_steps_per_second': 38.83, 'epoch': 1.0}
{'eval_loss': 0.5474852919578552, 'eval_accuracy': 0.7377450980392157, 'eval_f1': 0.831496062992126, 'eval_runtime': 0.6459, 'eval_samples_per_second': 631.712, 'eval_steps_per_second': 40.256, 'epoch': 2.0}
{'loss': 0.5577, 'grad_norm': 6.763742923736572, 'learning_rate': 4.565217391304348e-05, 'epoch': 2.1739130434782608}
{'eval_loss': 0.5569795966148376, 'eval_accuracy': 0.7426470588235294, 'eval_f1': 0.8330683624801272, 'eval_runtime': 0.8569, 'eval_samples_per_second': 476.119, 'eval_steps_per_second': 30.341, 'epoch': 3.0}
{'eval_loss': 0.5896832346916199, 'eval_accuracy': 0.7328431372549019, 'eval_f1': 0.8299531981279251, 'eval_runtime': 0.6641, 'eval_samples_per_second': 614.408, 'eval_steps_per_second': 39.153, 'epoch': 4.0}
{'train_runtime': 13.3805, 'train_samples_per_sec

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5816411972045898, 'eval_accuracy': 0.6887254901960784, 'eval_f1': 0.8095952023988006, 'eval_runtime': 0.7013, 'eval_samples_per_second': 581.766, 'eval_steps_per_second': 37.073, 'epoch': 1.0}
{'eval_loss': 0.5781240463256836, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8185907046476761, 'eval_runtime': 0.6461, 'eval_samples_per_second': 631.494, 'eval_steps_per_second': 40.242, 'epoch': 2.0}
{'loss': 0.5845, 'grad_norm': 4.170251846313477, 'learning_rate': 2.282608695652174e-05, 'epoch': 2.1739130434782608}
{'eval_loss': 0.568314790725708, 'eval_accuracy': 0.7083333333333334, 'eval_f1': 0.8183206106870229, 'eval_runtime': 0.6479, 'eval_samples_per_second': 629.689, 'eval_steps_per_second': 40.127, 'epoch': 3.0}
{'eval_loss': 0.568650484085083, 'eval_accuracy': 0.7205882352941176, 'eval_f1': 0.8246153846153846, 'eval_runtime': 0.6685, 'eval_samples_per_second': 610.317, 'eval_steps_per_second': 38.893, 'epoch': 4.0}
{'train_runtime': 13.4732, 'train_samples_per_se

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5934128761291504, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8193832599118943, 'eval_runtime': 0.6768, 'eval_samples_per_second': 602.868, 'eval_steps_per_second': 38.418, 'epoch': 1.0}
{'eval_loss': 0.5834196209907532, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8161434977578476, 'eval_runtime': 0.6651, 'eval_samples_per_second': 613.443, 'eval_steps_per_second': 39.092, 'epoch': 2.0}
{'loss': 0.6021, 'grad_norm': 2.227717161178589, 'learning_rate': 1.3695652173913042e-05, 'epoch': 2.1739130434782608}
{'eval_loss': 0.574123740196228, 'eval_accuracy': 0.6936274509803921, 'eval_f1': 0.8114630467571644, 'eval_runtime': 0.6869, 'eval_samples_per_second': 594.0, 'eval_steps_per_second': 37.853, 'epoch': 3.0}
{'eval_loss': 0.5735155940055847, 'eval_accuracy': 0.696078431372549, 'eval_f1': 0.8126888217522659, 'eval_runtime': 0.6627, 'eval_samples_per_second': 615.687, 'eval_steps_per_second': 39.235, 'epoch': 4.0}
{'train_runtime': 13.3606, 'train_samples_per_sec

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5496113896369934, 'eval_accuracy': 0.7377450980392157, 'eval_f1': 0.831496062992126, 'eval_runtime': 0.6992, 'eval_samples_per_second': 583.53, 'eval_steps_per_second': 18.593, 'epoch': 1.0}
{'eval_loss': 0.552695095539093, 'eval_accuracy': 0.7377450980392157, 'eval_f1': 0.8309636650868878, 'eval_runtime': 0.621, 'eval_samples_per_second': 657.042, 'eval_steps_per_second': 20.935, 'epoch': 2.0}
{'eval_loss': 0.6253054141998291, 'eval_accuracy': 0.7450980392156863, 'eval_f1': 0.8354430379746836, 'eval_runtime': 0.6584, 'eval_samples_per_second': 619.642, 'eval_steps_per_second': 19.744, 'epoch': 3.0}
{'eval_loss': 0.6537364721298218, 'eval_accuracy': 0.7377450980392157, 'eval_f1': 0.8304278922345484, 'eval_runtime': 0.6613, 'eval_samples_per_second': 616.997, 'eval_steps_per_second': 19.659, 'epoch': 4.0}
{'train_runtime': 9.3716, 'train_samples_per_second': 1565.587, 'train_steps_per_second': 49.085, 'train_loss': 0.43501543791397757, 'epoch': 4.0}
{'eval_loss': 0.65373

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.585704505443573, 'eval_accuracy': 0.6936274509803921, 'eval_f1': 0.81089258698941, 'eval_runtime': 0.6009, 'eval_samples_per_second': 678.983, 'eval_steps_per_second': 21.634, 'epoch': 1.0}
{'eval_loss': 0.5598652958869934, 'eval_accuracy': 0.7181372549019608, 'eval_f1': 0.8205928237129485, 'eval_runtime': 0.6198, 'eval_samples_per_second': 658.23, 'eval_steps_per_second': 20.973, 'epoch': 2.0}
{'eval_loss': 0.5700249075889587, 'eval_accuracy': 0.7132352941176471, 'eval_f1': 0.8186046511627907, 'eval_runtime': 0.6085, 'eval_samples_per_second': 670.486, 'eval_steps_per_second': 21.364, 'epoch': 3.0}
{'eval_loss': 0.5717409253120422, 'eval_accuracy': 0.7254901960784313, 'eval_f1': 0.8255451713395638, 'eval_runtime': 0.6602, 'eval_samples_per_second': 618.006, 'eval_steps_per_second': 19.691, 'epoch': 4.0}
{'train_runtime': 9.0286, 'train_samples_per_second': 1625.055, 'train_steps_per_second': 50.949, 'train_loss': 0.5322120666503907, 'epoch': 4.0}
{'eval_loss': 0.571740

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5961236953735352, 'eval_accuracy': 0.6936274509803921, 'eval_f1': 0.8169838945827232, 'eval_runtime': 0.6434, 'eval_samples_per_second': 634.166, 'eval_steps_per_second': 20.206, 'epoch': 1.0}
{'eval_loss': 0.5710234045982361, 'eval_accuracy': 0.7058823529411765, 'eval_f1': 0.817629179331307, 'eval_runtime': 0.6076, 'eval_samples_per_second': 671.465, 'eval_steps_per_second': 21.395, 'epoch': 2.0}
{'eval_loss': 0.5729517340660095, 'eval_accuracy': 0.7083333333333334, 'eval_f1': 0.8199697428139183, 'eval_runtime': 0.653, 'eval_samples_per_second': 624.835, 'eval_steps_per_second': 19.909, 'epoch': 3.0}
{'eval_loss': 0.56805819272995, 'eval_accuracy': 0.7107843137254902, 'eval_f1': 0.8201219512195121, 'eval_runtime': 0.6589, 'eval_samples_per_second': 619.184, 'eval_steps_per_second': 19.729, 'epoch': 4.0}
{'train_runtime': 9.0593, 'train_samples_per_second': 1619.558, 'train_steps_per_second': 50.777, 'train_loss': 0.5740605893342391, 'epoch': 4.0}
{'eval_loss': 0.568058

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6053450703620911, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.6206, 'eval_samples_per_second': 657.469, 'eval_steps_per_second': 20.949, 'epoch': 1.0}
{'eval_loss': 0.5885691046714783, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8212703101920237, 'eval_runtime': 0.6184, 'eval_samples_per_second': 659.808, 'eval_steps_per_second': 21.023, 'epoch': 2.0}
{'eval_loss': 0.5840325355529785, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8166915052160953, 'eval_runtime': 0.5974, 'eval_samples_per_second': 682.958, 'eval_steps_per_second': 21.761, 'epoch': 3.0}
{'eval_loss': 0.5790587663650513, 'eval_accuracy': 0.696078431372549, 'eval_f1': 0.8138138138138138, 'eval_runtime': 0.6518, 'eval_samples_per_second': 625.915, 'eval_steps_per_second': 19.943, 'epoch': 4.0}
{'train_runtime': 9.0076, 'train_samples_per_second': 1628.848, 'train_steps_per_second': 51.068, 'train_loss': 0.5975138125212296, 'epoch': 4.0}
{'eval_loss': 0.579

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5753757357597351, 'eval_accuracy': 0.696078431372549, 'eval_f1': 0.8132530120481928, 'eval_runtime': 0.6001, 'eval_samples_per_second': 679.894, 'eval_steps_per_second': 11.665, 'epoch': 1.0}
{'eval_loss': 0.550995409488678, 'eval_accuracy': 0.7328431372549019, 'eval_f1': 0.8304821150855366, 'eval_runtime': 0.6081, 'eval_samples_per_second': 670.906, 'eval_steps_per_second': 11.511, 'epoch': 2.0}
{'eval_loss': 0.5720232129096985, 'eval_accuracy': 0.7598039215686274, 'eval_f1': 0.8434504792332268, 'eval_runtime': 0.6393, 'eval_samples_per_second': 638.161, 'eval_steps_per_second': 10.949, 'epoch': 3.0}
{'eval_loss': 0.5971444249153137, 'eval_accuracy': 0.75, 'eval_f1': 0.8386075949367089, 'eval_runtime': 0.6254, 'eval_samples_per_second': 652.365, 'eval_steps_per_second': 11.193, 'epoch': 4.0}
{'train_runtime': 7.3854, 'train_samples_per_second': 1986.622, 'train_steps_per_second': 31.413, 'train_loss': 0.4707464020827721, 'epoch': 4.0}
{'eval_loss': 0.5971444249153137, 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.594853937625885, 'eval_accuracy': 0.6936274509803921, 'eval_f1': 0.8169838945827232, 'eval_runtime': 0.5893, 'eval_samples_per_second': 692.355, 'eval_steps_per_second': 11.879, 'epoch': 1.0}
{'eval_loss': 0.5708560943603516, 'eval_accuracy': 0.7009803921568627, 'eval_f1': 0.8117283950617284, 'eval_runtime': 0.6116, 'eval_samples_per_second': 667.098, 'eval_steps_per_second': 11.445, 'epoch': 2.0}
{'eval_loss': 0.5739648938179016, 'eval_accuracy': 0.7230392156862745, 'eval_f1': 0.8269525267993875, 'eval_runtime': 0.6001, 'eval_samples_per_second': 679.847, 'eval_steps_per_second': 11.664, 'epoch': 3.0}
{'eval_loss': 0.5612186193466187, 'eval_accuracy': 0.7303921568627451, 'eval_f1': 0.8297213622291022, 'eval_runtime': 0.5953, 'eval_samples_per_second': 685.381, 'eval_steps_per_second': 11.759, 'epoch': 4.0}
{'train_runtime': 7.1947, 'train_samples_per_second': 2039.272, 'train_steps_per_second': 32.246, 'train_loss': 0.5603954052102977, 'epoch': 4.0}
{'eval_loss': 0.561

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6062899231910706, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.7113, 'eval_samples_per_second': 573.609, 'eval_steps_per_second': 9.841, 'epoch': 1.0}
{'eval_loss': 0.590184211730957, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8212703101920237, 'eval_runtime': 0.573, 'eval_samples_per_second': 712.029, 'eval_steps_per_second': 12.216, 'epoch': 2.0}
{'eval_loss': 0.5828689932823181, 'eval_accuracy': 0.696078431372549, 'eval_f1': 0.8143712574850299, 'eval_runtime': 0.591, 'eval_samples_per_second': 690.352, 'eval_steps_per_second': 11.844, 'epoch': 3.0}
{'eval_loss': 0.5777505040168762, 'eval_accuracy': 0.6887254901960784, 'eval_f1': 0.8078668683812406, 'eval_runtime': 0.631, 'eval_samples_per_second': 646.567, 'eval_steps_per_second': 11.093, 'epoch': 4.0}
{'train_runtime': 7.2679, 'train_samples_per_second': 2018.73, 'train_steps_per_second': 31.921, 'train_loss': 0.5961887096536571, 'epoch': 4.0}
{'eval_loss': 0.577750504

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6178915500640869, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.5999, 'eval_samples_per_second': 680.148, 'eval_steps_per_second': 11.669, 'epoch': 1.0}
{'eval_loss': 0.6015655398368835, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.6429, 'eval_samples_per_second': 634.607, 'eval_steps_per_second': 10.888, 'epoch': 2.0}
{'eval_loss': 0.5944027900695801, 'eval_accuracy': 0.696078431372549, 'eval_f1': 0.8181818181818182, 'eval_runtime': 0.6129, 'eval_samples_per_second': 665.714, 'eval_steps_per_second': 11.422, 'epoch': 3.0}
{'eval_loss': 0.5916686058044434, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8217967599410898, 'eval_runtime': 0.6808, 'eval_samples_per_second': 599.275, 'eval_steps_per_second': 10.282, 'epoch': 4.0}
{'train_runtime': 7.3609, 'train_samples_per_second': 1993.248, 'train_steps_per_second': 31.518, 'train_loss': 0.6165741229879445, 'epoch': 4.0}
{'eval_loss': 0.591

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6026797890663147, 'eval_accuracy': 0.6887254901960784, 'eval_f1': 0.808446455505279, 'eval_runtime': 0.5978, 'eval_samples_per_second': 682.476, 'eval_steps_per_second': 6.691, 'epoch': 1.0}
{'eval_loss': 0.5666956901550293, 'eval_accuracy': 0.7205882352941176, 'eval_f1': 0.821875, 'eval_runtime': 0.6073, 'eval_samples_per_second': 671.869, 'eval_steps_per_second': 6.587, 'epoch': 2.0}
{'eval_loss': 0.5767462253570557, 'eval_accuracy': 0.7279411764705882, 'eval_f1': 0.8262910798122066, 'eval_runtime': 0.5894, 'eval_samples_per_second': 692.18, 'eval_steps_per_second': 6.786, 'epoch': 3.0}
{'eval_loss': 0.5759289264678955, 'eval_accuracy': 0.7352941176470589, 'eval_f1': 0.8291139240506329, 'eval_runtime': 0.5917, 'eval_samples_per_second': 689.576, 'eval_steps_per_second': 6.761, 'epoch': 4.0}
{'train_runtime': 6.6345, 'train_samples_per_second': 2211.471, 'train_steps_per_second': 17.484, 'train_loss': 0.5189324411852606, 'epoch': 4.0}
{'eval_loss': 0.5759289264678955, 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6038169264793396, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.6087, 'eval_samples_per_second': 670.269, 'eval_steps_per_second': 6.571, 'epoch': 1.0}
{'eval_loss': 0.5951903462409973, 'eval_accuracy': 0.7058823529411765, 'eval_f1': 0.8224852071005917, 'eval_runtime': 0.5752, 'eval_samples_per_second': 709.318, 'eval_steps_per_second': 6.954, 'epoch': 2.0}
{'eval_loss': 0.579287052154541, 'eval_accuracy': 0.6911764705882353, 'eval_f1': 0.8090909090909091, 'eval_runtime': 0.591, 'eval_samples_per_second': 690.353, 'eval_steps_per_second': 6.768, 'epoch': 3.0}
{'eval_loss': 0.5748432874679565, 'eval_accuracy': 0.6911764705882353, 'eval_f1': 0.8073394495412844, 'eval_runtime': 0.5887, 'eval_samples_per_second': 693.002, 'eval_steps_per_second': 6.794, 'epoch': 4.0}
{'train_runtime': 6.5158, 'train_samples_per_second': 2251.74, 'train_steps_per_second': 17.803, 'train_loss': 0.5897226004764952, 'epoch': 4.0}
{'eval_loss': 0.574843287

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6207587718963623, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.5758, 'eval_samples_per_second': 708.542, 'eval_steps_per_second': 6.946, 'epoch': 1.0}
{'eval_loss': 0.6050003170967102, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.5963, 'eval_samples_per_second': 684.19, 'eval_steps_per_second': 6.708, 'epoch': 2.0}
{'eval_loss': 0.5970742106437683, 'eval_accuracy': 0.6887254901960784, 'eval_f1': 0.8145985401459854, 'eval_runtime': 0.5873, 'eval_samples_per_second': 694.747, 'eval_steps_per_second': 6.811, 'epoch': 3.0}
{'eval_loss': 0.5952083468437195, 'eval_accuracy': 0.6936274509803921, 'eval_f1': 0.8169838945827232, 'eval_runtime': 0.5915, 'eval_samples_per_second': 689.78, 'eval_steps_per_second': 6.763, 'epoch': 4.0}
{'train_runtime': 6.5492, 'train_samples_per_second': 2240.291, 'train_steps_per_second': 17.712, 'train_loss': 0.6204661007585197, 'epoch': 4.0}
{'eval_loss': 0.59520834

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6439599990844727, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.6205, 'eval_samples_per_second': 657.537, 'eval_steps_per_second': 6.446, 'epoch': 1.0}
{'eval_loss': 0.6185536980628967, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.5924, 'eval_samples_per_second': 688.709, 'eval_steps_per_second': 6.752, 'epoch': 2.0}
{'eval_loss': 0.6116704344749451, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.591, 'eval_samples_per_second': 690.338, 'eval_steps_per_second': 6.768, 'epoch': 3.0}
{'eval_loss': 0.609874427318573, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.6017, 'eval_samples_per_second': 678.114, 'eval_steps_per_second': 6.648, 'epoch': 4.0}
{'train_runtime': 6.6213, 'train_samples_per_second': 2215.88, 'train_steps_per_second': 17.519, 'train_loss': 0.6398460125101024, 'epoch': 4.0}
{'eval_loss': 0.609874427

### LoRA

In [None]:
_ = lora_hyperparameter_tune("prajjwal1/bert-tiny", ranks, alphas, epochs, batch_sizes, learning_rates)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Progress: 1/500
{'eval_loss': 0.6257390975952148, 'eval_accuracy': 0.6813725490196079, 'eval_f1': 0.8104956268221575, 'eval_runtime': 0.7665, 'eval_samples_per_second': 532.311, 'eval_steps_per_second': 66.539, 'epoch': 1.0}
{'loss': 0.663, 'grad_norm': 0.46347522735595703, 'learning_rate': 0.00021830065359477123, 'epoch': 1.0893246187363834}
{'eval_loss': 0.6210986375808716, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.7382, 'eval_samples_per_second': 552.705, 'eval_steps_per_second': 69.088, 'epoch': 2.0}
{'loss': 0.6298, 'grad_norm': 0.40943172574043274, 'learning_rate': 0.00013660130718954247, 'epoch': 2.178649237472767}
{'eval_loss': 0.6197056770324707, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.749, 'eval_samples_per_second': 544.714, 'eval_steps_per_second': 68.089, 'epoch': 3.0}
{'loss': 0.6246, 'grad_norm': 0.42999500036239624, 'learning_rate': 5.4901960784313716e-05, 'epoch': 3.267973856209150

## BERT Mini

### Full Fine-Tuning

In [None]:
_ = ft_hyperparameter_tune("prajjwal1/bert-mini", epochs, batch_sizes, learning_rates)

Progress: 1/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6088163256645203, 'eval_accuracy': 0.6936274509803921, 'eval_f1': 0.8169838945827232, 'eval_runtime': 0.8245, 'eval_samples_per_second': 494.861, 'eval_steps_per_second': 61.858, 'epoch': 1.0}
{'loss': 0.6267, 'grad_norm': 2.557328939437866, 'learning_rate': 0.00021830065359477123, 'epoch': 1.0893246187363834}
{'eval_loss': 0.5870793461799622, 'eval_accuracy': 0.7009803921568627, 'eval_f1': 0.8157099697885196, 'eval_runtime': 0.8712, 'eval_samples_per_second': 468.311, 'eval_steps_per_second': 58.539, 'epoch': 2.0}
{'loss': 0.5421, 'grad_norm': 13.000743865966797, 'learning_rate': 0.00013660130718954247, 'epoch': 2.178649237472767}
{'eval_loss': 0.7715953588485718, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.7615526802218114, 'eval_runtime': 1.0786, 'eval_samples_per_second': 378.28, 'eval_steps_per_second': 47.285, 'epoch': 3.0}
{'loss': 0.3946, 'grad_norm': 4.684414386749268, 'learning_rate': 5.4901960784313716e-05, 'epoch': 3.2679738562091503}
{'eval_loss': 1.0

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.545035183429718, 'eval_accuracy': 0.7303921568627451, 'eval_f1': 0.819672131147541, 'eval_runtime': 0.8119, 'eval_samples_per_second': 502.497, 'eval_steps_per_second': 62.812, 'epoch': 1.0}
{'loss': 0.584, 'grad_norm': 5.325846195220947, 'learning_rate': 7.276688453159042e-05, 'epoch': 1.0893246187363834}
{'eval_loss': 0.5319600701332092, 'eval_accuracy': 0.7549019607843137, 'eval_f1': 0.8333333333333334, 'eval_runtime': 0.8258, 'eval_samples_per_second': 494.086, 'eval_steps_per_second': 61.761, 'epoch': 2.0}
{'loss': 0.4395, 'grad_norm': 41.37472915649414, 'learning_rate': 4.5533769063180834e-05, 'epoch': 2.178649237472767}
{'eval_loss': 0.769145131111145, 'eval_accuracy': 0.7328431372549019, 'eval_f1': 0.8161888701517707, 'eval_runtime': 0.8119, 'eval_samples_per_second': 502.495, 'eval_steps_per_second': 62.812, 'epoch': 3.0}
{'loss': 0.2954, 'grad_norm': 73.53695678710938, 'learning_rate': 1.8300653594771242e-05, 'epoch': 3.2679738562091503}
{'eval_loss': 0.955430

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5113222002983093, 'eval_accuracy': 0.7549019607843137, 'eval_f1': 0.8402555910543131, 'eval_runtime': 0.8379, 'eval_samples_per_second': 486.926, 'eval_steps_per_second': 60.866, 'epoch': 1.0}
{'loss': 0.5659, 'grad_norm': 9.544868469238281, 'learning_rate': 3.638344226579521e-05, 'epoch': 1.0893246187363834}
{'eval_loss': 0.47284623980522156, 'eval_accuracy': 0.7696078431372549, 'eval_f1': 0.8412162162162162, 'eval_runtime': 0.7853, 'eval_samples_per_second': 519.524, 'eval_steps_per_second': 64.941, 'epoch': 2.0}
{'loss': 0.4328, 'grad_norm': 14.25734806060791, 'learning_rate': 2.2766884531590417e-05, 'epoch': 2.178649237472767}
{'eval_loss': 0.5246177315711975, 'eval_accuracy': 0.7941176470588235, 'eval_f1': 0.8515901060070671, 'eval_runtime': 0.8108, 'eval_samples_per_second': 503.197, 'eval_steps_per_second': 62.9, 'epoch': 3.0}
{'loss': 0.3148, 'grad_norm': 19.18421173095703, 'learning_rate': 9.150326797385621e-06, 'epoch': 3.2679738562091503}
{'eval_loss': 0.5896

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5119180679321289, 'eval_accuracy': 0.7647058823529411, 'eval_f1': 0.8446601941747572, 'eval_runtime': 0.7846, 'eval_samples_per_second': 519.997, 'eval_steps_per_second': 65.0, 'epoch': 1.0}
{'loss': 0.5747, 'grad_norm': 5.824169635772705, 'learning_rate': 2.1830065359477124e-05, 'epoch': 1.0893246187363834}
{'eval_loss': 0.46384602785110474, 'eval_accuracy': 0.7794117647058824, 'eval_f1': 0.8484848484848485, 'eval_runtime': 0.8295, 'eval_samples_per_second': 491.877, 'eval_steps_per_second': 61.485, 'epoch': 2.0}
{'loss': 0.4709, 'grad_norm': 22.491836547851562, 'learning_rate': 1.366013071895425e-05, 'epoch': 2.178649237472767}
{'eval_loss': 0.4952385127544403, 'eval_accuracy': 0.7745098039215687, 'eval_f1': 0.8380281690140845, 'eval_runtime': 0.7864, 'eval_samples_per_second': 518.848, 'eval_steps_per_second': 64.856, 'epoch': 3.0}
{'loss': 0.389, 'grad_norm': 23.34096336364746, 'learning_rate': 5.490196078431373e-06, 'epoch': 3.2679738562091503}
{'eval_loss': 0.5073

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5968839526176453, 'eval_accuracy': 0.7132352941176471, 'eval_f1': 0.8245877061469266, 'eval_runtime': 0.6852, 'eval_samples_per_second': 595.431, 'eval_steps_per_second': 37.944, 'epoch': 1.0}
{'eval_loss': 0.5948936939239502, 'eval_accuracy': 0.696078431372549, 'eval_f1': 0.7816901408450704, 'eval_runtime': 0.8519, 'eval_samples_per_second': 478.932, 'eval_steps_per_second': 30.52, 'epoch': 2.0}
{'loss': 0.5319, 'grad_norm': 14.480657577514648, 'learning_rate': 0.00013695652173913042, 'epoch': 2.1739130434782608}
{'eval_loss': 0.7887159585952759, 'eval_accuracy': 0.696078431372549, 'eval_f1': 0.7816901408450704, 'eval_runtime': 0.727, 'eval_samples_per_second': 561.173, 'eval_steps_per_second': 35.761, 'epoch': 3.0}
{'eval_loss': 1.0651037693023682, 'eval_accuracy': 0.7230392156862745, 'eval_f1': 0.8021015761821366, 'eval_runtime': 0.6948, 'eval_samples_per_second': 587.18, 'eval_steps_per_second': 37.418, 'epoch': 4.0}
{'train_runtime': 21.5252, 'train_samples_per_sec

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5195183157920837, 'eval_accuracy': 0.7549019607843137, 'eval_f1': 0.8392282958199357, 'eval_runtime': 0.7163, 'eval_samples_per_second': 569.582, 'eval_steps_per_second': 36.297, 'epoch': 1.0}
{'eval_loss': 0.4973233640193939, 'eval_accuracy': 0.7622549019607843, 'eval_f1': 0.8427876823338736, 'eval_runtime': 0.698, 'eval_samples_per_second': 584.537, 'eval_steps_per_second': 37.25, 'epoch': 2.0}
{'loss': 0.5003, 'grad_norm': 7.2889227867126465, 'learning_rate': 4.565217391304348e-05, 'epoch': 2.1739130434782608}
{'eval_loss': 0.5796966552734375, 'eval_accuracy': 0.7671568627450981, 'eval_f1': 0.8376068376068376, 'eval_runtime': 0.7421, 'eval_samples_per_second': 549.791, 'eval_steps_per_second': 35.036, 'epoch': 3.0}
{'eval_loss': 0.6599036455154419, 'eval_accuracy': 0.7671568627450981, 'eval_f1': 0.8359240069084629, 'eval_runtime': 0.6942, 'eval_samples_per_second': 587.723, 'eval_steps_per_second': 37.453, 'epoch': 4.0}
{'train_runtime': 21.3818, 'train_samples_per_s

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.4999988079071045, 'eval_accuracy': 0.7573529411764706, 'eval_f1': 0.8352745424292846, 'eval_runtime': 0.7686, 'eval_samples_per_second': 530.814, 'eval_steps_per_second': 33.826, 'epoch': 1.0}
{'eval_loss': 0.4582745134830475, 'eval_accuracy': 0.7818627450980392, 'eval_f1': 0.8524046434494196, 'eval_runtime': 0.6894, 'eval_samples_per_second': 591.849, 'eval_steps_per_second': 37.716, 'epoch': 2.0}
{'loss': 0.5164, 'grad_norm': 12.530374526977539, 'learning_rate': 2.282608695652174e-05, 'epoch': 2.1739130434782608}
{'eval_loss': 0.4496159255504608, 'eval_accuracy': 0.7965686274509803, 'eval_f1': 0.851520572450805, 'eval_runtime': 0.7, 'eval_samples_per_second': 582.855, 'eval_steps_per_second': 37.143, 'epoch': 3.0}
{'eval_loss': 0.46178555488586426, 'eval_accuracy': 0.8063725490196079, 'eval_f1': 0.8626086956521739, 'eval_runtime': 0.6873, 'eval_samples_per_second': 593.645, 'eval_steps_per_second': 37.83, 'epoch': 4.0}
{'train_runtime': 21.2484, 'train_samples_per_sec

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5118685960769653, 'eval_accuracy': 0.7671568627450981, 'eval_f1': 0.8434925864909391, 'eval_runtime': 0.6867, 'eval_samples_per_second': 594.144, 'eval_steps_per_second': 37.862, 'epoch': 1.0}
{'eval_loss': 0.47276729345321655, 'eval_accuracy': 0.7769607843137255, 'eval_f1': 0.8480801335559266, 'eval_runtime': 0.694, 'eval_samples_per_second': 587.933, 'eval_steps_per_second': 37.466, 'epoch': 2.0}
{'loss': 0.5392, 'grad_norm': 6.2725348472595215, 'learning_rate': 1.3695652173913042e-05, 'epoch': 2.1739130434782608}
{'eval_loss': 0.4662048816680908, 'eval_accuracy': 0.7720588235294118, 'eval_f1': 0.8415672913117547, 'eval_runtime': 0.7077, 'eval_samples_per_second': 576.477, 'eval_steps_per_second': 36.736, 'epoch': 3.0}
{'eval_loss': 0.4712519943714142, 'eval_accuracy': 0.7696078431372549, 'eval_f1': 0.8384879725085911, 'eval_runtime': 0.7204, 'eval_samples_per_second': 566.383, 'eval_steps_per_second': 36.093, 'epoch': 4.0}
{'train_runtime': 21.2583, 'train_samples_pe

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5351694822311401, 'eval_accuracy': 0.7303921568627451, 'eval_f1': 0.8148148148148148, 'eval_runtime': 0.6891, 'eval_samples_per_second': 592.04, 'eval_steps_per_second': 18.864, 'epoch': 1.0}
{'eval_loss': 0.5405896306037903, 'eval_accuracy': 0.7598039215686274, 'eval_f1': 0.8409090909090909, 'eval_runtime': 0.6578, 'eval_samples_per_second': 620.294, 'eval_steps_per_second': 19.764, 'epoch': 2.0}
{'eval_loss': 0.600274384021759, 'eval_accuracy': 0.7524509803921569, 'eval_f1': 0.8160291438979964, 'eval_runtime': 0.6599, 'eval_samples_per_second': 618.278, 'eval_steps_per_second': 19.7, 'epoch': 3.0}
{'eval_loss': 0.8027786612510681, 'eval_accuracy': 0.7573529411764706, 'eval_f1': 0.8330522765598651, 'eval_runtime': 0.8814, 'eval_samples_per_second': 462.876, 'eval_steps_per_second': 14.749, 'epoch': 4.0}
{'train_runtime': 16.5811, 'train_samples_per_second': 884.865, 'train_steps_per_second': 27.743, 'train_loss': 0.365704577902089, 'epoch': 4.0}
{'eval_loss': 0.8027786

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.49950075149536133, 'eval_accuracy': 0.7377450980392157, 'eval_f1': 0.8276972624798712, 'eval_runtime': 0.656, 'eval_samples_per_second': 621.936, 'eval_steps_per_second': 19.817, 'epoch': 1.0}
{'eval_loss': 0.44355347752571106, 'eval_accuracy': 0.8063725490196079, 'eval_f1': 0.8621291448516579, 'eval_runtime': 0.6577, 'eval_samples_per_second': 620.366, 'eval_steps_per_second': 19.767, 'epoch': 2.0}
{'eval_loss': 0.5039736032485962, 'eval_accuracy': 0.7696078431372549, 'eval_f1': 0.8384879725085911, 'eval_runtime': 0.7084, 'eval_samples_per_second': 575.909, 'eval_steps_per_second': 18.35, 'epoch': 3.0}
{'eval_loss': 0.5225716829299927, 'eval_accuracy': 0.7745098039215687, 'eval_f1': 0.8408304498269896, 'eval_runtime': 0.68, 'eval_samples_per_second': 600.041, 'eval_steps_per_second': 19.119, 'epoch': 4.0}
{'train_runtime': 16.2877, 'train_samples_per_second': 900.802, 'train_steps_per_second': 28.242, 'train_loss': 0.4027526192043139, 'epoch': 4.0}
{'eval_loss': 0.5225

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5463365912437439, 'eval_accuracy': 0.7377450980392157, 'eval_f1': 0.837138508371385, 'eval_runtime': 0.7034, 'eval_samples_per_second': 580.058, 'eval_steps_per_second': 18.482, 'epoch': 1.0}
{'eval_loss': 0.46782827377319336, 'eval_accuracy': 0.7647058823529411, 'eval_f1': 0.8383838383838383, 'eval_runtime': 0.6588, 'eval_samples_per_second': 619.326, 'eval_steps_per_second': 19.733, 'epoch': 2.0}
{'eval_loss': 0.4775208532810211, 'eval_accuracy': 0.7696078431372549, 'eval_f1': 0.8422818791946308, 'eval_runtime': 0.6649, 'eval_samples_per_second': 613.582, 'eval_steps_per_second': 19.55, 'epoch': 3.0}
{'eval_loss': 0.4690450131893158, 'eval_accuracy': 0.7769607843137255, 'eval_f1': 0.842832469775475, 'eval_runtime': 0.6527, 'eval_samples_per_second': 625.107, 'eval_steps_per_second': 19.918, 'epoch': 4.0}
{'train_runtime': 16.3524, 'train_samples_per_second': 897.237, 'train_steps_per_second': 28.13, 'train_loss': 0.4728426394255265, 'epoch': 4.0}
{'eval_loss': 0.46904

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5599589347839355, 'eval_accuracy': 0.7156862745098039, 'eval_f1': 0.8242424242424242, 'eval_runtime': 0.6557, 'eval_samples_per_second': 622.204, 'eval_steps_per_second': 19.825, 'epoch': 1.0}
{'eval_loss': 0.5012838244438171, 'eval_accuracy': 0.7598039215686274, 'eval_f1': 0.8414239482200647, 'eval_runtime': 0.6734, 'eval_samples_per_second': 605.868, 'eval_steps_per_second': 19.305, 'epoch': 2.0}
{'eval_loss': 0.49729257822036743, 'eval_accuracy': 0.7573529411764706, 'eval_f1': 0.8379705400981997, 'eval_runtime': 0.7044, 'eval_samples_per_second': 579.206, 'eval_steps_per_second': 18.455, 'epoch': 3.0}
{'eval_loss': 0.48942670226097107, 'eval_accuracy': 0.7549019607843137, 'eval_f1': 0.8344370860927153, 'eval_runtime': 0.6645, 'eval_samples_per_second': 614.021, 'eval_steps_per_second': 19.564, 'epoch': 4.0}
{'train_runtime': 16.2174, 'train_samples_per_second': 904.705, 'train_steps_per_second': 28.365, 'train_loss': 0.5142016203507133, 'epoch': 4.0}
{'eval_loss': 0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5231978297233582, 'eval_accuracy': 0.7573529411764706, 'eval_f1': 0.841091492776886, 'eval_runtime': 0.6594, 'eval_samples_per_second': 618.723, 'eval_steps_per_second': 10.615, 'epoch': 1.0}
{'eval_loss': 0.5209354162216187, 'eval_accuracy': 0.75, 'eval_f1': 0.8104089219330854, 'eval_runtime': 0.675, 'eval_samples_per_second': 604.414, 'eval_steps_per_second': 10.37, 'epoch': 2.0}
{'eval_loss': 0.5699172616004944, 'eval_accuracy': 0.7647058823529411, 'eval_f1': 0.84, 'eval_runtime': 0.6433, 'eval_samples_per_second': 634.273, 'eval_steps_per_second': 10.882, 'epoch': 3.0}
{'eval_loss': 0.6639034152030945, 'eval_accuracy': 0.7598039215686274, 'eval_f1': 0.8280701754385965, 'eval_runtime': 0.735, 'eval_samples_per_second': 555.092, 'eval_steps_per_second': 9.524, 'epoch': 4.0}
{'train_runtime': 15.1168, 'train_samples_per_second': 970.579, 'train_steps_per_second': 15.347, 'train_loss': 0.36812943425671807, 'epoch': 4.0}
{'eval_loss': 0.6639034152030945, 'eval_accuracy':

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.558167576789856, 'eval_accuracy': 0.7279411764705882, 'eval_f1': 0.832579185520362, 'eval_runtime': 0.6647, 'eval_samples_per_second': 613.794, 'eval_steps_per_second': 10.531, 'epoch': 1.0}
{'eval_loss': 0.47555169463157654, 'eval_accuracy': 0.7696078431372549, 'eval_f1': 0.8362369337979094, 'eval_runtime': 0.6406, 'eval_samples_per_second': 636.902, 'eval_steps_per_second': 10.927, 'epoch': 2.0}
{'eval_loss': 0.48263099789619446, 'eval_accuracy': 0.7818627450980392, 'eval_f1': 0.8499156829679595, 'eval_runtime': 0.672, 'eval_samples_per_second': 607.164, 'eval_steps_per_second': 10.417, 'epoch': 3.0}
{'eval_loss': 0.47803330421447754, 'eval_accuracy': 0.7990196078431373, 'eval_f1': 0.8610169491525423, 'eval_runtime': 0.6626, 'eval_samples_per_second': 615.743, 'eval_steps_per_second': 10.564, 'epoch': 4.0}
{'train_runtime': 15.025, 'train_samples_per_second': 976.505, 'train_steps_per_second': 15.441, 'train_loss': 0.4474343266980401, 'epoch': 4.0}
{'eval_loss': 0.478

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.561064600944519, 'eval_accuracy': 0.7230392156862745, 'eval_f1': 0.8274809160305343, 'eval_runtime': 0.6586, 'eval_samples_per_second': 619.48, 'eval_steps_per_second': 10.628, 'epoch': 1.0}
{'eval_loss': 0.4964536726474762, 'eval_accuracy': 0.7671568627450981, 'eval_f1': 0.8455284552845529, 'eval_runtime': 0.6387, 'eval_samples_per_second': 638.839, 'eval_steps_per_second': 10.96, 'epoch': 2.0}
{'eval_loss': 0.49491190910339355, 'eval_accuracy': 0.7696078431372549, 'eval_f1': 0.8448844884488449, 'eval_runtime': 0.6443, 'eval_samples_per_second': 633.273, 'eval_steps_per_second': 10.865, 'epoch': 3.0}
{'eval_loss': 0.48988640308380127, 'eval_accuracy': 0.7696078431372549, 'eval_f1': 0.8433333333333334, 'eval_runtime': 0.6307, 'eval_samples_per_second': 646.851, 'eval_steps_per_second': 11.098, 'epoch': 4.0}
{'train_runtime': 14.9936, 'train_samples_per_second': 978.552, 'train_steps_per_second': 15.473, 'train_loss': 0.5032135535930765, 'epoch': 4.0}
{'eval_loss': 0.489

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5779255032539368, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8150375939849624, 'eval_runtime': 0.6496, 'eval_samples_per_second': 628.085, 'eval_steps_per_second': 10.776, 'epoch': 1.0}
{'eval_loss': 0.5288864970207214, 'eval_accuracy': 0.7450980392156863, 'eval_f1': 0.8333333333333334, 'eval_runtime': 0.6467, 'eval_samples_per_second': 630.938, 'eval_steps_per_second': 10.825, 'epoch': 2.0}
{'eval_loss': 0.5168865919113159, 'eval_accuracy': 0.7549019607843137, 'eval_f1': 0.8376623376623377, 'eval_runtime': 0.6656, 'eval_samples_per_second': 612.953, 'eval_steps_per_second': 10.516, 'epoch': 3.0}
{'eval_loss': 0.5123254656791687, 'eval_accuracy': 0.7524509803921569, 'eval_f1': 0.835236541598695, 'eval_runtime': 0.7396, 'eval_samples_per_second': 551.65, 'eval_steps_per_second': 9.465, 'epoch': 4.0}
{'train_runtime': 15.0373, 'train_samples_per_second': 975.708, 'train_steps_per_second': 15.428, 'train_loss': 0.5400381417110048, 'epoch': 4.0}
{'eval_loss': 0.51232

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5615370273590088, 'eval_accuracy': 0.7279411764705882, 'eval_f1': 0.8229665071770335, 'eval_runtime': 0.6347, 'eval_samples_per_second': 642.856, 'eval_steps_per_second': 6.303, 'epoch': 1.0}
{'eval_loss': 0.5163636803627014, 'eval_accuracy': 0.7475490196078431, 'eval_f1': 0.832520325203252, 'eval_runtime': 0.646, 'eval_samples_per_second': 631.591, 'eval_steps_per_second': 6.192, 'epoch': 2.0}
{'eval_loss': 0.587872326374054, 'eval_accuracy': 0.7426470588235294, 'eval_f1': 0.8303715670436187, 'eval_runtime': 0.6585, 'eval_samples_per_second': 619.595, 'eval_steps_per_second': 6.074, 'epoch': 3.0}
{'eval_loss': 0.5940830111503601, 'eval_accuracy': 0.75, 'eval_f1': 0.8277027027027027, 'eval_runtime': 0.6505, 'eval_samples_per_second': 627.243, 'eval_steps_per_second': 6.149, 'epoch': 4.0}
{'train_runtime': 14.4393, 'train_samples_per_second': 1016.114, 'train_steps_per_second': 8.034, 'train_loss': 0.4550747378119107, 'epoch': 4.0}
{'eval_loss': 0.5940830111503601, 'eval

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.578144907951355, 'eval_accuracy': 0.7156862745098039, 'eval_f1': 0.8258258258258259, 'eval_runtime': 0.6484, 'eval_samples_per_second': 629.198, 'eval_steps_per_second': 6.169, 'epoch': 1.0}
{'eval_loss': 0.5186727643013, 'eval_accuracy': 0.7475490196078431, 'eval_f1': 0.8341384863123994, 'eval_runtime': 0.6306, 'eval_samples_per_second': 647.042, 'eval_steps_per_second': 6.344, 'epoch': 2.0}
{'eval_loss': 0.49667444825172424, 'eval_accuracy': 0.7549019607843137, 'eval_f1': 0.8327759197324415, 'eval_runtime': 0.6584, 'eval_samples_per_second': 619.718, 'eval_steps_per_second': 6.076, 'epoch': 3.0}
{'eval_loss': 0.4915316700935364, 'eval_accuracy': 0.7475490196078431, 'eval_f1': 0.8239316239316239, 'eval_runtime': 0.7105, 'eval_samples_per_second': 574.207, 'eval_steps_per_second': 5.629, 'epoch': 4.0}
{'train_runtime': 14.4571, 'train_samples_per_second': 1014.863, 'train_steps_per_second': 8.024, 'train_loss': 0.5085905009302599, 'epoch': 4.0}
{'eval_loss': 0.491531670

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5782576203346252, 'eval_accuracy': 0.7058823529411765, 'eval_f1': 0.8170731707317073, 'eval_runtime': 0.6538, 'eval_samples_per_second': 623.998, 'eval_steps_per_second': 6.118, 'epoch': 1.0}
{'eval_loss': 0.5304329991340637, 'eval_accuracy': 0.7475490196078431, 'eval_f1': 0.8352, 'eval_runtime': 0.6321, 'eval_samples_per_second': 645.431, 'eval_steps_per_second': 6.328, 'epoch': 2.0}
{'eval_loss': 0.5076761245727539, 'eval_accuracy': 0.7598039215686274, 'eval_f1': 0.839344262295082, 'eval_runtime': 0.6595, 'eval_samples_per_second': 618.646, 'eval_steps_per_second': 6.065, 'epoch': 3.0}
{'eval_loss': 0.5055544376373291, 'eval_accuracy': 0.7549019607843137, 'eval_f1': 0.8366013071895425, 'eval_runtime': 0.6424, 'eval_samples_per_second': 635.088, 'eval_steps_per_second': 6.226, 'epoch': 4.0}
{'train_runtime': 14.3729, 'train_samples_per_second': 1020.808, 'train_steps_per_second': 8.071, 'train_loss': 0.54274792506777, 'epoch': 4.0}
{'eval_loss': 0.5055544376373291, 'ev

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5957663059234619, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8183161004431314, 'eval_runtime': 0.6509, 'eval_samples_per_second': 626.864, 'eval_steps_per_second': 6.146, 'epoch': 1.0}
{'eval_loss': 0.5634233355522156, 'eval_accuracy': 0.7034313725490197, 'eval_f1': 0.8163884673748103, 'eval_runtime': 0.6358, 'eval_samples_per_second': 641.721, 'eval_steps_per_second': 6.291, 'epoch': 2.0}
{'eval_loss': 0.5451557040214539, 'eval_accuracy': 0.7352941176470589, 'eval_f1': 0.8301886792452831, 'eval_runtime': 0.6833, 'eval_samples_per_second': 597.089, 'eval_steps_per_second': 5.854, 'epoch': 3.0}
{'eval_loss': 0.5405035614967346, 'eval_accuracy': 0.7450980392156863, 'eval_f1': 0.8354430379746836, 'eval_runtime': 0.636, 'eval_samples_per_second': 641.479, 'eval_steps_per_second': 6.289, 'epoch': 4.0}
{'train_runtime': 14.4151, 'train_samples_per_second': 1017.825, 'train_steps_per_second': 8.047, 'train_loss': 0.5715737836114292, 'epoch': 4.0}
{'eval_loss': 0.5405035

### LoRA

In [None]:
_ = lora_hyperparameter_tune("prajjwal1/bert-mini", ranks, alphas, epochs, batch_sizes, learning_rates)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Progress: 1/500
{'eval_loss': 0.6154337525367737, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.8525, 'eval_samples_per_second': 478.567, 'eval_steps_per_second': 59.821, 'epoch': 1.0}
{'loss': 0.634, 'grad_norm': 0.4864899814128876, 'learning_rate': 0.00021830065359477123, 'epoch': 1.0893246187363834}
{'eval_loss': 0.5992273092269897, 'eval_accuracy': 0.7058823529411765, 'eval_f1': 0.8219584569732937, 'eval_runtime': 0.8551, 'eval_samples_per_second': 477.12, 'eval_steps_per_second': 59.64, 'epoch': 2.0}
{'loss': 0.6152, 'grad_norm': 1.3363189697265625, 'learning_rate': 0.00013660130718954247, 'epoch': 2.178649237472767}
{'eval_loss': 0.5883147120475769, 'eval_accuracy': 0.7107843137254902, 'eval_f1': 0.8201219512195121, 'eval_runtime': 0.9861, 'eval_samples_per_second': 413.733, 'eval_steps_per_second': 51.717, 'epoch': 3.0}
{'loss': 0.5894, 'grad_norm': 2.013890027999878, 'learning_rate': 5.4901960784313716e-05, 'epoch': 3.2679738562091503}
{'

## BERT Small

### Full Fine-Tuning

In [None]:
_ = ft_hyperparameter_tune("prajjwal1/bert-small", epochs, batch_sizes, learning_rates)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Progress: 1/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.6314049363136292, 'eval_accuracy': 0.678921568627451, 'eval_f1': 0.7937007874015748, 'eval_runtime': 1.0283, 'eval_samples_per_second': 396.766, 'eval_steps_per_second': 49.596, 'epoch': 1.0}
{'loss': 0.648, 'grad_norm': 2.3528854846954346, 'learning_rate': 0.00021830065359477123, 'epoch': 1.0893246187363834}
{'eval_loss': 0.6235907077789307, 'eval_accuracy': 0.6911764705882353, 'eval_f1': 0.8157894736842105, 'eval_runtime': 0.9006, 'eval_samples_per_second': 453.019, 'eval_steps_per_second': 56.627, 'epoch': 2.0}
{'loss': 0.6408, 'grad_norm': 6.450732707977295, 'learning_rate': 0.00013660130718954247, 'epoch': 2.178649237472767}
{'eval_loss': 0.63517165184021, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.8396, 'eval_samples_per_second': 485.961, 'eval_steps_per_second': 60.745, 'epoch': 3.0}
{'loss': 0.6329, 'grad_norm': 6.159433364868164, 'learning_rate': 5.4901960784313716e-05, 'epoch': 3.2679738562091503}
{'eval_loss': 0.6240

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.562410831451416, 'eval_accuracy': 0.7377450980392157, 'eval_f1': 0.837138508371385, 'eval_runtime': 0.9167, 'eval_samples_per_second': 445.067, 'eval_steps_per_second': 55.633, 'epoch': 1.0}
{'loss': 0.5804, 'grad_norm': 7.497133255004883, 'learning_rate': 7.276688453159042e-05, 'epoch': 1.0893246187363834}
{'eval_loss': 0.6133310198783875, 'eval_accuracy': 0.7573529411764706, 'eval_f1': 0.8421052631578947, 'eval_runtime': 0.8591, 'eval_samples_per_second': 474.942, 'eval_steps_per_second': 59.368, 'epoch': 2.0}
{'loss': 0.4025, 'grad_norm': 7.244646072387695, 'learning_rate': 4.5533769063180834e-05, 'epoch': 2.178649237472767}
{'eval_loss': 0.9236742854118347, 'eval_accuracy': 0.7647058823529411, 'eval_f1': 0.8378378378378378, 'eval_runtime': 0.8636, 'eval_samples_per_second': 472.436, 'eval_steps_per_second': 59.054, 'epoch': 3.0}
{'loss': 0.209, 'grad_norm': 0.6266918778419495, 'learning_rate': 1.8300653594771242e-05, 'epoch': 3.2679738562091503}
{'eval_loss': 1.2381

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.46955713629722595, 'eval_accuracy': 0.7696078431372549, 'eval_f1': 0.8517350157728707, 'eval_runtime': 0.9038, 'eval_samples_per_second': 451.451, 'eval_steps_per_second': 56.431, 'epoch': 1.0}
{'loss': 0.5605, 'grad_norm': 11.963642120361328, 'learning_rate': 3.638344226579521e-05, 'epoch': 1.0893246187363834}
{'eval_loss': 0.4801804721355438, 'eval_accuracy': 0.8308823529411765, 'eval_f1': 0.8851913477537438, 'eval_runtime': 1.0707, 'eval_samples_per_second': 381.067, 'eval_steps_per_second': 47.633, 'epoch': 2.0}
{'loss': 0.3475, 'grad_norm': 14.63426399230957, 'learning_rate': 2.2766884531590417e-05, 'epoch': 2.178649237472767}
{'eval_loss': 0.6653249263763428, 'eval_accuracy': 0.8357843137254902, 'eval_f1': 0.8830715532286213, 'eval_runtime': 0.8724, 'eval_samples_per_second': 467.666, 'eval_steps_per_second': 58.458, 'epoch': 3.0}
{'loss': 0.1949, 'grad_norm': 0.5042751431465149, 'learning_rate': 9.150326797385621e-06, 'epoch': 3.2679738562091503}
{'eval_loss': 0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5211499333381653, 'eval_accuracy': 0.7450980392156863, 'eval_f1': 0.8395061728395061, 'eval_runtime': 0.8643, 'eval_samples_per_second': 472.061, 'eval_steps_per_second': 59.008, 'epoch': 1.0}
{'loss': 0.5729, 'grad_norm': 6.145843029022217, 'learning_rate': 2.1830065359477124e-05, 'epoch': 1.0893246187363834}
{'eval_loss': 0.4055912494659424, 'eval_accuracy': 0.8235294117647058, 'eval_f1': 0.8787878787878788, 'eval_runtime': 0.8834, 'eval_samples_per_second': 461.831, 'eval_steps_per_second': 57.729, 'epoch': 2.0}
{'loss': 0.3931, 'grad_norm': 22.28466033935547, 'learning_rate': 1.366013071895425e-05, 'epoch': 2.178649237472767}
{'eval_loss': 0.4193804860115051, 'eval_accuracy': 0.8382352941176471, 'eval_f1': 0.8829787234042553, 'eval_runtime': 0.8491, 'eval_samples_per_second': 480.524, 'eval_steps_per_second': 60.065, 'epoch': 3.0}
{'loss': 0.2516, 'grad_norm': 3.0146636962890625, 'learning_rate': 5.490196078431373e-06, 'epoch': 3.2679738562091503}
{'eval_loss': 0.59

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.579662561416626, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.8222, 'eval_samples_per_second': 496.205, 'eval_steps_per_second': 31.621, 'epoch': 1.0}
{'eval_loss': 0.564789354801178, 'eval_accuracy': 0.7352941176470589, 'eval_f1': 0.8328173374613003, 'eval_runtime': 0.8418, 'eval_samples_per_second': 484.702, 'eval_steps_per_second': 30.888, 'epoch': 2.0}
{'loss': 0.5701, 'grad_norm': 10.844886779785156, 'learning_rate': 0.00013695652173913042, 'epoch': 2.1739130434782608}
{'eval_loss': 0.8407540321350098, 'eval_accuracy': 0.6691176470588235, 'eval_f1': 0.7398843930635838, 'eval_runtime': 0.8422, 'eval_samples_per_second': 484.418, 'eval_steps_per_second': 30.87, 'epoch': 3.0}
{'eval_loss': 1.071877121925354, 'eval_accuracy': 0.7181372549019608, 'eval_f1': 0.7978910369068541, 'eval_runtime': 0.8208, 'eval_samples_per_second': 497.066, 'eval_steps_per_second': 31.676, 'epoch': 4.0}
{'train_runtime': 44.1305, 'train_samples_per_se

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5473975539207458, 'eval_accuracy': 0.7450980392156863, 'eval_f1': 0.8395061728395061, 'eval_runtime': 0.8513, 'eval_samples_per_second': 479.285, 'eval_steps_per_second': 30.543, 'epoch': 1.0}
{'eval_loss': 0.49668586254119873, 'eval_accuracy': 0.7696078431372549, 'eval_f1': 0.8433333333333334, 'eval_runtime': 0.8525, 'eval_samples_per_second': 478.568, 'eval_steps_per_second': 30.497, 'epoch': 2.0}
{'loss': 0.476, 'grad_norm': 0.7576995491981506, 'learning_rate': 4.565217391304348e-05, 'epoch': 2.1739130434782608}
{'eval_loss': 0.6562468409538269, 'eval_accuracy': 0.7843137254901961, 'eval_f1': 0.8461538461538461, 'eval_runtime': 0.8255, 'eval_samples_per_second': 494.262, 'eval_steps_per_second': 31.497, 'epoch': 3.0}
{'eval_loss': 0.9898021817207336, 'eval_accuracy': 0.7671568627450981, 'eval_f1': 0.8330404217926186, 'eval_runtime': 1.0, 'eval_samples_per_second': 408.015, 'eval_steps_per_second': 26.001, 'epoch': 4.0}
{'train_runtime': 44.2443, 'train_samples_per_se

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5045546293258667, 'eval_accuracy': 0.7573529411764706, 'eval_f1': 0.8460342146189735, 'eval_runtime': 0.8164, 'eval_samples_per_second': 499.778, 'eval_steps_per_second': 31.849, 'epoch': 1.0}
{'eval_loss': 0.4622404873371124, 'eval_accuracy': 0.7867647058823529, 'eval_f1': 0.85667215815486, 'eval_runtime': 0.8468, 'eval_samples_per_second': 481.805, 'eval_steps_per_second': 30.703, 'epoch': 2.0}
{'loss': 0.4786, 'grad_norm': 10.434516906738281, 'learning_rate': 2.282608695652174e-05, 'epoch': 2.1739130434782608}
{'eval_loss': 0.47002848982810974, 'eval_accuracy': 0.8161764705882353, 'eval_f1': 0.8628884826325411, 'eval_runtime': 0.844, 'eval_samples_per_second': 483.433, 'eval_steps_per_second': 30.807, 'epoch': 3.0}
{'eval_loss': 0.5593783855438232, 'eval_accuracy': 0.8112745098039216, 'eval_f1': 0.8670120898100173, 'eval_runtime': 0.8341, 'eval_samples_per_second': 489.142, 'eval_steps_per_second': 31.171, 'epoch': 4.0}
{'train_runtime': 44.2229, 'train_samples_per_s

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5061951279640198, 'eval_accuracy': 0.7598039215686274, 'eval_f1': 0.8444444444444444, 'eval_runtime': 0.8069, 'eval_samples_per_second': 505.636, 'eval_steps_per_second': 32.222, 'epoch': 1.0}
{'eval_loss': 0.43950167298316956, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.8617886178861789, 'eval_runtime': 0.8514, 'eval_samples_per_second': 479.206, 'eval_steps_per_second': 30.538, 'epoch': 2.0}
{'loss': 0.5054, 'grad_norm': 10.137059211730957, 'learning_rate': 1.3695652173913042e-05, 'epoch': 2.1739130434782608}
{'eval_loss': 0.3917691707611084, 'eval_accuracy': 0.8137254901960784, 'eval_f1': 0.8666666666666667, 'eval_runtime': 0.8193, 'eval_samples_per_second': 497.993, 'eval_steps_per_second': 31.735, 'epoch': 3.0}
{'eval_loss': 0.42956361174583435, 'eval_accuracy': 0.8014705882352942, 'eval_f1': 0.8638655462184874, 'eval_runtime': 0.8182, 'eval_samples_per_second': 498.676, 'eval_steps_per_second': 31.778, 'epoch': 4.0}
{'train_runtime': 44.811, 'train_samples_p

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5746538639068604, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8144796380090498, 'eval_runtime': 0.7948, 'eval_samples_per_second': 513.343, 'eval_steps_per_second': 16.357, 'epoch': 1.0}
{'eval_loss': 0.5813997387886047, 'eval_accuracy': 0.7279411764705882, 'eval_f1': 0.8206785137318255, 'eval_runtime': 0.8, 'eval_samples_per_second': 510.025, 'eval_steps_per_second': 16.251, 'epoch': 2.0}
{'eval_loss': 0.7434532046318054, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.7533460803059273, 'eval_runtime': 0.8186, 'eval_samples_per_second': 498.417, 'eval_steps_per_second': 15.881, 'epoch': 3.0}
{'eval_loss': 1.0397051572799683, 'eval_accuracy': 0.7009803921568627, 'eval_f1': 0.7932203389830509, 'eval_runtime': 0.8098, 'eval_samples_per_second': 503.802, 'eval_steps_per_second': 16.053, 'epoch': 4.0}
{'train_runtime': 38.2949, 'train_samples_per_second': 383.132, 'train_steps_per_second': 12.012, 'train_loss': 0.3888619796089504, 'epoch': 4.0}
{'eval_loss': 1.03970

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.49568262696266174, 'eval_accuracy': 0.7426470588235294, 'eval_f1': 0.8198970840480274, 'eval_runtime': 0.8091, 'eval_samples_per_second': 504.282, 'eval_steps_per_second': 16.068, 'epoch': 1.0}
{'eval_loss': 0.40469300746917725, 'eval_accuracy': 0.8259803921568627, 'eval_f1': 0.8798646362098139, 'eval_runtime': 0.7974, 'eval_samples_per_second': 511.633, 'eval_steps_per_second': 16.302, 'epoch': 2.0}
{'eval_loss': 0.4541195034980774, 'eval_accuracy': 0.8186274509803921, 'eval_f1': 0.8719723183391004, 'eval_runtime': 0.818, 'eval_samples_per_second': 498.771, 'eval_steps_per_second': 15.892, 'epoch': 3.0}
{'eval_loss': 0.5668432712554932, 'eval_accuracy': 0.8235294117647058, 'eval_f1': 0.8775510204081632, 'eval_runtime': 0.8037, 'eval_samples_per_second': 507.664, 'eval_steps_per_second': 16.176, 'epoch': 4.0}
{'train_runtime': 38.3588, 'train_samples_per_second': 382.494, 'train_steps_per_second': 11.992, 'train_loss': 0.32324958469556725, 'epoch': 4.0}
{'eval_loss': 0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.533130407333374, 'eval_accuracy': 0.7303921568627451, 'eval_f1': 0.8270440251572327, 'eval_runtime': 0.7993, 'eval_samples_per_second': 510.478, 'eval_steps_per_second': 16.265, 'epoch': 1.0}
{'eval_loss': 0.45403948426246643, 'eval_accuracy': 0.7892156862745098, 'eval_f1': 0.8612903225806452, 'eval_runtime': 0.8275, 'eval_samples_per_second': 493.046, 'eval_steps_per_second': 15.71, 'epoch': 2.0}
{'eval_loss': 0.42151057720184326, 'eval_accuracy': 0.8063725490196079, 'eval_f1': 0.8689883913764511, 'eval_runtime': 0.9841, 'eval_samples_per_second': 414.607, 'eval_steps_per_second': 13.211, 'epoch': 3.0}
{'eval_loss': 0.4625893533229828, 'eval_accuracy': 0.8014705882352942, 'eval_f1': 0.8661157024793389, 'eval_runtime': 0.8161, 'eval_samples_per_second': 499.968, 'eval_steps_per_second': 15.93, 'epoch': 4.0}
{'train_runtime': 38.4523, 'train_samples_per_second': 381.564, 'train_steps_per_second': 11.963, 'train_loss': 0.39331210592518684, 'epoch': 4.0}
{'eval_loss': 0.46

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5536954402923584, 'eval_accuracy': 0.7156862745098039, 'eval_f1': 0.8237082066869301, 'eval_runtime': 0.801, 'eval_samples_per_second': 509.337, 'eval_steps_per_second': 16.229, 'epoch': 1.0}
{'eval_loss': 0.48963889479637146, 'eval_accuracy': 0.7622549019607843, 'eval_f1': 0.8412438625204582, 'eval_runtime': 0.8571, 'eval_samples_per_second': 476.042, 'eval_steps_per_second': 15.168, 'epoch': 2.0}
{'eval_loss': 0.4891095757484436, 'eval_accuracy': 0.7720588235294118, 'eval_f1': 0.848780487804878, 'eval_runtime': 0.8172, 'eval_samples_per_second': 499.283, 'eval_steps_per_second': 15.909, 'epoch': 3.0}
{'eval_loss': 0.49433887004852295, 'eval_accuracy': 0.7622549019607843, 'eval_f1': 0.8412438625204582, 'eval_runtime': 0.7863, 'eval_samples_per_second': 518.868, 'eval_steps_per_second': 16.533, 'epoch': 4.0}
{'train_runtime': 38.3428, 'train_samples_per_second': 382.653, 'train_steps_per_second': 11.997, 'train_loss': 0.4645555579144022, 'epoch': 4.0}
{'eval_loss': 0.49

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5291810035705566, 'eval_accuracy': 0.7303921568627451, 'eval_f1': 0.8202614379084967, 'eval_runtime': 0.8243, 'eval_samples_per_second': 494.966, 'eval_steps_per_second': 8.492, 'epoch': 1.0}
{'eval_loss': 0.5309584140777588, 'eval_accuracy': 0.7720588235294118, 'eval_f1': 0.841025641025641, 'eval_runtime': 0.8081, 'eval_samples_per_second': 504.875, 'eval_steps_per_second': 8.662, 'epoch': 2.0}
{'eval_loss': 0.669710099697113, 'eval_accuracy': 0.7769607843137255, 'eval_f1': 0.8460236886632826, 'eval_runtime': 0.8173, 'eval_samples_per_second': 499.192, 'eval_steps_per_second': 8.565, 'epoch': 3.0}
{'eval_loss': 0.8030795454978943, 'eval_accuracy': 0.7720588235294118, 'eval_f1': 0.8382608695652174, 'eval_runtime': 1.1366, 'eval_samples_per_second': 358.956, 'eval_steps_per_second': 6.159, 'epoch': 4.0}
{'train_runtime': 36.814, 'train_samples_per_second': 398.545, 'train_steps_per_second': 6.302, 'train_loss': 0.3362513903913827, 'epoch': 4.0}
{'eval_loss': 0.8030795454

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.52508944272995, 'eval_accuracy': 0.7401960784313726, 'eval_f1': 0.8364197530864198, 'eval_runtime': 0.811, 'eval_samples_per_second': 503.071, 'eval_steps_per_second': 8.631, 'epoch': 1.0}
{'eval_loss': 0.4450721740722656, 'eval_accuracy': 0.7941176470588235, 'eval_f1': 0.8590604026845637, 'eval_runtime': 0.7997, 'eval_samples_per_second': 510.179, 'eval_steps_per_second': 8.753, 'epoch': 2.0}
{'eval_loss': 0.5217808485031128, 'eval_accuracy': 0.7867647058823529, 'eval_f1': 0.8603531300160514, 'eval_runtime': 0.7991, 'eval_samples_per_second': 510.591, 'eval_steps_per_second': 8.76, 'epoch': 3.0}
{'eval_loss': 0.5237591862678528, 'eval_accuracy': 0.8014705882352942, 'eval_f1': 0.8638655462184874, 'eval_runtime': 0.8189, 'eval_samples_per_second': 498.21, 'eval_steps_per_second': 8.548, 'epoch': 4.0}
{'train_runtime': 36.4777, 'train_samples_per_second': 402.219, 'train_steps_per_second': 6.36, 'train_loss': 0.3479101246800916, 'epoch': 4.0}
{'eval_loss': 0.5237591862678

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5457501411437988, 'eval_accuracy': 0.7279411764705882, 'eval_f1': 0.8320726172465961, 'eval_runtime': 0.8152, 'eval_samples_per_second': 500.513, 'eval_steps_per_second': 8.587, 'epoch': 1.0}
{'eval_loss': 0.4699881970882416, 'eval_accuracy': 0.7916666666666666, 'eval_f1': 0.8604269293924466, 'eval_runtime': 0.8055, 'eval_samples_per_second': 506.545, 'eval_steps_per_second': 8.691, 'epoch': 2.0}
{'eval_loss': 0.47402524948120117, 'eval_accuracy': 0.7867647058823529, 'eval_f1': 0.8594507269789984, 'eval_runtime': 0.8228, 'eval_samples_per_second': 495.849, 'eval_steps_per_second': 8.507, 'epoch': 3.0}
{'eval_loss': 0.46653056144714355, 'eval_accuracy': 0.7843137254901961, 'eval_f1': 0.8533333333333334, 'eval_runtime': 0.8175, 'eval_samples_per_second': 499.068, 'eval_steps_per_second': 8.562, 'epoch': 4.0}
{'train_runtime': 36.5424, 'train_samples_per_second': 401.506, 'train_steps_per_second': 6.349, 'train_loss': 0.43782322982261923, 'epoch': 4.0}
{'eval_loss': 0.4665

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5626303553581238, 'eval_accuracy': 0.7205882352941176, 'eval_f1': 0.8283132530120482, 'eval_runtime': 0.7974, 'eval_samples_per_second': 511.692, 'eval_steps_per_second': 8.779, 'epoch': 1.0}
{'eval_loss': 0.5081871151924133, 'eval_accuracy': 0.7573529411764706, 'eval_f1': 0.841091492776886, 'eval_runtime': 0.8828, 'eval_samples_per_second': 462.159, 'eval_steps_per_second': 7.929, 'epoch': 2.0}
{'eval_loss': 0.5035465359687805, 'eval_accuracy': 0.7647058823529411, 'eval_f1': 0.8471337579617835, 'eval_runtime': 0.8231, 'eval_samples_per_second': 495.675, 'eval_steps_per_second': 8.504, 'epoch': 3.0}
{'eval_loss': 0.5000389814376831, 'eval_accuracy': 0.7573529411764706, 'eval_f1': 0.8405797101449275, 'eval_runtime': 0.818, 'eval_samples_per_second': 498.764, 'eval_steps_per_second': 8.557, 'epoch': 4.0}
{'train_runtime': 36.4948, 'train_samples_per_second': 402.03, 'train_steps_per_second': 6.357, 'train_loss': 0.503720316393622, 'epoch': 4.0}
{'eval_loss': 0.50003898143

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5885159969329834, 'eval_accuracy': 0.696078431372549, 'eval_f1': 0.8176470588235294, 'eval_runtime': 0.8301, 'eval_samples_per_second': 491.487, 'eval_steps_per_second': 4.818, 'epoch': 1.0}
{'eval_loss': 0.4501376450061798, 'eval_accuracy': 0.7671568627450981, 'eval_f1': 0.8342059336823735, 'eval_runtime': 0.8349, 'eval_samples_per_second': 488.674, 'eval_steps_per_second': 4.791, 'epoch': 2.0}
{'eval_loss': 0.5504807233810425, 'eval_accuracy': 0.7769607843137255, 'eval_f1': 0.8500823723228995, 'eval_runtime': 0.8138, 'eval_samples_per_second': 501.322, 'eval_steps_per_second': 4.915, 'epoch': 3.0}
{'eval_loss': 0.6577503681182861, 'eval_accuracy': 0.7818627450980392, 'eval_f1': 0.8494077834179357, 'eval_runtime': 0.8341, 'eval_samples_per_second': 489.157, 'eval_steps_per_second': 4.796, 'epoch': 4.0}
{'train_runtime': 36.1671, 'train_samples_per_second': 405.673, 'train_steps_per_second': 3.207, 'train_loss': 0.35784339904785156, 'epoch': 4.0}
{'eval_loss': 0.6577503

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5197935700416565, 'eval_accuracy': 0.7475490196078431, 'eval_f1': 0.8367670364500792, 'eval_runtime': 0.8502, 'eval_samples_per_second': 479.879, 'eval_steps_per_second': 4.705, 'epoch': 1.0}
{'eval_loss': 0.466247022151947, 'eval_accuracy': 0.7867647058823529, 'eval_f1': 0.8589951377633711, 'eval_runtime': 0.8062, 'eval_samples_per_second': 506.059, 'eval_steps_per_second': 4.961, 'epoch': 2.0}
{'eval_loss': 0.4758876860141754, 'eval_accuracy': 0.7843137254901961, 'eval_f1': 0.8562091503267973, 'eval_runtime': 0.8122, 'eval_samples_per_second': 502.36, 'eval_steps_per_second': 4.925, 'epoch': 3.0}
{'eval_loss': 0.489401251077652, 'eval_accuracy': 0.7769607843137255, 'eval_f1': 0.851063829787234, 'eval_runtime': 0.855, 'eval_samples_per_second': 477.168, 'eval_steps_per_second': 4.678, 'epoch': 4.0}
{'train_runtime': 36.2139, 'train_samples_per_second': 405.148, 'train_steps_per_second': 3.203, 'train_loss': 0.41609747656460466, 'epoch': 4.0}
{'eval_loss': 0.48940125107

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5520063042640686, 'eval_accuracy': 0.7083333333333334, 'eval_f1': 0.8166409861325116, 'eval_runtime': 0.8566, 'eval_samples_per_second': 476.274, 'eval_steps_per_second': 4.669, 'epoch': 1.0}
{'eval_loss': 0.5172619819641113, 'eval_accuracy': 0.7524509803921569, 'eval_f1': 0.8404423380726699, 'eval_runtime': 0.9925, 'eval_samples_per_second': 411.071, 'eval_steps_per_second': 4.03, 'epoch': 2.0}
{'eval_loss': 0.5041115880012512, 'eval_accuracy': 0.7475490196078431, 'eval_f1': 0.835725677830941, 'eval_runtime': 0.8093, 'eval_samples_per_second': 504.115, 'eval_steps_per_second': 4.942, 'epoch': 3.0}
{'eval_loss': 0.5032727122306824, 'eval_accuracy': 0.7524509803921569, 'eval_f1': 0.8373590982286635, 'eval_runtime': 0.8324, 'eval_samples_per_second': 490.139, 'eval_steps_per_second': 4.805, 'epoch': 4.0}
{'train_runtime': 36.4974, 'train_samples_per_second': 402.001, 'train_steps_per_second': 3.178, 'train_loss': 0.5086786664765457, 'epoch': 4.0}
{'eval_loss': 0.503272712

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5684205889701843, 'eval_accuracy': 0.7156862745098039, 'eval_f1': 0.8220858895705522, 'eval_runtime': 0.8242, 'eval_samples_per_second': 495.006, 'eval_steps_per_second': 4.853, 'epoch': 1.0}
{'eval_loss': 0.5529979467391968, 'eval_accuracy': 0.7230392156862745, 'eval_f1': 0.8285280728376327, 'eval_runtime': 0.8791, 'eval_samples_per_second': 464.134, 'eval_steps_per_second': 4.55, 'epoch': 2.0}
{'eval_loss': 0.5299142003059387, 'eval_accuracy': 0.7303921568627451, 'eval_f1': 0.828125, 'eval_runtime': 0.8574, 'eval_samples_per_second': 475.844, 'eval_steps_per_second': 4.665, 'epoch': 3.0}
{'eval_loss': 0.5251005291938782, 'eval_accuracy': 0.7328431372549019, 'eval_f1': 0.8288854003139717, 'eval_runtime': 0.8339, 'eval_samples_per_second': 489.255, 'eval_steps_per_second': 4.797, 'epoch': 4.0}
{'train_runtime': 36.4385, 'train_samples_per_second': 402.651, 'train_steps_per_second': 3.183, 'train_loss': 0.547728209659971, 'epoch': 4.0}
{'eval_loss': 0.5251005291938782, '

### LoRA

In [None]:
_ = lora_hyperparameter_tune("prajjwal1/bert-small", ranks, alphas, epochs, batch_sizes, learning_rates)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Progress: 1/500
{'eval_loss': 0.5969144105911255, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 0.8646, 'eval_samples_per_second': 471.912, 'eval_steps_per_second': 58.989, 'epoch': 1.0}
{'loss': 0.623, 'grad_norm': 1.793513298034668, 'learning_rate': 0.00021830065359477123, 'epoch': 1.0893246187363834}
{'eval_loss': 0.5749024152755737, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8177777777777778, 'eval_runtime': 0.9361, 'eval_samples_per_second': 435.838, 'eval_steps_per_second': 54.48, 'epoch': 2.0}
{'loss': 0.5948, 'grad_norm': 4.5539727210998535, 'learning_rate': 0.00013660130718954247, 'epoch': 2.178649237472767}
{'eval_loss': 0.5578769445419312, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8116385911179173, 'eval_runtime': 0.891, 'eval_samples_per_second': 457.912, 'eval_steps_per_second': 57.239, 'epoch': 3.0}
{'loss': 0.5675, 'grad_norm': 4.53668212890625, 'learning_rate': 5.4901960784313716e-05, 'epoch': 3.2679738562091503}
{'ev