<a href="https://colab.research.google.com/github/danjshaw/ece57000-finalProject/blob/main/lora-bert/source/lora-bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup

In [113]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [114]:
import os
if not os.path.exists('/content/drive/MyDrive/lora-bert/'):
    os.makedirs('/content/drive/MyDrive/lora-bert/')
output_dir = '/content/drive/MyDrive/lora-bert/'

In [115]:
!pip install datasets
!pip install evaluate
!pip freeze > /content/drive/MyDrive/lora-bert/requirements.txt



In [116]:
import torch
import torch.nn as nn
import numpy as np

In [117]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [118]:
from transformers import set_seed
set_seed(0)

In [119]:
def get_trainable_parameters(model):
  trainable_parameters = 0
  parameters = 0
  for param in model.parameters():
    parameters += param.numel()
    if param.requires_grad:
      trainable_parameters += param.numel()
  return {'total_parameters': parameters, 'trainable_parameters': trainable_parameters}

In [120]:
import csv
def write_results_to_csv(file_name, results):
  with open(file_name, 'w', newline='') as csvfile:
      writer = csv.DictWriter(csvfile, fieldnames=list(results[0].keys()))
      writer.writeheader()
      for result in results:
        writer.writerow(result)

In [121]:
def get_model_name(name_and_path):
  return name_and_path.split('/')[1]

# Full Fine-Tuning Functions



Followed examples from this [Hugging Face NLP course](https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt#fine-tuning-a-model-with-the-trainer-api) on how to use the trainer API for fine-tuning.

In [122]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate

def fine_tune(checkpoint, epochs, batch_size, learning_rate):
  model_name = checkpoint
  if '/' in checkpoint:
    model_name = get_model_name(checkpoint)

  # Initialize result
  result = {"batch_size": batch_size, "learning_rate": learning_rate}

  # Setup model and dataset
  raw_datasets = load_dataset("glue", "mrpc")
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  def tokenize_function(example):
      return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_preds):
      metric = evaluate.load("glue", "mrpc")
      logits, labels = eval_preds
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

  # Setup the trainer
  training_args = TrainingArguments(
      "fine-tuning-trainer",
      eval_strategy="epoch",
      per_device_eval_batch_size=batch_size,
      per_device_train_batch_size=batch_size,
      num_train_epochs=epochs,
      learning_rate=batch_size,
      disable_tqdm=True,
      report_to="none"
  )
  trainer = Trainer(
      model,
      training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["validation"],
      data_collator=data_collator,
      processing_class=tokenizer,
      compute_metrics=compute_metrics
  )

  # Train the model
  return result | trainer.train().metrics | trainer.evaluate() | get_trainable_parameters(model)

In [123]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, set_seed
import time

def hyperparameter_tune(checkpoint, batch_sizes, learning_rates):
  model_name = checkpoint
  if '/' in checkpoint:
    model_name = get_model_name(checkpoint)

  ft_results = []
  progress = 0
  iterations = len(batch_sizes)*len(learning_rates)

  # Hyperparameter tuning
  start_time = time.time()
  for size in batch_sizes:
    for rate in learning_rates:
      # Track progress
      progress += 1
      print(f"Progress: {progress}/{iterations}")

      # Fine-tune the model and store the results
      ft_result = fine_tune(checkpoint, epochs, size, rate)
      ft_results.append(ft_result)

  write_results_to_csv(output_dir+model_name+'-ft-results.csv', ft_results)

  end_time = time.time()

  runtime_seconds = end_time - start_time
  runtime_minutes = runtime_seconds / 60

  # Output the best result
  max_ft_result = ft_results[0]
  for _, result in enumerate(ft_results):
    if result['eval_f1'] > max_ft_result['eval_f1']:
      max_ft_result = result
  print(f'\n================ \
          \nTotal Runtime: {runtime_minutes} minutes \
          \nBest Result: \
          \n\tF1={max_ft_result["eval_f1"]} \
          \n\tBatch Size={max_ft_result["batch_size"]} \
          \n\tLearning Rate={max_ft_result["learning_rate"]}'
  )

  return ft_results

# Low-Rank Adaptation (LoRA) Functions

In [124]:
class LoraModule(nn.Module):
  def __init__(self, in_features, out_features, rank, alpha):
    super().__init__()
    self.scale = alpha / rank
    self.A = nn.Parameter(torch.randn(in_features, rank))
    self.B = nn.Parameter(torch.zeros(rank, out_features))

  def forward(self, x):
    return (self.scale * (x @ self.A @ self.B))

class LoraLinear(nn.Module):
  def __init__(self, linear, rank, alpha):
    super().__init__()
    if (isinstance(linear, LoraLinear)):
      self.linear = linear.linear
      self.lora = LoraModule(self.linear.in_features, self.linear.out_features, rank, alpha)

    else:
      self.linear = linear
      self.lora = LoraModule(self.linear.in_features, self.linear.out_features, rank, alpha)

  def forward(self, x):
    return self.linear(x) + self.lora(x)

In [125]:
def configure_lora_model(model, rank, alpha):
  # Replace the query and value linear layers with LoRA layers
  for _, layer in enumerate(model.bert.encoder.layer):
    s = layer.attention.self
    s.query = LoraLinear(s.query, rank, alpha)
    s.value = LoraLinear(s.value, rank, alpha)

  # Freeze the pre-trained weights
  for name, param in model.named_parameters():
    if 'A' in name or 'B' in name:
      param.requires_grad = True
    else:
      param.requires_grad = False

In [126]:
from transformers import Trainer, TrainingArguments, set_seed
import time

def lora_hyperparameter_tune(checkpoint, ranks, alphas, epochs, batch_sizes, learning_rates):
  model_name = checkpoint
  if '/' in checkpoint:
    model_name = get_model_name(checkpoint)

  # Setup model and dataset
  raw_datasets = load_dataset("glue", "mrpc")
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  def tokenize_function(example):
      return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_preds):
      metric = evaluate.load("glue", "mrpc")
      logits, labels = eval_preds
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

  lora_results = []
  progress = 0
  iterations = len(batch_sizes)*len(learning_rates)*len(ranks)*len(alphas)

  # Hyperparameter tuning
  start_time = time.time()
  for rank in ranks:
    for alpha in alphas:
      for size in batch_sizes:
        for rate in learning_rates:
          # Track progress
          progress += 1
          print(f"Progress: {progress}/{iterations}")

          # Initialize result
          result = {"rank": rank, "alpha": alpha, "batch_size": size, "learning_rate": rate}

          # Configure an existing model with new LoRA layers
          configure_lora_model(model, rank, alpha)

          # Setup the trainer
          training_args = TrainingArguments(
              "lora-trainer",
              eval_strategy="epoch",
              per_device_eval_batch_size=size,
              per_device_train_batch_size=size,
              num_train_epochs=epochs,
              learning_rate=rate,
              disable_tqdm=True,
              report_to="none"
          )
          trainer = Trainer(
              model,
              training_args,
              train_dataset=tokenized_datasets["train"],
              eval_dataset=tokenized_datasets["validation"],
              data_collator=data_collator,
              processing_class=tokenizer,
              compute_metrics=compute_metrics
          )

          # Train the model
          lora_results.append(result | trainer.train().metrics | trainer.evaluate() | get_trainable_parameters(model))

  write_results_to_csv(output_dir+model_name+'-lora-results.csv', lora_results)

  end_time = time.time()

  runtime_seconds = end_time - start_time
  runtime_hours = runtime_seconds / 3600

  # Output the best result
  max_lora_result = lora_results[0]
  for _, result in enumerate(lora_results):
    if result['eval_f1'] > max_lora_result['eval_f1']:
      max_lora_result = result
  print(f'\n================ \
          \nTotal Runtime: {runtime_hours} hours \
          \nBest Result: \
          \n\tF1={max_lora_result["eval_f1"]} \
          \n\tRank={max_lora_result["rank"]} \
          \n\tAlpha={max_lora_result["alpha"]} \
          \n\tBatch Size={max_lora_result["batch_size"]} \
          \n\tLearning Rate={max_lora_result["learning_rate"]}'
  )

  return lora_results

# Run Program

Hyperparameters from [google-research/bert](https://github.com/google-research/bert):



> For each task, we selected the best fine-tuning hyperparameters from the lists below, and trained for 4 epochs:
> * batch sizes: 8, 16, 32, 64, 128
> * learning rates: 3e-4, 1e-4, 5e-5, 3e-5



In [127]:
epochs = 4
batch_sizes = [8]#, 16, 32, 64, 128]
learning_rates = [3e-4]#, 1e-4, 5e-5, 3e-5]

In [128]:
ranks = [1]#, 2, 4, 8, 16]
alphas = [1]#, 2, 4, 8, 16]

## BERT Small

### Full Fine-Tuning

In [111]:
_ = hyperparameter_tune("prajjwal1/bert-small", batch_sizes, learning_rates)

Progress: 1/1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.3702, 'eval_samples_per_second': 297.758, 'eval_steps_per_second': 37.22, 'epoch': 1.0}
{'loss': 65.011, 'grad_norm': nan, 'learning_rate': 5.821350762527233, 'epoch': 1.0893246187363834}
{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 2.1891, 'eval_samples_per_second': 186.379, 'eval_steps_per_second': 23.297, 'epoch': 2.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 3.6427015250544663, 'epoch': 2.178649237472767}
{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.7076, 'eval_samples_per_second': 238.931, 'eval_steps_per_second': 29.866, 'epoch': 3.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.4640522875816993, 'epoch': 3.2679738562091503}
{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.4182, 'eval_samples_per_second': 287.688, 'eval_steps_per_second': 35.961, 'epo

### LoRA

In [112]:
_ = lora_hyperparameter_tune("prajjwal1/bert-small", ranks, alphas, epochs, batch_sizes, learning_rates)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Progress: 1/1
{'eval_loss': 0.5969144105911255, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 1.7753, 'eval_samples_per_second': 229.814, 'eval_steps_per_second': 28.727, 'epoch': 1.0}
{'loss': 0.623, 'grad_norm': 1.793513298034668, 'learning_rate': 0.00021830065359477123, 'epoch': 1.0893246187363834}
{'eval_loss': 0.5749024152755737, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8177777777777778, 'eval_runtime': 1.7383, 'eval_samples_per_second': 234.709, 'eval_steps_per_second': 29.339, 'epoch': 2.0}
{'loss': 0.5948, 'grad_norm': 4.5539727210998535, 'learning_rate': 0.00013660130718954247, 'epoch': 2.178649237472767}
{'eval_loss': 0.5578769445419312, 'eval_accuracy': 0.6985294117647058, 'eval_f1': 0.8116385911179173, 'eval_runtime': 2.0722, 'eval_samples_per_second': 196.896, 'eval_steps_per_second': 24.612, 'epoch': 3.0}
{'loss': 0.5675, 'grad_norm': 4.53668212890625, 'learning_rate': 5.4901960784313716e-05, 'epoch': 3.2679738562091503}
{'ev

## BERT Mini

### Full Fine-Tuning

In [109]:
_ = hyperparameter_tune("prajjwal1/bert-mini", batch_sizes, learning_rates)

Progress: 1/1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.3542, 'eval_samples_per_second': 301.294, 'eval_steps_per_second': 37.662, 'epoch': 1.0}
{'loss': 402.5992, 'grad_norm': nan, 'learning_rate': 5.821350762527233, 'epoch': 1.0893246187363834}
{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 2.0197, 'eval_samples_per_second': 202.008, 'eval_steps_per_second': 25.251, 'epoch': 2.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 3.6427015250544663, 'epoch': 2.178649237472767}
{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.6969, 'eval_samples_per_second': 240.442, 'eval_steps_per_second': 30.055, 'epoch': 3.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.4640522875816993, 'epoch': 3.2679738562091503}
{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.7404, 'eval_samples_per_second': 234.425, 'eval_steps_per_second': 29.303, '

### LoRA

In [110]:
_ = lora_hyperparameter_tune("prajjwal1/bert-mini", ranks, alphas, epochs, batch_sizes, learning_rates)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Progress: 1/1
{'eval_loss': 0.6154337525367737, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 2.0688, 'eval_samples_per_second': 197.212, 'eval_steps_per_second': 24.652, 'epoch': 1.0}
{'loss': 0.634, 'grad_norm': 0.4864899814128876, 'learning_rate': 0.00021830065359477123, 'epoch': 1.0893246187363834}
{'eval_loss': 0.5992273092269897, 'eval_accuracy': 0.7058823529411765, 'eval_f1': 0.8219584569732937, 'eval_runtime': 1.3379, 'eval_samples_per_second': 304.946, 'eval_steps_per_second': 38.118, 'epoch': 2.0}
{'loss': 0.6152, 'grad_norm': 1.3363189697265625, 'learning_rate': 0.00013660130718954247, 'epoch': 2.178649237472767}
{'eval_loss': 0.5883147120475769, 'eval_accuracy': 0.7107843137254902, 'eval_f1': 0.8201219512195121, 'eval_runtime': 2.0428, 'eval_samples_per_second': 199.729, 'eval_steps_per_second': 24.966, 'epoch': 3.0}
{'loss': 0.5894, 'grad_norm': 2.013890027999878, 'learning_rate': 5.4901960784313716e-05, 'epoch': 3.2679738562091503}
{'

## BERT Tiny

### Full Fine-Tuning

In [129]:
_ = hyperparameter_tune("prajjwal1/bert-tiny", batch_sizes, learning_rates)

Progress: 1/1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.9281, 'eval_samples_per_second': 211.608, 'eval_steps_per_second': 26.451, 'epoch': 1.0}
{'loss': 117.5354, 'grad_norm': nan, 'learning_rate': 5.821350762527233, 'epoch': 1.0893246187363834}
{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.7205, 'eval_samples_per_second': 237.146, 'eval_steps_per_second': 29.643, 'epoch': 2.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 3.6427015250544663, 'epoch': 2.178649237472767}
{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.7453, 'eval_samples_per_second': 233.77, 'eval_steps_per_second': 29.221, 'epoch': 3.0}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.4640522875816993, 'epoch': 3.2679738562091503}
{'eval_loss': nan, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 1.4086, 'eval_samples_per_second': 289.645, 'eval_steps_per_second': 36.206, 'e

### LoRA

In [108]:
_ = lora_hyperparameter_tune("prajjwal1/bert-tiny", ranks, alphas, epochs, batch_sizes, learning_rates)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Progress: 1/1
{'eval_loss': 0.6257390975952148, 'eval_accuracy': 0.6813725490196079, 'eval_f1': 0.8104956268221575, 'eval_runtime': 1.6848, 'eval_samples_per_second': 242.159, 'eval_steps_per_second': 30.27, 'epoch': 1.0}
{'loss': 0.663, 'grad_norm': 0.46347522735595703, 'learning_rate': 0.00021830065359477123, 'epoch': 1.0893246187363834}
{'eval_loss': 0.6210986375808716, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 1.6779, 'eval_samples_per_second': 243.162, 'eval_steps_per_second': 30.395, 'epoch': 2.0}
{'loss': 0.6298, 'grad_norm': 0.40943172574043274, 'learning_rate': 0.00013660130718954247, 'epoch': 2.178649237472767}
{'eval_loss': 0.6197056770324707, 'eval_accuracy': 0.6838235294117647, 'eval_f1': 0.8122270742358079, 'eval_runtime': 1.6573, 'eval_samples_per_second': 246.185, 'eval_steps_per_second': 30.773, 'epoch': 3.0}
{'loss': 0.6246, 'grad_norm': 0.42999500036239624, 'learning_rate': 5.4901960784313716e-05, 'epoch': 3.2679738562091503}