<a href="https://colab.research.google.com/github/danjshaw/ece57000-finalProject/blob/main/lora-bert/source/lora-bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup

In [49]:
!pip install datasets
!pip install evaluate



In [50]:
import torch
import torch.nn as nn
import numpy as np

In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [52]:
from transformers import set_seed
set_seed(42)

In [53]:
def get_trainable_parameters(model):
  trainable_parameters = 0
  parameters = 0
  for param in model.parameters():
    parameters += param.numel()
    if param.requires_grad:
      trainable_parameters += param.numel()
  return {'total_parameters': parameters, 'trainable_parameters': trainable_parameters}

In [54]:
import csv
def write_results_to_csv(file_name, results):
  with open(file_name, 'w', newline='') as csvfile:
      writer = csv.DictWriter(csvfile, fieldnames=list(results[0].keys()))
      writer.writeheader()
      for result in results:
        writer.writerow(result)

In [55]:
def get_model_name(name_and_path):
  return name_and_path.split('/')[1]

# Full Fine-Tuning Implementation



Followed examples from this [Hugging Face NLP course](https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt#fine-tuning-a-model-with-the-trainer-api) on how to use the trainer API for fine-tuning.

In [56]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate
import time

def ft_hyperparameter_tune(checkpoint, epochs, batch_sizes, learning_rates):
  model_name = checkpoint
  if '/' in checkpoint:
    model_name = get_model_name(checkpoint)

  raw_datasets = load_dataset("glue", "mrpc")
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  def tokenize_function(example):
      return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_preds):
      metric = evaluate.load("glue", "mrpc")
      logits, labels = eval_preds
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

  ft_results = []
  progress = 0
  iterations = len(batch_sizes)*len(learning_rates)

  # Hyperparameter tuning
  start_time = time.time()
  for size in batch_sizes:
    for rate in learning_rates:
      # Track progress
      progress += 1
      print(f"Progress: {progress}/{iterations}")

      # Initialize result
      result = {"batch_size": size, "learning_rate": rate}

      model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

      # Setup the trainer
      training_args = TrainingArguments(
          "fine-tuning-trainer",
          eval_strategy="epoch",
          per_device_eval_batch_size=size,
          per_device_train_batch_size=size,
          num_train_epochs=epochs,
          learning_rate=rate,
          disable_tqdm=True,
          report_to="none"
      )
      trainer = Trainer(
          model,
          training_args,
          train_dataset=tokenized_datasets["train"],
          eval_dataset=tokenized_datasets["validation"],
          data_collator=data_collator,
          processing_class=tokenizer,
          compute_metrics=compute_metrics
      )

      # Train the model
      ft_results.append(result | trainer.train().metrics | trainer.evaluate() | get_trainable_parameters(model))

  write_results_to_csv(output_dir+model_name+'-ft-results.csv', ft_results)

  end_time = time.time()

  runtime_seconds = end_time - start_time
  runtime_minutes = runtime_seconds / 60

  # Output the best result
  max_ft_result = ft_results[0]
  for _, result in enumerate(ft_results):
    if result['eval_f1'] > max_ft_result['eval_f1']:
      max_ft_result = result
  print(f'\n================ \
          \nTotal Runtime: {runtime_minutes} minutes \
          \nBest Result: \
          \n\tF1={max_ft_result["eval_f1"]} \
          \n\tBatch Size={max_ft_result["batch_size"]} \
          \n\tLearning Rate={max_ft_result["learning_rate"]}'
  )

  return ft_results

In [57]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate

def fine_tune(checkpoint, epoch, batch_size, learning_rate):
  model_name = checkpoint
  if '/' in checkpoint:
    model_name = get_model_name(checkpoint)

  raw_datasets = load_dataset("glue", "mrpc")
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  def tokenize_function(example):
      return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_preds):
      metric = evaluate.load("glue", "mrpc")
      logits, labels = eval_preds
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

  # Setup the trainer
  training_args = TrainingArguments(
      "fine-tuning-trainer",
      eval_strategy="epoch",
      per_device_eval_batch_size=batch_size,
      per_device_train_batch_size=batch_size,
      num_train_epochs=epoch,
      learning_rate=learning_rate,
      report_to="none"
  )
  trainer = Trainer(
      model,
      training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["validation"],
      data_collator=data_collator,
      processing_class=tokenizer,
      compute_metrics=compute_metrics
  )

  # Train the model
  result = trainer.train().metrics | trainer.evaluate() | get_trainable_parameters(model)

  print("=====================")
  print("Results:")
  for key in result:
    print(f'\t{key}={result[key]}')

# Low-Rank Adaptation (LoRA) Implementation

In [58]:
class LoraModule(nn.Module):
  def __init__(self, in_features, out_features, rank, alpha):
    super().__init__()
    self.scale = alpha / rank
    self.A = nn.Parameter(torch.randn(in_features, rank))
    self.B = nn.Parameter(torch.zeros(rank, out_features))

  def forward(self, x):
    return (self.scale * (x @ self.A @ self.B))

class LoraLinear(nn.Module):
  def __init__(self, linear, rank, alpha):
    super().__init__()
    if (isinstance(linear, LoraLinear)):
      self.linear = linear.linear
      self.lora = LoraModule(self.linear.in_features, self.linear.out_features, rank, alpha)

    else:
      self.linear = linear
      self.lora = LoraModule(self.linear.in_features, self.linear.out_features, rank, alpha)

  def forward(self, x):
    return self.linear(x) + self.lora(x)

In [59]:
def configure_lora_model(model, rank, alpha):
  # Replace the query and value linear layers with LoRA layers
  for _, layer in enumerate(model.bert.encoder.layer):
    s = layer.attention.self
    s.query = LoraLinear(s.query, rank, alpha)
    s.value = LoraLinear(s.value, rank, alpha)

  # Freeze the pre-trained weights
  for name, param in model.named_parameters():
    if 'A' in name or 'B' in name:
      param.requires_grad = True
    else:
      param.requires_grad = False

In [60]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate
import time

def lora_hyperparameter_tune(checkpoint, ranks, alphas, epochs, batch_sizes, learning_rates):
  model_name = checkpoint
  if '/' in checkpoint:
    model_name = get_model_name(checkpoint)

  # Setup model and dataset
  raw_datasets = load_dataset("glue", "mrpc")
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  def tokenize_function(example):
      return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_preds):
      metric = evaluate.load("glue", "mrpc")
      logits, labels = eval_preds
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

  lora_results = []
  progress = 0
  iterations = len(batch_sizes)*len(learning_rates)*len(ranks)*len(alphas)

  # Hyperparameter tuning
  start_time = time.time()
  for rank in ranks:
    for alpha in alphas:
      for size in batch_sizes:
        for rate in learning_rates:
          # Track progress
          progress += 1
          print(f"Progress: {progress}/{iterations}")

          # Initialize result
          result = {"rank": rank, "alpha": alpha, "batch_size": size, "learning_rate": rate}

          # Configure an existing model with new LoRA layers
          configure_lora_model(model, rank, alpha)

          # Setup the trainer
          training_args = TrainingArguments(
              "lora-trainer",
              eval_strategy="epoch",
              per_device_eval_batch_size=size,
              per_device_train_batch_size=size,
              num_train_epochs=epochs,
              learning_rate=rate,
              disable_tqdm=True,
              report_to="none"
          )
          trainer = Trainer(
              model,
              training_args,
              train_dataset=tokenized_datasets["train"],
              eval_dataset=tokenized_datasets["validation"],
              data_collator=data_collator,
              processing_class=tokenizer,
              compute_metrics=compute_metrics
          )

          # Train the model
          lora_results.append(result | trainer.train().metrics | trainer.evaluate() | get_trainable_parameters(model))

  write_results_to_csv(output_dir+model_name+'-lora-results.csv', lora_results)

  end_time = time.time()

  runtime_seconds = end_time - start_time
  runtime_hours = runtime_seconds / 3600

  # Output the best result
  max_lora_result = lora_results[0]
  for _, result in enumerate(lora_results):
    if result['eval_f1'] > max_lora_result['eval_f1']:
      max_lora_result = result
  print(f'\n================ \
          \nTotal Runtime: {runtime_hours} hours \
          \nBest Result: \
          \n\tF1={max_lora_result["eval_f1"]} \
          \n\tRank={max_lora_result["rank"]} \
          \n\tAlpha={max_lora_result["alpha"]} \
          \n\tBatch Size={max_lora_result["batch_size"]} \
          \n\tLearning Rate={max_lora_result["learning_rate"]}'
  )

  return lora_results

In [61]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate

def fine_tune_with_lora(checkpoint, rank, alpha, epoch, batch_size, learning_rate):
  model_name = checkpoint
  if '/' in checkpoint:
    model_name = get_model_name(checkpoint)

  # Setup model and dataset
  raw_datasets = load_dataset("glue", "mrpc")
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  def tokenize_function(example):
      return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_preds):
      metric = evaluate.load("glue", "mrpc")
      logits, labels = eval_preds
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(predictions=predictions, references=labels)

  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

  # Configure an existing model with new LoRA layers
  configure_lora_model(model, rank, alpha)

  # Setup the trainer
  training_args = TrainingArguments(
      "lora-trainer",
      eval_strategy="epoch",
      per_device_eval_batch_size=batch_size,
      per_device_train_batch_size=batch_size,
      num_train_epochs=epoch,
      learning_rate=learning_rate,
      report_to="none"
  )
  trainer = Trainer(
      model,
      training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["validation"],
      data_collator=data_collator,
      processing_class=tokenizer,
      compute_metrics=compute_metrics
  )

  # Train the model
  result = trainer.train().metrics | trainer.evaluate() | get_trainable_parameters(model)

  print("=====================")
  print("Results:")
  for key in result:
    print(f'\t{key}={result[key]}')

# Run

In [62]:
models = ["prajjwal1/bert-tiny", "prajjwal1/bert-mini", "prajjwal1/bert-small"]

## Hyperparameter Tuning

Hyperparameters from [google-research/bert](https://github.com/google-research/bert):



> For each task, we selected the best fine-tuning hyperparameters from the lists below, and trained for 4 epochs:
> * batch sizes: 8, 16, 32, 64, 128
> * learning rates: 3e-4, 1e-4, 5e-5, 3e-5



In [63]:
epochs = 4
batch_sizes = [8, 16, 32, 64, 128]
learning_rates = [3e-4, 1e-4, 5e-5, 3e-5]

In [64]:
ranks = [1, 2, 4, 8, 16]
alphas = [1, 2, 4, 8, 16]

Uncomment to perform hyperparameter tuning—this is a long process and it might be beneficial to break it up over multiple cells if desired. Results metrics are saved to CSV files in google drive for further analysis and interpretation.

In [65]:
# from google.colab import drive
# drive.mount('/content/drive')

In [66]:
# import os
# if not os.path.exists('/content/drive/MyDrive/lora-bert/'):
#     os.makedirs('/content/drive/MyDrive/lora-bert/')
# output_dir = '/content/drive/MyDrive/lora-bert/'

In [67]:
# for model in models:
#   _ = ft_hyperparameter_tune(model, epochs, batch_sizes, learning_rates)
#   _ = lora_hyperparameter_tune(model, ranks, alphas, epochs, batch_sizes, learning_rates)

## Best Hyperparameters

In [68]:
fine_tune("prajjwal1/bert-tiny", epoch=4, batch_size=64, learning_rate=3e-4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.578104,0.705882,0.820359
2,No log,0.557935,0.737745,0.834621
3,No log,0.565318,0.754902,0.84127
4,No log,0.580859,0.75,0.837061


Results:
	train_runtime=11.0653
	train_samples_per_second=1325.942
	train_steps_per_second=20.966
	total_flos=3035719599600.0
	train_loss=0.4791317643790409
	epoch=4.0
	eval_loss=0.5808593034744263
	eval_accuracy=0.75
	eval_f1=0.8370607028753994
	eval_runtime=1.2949
	eval_samples_per_second=315.078
	eval_steps_per_second=5.406
	total_parameters=4386178
	trainable_parameters=4386178


In [69]:
fine_tune_with_lora("prajjwal1/bert-tiny", rank=4, alpha=8, epoch=4, batch_size=8, learning_rate=3e-4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.614409,0.683824,0.812227
2,0.634000,0.606059,0.686275,0.813411
3,0.619700,0.602576,0.693627,0.816984
4,0.606500,0.602209,0.698529,0.818316


Results:
	train_runtime=24.5677
	train_samples_per_second=597.206
	train_steps_per_second=74.732
	total_flos=2635525921248.0
	train_loss=0.6190774788783786
	epoch=4.0
	eval_loss=0.6022091507911682
	eval_accuracy=0.6985294117647058
	eval_f1=0.8183161004431314
	eval_runtime=1.5603
	eval_samples_per_second=261.483
	eval_steps_per_second=32.685
	total_parameters=4390274
	trainable_parameters=4096


In [70]:
fine_tune("prajjwal1/bert-mini", epoch=4, batch_size=16, learning_rate=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.499999,0.757353,0.835275
2,No log,0.458275,0.781863,0.852405
3,0.516400,0.449616,0.796569,0.851521
4,0.516400,0.461786,0.806373,0.862609


Results:
	train_runtime=24.8562
	train_samples_per_second=590.275
	train_steps_per_second=37.013
	total_flos=21529427905728.0
	train_loss=0.43540860051694125
	epoch=4.0
	eval_loss=0.46178555488586426
	eval_accuracy=0.8063725490196079
	eval_f1=0.8626086956521739
	eval_runtime=1.5939
	eval_samples_per_second=255.976
	eval_steps_per_second=16.312
	total_parameters=11171074
	trainable_parameters=11171074


In [71]:
fine_tune_with_lora("prajjwal1/bert-mini", rank=16, alpha=4, epoch=4, batch_size=8, learning_rate=3e-4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.544532,0.740196,0.832278
2,0.596100,0.516101,0.762255,0.840198
3,0.547800,0.50272,0.772059,0.84317
4,0.511800,0.513373,0.762255,0.840722


Results:
	train_runtime=33.75
	train_samples_per_second=434.725
	train_steps_per_second=54.4
	total_flos=20769117438432.0
	train_loss=0.5442345978647536
	epoch=4.0
	eval_loss=0.5133726000785828
	eval_accuracy=0.7622549019607843
	eval_f1=0.8407224958949097
	eval_runtime=1.5326
	eval_samples_per_second=266.21
	eval_steps_per_second=33.276
	total_parameters=11236610
	trainable_parameters=65536


In [72]:
fine_tune("prajjwal1/bert-small", epoch=4, batch_size=8, learning_rate=3e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.52115,0.745098,0.839506
2,0.572900,0.405591,0.823529,0.878788
3,0.393100,0.41938,0.838235,0.882979
4,0.251600,0.597334,0.833333,0.885906


Results:
	train_runtime=60.5071
	train_samples_per_second=242.484
	train_steps_per_second=30.344
	total_flos=81238114922976.0
	train_loss=0.3667250365213631
	epoch=4.0
	eval_loss=0.5973342061042786
	eval_accuracy=0.8333333333333334
	eval_f1=0.8859060402684564
	eval_runtime=1.5152
	eval_samples_per_second=269.274
	eval_steps_per_second=33.659
	total_parameters=28764674
	trainable_parameters=28764674


In [73]:
fine_tune_with_lora("prajjwal1/bert-small", rank=16, alpha=16, epoch=4, batch_size=8, learning_rate=1e-4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.548451,0.715686,0.823708
2,0.604100,0.509534,0.747549,0.831974
3,0.544500,0.493069,0.762255,0.83247
4,0.490400,0.498056,0.764706,0.840532


Results:
	train_runtime=37.3606
	train_samples_per_second=392.713
	train_steps_per_second=49.143
	total_flos=82065196020192.0
	train_loss=0.5331009343298951
	epoch=4.0
	eval_loss=0.49805593490600586
	eval_accuracy=0.7647058823529411
	eval_f1=0.840531561461794
	eval_runtime=1.589
	eval_samples_per_second=256.76
	eval_steps_per_second=32.095
	total_parameters=28895746
	trainable_parameters=131072
