<a href="https://colab.research.google.com/github/danjshaw/ece57000-finalProject/blob/main/lora-bert-tiny/source/lora-bert-tiny.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
if not os.path.exists('/content/drive/MyDrive/lora-bert-tiny/'):
    os.makedirs('/content/drive/MyDrive/lora-bert-tiny/')
output_dir = '/content/drive/MyDrive/lora-bert-tiny/'

In [3]:
!pip install datasets
!pip install evaluate
!pip install codecarbon
!pip freeze > /content/drive/MyDrive/lora-bert-tiny/requirements.txt

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [4]:
import torch
import torch.nn as nn
import numpy as np
from codecarbon import EmissionsTracker

KeyboardInterrupt: 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

Followed examples from this [Hugging Face NLP course](https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt#fine-tuning-a-model-with-the-trainer-api) on how to use the trainer API for fine-tuning.

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
import evaluate

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
def get_trainable_parameters(model):
  trainable_parameters = 0
  parameters = 0
  for param in model.parameters():
    parameters += param.numel()
    if param.requires_grad:
      trainable_parameters += param.numel()
  return {'total-parameters': parameters, 'trainable-parameters': trainable_parameters}

In [None]:
import csv

def write_results_to_csv(file_name, results):
  with open(file_name, 'w', newline='') as csvfile:
      writer = csv.DictWriter(csvfile, fieldnames=list(results[0].keys()))

      writer.writeheader()
      for result in results:
        writer.writerow(result)

Number of times to run each section.

In [None]:
num_runs = 1

# Full Fine-Tuning



In [None]:
def fine_tune(epochs, batch_size, learning_rate):
  # Track emissions
  tracker = EmissionsTracker(save_to_file=False)
  tracker.start()

  # Create the model
  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

  # Setup the trainer
  training_args = TrainingArguments(
      "fine-tuning-trainer",
      eval_strategy="epoch",
      per_device_eval_batch_size=size,
      per_device_train_batch_size=size,
      num_train_epochs=epochs,
      learning_rate=rate,
      disable_tqdm=True,
      report_to="none"
  )
  trainer = Trainer(
      model=model,
      TrainingArguments=training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["validation"],
      data_collator=data_collator,
      processing_class=tokenizer,
      compute_metrics=compute_metrics
  )

  # Train the model
  result = trainer.train().metrics

  # Stop tracking emissions
  tracker.stop()

  # Store metrics and delete the tracker
  result |= trainer.evaluate() | get_trainable_parameters(model) | tracker.final_emissions_data.values
  del tracker

  return result

Hyperparameters from [google-research/bert](https://github.com/google-research/bert):



> For each task, we selected the best fine-tuning hyperparameters from the lists below, and trained for 4 epochs:
> * batch sizes: 8, 16, 32, 64, 128
> * learning rates: 3e-4, 1e-4, 5e-5, 3e-5



In [None]:
epochs = 4
batch_sizes = [8, 16, 32, 64, 128]
learning_rates = [3e-4, 1e-4, 5e-5, 3e-5]

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, set_seed
import time

ft_results = []
progress = 0
iterations = len(batch_sizes)*len(learning_rates)*num_runs

# Hyperparameter tuning
start_time = time.time()
for run in range(1, num_runs+1):
  for size in batch_sizes:
    for rate in learning_rates:
      # Track progress
      progress += 1
      print(f"Progress: {progress}/{iterations}")

      # Set a new seed for each run
      set_seed(run)

      # Fine-tune the model and store the results
      ft_result = {"run":run, "batch_size": size, "learning_rate": rate}
      ft_result |= fine_tune(epochs, size, rate)
      ft_results.append(ft_result)
end_time = time.time()

runtime_seconds = end_time - start_time
runtime_minutes = runtime_seconds / 60

# Output the best result
max_ft_result = ft_results[0]
for _, result in enumerate(ft_results):
  if result['eval_f1'] > max_ft_result['eval_f1']:
    max_ft_result = result
print(f'\n================ \
        \nRuntime: {runtime_minutes} minutes \
        \nBest Result: \
        \n\tbatch_size={max_ft_result["batch_size"]}, \
        \n\tlearning_rate={max_ft_result["learning_rate"]}, \
        \n\teval_f1={max_ft_result["eval_f1"]}')

# Save all the results
write_results_to_csv(output_dir + 'ft-results.csv', ft_results)

# Low-Rank Adaptation (LoRA)

In [None]:
from transformers import AutoModelForSequenceClassification

lora_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

In [None]:
class LoraModule(nn.Module):
  def __init__(self, in_features, out_features, rank, alpha):
    super().__init__()
    self.scale = alpha / rank
    self.A = nn.Parameter(torch.randn(in_features, rank))
    self.B = nn.Parameter(torch.zeros(rank, out_features))

  def forward(self, x):
    return (self.scale * (x @ self.A @ self.B))

class LoraLinear(nn.Module):
  def __init__(self, linear, rank, alpha):
    super().__init__()
    if (isinstance(linear, LoraLinear)):
      self.linear = linear.linear
      self.lora = LoraModule(self.linear.in_features, self.linear.out_features, rank, alpha)

    else:
      self.linear = linear
      self.lora = LoraModule(self.linear.in_features, self.linear.out_features, rank, alpha)

  def forward(self, x):
    return self.linear(x) + self.lora(x)

In [None]:
def configure_lora_model(model, rank, alpha):
  # Replace the query and value linear layers with LoRA layers
  for _, layer in enumerate(model.bert.encoder.layer):
    s = layer.attention.self
    s.query = LoraLinear(s.query, rank, alpha)
    s.value = LoraLinear(s.value, rank, alpha)

  # Freeze the pre-trained weights
  for name, param in model.named_parameters():
    if 'A' in name or 'B' in name:
      param.requires_grad = True
    else:
      param.requires_grad = False

In [None]:
def fine_tune_with_lora_model(model, rank, alpha, epochs, batch_size, learning_rate):
  # Initialize results
  result = {"rank": rank, "alpha": alpha, "epochs": epochs, "batch_size": batch_size, "learning_rate": learning_rate}

  # Track emissions
  tracker = EmissionsTracker(save_to_file=False)
  tracker.start()

  # Configure an existing model with new LoRA layers
  configure_lora_model(model, rank, alpha)

  # Setup the trainer
  training_args = TrainingArguments(
      "lora-trainer",
      eval_strategy="epoch",
      per_device_eval_batch_size=size,
      per_device_train_batch_size=size,
      num_train_epochs=epochs,
      learning_rate=rate,
      disable_tqdm=True,
      report_to="none"
  )
  trainer = Trainer(
      model,
      training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["validation"],
      data_collator=data_collator,
      processing_class=tokenizer,
      compute_metrics=compute_metrics
  )

  # Train the model
  result = trainer.train().metrics

  # Stop tracking emissions
  tracker.stop()

  # Store metrics and delete the tracker
  result |= trainer.evaluate() | get_trainable_parameters(model) | tracker.final_emissions_data.values
  del tracker

  return result

In [None]:
ranks = [1, 2, 4, 8, 16]
alphas = [1, 2, 4, 8, 16]

In [None]:
from transformers import Trainer, TrainingArguments, set_seed
import time

lora_results = []
progress = 0
iterations = len(batch_sizes)*len(learning_rates)*len(ranks)*len(alphas)*num_runs

# Hyperparameter tuning
start_time = time.time()
for run in range(1, num_runs+1):
  for rank in ranks:
    for alpha in alphas:
      for size in batch_sizes:
        for rate in learning_rates:
          # Track progress
          progress += 1
          print(f"Progress: {progress}/{iterations}")

          # Set a new seed for each run
          set_seed(run)

          # Fine-tune the LoRA model and store the results
          lora_result = {"run": run}
          lora_result |= fine_tune_with_lora_model(lora_model, rank, alpha, epochs, size, rate)
          lora_results.append(lora_result)
end_time = time.time()

runtime_seconds = end_time - start_time
runtime_hours = runtime_seconds / 3600

# Output the best result
max_lora_result = lora_results[0]
for i, result in enumerate(lora_results):
  if result['eval_f1'] > max_lora_result['eval_f1']:
    max_lora_result = result
print(f'\n================ \
        \nRuntime: {runtime_hours} hours \
        \nBest Result: \
        \nrank={max_lora_result["rank"]}, \
        \nalpha={max_lora_result["alpha"]}, \
        \nparameters={max_lora_result["parameters"]}, \
        \ntrainable_parameters={max_lora_result["trainable_parameters"]}, \
        \nbatch_size={max_lora_result["batch_size"]}, \
        \nlearning_rate={max_lora_result["learning_rate"]}, \
        \neval_f1={max_lora_result["eval_f1"]}')

# Save all the results
write_results_to_csv(output_dir + 'lora-results.csv', lora_results)