<a href="https://colab.research.google.com/github/danjshaw/ece57000_finalProject/blob/main/LoRA_NLU_Modules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Package installations

In [18]:
!pip install datasets
!pip install evaluate



In [19]:
import torch
import torch.nn as nn
import numpy as np

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Define a LoRA class which has rank decomposition matrices based on a provided dense layer, rank, and alpha.

In [21]:
class LoraModule(nn.Module):
  def __init__(self, in_features, out_features, rank, alpha):
    super().__init__()
    self.scale = alpha / rank
    # Matrix A initialized with random gaussian distrubtion with shape (r, d)
    self.A = nn.Parameter(torch.randn(in_features, rank))
    # Matrix B initialized to zeros with shape (d, r)
    self.B = nn.Parameter(torch.zeros(rank, out_features))

  def forward(self, x):
    return (self.scale * (x @ self.A @ self.B))

class LoraLinear(nn.Module):
  def __init__(self, linear, rank, alpha):
    super().__init__()
    self.linear = linear
    self.lora = LoraModule(linear.in_features, linear.out_features, rank, alpha)

  def forward(self, x):
    return self.linear(x) + self.lora(x)

Helper function for applying lora layers to a given model. Supports RoBERTa and DeBERTa.

In [22]:
def init_lora_layers(rank, alpha, model):
  modules = []

  # Replace dense layers of the model with LoRA
  if "roberta" in model.__class__.__name__.lower():
    for i, layer in enumerate(model.roberta.encoder.layer):
      s = layer.attention.self

      # Replace query linear layer with a lora layer
      if (isinstance(s.query, LoraLinear)):
        s.query.lora = LoraModule(s.query.linear.in_features, s.query.linear.out_features, rank, alpha)
      else:
        s.query = LoraLinear(s.query, rank, alpha)

      # Replace value linear layer with a lora layer
      if (isinstance(s.value, LoraLinear)):
        s.value.lora = LoraModule(s.value.linear.in_features, s.query.linear.out_features, rank, alpha)
      else:
        s.value = LoraLinear(s.value, rank, alpha)

      # Store the lora layers for later
      modules.append({'query': s.query.lora, 'value': s.value.lora})

  elif "deberta" in model.__class__.__name__.lower():
    for i, layer in enumerate(model.deberta.encoder.layer):
      s = layer.attention.self

      # Replace query linear layer with a lora layer
      if (isinstance(s.query, LoraLinear)):
        s.query_proj.lora = LoraModule(s.query_proj.linear.in_features, s.query.linear.out_features, rank, alpha)
      else:
        s.query_proj = LoraLinear(s.query_proj, rank, alpha)

      # Replace value linear layer with a lora layer
      if (isinstance(s.value, LoraLinear)):
        s.value_proj.lora = LoraModule(s.value_proj.linear.in_features, s.query.linear.out_features, rank, alpha)
      else:
        s.value_proj = LoraLinear(s.value_proj, rank, alpha)

      # Store the lora layers for later
      modules.append({'query': s.query_proj.lora, 'value': s.value_proj.lora})

  # Set parameters in A and B as trainable
  for name, param in model.named_parameters():
    if 'A' in name or 'B' in name:
      param.requires_grad = True

  return modules

def update_lora_layers(modules, model):
  # Replace dense layers of the model with LoRA
  if "roberta" in model.__class__.__name__.lower():
    for i, layer in enumerate(model.roberta.encoder.layer):
      layer.attention.self.query = modules[i]['query']
      layer.attention.self.value = modules[i]['value']

  elif "deberta" in model.__class__.__name__.lower():
    for i, layer in enumerate(model.deberta.encoder.layer):
      layer.attention.self.query = modules[i]['query']
      layer.attention.self.value = modules[i]['value']

Helper function for checking the total number of parameter and the total number of trainable parameters for a given model.

In [23]:
def get_trainable_parameters(model):
  trainable_parameters = 0
  parameters = 0
  for param in model.parameters():
    # count the total number of parameters
    parameters += param.numel()
    if param.requires_grad:
      # count the total number of trainable parameters
      trainable_parameters += param.numel()
  return parameters, trainable_parameters

Define hyperparameters specified in the LoRA paper.

In [24]:
roberta_base_hyperparameters = {
    "mnli": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 16,
      "epochs": 30,
      "learning-rate": 5e-04,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 512
    },
    "sst2": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 16,
      "epochs": 60,
      "learning-rate": 5e-04,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 512
    },
    "mrpc": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 16,
      "epochs": 30,
      "learning-rate": 4e-04,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 512,
    },
    "cola": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 32,
      "epochs": 80,
      "learning-rate": 4e-04,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 512
    },
    "qnli": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 32,
      "epochs": 25,
      "learning-rate": 4e-04,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 512
    },
    "qqp": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 16,
      "epochs": 25,
      "learning-rate": 4e-04,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 512
    },
    "rte": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 32,
      "epochs": 80,
      "learning-rate": 5e-04,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 512
    },
    "stsb": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 16,
      "epochs": 40,
      "learning-rate": 4e-04,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 512
    }
}

roberta_large_hyperparameters = {
    "mnli": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 4,
      "epochs": 10,
      "learning-rate": 3e-04,
      "rank": 8,
      "alpha": 16,
      "max-seq-len": 128
    },
    "sst2": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 4,
      "epochs": 10,
      "learning-rate": 4e-04,
      "rank": 8,
      "alpha": 16,
      "max-seq-len": 128
    },
    "mrpc": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 4,
      "epochs":20,
      "learning-rate": 3e-04,
      "rank": 8,
      "alpha": 16,
      "max-seq-len": 512
    },
    "cola": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 4,
      "epochs": 20,
      "learning-rate": 2e-04,
      "rank": 8,
      "alpha": 16,
      "max-seq-len": 128
    },
    "qnli": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 4,
      "epochs": 10,
      "learning-rate": 2e-04,
      "rank": 8,
      "alpha": 16,
      "max-seq-len": 128
    },
    "qqp": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 4,
      "epochs": 20,
      "learning-rate": 3e-04,
      "rank": 8,
      "alpha": 16,
      "max-seq-len": 512
    },
    "rte": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 8,
      "epochs": 20,
      "learning-rate": 4e-04,
      "rank": 8,
      "alpha": 16,
      "max-seq-len": 512
    },
    "stsb": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.06,
      "batch-size": 8,
      "epochs": 30,
      "learning-rate": 2e-04,
      "rank": 8,
      "alpha": 16,
      "max-seq-len": 512
    }
}

deberta_hyperparameters = {
    "mnli": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.1,
      "batch-size": 8,
      "epochs": 5,
      "learning-rate": 1e-04,
      "weight-decay": 0,
      "cls-dropout": 0.15,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 256
    },
    "sst2": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.1,
      "batch-size": 8,
      "epochs": 16,
      "learning-rate": 6e-05,
      "weight-decay": 0.01,
      "cls-dropout": 0,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 128
    },
    "mrpc": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.1,
      "batch-size": 32,
      "epochs": 30,
      "learning-rate": 2e-04,
      "weight-decay": 0.01,
      "cls-dropout": 0,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 128
    },
    "cola": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.1,
      "batch-size": 4,
      "epochs": 10,
      "learning-rate": 1e-04,
      "weight-decay": 0,
      "cls-dropout": 0.1,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 256
    },
    "qnli": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.1,
      "batch-size": 6,
      "epochs": 8,
      "learning-rate": 1e-04,
      "weight-decay": 0.01,
      "cls-dropout": 0.1,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 512
    },
    "qqp": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.1,
      "batch-size": 8,
      "epochs": 11,
      "learning-rate": 1e-04,
      "weight-decay": 0.01,
      "cls-dropout": 0.2,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 320
    },
    "rte": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.1,
      "batch-size": 4,
      "epochs": 11,
      "learning-rate": 2e-04,
      "weight-decay": 0.01,
      "cls-dropout": 0.2,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 320
    },
    "stsb": {
      "lr-schedule": "linear",
      "warmup-ratio": 0.1,
      "batch-size": 4,
      "epochs": 10,
      "learning-rate": 2e-04,
      "weight-decay": 0.1,
      "cls-dropout": 0.2,
      "rank": 8,
      "alpha": 8,
      "max-seq-len": 128
    }
}

Instantiate models.

In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Create roberta-base model
roberta_base = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
roberta_base_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Create roberta-large model
roberta_large = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=2).to(device)
roberta_large_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Freeze parameters
for param in roberta_base.parameters():
  param.requires_grad = False
for param in roberta_large.parameters():
  param.requires_grad = False

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
  # Initialize LoRA layers in model
roberta_base_mrpc_modules = init_lora_layers(roberta_base_hyperparameters["mrpc"]["rank"], roberta_base_hyperparameters["mrpc"]["alpha"], roberta_base)
roberta_base_cola_modules = init_lora_layers(roberta_base_hyperparameters["cola"]["rank"], roberta_base_hyperparameters["cola"]["alpha"], roberta_base)

In [27]:
print(roberta_base)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): LoraLinear(
                (linear): Linear(in_features=768, out_features=768, bias=True)
                (lora): LoraModule()
              )
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): LoraLinear(
                (linear): Linear(in_features=768, out_features=768, bias=True)
                (lora): LoraModule()
              )
              (dr

In [28]:
from transformers import DataCollatorWithPadding
from datasets import load_dataset
import evaluate
from transformers import Trainer, TrainingArguments

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp" : ("question1", "question2"),
    "rte" : ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

def train_new(name, model, tokenizer, task, hyperparameters):
  h = hyperparameters[task]

  # Load dataset
  raw_datasets = load_dataset("glue", task)

  def tokenize_function(example):
    col1 = task_to_keys[task][0]
    col2 = task_to_keys[task][1]
    if col2 == None:
      return tokenizer(example[col1], max_length=h["max-seq-len"], truncation=True)
    else:
      return tokenizer(example[col1], example[col2], max_length=h["max-seq-len"], truncation=True)

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_pred):
    metric = evaluate.load("glue", task)
    predictions, labels = eval_pred
    if task == "stsb":
      predictions = predictions[:,0]
    else:
      predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"

  training_args = TrainingArguments(
      f"{name}-finetuned-{task}", # output_dir
      warmup_ratio=h["warmup-ratio"],
      lr_scheduler_type=h["lr-schedule"],
      per_device_train_batch_size=h["batch-size"],
      per_device_eval_batch_size=h["batch-size"],
      eval_strategy="epoch",
      save_strategy="epoch",
      num_train_epochs=h["epochs"],
      learning_rate=h["learning-rate"],
      load_best_model_at_end=True,
      metric_for_best_model=metric_name,
      report_to="none"
      )

  # Handle mnli-mm and mnli expected keys for eval_dataset
  validation_key = "validation_mismatched" if task=="mnli-mm" else \
                "validation_matched" if task=="mnli" else "validation"

  # Initialize LoRA layers in model
  modules = init_lora_layers(h["rank"], h["alpha"], model)

  num_params, num_trainable_params = get_trainable_parameters(model)
  print(f"Parameters={num_params}; Trainable Parameters={num_trainable_params}")

  trainer = Trainer(
      model,
      training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets[validation_key],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )
  trainer.train()

  return modules

def train_with_existing_module(name, model, tokenizer, task, hyperparameters, modules):
  h = hyperparameters[task]

  # Load dataset
  raw_datasets = load_dataset("glue", task)

  def tokenize_function(example):
    col1 = task_to_keys[task][0]
    col2 = task_to_keys[task][1]
    if col2 == None:
      return tokenizer(example[col1], max_length=h["max-seq-len"], truncation=True)
    else:
      return tokenizer(example[col1], example[col2], max_length=h["max-seq-len"], truncation=True)

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  def compute_metrics(eval_pred):
    metric = evaluate.load("glue", task)
    predictions, labels = eval_pred
    if task == "stsb":
      predictions = predictions[:,0]
    else:
      predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"

  training_args = TrainingArguments(
      f"{name}-finetuned-{task}", # output_dir
      warmup_ratio=h["warmup-ratio"],
      lr_scheduler_type=h["lr-schedule"],
      per_device_train_batch_size=h["batch-size"],
      per_device_eval_batch_size=h["batch-size"],
      eval_strategy="epoch",
      save_strategy="epoch",
      num_train_epochs=h["epochs"],
      learning_rate=h["learning-rate"],
      load_best_model_at_end=True,
      metric_for_best_model=metric_name,
      report_to="none"
      )

  # Handle mnli-mm and mnli expected keys for eval_dataset
  validation_key = "validation_mismatched" if task=="mnli-mm" else \
                "validation_matched" if task=="mnli" else "validation"

  # Initialize LoRA layers in model
  update_lora_layers(modules, model)
  #init_lora_layers(h["rank"], h["alpha"], model)

  num_params, num_trainable_params = get_trainable_parameters(model)
  print(f"Parameters={num_params}; Trainable Parameters={num_trainable_params}")

  trainer = Trainer(
      model,
      training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets[validation_key],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )
  trainer.train()

  return modules

In [None]:
roberta_base_mrpc_modules = train_new("roberta-base", roberta_base, roberta_base_tokenizer, "mrpc", roberta_base_hyperparameters)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Parameters=124942082; Trainable Parameters=294912


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.668517,0.683824,0.812227
2,No log,0.634761,0.683824,0.812227
3,0.632700,0.627453,0.683824,0.812227
4,0.632700,0.628054,0.683824,0.812227
5,0.632900,0.624088,0.683824,0.812227
6,0.632900,0.623032,0.683824,0.812227


In [None]:
roberta_base_mrpc_modules = train_with_existing_modules("roberta-base", roberta_base, roberta_base_tokenizer, "mrpc", roberta_base_hyperparameters, roberta_base_mrpc_modules)

In [None]:
roberta_base_cola_modules = train("roberta-base", roberta_base, roberta_base_tokenizer, "cola", roberta_base_hyperparameters, roberta_base_cola_modules)