In [1]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
)

import pdb
import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm

from peft import PeftModel, PeftConfig, load_peft_weights, set_peft_model_state_dict
from transformers import AutoModelForCausalLM, AutoTokenizer
from itertools import combinations
import scipy
import sklearn

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tasks = [
    #'cola',
    'sst2',
    'mrpc',
    # 'mnli', # This doesn't have a validation set.... (don't have)
    # 'qnli', # We don't have enough memory to train this.. ((don't have))
    #'rte', ***
    #'wnli',
    #'stsb', # This data isn't formatted how the next code block expects it to be (don't have)
    #'qqp', # This takes 1hr per epoch... ***
]
# Note: the four tasks that we want: sst2, mrpc, rte, qqp

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [4]:
list(combinations(tasks,2))

[('sst2', 'mrpc')]

In [6]:
# Before modified

all_evals = []
all_metrics = []
all_model_ids = []
zero_shot = False

batch_size = 32
model_name_or_path = "roberta-large"
peft_type = PeftType.LORA
device = "cuda"


peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)


if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")

all_tasks = list(combinations(tasks,2))
print(all_tasks)
for tasks in all_tasks:
    for task in tasks:
        datasets = load_dataset("glue", task, cache_dir='/nethome/becsedi3/flash/datasets/GLUE')
        metric = evaluate.load("glue", task, cache_dir='/nethome/becsedi3/flash/datasets/GLUE')
        sentence1_key, sentence2_key = task_to_keys[task]
    
        def tokenize_function(examples):
            # max_length=None => use the model max length (it's actually the default)
            args = (
                (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
            )
            outputs = tokenizer(*args, truncation=True, max_length=None)
            return outputs
    
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx"] + list([t for t in task_to_keys[task] if t is not None]),
        )
        tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
        
        train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
        eval_dataloader = DataLoader(
            tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
        )
        all_evals.append(eval_dataloader)
        all_metrics.append(metric)
        all_model_ids.append("gstoica3/roberta-large-peft-" + task)
    
    #Merge the models
    config = PeftConfig.from_pretrained(all_model_ids[0])
    merged_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
    if(zero_shot == False):
        merged_model = PeftModel.from_pretrained(merged_model, all_model_ids[0])
        adapters_weights = load_peft_weights(all_model_ids[0], device=device)
        
        for all_model_id in all_model_ids[1:]:
            new_adapter = load_peft_weights(all_model_id, device=device)
            for key in new_adapter:
                adapters_weights[key] += new_adapter[key]
            
            # after the operation, set B weights to all zeros
            # What's happening now: output = Wx + ABx 
            # What should happen in my implementation: output = (W + mergedAB)x
        
        load_result = set_peft_model_state_dict(merged_model, adapters_weights) 
    
    #Evaluate the merged model
    merged_model.to(device)
    merged_model.eval()
    for task_id, task in enumerate(tasks):
        for step, batch in enumerate(tqdm(all_evals[task_id])):
            batch.to(device)
            with torch.no_grad():
                outputs = merged_model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = predictions, batch["labels"]
            all_metrics[task_id].add_batch(
                predictions=predictions,
                references=references,
            )
        
        eval_metric = all_metrics[task_id].compute()
        print(task, eval_metric)


[('sst2', 'mrpc')]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading adapter_model.bin: 100%|█████████████████████████████████████████████████████████████| 7.39M/7.39M [00:00<00:00, 20.5MB/s]
  0%|                                                                                                          | 0/28 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:02<00:00, 10.06it/s]


sst2 {'accuracy': 0.9059633027522935}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  6.39it/s]

mrpc {'accuracy': 0.5931372549019608, 'f1': 0.6541666666666667}





In [65]:
# Maxout method (AmBm = max(A1B1, A2B2))

all_evals = []
all_metrics = []
all_model_ids = []
zero_shot = False

batch_size = 32
model_name_or_path = "roberta-large"
peft_type = PeftType.LORA
device = "cuda"


peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)


if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")

all_tasks = list(combinations(tasks,2))
print(all_tasks)
for tasks in all_tasks:
    for task in tasks:
        datasets = load_dataset("glue", task, cache_dir='/nethome/becsedi3/flash/datasets/GLUE')
        metric = evaluate.load("glue", task, cache_dir='/nethome/becsedi3/flash/datasets/GLUE')
        sentence1_key, sentence2_key = task_to_keys[task]
    
        def tokenize_function(examples):
            # max_length=None => use the model max length (it's actually the default)
            args = (
                (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
            )
            outputs = tokenizer(*args, truncation=True, max_length=None)
            return outputs
    
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx"] + list([t for t in task_to_keys[task] if t is not None]),
        )
        tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
        
        train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
        eval_dataloader = DataLoader(
            tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
        )
        all_evals.append(eval_dataloader)
        all_metrics.append(metric)
        all_model_ids.append("gstoica3/roberta-large-peft-" + task)

    print(all_model_ids)
    
    #Merge the models
    config = PeftConfig.from_pretrained(all_model_ids[0])
    merged_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
    if(zero_shot == False):
        merged_model = PeftModel.from_pretrained(merged_model, all_model_ids[0])
        merged_adapter_weights = {}
        adapters_weights = load_peft_weights(all_model_ids[0], device=device) # adapter 1
        
        for all_model_id in all_model_ids[1:]:
            new_adapter = load_peft_weights(all_model_id, device=device) # adapter 2
            
            keys = list(new_adapter.keys())
            
            for i in range(0, len(keys) - 4, 2): #TODO: the 4 shall be a 1, but: 
                # what to do with last two layers which has the bias? Do we merge that too?? If so, how?
                
                key_A, key_B = keys[i], keys[i + 1]

                # TODO: remove prints
                #print("keys: ", key_A, key_B)
                #print(adapters_weights[key_A].size())
                #print(adapters_weights[key_B].size())
                #print(new_adapter[key_A].size())
                #print(new_adapter[key_B].size())
                
                # Calculate element-wise max of A1_B1 and A2_B2
                A1_B1 = torch.matmul(adapters_weights[key_A], adapters_weights[key_B]) # full-rank adapter 1
                A2_B2 = torch.matmul(new_adapter[key_A], new_adapter[key_B]) # full-rank adapter 2
                
                merged_adapter_weights[i] = torch.max(A1_B1, A2_B2) # take element-wise max of A1_B1 and A2_B2
                
                # Set B weights to zeros
                adapters_weights[key_B].zero_()
                new_adapter[key_B].zero_()
                
                #merged_model[i] += merged_adapter_weights[i]

            #### DEBUG ####
            # Sum the merged adapter weights
            #merged_model += torch.sum(merged_adapter_weights, dim=0)
            #print(merged_model.print_trainable_parameters())

            
            #TODO: merge the merged_adapter_weights back into the model weights
            
            for name, param in merged_model.named_parameters():
            #for param in merged_model.parameters():
                #print(name, param)
                print(name)
                print(merged_model.state_dict()[name].size())
                
            merged_model += merged_adapter_weights # this does not work, we need to access the merged_model's weights and edit them...
            
            ##### END OF SECTION TO DEBUG #####
        
        load_result = set_peft_model_state_dict(merged_model, adapters_weights)
    
    #Evaluate the merged model
    merged_model.to(device)
    merged_model.eval()
    for task_id, task in enumerate(tasks):
        for step, batch in enumerate(tqdm(all_evals[task_id])):
            batch.to(device)
            with torch.no_grad():
                outputs = merged_model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = predictions, batch["labels"]
            all_metrics[task_id].add_batch(
                predictions=predictions,
                references=references,
            )
        
        eval_metric = all_metrics[task_id].compute()
        print(task, eval_metric)


[('sst2', 'mrpc')]
['gstoica3/roberta-large-peft-sst2', 'gstoica3/roberta-large-peft-mrpc']


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


base_model.model.roberta.embeddings.word_embeddings.weight
torch.Size([50265, 1024])
base_model.model.roberta.embeddings.position_embeddings.weight
torch.Size([514, 1024])
base_model.model.roberta.embeddings.token_type_embeddings.weight
torch.Size([1, 1024])
base_model.model.roberta.embeddings.LayerNorm.weight
torch.Size([1024])
base_model.model.roberta.embeddings.LayerNorm.bias
torch.Size([1024])
base_model.model.roberta.encoder.layer.0.attention.self.query.weight
torch.Size([1024, 1024])
base_model.model.roberta.encoder.layer.0.attention.self.query.bias
torch.Size([1024])
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
torch.Size([8, 1024])
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight
torch.Size([1024, 8])
base_model.model.roberta.encoder.layer.0.attention.self.key.weight
torch.Size([1024, 1024])
base_model.model.roberta.encoder.layer.0.attention.self.key.bias
torch.Size([1024])
base_model.model.roberta.encode

TypeError: unsupported operand type(s) for +=: 'PeftModelForSequenceClassification' and 'dict'

In [64]:
# Self-regularized Fine-Tuning method (from C-LoRA) (AmBm = (alpha / A2B2) ⊙ A1B1 + (beta / A1B1) ⊙ A2B2)

all_evals = []
all_metrics = []
all_model_ids = []
zero_shot = False

batch_size = 32
model_name_or_path = "roberta-large"
peft_type = PeftType.LORA
device = "cuda"


peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)


if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")

all_tasks = list(combinations(tasks,2))
print(all_tasks)
for tasks in all_tasks:
    for task in tasks:
        datasets = load_dataset("glue", task, cache_dir='/nethome/becsedi3/flash/datasets/GLUE')
        metric = evaluate.load("glue", task, cache_dir='/nethome/becsedi3/flash/datasets/GLUE')
        sentence1_key, sentence2_key = task_to_keys[task]
    
        def tokenize_function(examples):
            # max_length=None => use the model max length (it's actually the default)
            args = (
                (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
            )
            outputs = tokenizer(*args, truncation=True, max_length=None)
            return outputs
    
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx"] + list([t for t in task_to_keys[task] if t is not None]),
        )
        tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
        
        train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
        eval_dataloader = DataLoader(
            tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
        )
        all_evals.append(eval_dataloader)
        all_metrics.append(metric)
        all_model_ids.append("gstoica3/roberta-large-peft-" + task)

    print(all_model_ids)
    
    #Merge the models
    config = PeftConfig.from_pretrained(all_model_ids[0])
    merged_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
    if(zero_shot == False):
        merged_model = PeftModel.from_pretrained(merged_model, all_model_ids[0])
        merged_adapter_weights = {}
        adapters_weights = load_peft_weights(all_model_ids[0], device=device) # adapter 1
        
        # Merging Parameters
        alpha = 1.0 #TODO: might want to modify this such that we can optimize these parameters layerwise
        beta = 1.0
        
        for all_model_id in all_model_ids[1:]:
            new_adapter = load_peft_weights(all_model_id, device=device) # adapter 2
            
            keys = list(new_adapter.keys())
            
            for i in range(0, len(keys) - 4, 2): #TODO: the 4 shall be a 1, but:
                # what to do with last two layers which has the bias? Do we merge that too?? If so, how?
                
                key_A, key_B = keys[i], keys[i + 1]
                
                # Calculate element-wise max of A1_B1 and A2_B2
                A1_B1 = torch.matmul(adapters_weights[key_A], adapters_weights[key_B]) # full-rank adapter 1
                A2_B2 = torch.matmul(new_adapter[key_A], new_adapter[key_B]) # full-rank adapter 2
                print("A1_B1: ", A1_B1.size())

                merged_adapter_weights[i] = ((alpha / A2_B2) * A1_B1) + ((beta / A1_B1) * A2_B2)
                
                # Set B weights to zeros
                adapters_weights[key_B].zero_()
                new_adapter[key_B].zero_()
                
                #merged_model[i] += merged_adapter_weights[i]

            #### DEBUG ####
            # Sum the merged adapter weights
            #merged_model += torch.sum(merged_adapter_weights, dim=0)
            #print(merged_model.print_trainable_parameters())

            
            #TODO: merge the merged_adapter_weights back into the model weights
            
            for name, param in merged_model.named_parameters():
            #for param in merged_model.parameters():
                #print(name, param)
                print(name)
                print(merged_model.state_dict()[name].size())
                
            merged_model += merged_adapter_weights # this does not work, we need to access the merged_model's weights and edit them...
            
            ##### END OF SECTION TO DEBUG #####
        
        load_result = set_peft_model_state_dict(merged_model, adapters_weights)
    
    #Evaluate the merged model
    merged_model.to(device)
    merged_model.eval()
    for task_id, task in enumerate(tasks):
        for step, batch in enumerate(tqdm(all_evals[task_id])):
            batch.to(device)
            with torch.no_grad():
                outputs = merged_model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = predictions, batch["labels"]
            all_metrics[task_id].add_batch(
                predictions=predictions,
                references=references,
            )
        
        eval_metric = all_metrics[task_id].compute()
        print(task, eval_metric)


[('sst2', 'mrpc')]
['gstoica3/roberta-large-peft-sst2', 'gstoica3/roberta-large-peft-mrpc']


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


alpha 1.0
beta 1.0
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  torch.Size([8, 8])
A1_B1:  t

TypeError: unsupported operand type(s) for +=: 'PeftModelForSequenceClassification' and 'dict'