In [321]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
)

import pdb
import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm

from peft import PeftModel, PeftConfig, load_peft_weights, set_peft_model_state_dict
from transformers import AutoModelForCausalLM, AutoTokenizer
from itertools import combinations

In [325]:
list(combinations(tasks,4))

[]

In [326]:
tasks = [
    #'cola',
    'sst2',
    'mrpc',
    # 'mnli', # This doesn't have a validation set.... (don't have)
    # 'qnli', # We don't have enough memory to train this.. ((don't have))
    'rte',
    #'wnli',
    #'stsb', # This data isn't formatted how the next code block expects it to be (don't have)
    'qqp', # This takes 1hr per epoch...
]

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [328]:
all_evals = []
all_metrics = []
all_model_ids = []
zero_shot = True

batch_size = 32
model_name_or_path = "roberta-large"
peft_type = PeftType.LORA
device = "cuda"


peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)


if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")
    
all_tasks = list(combinations(tasks,4))
for tasks in all_tasks:
    print(tasks)
    for task in tasks:
        datasets = load_dataset("glue", task, cache_dir='/srv/hoffman-lab/flash9/pramesh39/datasets/GLUE')
        metric = evaluate.load("glue", task, cache_dir='/srv/hoffman-lab/flash9/pramesh39/datasets/GLUE')
        sentence1_key, sentence2_key = task_to_keys[task]
    
        def tokenize_function(examples):
            # max_length=None => use the model max length (it's actually the default)
            args = (
                (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
            )
            outputs = tokenizer(*args, truncation=True, max_length=None)
            return outputs
    
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            remove_columns=["idx"] + list([t for t in task_to_keys[task] if t is not None]),
        )
        tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
        
        train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
        eval_dataloader = DataLoader(
            tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
        )
        all_evals.append(eval_dataloader)
        all_metrics.append(metric)
        all_model_ids.append("gstoica3/roberta-large-peft-" + task)
    
    #Merged the models
    merged_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
    if(zero_shot == False):
        merged_model = PeftModel.from_pretrained(merged_model, all_model_ids[0])
        adapters_weights = load_peft_weights(all_model_ids[0], device=device)
        
        for all_model_id in all_model_ids[1:]:
            new_adapter = load_peft_weights(all_model_id, device=device)
            for key in new_adapter:
                adapters_weights[key] += new_adapter[key]
        
        load_result = set_peft_model_state_dict(merged_model, adapters_weights) 
    
    #Evaluate the merged model
    merged_model.to(device)
    merged_model.eval()
    for task_id, task in enumerate(tasks):
        for step, batch in enumerate(tqdm(all_evals[task_id])):
            batch.to(device)
            with torch.no_grad():
                outputs = merged_model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = predictions, batch["labels"]
            all_metrics[task_id].add_batch(
                predictions=predictions,
                references=references,
            )
        
        eval_metric = all_metrics[task_id].compute()
        print(task, eval_metric)


('sst2', 'mrpc', 'rte', 'qqp')


Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|                                                                                                                                                                                                                                                                                                                                                                                       | 0/28 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█

sst2 {'accuracy': 0.5091743119266054}


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  9.45it/s]


mrpc {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:02<00:00,  4.24it/s]


rte {'accuracy': 0.4729241877256318}


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1264/1264 [01:59<00:00, 10.57it/s]

qqp {'accuracy': 0.3681919366806827, 'f1': 0.53820009400875}





In [308]:
# for task_id, task in enumerate(tasks):
#     config = PeftConfig.from_pretrained(all_model_ids[task_id])
#     inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
#     #tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
    
#     # Load the Lora model
#     inference_model = PeftModel.from_pretrained(inference_model, all_model_ids[task_id])
    
#     inference_model.to(device)
#     inference_model.eval()
#     for step, batch in enumerate(tqdm(all_evals[task_id])):
#         batch.to(device)
#         with torch.no_grad():
#             outputs = inference_model(**batch)
#         predictions = outputs.logits.argmax(dim=-1)
#         predictions, references = predictions, batch["labels"]
#         all_metrics[task_id].add_batch(
#             predictions=predictions,
#             references=references,
#         )
    
#     eval_metric = all_metrics[task_id].compute()
#     print(task, eval_metric)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|                                                                                                                                                                                                                                                                                                                                                                                       | 0/28 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█

sst2 {'accuracy': 0.9575688073394495}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1264/1264 [02:06<00:00,  9.97it/s]


qqp {'accuracy': 0.9147167944595598, 'f1': 0.8871136720796228}


In [309]:
# merged_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
# # # tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
# merged_model = PeftModel.from_pretrained(merged_model, all_model_ids[0])
# adapters_weights = load_peft_weights(all_model_ids[0], device=device)

# for all_model_id in all_model_ids[1:]:
#     new_adapter = load_peft_weights(all_model_id, device=device)
#     for key in new_adapter:
#         adapters_weights[key] += new_adapter[key]

# load_result = set_peft_model_state_dict(merged_model, adapters_weights) 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [310]:
# merged_model.to(device)
# merged_model.eval()
# for task_id, task in enumerate(tasks):
#     for step, batch in enumerate(tqdm(all_evals[task_id])):
#         batch.to(device)
#         with torch.no_grad():
#             outputs = merged_model(**batch)
#         predictions = outputs.logits.argmax(dim=-1)
#         predictions, references = predictions, batch["labels"]
#         all_metrics[task_id].add_batch(
#             predictions=predictions,
#             references=references,
#         )
    
#     eval_metric = all_metrics[task_id].compute()
#     print(task, eval_metric)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:02<00:00, 13.41it/s]


sst2 {'accuracy': 0.6376146788990825}


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1264/1264 [02:07<00:00,  9.93it/s]


qqp {'accuracy': 0.7787039327232254, 'f1': 0.7173590270099511}


In [244]:
merged_model.state_dict().keys()

odict_keys(['base_model.model.base_model.model.roberta.embeddings.word_embeddings.weight', 'base_model.model.base_model.model.roberta.embeddings.position_embeddings.weight', 'base_model.model.base_model.model.roberta.embeddings.token_type_embeddings.weight', 'base_model.model.base_model.model.roberta.embeddings.LayerNorm.weight', 'base_model.model.base_model.model.roberta.embeddings.LayerNorm.bias', 'base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.query.weight', 'base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.query.bias', 'base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight', 'base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight', 'base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.key.weight', 'base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.key.bias', 'base_model.model.base_model.model.robert

In [245]:
inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [253]:
adapters_weights['base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.weight']

tensor([[ 0.0324,  0.0208,  0.0292,  ..., -0.0181,  0.0363, -0.0287],
        [-0.0386, -0.0438, -0.0166,  ...,  0.0963,  0.0596, -0.0434],
        [ 0.0043,  0.0122, -0.0232,  ...,  0.0030,  0.0238,  0.0308],
        ...,
        [ 0.0127, -0.0332, -0.0553,  ...,  0.0156, -0.0294, -0.0243],
        [-0.0258, -0.0459,  0.0265,  ..., -0.0770,  0.0154,  0.0336],
        [-0.0325,  0.0043, -0.0517,  ..., -0.0407,  0.0424,  0.0241]],
       device='cuda:0')

In [254]:
b = merged_model.state_dict()['base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight'].to(device)
b

tensor([[ 0.0324,  0.0208,  0.0292,  ..., -0.0181,  0.0363, -0.0287],
        [-0.0386, -0.0438, -0.0166,  ...,  0.0963,  0.0596, -0.0434],
        [ 0.0043,  0.0122, -0.0232,  ...,  0.0030,  0.0238,  0.0308],
        ...,
        [ 0.0127, -0.0332, -0.0553,  ...,  0.0156, -0.0294, -0.0243],
        [-0.0258, -0.0459,  0.0265,  ..., -0.0770,  0.0154,  0.0336],
        [-0.0325,  0.0043, -0.0517,  ..., -0.0407,  0.0424,  0.0241]],
       device='cuda:0')

In [255]:
adapters_weights0 = load_peft_weights(all_model_ids[0], device=device)
adapters_weights1 = load_peft_weights(all_model_ids[1], device=device)

In [256]:
adapters_weights0['base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.weight']

tensor([[ 0.0136,  0.0236, -0.0172,  ..., -0.0050,  0.0016,  0.0076],
        [-0.0288, -0.0110, -0.0097,  ...,  0.0592,  0.0269, -0.0405],
        [ 0.0067, -0.0096, -0.0102,  ...,  0.0206, -0.0029,  0.0235],
        ...,
        [ 0.0413, -0.0334, -0.0304,  ..., -0.0022, -0.0205, -0.0108],
        [-0.0164, -0.0117,  0.0170,  ..., -0.0249, -0.0279,  0.0334],
        [-0.0144,  0.0010, -0.0440,  ..., -0.0247,  0.0348,  0.0029]],
       device='cuda:0')

In [257]:
adapters_weights1['base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.weight']

tensor([[ 0.0188, -0.0028,  0.0464,  ..., -0.0131,  0.0347, -0.0363],
        [-0.0098, -0.0328, -0.0069,  ...,  0.0370,  0.0326, -0.0029],
        [-0.0024,  0.0218, -0.0131,  ..., -0.0176,  0.0267,  0.0073],
        ...,
        [-0.0286,  0.0002, -0.0248,  ...,  0.0179, -0.0089, -0.0135],
        [-0.0093, -0.0342,  0.0095,  ..., -0.0521,  0.0433,  0.0001],
        [-0.0180,  0.0032, -0.0077,  ..., -0.0160,  0.0077,  0.0212]],
       device='cuda:0')

In [258]:
a = adapters_weights0['base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.weight'] + adapters_weights1['base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.weight']
a

tensor([[ 0.0324,  0.0208,  0.0292,  ..., -0.0181,  0.0363, -0.0287],
        [-0.0386, -0.0438, -0.0166,  ...,  0.0963,  0.0596, -0.0434],
        [ 0.0043,  0.0122, -0.0232,  ...,  0.0030,  0.0238,  0.0308],
        ...,
        [ 0.0127, -0.0332, -0.0553,  ...,  0.0156, -0.0294, -0.0243],
        [-0.0258, -0.0459,  0.0265,  ..., -0.0770,  0.0154,  0.0336],
        [-0.0325,  0.0043, -0.0517,  ..., -0.0407,  0.0424,  0.0241]],
       device='cuda:0')

In [259]:
a == b

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]], device='cuda:0')

In [118]:
load_result = set_peft_model_state_dict(inference_model)

AttributeError: 'Tensor' object has no attribute 'items'

In [78]:
len(adapters_weights.keys())

100

tensor([[ 0.0136,  0.0236, -0.0172,  ..., -0.0050,  0.0016,  0.0076],
        [-0.0288, -0.0110, -0.0097,  ...,  0.0592,  0.0269, -0.0405],
        [ 0.0067, -0.0096, -0.0102,  ...,  0.0206, -0.0029,  0.0235],
        ...,
        [ 0.0413, -0.0334, -0.0304,  ..., -0.0022, -0.0205, -0.0108],
        [-0.0164, -0.0117,  0.0170,  ..., -0.0249, -0.0279,  0.0334],
        [-0.0144,  0.0010, -0.0440,  ..., -0.0247,  0.0348,  0.0029]],
       device='cuda:0')

In [103]:
adapters_weights0['base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.weight']

tensor([[ 0.0136,  0.0236, -0.0172,  ..., -0.0050,  0.0016,  0.0076],
        [-0.0288, -0.0110, -0.0097,  ...,  0.0592,  0.0269, -0.0405],
        [ 0.0067, -0.0096, -0.0102,  ...,  0.0206, -0.0029,  0.0235],
        ...,
        [ 0.0413, -0.0334, -0.0304,  ..., -0.0022, -0.0205, -0.0108],
        [-0.0164, -0.0117,  0.0170,  ..., -0.0249, -0.0279,  0.0334],
        [-0.0144,  0.0010, -0.0440,  ..., -0.0247,  0.0348,  0.0029]],
       device='cuda:0')

In [73]:
inference_model = PeftModel.from_pretrained(inference_model, peft_model_id)

In [74]:
inference_model.peft_config

{'default': LoraConfig(peft_type='LORA', auto_mapping=None, base_model_name_or_path='roberta-large', revision=None, task_type='SEQ_CLS', inference_mode=True, r=8, target_modules=['value', 'query'], lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)}

In [75]:
load_result = set_peft_model_state_dict(inference_model, adapters_weights)

In [76]:
inference_model.state_dict()['base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight']

tensor([[ 0.0136,  0.0236, -0.0172,  ..., -0.0050,  0.0016,  0.0076],
        [-0.0288, -0.0110, -0.0097,  ...,  0.0592,  0.0269, -0.0405],
        [ 0.0067, -0.0096, -0.0102,  ...,  0.0206, -0.0029,  0.0235],
        ...,
        [ 0.0413, -0.0334, -0.0304,  ..., -0.0022, -0.0205, -0.0108],
        [-0.0164, -0.0117,  0.0170,  ..., -0.0249, -0.0279,  0.0334],
        [-0.0144,  0.0010, -0.0440,  ..., -0.0247,  0.0348,  0.0029]])