In [1]:
import os 
import re

import numpy as np
import torch
import transformers
from datasets import load_dataset, load_metric
from torch import nn
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding

In [2]:
transformers.__version__, torch.__version__

('4.20.1', '1.13.1+cu116')

## Train individual models

In [3]:
def get_metrics_func(task_name):
    metric = load_metric("glue", task_name)
    def compute_metrics(p):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.argmax(preds, axis=1)
        result = metric.compute(predictions=preds, references=p.label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result
    return compute_metrics

In [4]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [5]:
def train_glue_model(task_name, checkpoint_name):
    model = AutoModelForSequenceClassification.from_pretrained('roberta-base').cuda()
    ds = load_dataset('glue', task_name)
    metric = get_metrics_func(task_name)
    enc_ds = ds.map(lambda examples: tokenizer(examples["sentence"], max_length=128, truncation=True), batched=True)
    training_args = TrainingArguments(
        output_dir=f'./results/{task_name}',  # output directory
        num_train_epochs=3,                   # total # of training epochs
        per_device_train_batch_size=16,       # batch size per device during training
        per_device_eval_batch_size=16,        # batch size for evaluation
        warmup_steps=500,                     # number of warmup steps for learning rate scheduler
        weight_decay=0.01,                    # strength of weight decay
    )
    trainer = Trainer(
        model=model,                        # the instantiated 🤗 Transformers model to be trained
        args=training_args,                 # training arguments, defined above
        train_dataset=enc_ds['train'],      # training dataset
        eval_dataset=enc_ds['validation'],  # evaluation dataset
        compute_metrics=metric,
        tokenizer=tokenizer,
    )

    checkpoint_path = os.path.join(training_args.output_dir, checkpoint_name)
    if os.path.exists(checkpoint_path):
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path).cuda()
    else:
        trainer.train()

    return trainer, model

In [6]:
trainer1, model1 = train_glue_model('cola', 'checkpoint-1500')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8551
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1605


Step,Training Loss
500,0.5468
1000,0.3954
1500,0.2403


Saving model checkpoint to ./results/cola/checkpoint-500
Configuration saved in ./results/cola/checkpoint-500/config.json
Model weights saved in ./results/cola/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/cola/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/cola/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/cola/checkpoint-1000
Configuration saved in ./results/cola/checkpoint-1000/config.json
Model weights saved in ./results/cola/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/cola/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/cola/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/cola/checkpoint-1500
Configuration saved in ./results/cola/checkpoint-1500/config.json
Model weights saved in ./results/cola/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/cola/checkpoint-1500/tokenizer_config.js

In [7]:
trainer2, model2 = train_glue_model('sst2', 'checkpoint-12500')

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/roberta-base/r

  0%|          | 0/3 [00:00<?, ?it/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 67349
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12630


Step,Training Loss
500,0.4509
1000,0.3667
1500,0.3183
2000,0.3041
2500,0.2901
3000,0.2607
3500,0.2743
4000,0.256
4500,0.2191
5000,0.2154


Saving model checkpoint to ./results/sst2/checkpoint-500
Configuration saved in ./results/sst2/checkpoint-500/config.json
Model weights saved in ./results/sst2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/sst2/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/sst2/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/sst2/checkpoint-1000
Configuration saved in ./results/sst2/checkpoint-1000/config.json
Model weights saved in ./results/sst2/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/sst2/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/sst2/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/sst2/checkpoint-1500
Configuration saved in ./results/sst2/checkpoint-1500/config.json
Model weights saved in ./results/sst2/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/sst2/checkpoint-1500/tokenizer_config.js

## Compute inner-product matrices

In [8]:
def filter_params_to_merge(param_names, exclude_param_regex):
    params_to_merge = []
    for name in param_names:
        valid = not any([re.match(patt, name) for patt in exclude_param_regex])
        if valid:
            params_to_merge.append(name)
    return params_to_merge


def filter_modules_by_regex(base_module, include_patterns, include_type):
    modules = {}
    for name, module in base_module.named_modules():
        valid_name = not include_patterns or any([re.match(patt, name) for patt in include_patterns])
        valid_type = not include_type or any([isinstance(module, md_cls) for md_cls in include_type])
        if valid_type and valid_name:
            modules[name] = module
    return modules

In [9]:
def compute_gram(model, trainer):
    train_dataloader = trainer.get_train_dataloader()
    grams = {} # gram matrices for each linear layer inputs
    xn = {} # number of examples used for computing gram

    def get_gram(name):
        def hook(module, input, output):
            x = input[0].detach() # $[b,t,h]
            x = x.view(-1, x.size(-1))
            xtx = torch.matmul(x.transpose(0,1), x) # [h,h]
            if name not in grams:
                grams[name] = xtx / x.size(0)
                xn[name] = x.size(0)
            else:
                grams[name] = (grams[name] * xn[name] + xtx) / (x.size(0) + xn[name])
                xn[name] += x.size(0)
        return hook

    linear_modules = filter_modules_by_regex(model, None, [nn.Linear])
    handles = []
    for name, module in linear_modules.items():
        handle = module.register_forward_hook(get_gram(name))
        handles.append(handle)

    n_step = 1000
    total = n_step if n_step > 0 else len(train_dataloader)
    for step, inputs in tqdm(enumerate(train_dataloader), total=total, desc='Computing gram matrix'):
        if n_step > 0 and step == n_step:
            break

        inputs = trainer._prepare_inputs(inputs)
        outputs = model(**inputs)

    for handle in handles:
        handle.remove()

    return grams

In [10]:
with torch.no_grad():
    grams1 = compute_gram(model1, trainer1)

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
Computing gram matrix:  54%|█████▎    | 535/1000 [00:13<00:11, 40.73it/s]


In [11]:
with torch.no_grad():
    grams2 = compute_gram(model2, trainer2)

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
Computing gram matrix: 100%|██████████| 1000/1000 [00:26<00:00, 38.00it/s]


In [12]:
def avg_merge(local_models, global_model, regmean_grams=None, **kwargs):
    params = {}
    for local_model in local_models:
        n2p = {k: v for k,v in local_model.named_parameters()}
        merge_param_names = filter_params_to_merge([n for n in n2p], ['.*classifier.*']) # for glue label spaces are different
        for n in merge_param_names:
            if n not in params:
                params[n] = []
            params[n].append(n2p[n])

    if regmean_grams: # regmean average
        avg_params = regmean_merge(params, regmean_grams)

    else: # simple average
        avg_params = {k: torch.stack(v,0).mean(0) for k, v in params.items()}

    return avg_params

def copy_params_to_model(avg_params, model):
    for n, p in model.named_parameters():
        if n in avg_params:
            p.data.copy_(avg_params[n])

def reduce_non_diag(cov_mat, a):
    diag_weight = torch.diag(torch.ones(cov_mat.size(0)) - a).to(cov_mat.device)
    non_diag_weight = torch.zeros_like(diag_weight).fill_(a)
    weight = diag_weight + non_diag_weight
    ret = cov_mat * weight
    return ret

def regmean_merge(all_params, all_grams):
    avg_params = {}
    n_model = len(all_grams)
    for name in all_params:
        h_avged = False
        if name.endswith('.weight'):
            print(f'Regmean: {name}')
            module_name = name[:-len('.weight')]
            if module_name in all_grams[0]:
                gram_m_ws, grams = [], []

                for model_id, model_grams in enumerate(all_grams):
                    param_grams = model_grams[module_name]

                    # for roberta we dont need this; but it is important for deberta and t5
                    #param_grams = reduce_non_diag(param_grams, a=0.9)

                    param = all_params[name][model_id]
                    gram_m_ws.append(torch.matmul(param_grams, param.transpose(0,1)))
                    grams.append(param_grams)
                sum_gram = sum(grams)
                sum_gram_m_ws = sum(gram_m_ws)
                sum_gram_inv = torch.inverse(sum_gram)
                wt = torch.matmul(sum_gram_inv, sum_gram_m_ws)
                w = wt.transpose(0,1)
                avg_params[name] = w
                h_avged = True
        if not h_avged: # if not averaged with regmean, then do simple avg
            avg_params[name] = torch.stack(all_params[name],0).mean(0)
           
    return avg_params

## Performance before merging

In [13]:
trainer1.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 16


{'eval_loss': 0.531173825263977,
 'eval_matthews_correlation': 0.6011053198493792,
 'eval_runtime': 1.0405,
 'eval_samples_per_second': 1002.385,
 'eval_steps_per_second': 63.43,
 'epoch': 3.0}

In [14]:
trainer2.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


{'eval_loss': 0.290436327457428,
 'eval_accuracy': 0.9277522935779816,
 'eval_runtime': 0.9222,
 'eval_samples_per_second': 945.572,
 'eval_steps_per_second': 59.64,
 'epoch': 3.0}

## Merging with RegMean

In [15]:
merged_model = AutoModelForSequenceClassification.from_pretrained('roberta-base').cuda()

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/roberta-base/r

In [16]:
regmean_avg_params = avg_merge([model1, model2], merged_model, regmean_grams=[grams1, grams2])

Regmean: roberta.embeddings.word_embeddings.weight
Regmean: roberta.embeddings.position_embeddings.weight
Regmean: roberta.embeddings.token_type_embeddings.weight
Regmean: roberta.embeddings.LayerNorm.weight
Regmean: roberta.encoder.layer.0.attention.self.query.weight
Regmean: roberta.encoder.layer.0.attention.self.key.weight
Regmean: roberta.encoder.layer.0.attention.self.value.weight
Regmean: roberta.encoder.layer.0.attention.output.dense.weight
Regmean: roberta.encoder.layer.0.attention.output.LayerNorm.weight
Regmean: roberta.encoder.layer.0.intermediate.dense.weight
Regmean: roberta.encoder.layer.0.output.dense.weight
Regmean: roberta.encoder.layer.0.output.LayerNorm.weight
Regmean: roberta.encoder.layer.1.attention.self.query.weight
Regmean: roberta.encoder.layer.1.attention.self.key.weight
Regmean: roberta.encoder.layer.1.attention.self.value.weight
Regmean: roberta.encoder.layer.1.attention.output.dense.weight
Regmean: roberta.encoder.layer.1.attention.output.LayerNorm.weight
R

In [17]:
copy_params_to_model(regmean_avg_params, merged_model)
merged_model.classifier = model1.classifier  # we didn't merge classification heads

In [18]:
evaluator_cola = Trainer(
    model=merged_model,                    # the instantiated 🤗 Transformers model to be trained
    args=trainer1.args,                    # training arguments, defined above
    train_dataset=trainer1.train_dataset,  # training dataset
    eval_dataset=trainer1.eval_dataset,    # evaluation dataset
    compute_metrics=get_metrics_func('cola'),
    tokenizer=tokenizer,
)

In [19]:
evaluator_cola.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 16


{'eval_loss': 0.66486656665802,
 'eval_matthews_correlation': 0.08987761934408492,
 'eval_runtime': 1.0497,
 'eval_samples_per_second': 993.636,
 'eval_steps_per_second': 62.876}

In [20]:
copy_params_to_model(regmean_avg_params, merged_model)
merged_model.classifier = model2.classifier  # we didn't merge classification heads

In [21]:
evaluator_sst = Trainer(
    model=merged_model,                    # the instantiated 🤗 Transformers model to be trained
    args=trainer2.args,                    # training arguments, defined above
    train_dataset=trainer2.train_dataset,  # training dataset
    eval_dataset=trainer2.eval_dataset,    # evaluation dataset
    compute_metrics=get_metrics_func('sst2'),
    tokenizer=tokenizer,
)

In [22]:
evaluator_sst.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


{'eval_loss': 0.23991066217422485,
 'eval_accuracy': 0.9357798165137615,
 'eval_runtime': 0.9036,
 'eval_samples_per_second': 964.997,
 'eval_steps_per_second': 60.866}

## Merging with Simple Avg

In [23]:
simple_avg_params = avg_merge([model1, model2], merged_model)
copy_params_to_model(simple_avg_params, merged_model)
merged_model.classifier = model1.classifier  # we didn't merge classification heads
evaluator_cola.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 16


{'eval_loss': 0.6459254026412964,
 'eval_matthews_correlation': 0.020369871666239114,
 'eval_runtime': 1.1313,
 'eval_samples_per_second': 921.966,
 'eval_steps_per_second': 58.341}

In [24]:
simple_avg_params = avg_merge([model1, model2], merged_model)
copy_params_to_model(simple_avg_params, merged_model)
merged_model.classifier = model2.classifier  # we didn't merge classification heads
evaluator_sst.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 16


{'eval_loss': 0.1879292130470276,
 'eval_accuracy': 0.9392201834862385,
 'eval_runtime': 1.1327,
 'eval_samples_per_second': 769.839,
 'eval_steps_per_second': 48.556}