In [1]:
import gc
import torch
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollatorForLanguageModeling
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
import transformers
import os

gc.collect()
torch.cuda.empty_cache()

In [2]:

# List of glue tasks
#GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2",  "wnli"]

#List of glue keys
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}


In [3]:
###  Tokenizing Section  ####

#Load model
model_checkpoint = "microsoft/deberta-v3-small"

# Create tokenizer for respective model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, truncation=True, model_max_length=512)

def tokenizer_func(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True,)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True,)




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Load Deberta

In [8]:
collector=[]

for task in GLUE_TASKS:

    #Select task
    #task = "rte"  #cola, mrpc
    batch_size = 10 #10 normally, 8 for qnli

    # Load dataset based on task variable
    actual_task = "mnli" if task == "mnli-mm" else task
    dataset = load_dataset("glue", actual_task)
    metric = load_metric('glue', actual_task)



    #Collect sentence keys and labels
    sentence1_key, sentence2_key = task_to_keys[task]

    # tokenize sentence(s)
    encoded_dataset = dataset.map(tokenizer_func, batched=True)


    # Number of logits to output
    num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2

    #model_checkpoint = "microsoft/deberta-v3-small"
    #model_checkpoint = "deberta-v3-small_baseline_cola/"
    #model_checkpoint = "deberta-v3-small_baseline_"+actual_task+"/"
    model_checkpoint = "deberta-v3-small_tuned_"+actual_task+"/"
    
    ###  Model Section  ####

    # Create model and attach ForSequenceClassification head
    model_deberta = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

    # Type of metric for given task
    metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"

    args = TrainingArguments(
        f"{model_checkpoint}-finetuned-Testing-{task}",
        evaluation_strategy = "epoch",
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        metric_for_best_model=metric_name,
        eval_accumulation_steps=5
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        if task != "stsb":
            predictions = np.argmax(predictions, axis=1)
        else:
            predictions = predictions[:, 0]
        return metric.compute(predictions=predictions, references=labels)

    validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
    trainer = Trainer(
        model_deberta,
        args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset[validation_key],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    score = trainer.evaluate()
    print(f'{actual_task}: {score}\n\n')
    collector.append([actual_task, metric_name, score])

Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-3e7ffd81fbad8c71.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-d89bfb1f7f7e2262.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-4a0ea9ba453e921a.arrow
loading configuration file deberta-v3-small_tuned_cola/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout

Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


cola: {'eval_loss': 0.3876675069332123, 'eval_matthews_correlation': 0.6208288813242873, 'eval_runtime': 2.8646, 'eval_samples_per_second': 364.095, 'eval_steps_per_second': 36.654}




Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9b729c3a4fc16494.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-86823df2eedba3cb.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-c5a95376e929e9ec.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-6bad094da374b3d8.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-78b716a4791d6ab4.arrow
loading configuration file deberta-v3-small_tuned_mnli/config.jso

Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


mnli: {'eval_loss': 0.3581653833389282, 'eval_accuracy': 0.8745797249108508, 'eval_runtime': 47.5046, 'eval_samples_per_second': 206.611, 'eval_steps_per_second': 20.672}




Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9b729c3a4fc16494.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-86823df2eedba3cb.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-c5a95376e929e9ec.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-6bad094da374b3d8.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-78b716a4791d6ab4.arrow
loading configuration file deberta-v3-small_tuned_mnli/config.jso

mnli: {'eval_loss': 0.35744670033454895, 'eval_accuracy': 0.8723555736371034, 'eval_runtime': 47.9737, 'eval_samples_per_second': 204.946, 'eval_steps_per_second': 20.511}




Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-939ce9b7f04cf7ad.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1c3dae61c64f051e.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1c6f26ba165be7f0.arrow
loading configuration file deberta-v3-small_tuned_mrpc/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout

Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


mrpc: {'eval_loss': 0.7517918348312378, 'eval_accuracy': 0.3161764705882353, 'eval_f1': 0.0, 'eval_runtime': 2.1665, 'eval_samples_per_second': 188.321, 'eval_steps_per_second': 18.924}




  0%|          | 0/105 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

loading configuration file deberta-v3-small_tuned_qnli/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading weights file deb

Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


qnli: {'eval_loss': 0.7032103538513184, 'eval_accuracy': 0.5053999633900788, 'eval_runtime': 29.9654, 'eval_samples_per_second': 182.311, 'eval_steps_per_second': 18.254}




  0%|          | 0/364 [00:00<?, ?ba/s]

  0%|          | 0/41 [00:00<?, ?ba/s]

  0%|          | 0/391 [00:00<?, ?ba/s]

loading configuration file deberta-v3-small_tuned_qqp/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading weights file debe

Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


qqp: {'eval_loss': 0.2822546660900116, 'eval_accuracy': 0.9040563937670048, 'eval_f1': 0.8723467272188765, 'eval_runtime': 154.1959, 'eval_samples_per_second': 262.199, 'eval_steps_per_second': 26.22}




  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

loading configuration file deberta-v3-small_tuned_rte/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading weights file debe

OSError: Unable to load weights from pytorch checkpoint file for 'deberta-v3-small_tuned_rte/' at 'deberta-v3-small_tuned_rte/pytorch_model.bin'If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. 

In [9]:
collector

[['cola',
  'matthews_correlation',
  {'eval_loss': 0.3876675069332123,
   'eval_matthews_correlation': 0.6208288813242873,
   'eval_runtime': 2.8646,
   'eval_samples_per_second': 364.095,
   'eval_steps_per_second': 36.654}],
 ['mnli',
  'accuracy',
  {'eval_loss': 0.3581653833389282,
   'eval_accuracy': 0.8745797249108508,
   'eval_runtime': 47.5046,
   'eval_samples_per_second': 206.611,
   'eval_steps_per_second': 20.672}],
 ['mnli',
  'accuracy',
  {'eval_loss': 0.35744670033454895,
   'eval_accuracy': 0.8723555736371034,
   'eval_runtime': 47.9737,
   'eval_samples_per_second': 204.946,
   'eval_steps_per_second': 20.511}],
 ['mrpc',
  'accuracy',
  {'eval_loss': 0.7517918348312378,
   'eval_accuracy': 0.3161764705882353,
   'eval_f1': 0.0,
   'eval_runtime': 2.1665,
   'eval_samples_per_second': 188.321,
   'eval_steps_per_second': 18.924}],
 ['qnli',
  'accuracy',
  {'eval_loss': 0.7032103538513184,
   'eval_accuracy': 0.5053999633900788,
   'eval_runtime': 29.9654,
   'eval_s

In [7]:
#model_deberta = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

loading configuration file deberta-v3-small_tuned_mrpc/config.json
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading weights file deb