# DeBERTa
### Microsoft's Decoding-enhanced BERT with Disentangled Attention

Hugging Face: https://huggingface.co/microsoft/deberta-base  
Models: https://huggingface.co/models?sort=downloads&search=debert  
Paper : https://arxiv.org/abs/2006.03654  

In [9]:
import pandas as pd
from datasets import load_dataset
#!pip install transformers[sentencepiece]

In [11]:
#
# This section is the initial downloads of all the GLUE Datasets
# This only needs to be run once, but if you run it again
# it will check the directory in EC2 before it downloads again.
#

from datasets import load_dataset
"""
Reference for the GLUE datasets:
https://huggingface.co/datasets/glue
"""

glue_datasets = ['cola', 'mnli', 'mrpc','qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli']

"""
#Download all the datasets for initial set-up
for challenge in glue_datasets:
    print('\n\n',challenge)
    dataset_train = load_dataset('glue', challenge, split='train')
    try:
        dataset_test = load_dataset('glue', challenge, split='test')
    except:
        dataset_test = load_dataset('glue', challenge, split='test_matched')
        dataset_test = load_dataset('glue', challenge, split='test_mismatched')
"""
print('Uncomment if download needed')

Uncomment if download needed


In [31]:
feature_options = []

for dset in glue_datasets:
    print(f'=====\/  {dset}  \/====\n')
    raw_datasets = load_dataset("glue", dset)
    feature_options.append(list(raw_datasets['train'].features.keys()))
    print(raw_datasets['train'])
    print('\n\n')

Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


=====\/  cola  \/====

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8551
})



=====\/  mnli  \/====



Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 392702
})



=====\/  mrpc  \/====

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})



=====\/  qnli  \/====

Dataset({
    features: ['question', 'sentence', 'label', 'idx'],
    num_rows: 104743
})



=====\/  qqp  \/====



Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


Dataset({
    features: ['question1', 'question2', 'label', 'idx'],
    num_rows: 363846
})



=====\/  rte  \/====

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 2490
})



=====\/  sst2  \/====



Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})



=====\/  stsb  \/====

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 5749
})



=====\/  wnli  \/====

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 635
})





In [34]:
feature_options

[['sentence', 'label', 'idx'],
 ['premise', 'hypothesis', 'label', 'idx'],
 ['sentence1', 'sentence2', 'label', 'idx'],
 ['question', 'sentence', 'label', 'idx'],
 ['question1', 'question2', 'label', 'idx'],
 ['sentence1', 'sentence2', 'label', 'idx'],
 ['sentence', 'label', 'idx'],
 ['sentence1', 'sentence2', 'label', 'idx'],
 ['sentence1', 'sentence2', 'label', 'idx']]

In [48]:
task = "stsb"
load_dataset('glue', task, split='train').to_pandas().label.unique()


Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


array([5.       , 3.8      , 2.6      , 4.25     , 0.5      , 1.6      ,
       2.2      , 4.2      , 4.6      , 3.867    , 4.667    , 1.667    ,
       3.75     , 3.2      , 2.8      , 3.       , 4.8      , 4.       ,
       4.909    , 2.4      , 3.4      , 2.75     , 3.6      , 1.75     ,
       1.       , 2.375    , 4.4      , 4.75     , 1.556    , 3.938    ,
       3.5      , 1.4      , 3.833    , 0.6      , 2.917    , 2.       ,
       0.8      , 1.643    , 2.25     , 4.857    , 2.533    , 0.143    ,
       2.5      , 0.       , 0.4      , 0.667    , 4.133    , 1.2      ,
       3.765    , 3.941    , 0.25     , 3.25     , 0.75     , 1.5      ,
       0.2      , 3.111    , 1.286    , 1.8      , 0.85     , 3.923    ,
       1.25     , 0.833    , 0.333    , 3.333    , 4.333    , 2.667    ,
       0.417    , 2.818    , 3.533    , 0.643    , 1.583    , 1.778    ,
       3.667    , 2.333    , 1.7      , 4.5      , 0.727    , 1.333    ,
       0.067    , 4.875    , 3.615    , 2.875    , 

In [12]:
#Start with Cola

challenge = 'cola'

train_data = load_dataset('glue', challenge, split='train')
test_data = load_dataset('glue', challenge, split='test')
valid_data = load_dataset('glue', challenge, split='validation')


Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [49]:
###  Imports  ###

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
from datasets import load_metric


###  Set-up  ###
num_epochs = 5
checkpoint = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
task = "wnli"

###  Choose Task Dataset  ###
raw_datasets = load_dataset("glue", task)


###  Tokenize  ###
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names


###  Dataloader  ###
train_dataloader = DataLoader(tokenized_datasets["train"],
                              shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["validation"],
                             batch_size=8, collate_fn=data_collator)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}


###  Model Set-up  ###
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)
optimizer = AdamW(model.parameters(), lr=5e-5)

num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler( "linear", optimizer=optimizer,
                              num_warmup_steps=0, num_training_steps=num_training_steps)
print(num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

#Tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

###  Evaluations  ###
metric = load_metric("glue", task)
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Reusing dataset glue (/home/ubuntu/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-7631f19924bf8e90.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-f818b1239ef0a01e.arrow
Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifer.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifer.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or w

tensor(0.6947, grad_fn=<NllLossBackward>) torch.Size([8, 2])
400


  0%|          | 0/400 [00:00<?, ?it/s]

{'accuracy': 0.36619718309859156}

In [19]:
'sentence1' in raw_datasets['train'].features

True