## 1. Load Dataset

In [1]:
#comment this if you are not using AIT proxy...
import os
os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [2]:
from datasets import load_dataset, load_metric

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
task_name = "stsb"
datasets = load_dataset("glue", task_name)
datasets["train"][3]

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset glue (/home/todsavadt/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:00<00:00, 1614.85it/s]


{'sentence1': 'Three men are playing chess.',
 'sentence2': 'Two men are playing chess.',
 'label': 2.5999999046325684,
 'idx': 3}

In [3]:
datasets["train"].column_names

['sentence1', 'sentence2', 'label', 'idx']

In [4]:
task_to_keys[task_name]

('sentence1', 'sentence2')

In [5]:
datasets['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': Value(dtype='float32', id=None),
 'idx': Value(dtype='int32', id=None)}

## 2. Preprocessing

In [6]:
# Labels
if task_name is not None:
    is_regression = task_name == "stsb"
    if not is_regression:
        label_list = datasets["train"].features["label"].names
        num_labels = len(label_list)
    else:
        num_labels = 1
else:
    # Trying to have good defaults here, don't hesitate to tweak to your needs.
    is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"]
    if is_regression:
        num_labels = 1
    else:
        # A useful fast method:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
        label_list = datasets["train"].unique("label")
        label_list.sort()  # Let's sort it for determinism
        num_labels = len(label_list)
        
num_labels, is_regression

(1, True)

In [7]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer, PretrainedConfig
model_name_or_path = "bert-base-cased"
config = AutoConfig.from_pretrained(
    model_name_or_path, 
    num_labels=num_labels, 
    finetuning_task=task_name)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
)
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=num_labels)
label_to_id = None

if (
    model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
    and task_name is not None
    and not is_regression
):
    # Some have all caps in their config, some don't.
    label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
    if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
        label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
        
elif task_name is None and not is_regression:
    label_to_id = {v: i for i, v in enumerate(label_list)}

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [8]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    sentence1_key, sentence2_key = task_to_keys[task_name]
    args = (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    result = tokenizer(*args, max_length=180, padding="max_length", truncation=True)

    # Map labels to IDs (not necessary for GLUE tasks)
    # if label_to_id is not None and "label" in examples:
    #     result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]

    if "label" in examples:
        if label_to_id is not None:
            # Map labels to IDs (not necessary for GLUE tasks)
            result["label"] = [label_to_id[l] for l in examples["label"]]
        else:
            # In all cases, rename the column to labels because the model will expect that.
            result["label"] = examples["label"]
    
    return result

tokenized_datasets = datasets.map(tokenize_function, batched=True)

Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-02f69511fecf16e0.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-5b57481bcffd58ce.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-0c28ff668309d899.arrow


If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes:

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5749
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1379
    })
})

In [10]:
tokenized_datasets = tokenized_datasets.remove_columns(list(task_to_keys[task_name]) + ["idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5749
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1379
    })
})

In [11]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(1000))
small_eval_dataset = tokenized_datasets["validation_matched" if task_name == "mnli" else "validation"].shuffle(seed=42) #.select(range(100))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42)

Loading cached shuffled indices for dataset at /home/todsavadt/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a162f21ce968cedb.arrow
Loading cached shuffled indices for dataset at /home/todsavadt/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-ece2296a3dfbde6e.arrow
Loading cached shuffled indices for dataset at /home/todsavadt/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-f9ac64ab64dddc94.arrow


## 3. Dataloaders

In [12]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=16)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)

In [13]:
small_train_dataset['labels'].unique(), small_eval_dataset['labels'].unique()

(tensor([0.0000, 0.0670, 0.1180, 0.1430, 0.1700, 0.2000, 0.2310, 0.2500, 0.3330,
         0.4000, 0.4170, 0.5000, 0.6000, 0.6430, 0.6670, 0.7270, 0.7500, 0.8000,
         0.8330, 0.8500, 0.8890, 0.9000, 0.9440, 1.0000, 1.1000, 1.2000, 1.2500,
         1.2730, 1.2860, 1.3330, 1.4000, 1.5000, 1.5333, 1.5560, 1.5830, 1.6000,
         1.6430, 1.6670, 1.7000, 1.7330, 1.7500, 1.7780, 1.8000, 1.8460, 2.0000,
         2.1111, 2.2000, 2.2500, 2.3300, 2.3330, 2.3750, 2.4000, 2.4667, 2.5000,
         2.5330, 2.5830, 2.5880, 2.6000, 2.6250, 2.6470, 2.6670, 2.7000, 2.7500,
         2.7690, 2.8000, 2.8180, 2.8300, 2.8750, 2.9090, 2.9170, 3.0000, 3.0560,
         3.0670, 3.1000, 3.1110, 3.1670, 3.2000, 3.2310, 3.2500, 3.2730, 3.3330,
         3.3333, 3.4000, 3.4380, 3.4440, 3.4550, 3.5000, 3.5330, 3.6000, 3.6150,
         3.6250, 3.6430, 3.6670, 3.6700, 3.6920, 3.7500, 3.7650, 3.7690, 3.7778,
         3.7860, 3.8000, 3.8330, 3.8460, 3.8570, 3.8670, 3.8750, 3.9090, 3.9230,
         3.9290, 3.9330, 3.9

## 4. Model

In [14]:
# from transformers import AutoModelForSequenceClassification, AutoConfig
# model_name_or_path = "bert-base-cased"
# config = AutoConfig.from_pretrained(
#     model_name_or_path, 
#     num_labels=num_labels, 
#     finetuning_task=task_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name_or_path,
#     from_tf=bool(".ckpt" in model_name_or_path),
#     config=config,
# )


In [15]:
# from transformers import AutoModelForSequenceClassification, AutoConfig

# config = AutoConfig.from_pretrained("bert-base-cased", 
#                                     num_labels=num_labels, 
#                                     finetuning_task=task_name)
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", 
#                                                            config=config)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('Original parameters :', count_parameters(model))

Original parameters : 108311041


In [16]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    TaskType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    LoraConfig,
)
id2label = None
PEFTtechnique = 'Adapter'
print(f'Technique : {PEFTtechnique}')
if PEFTtechnique == 'FT':
    pass
elif PEFTtechnique == 'Adapter':
    #!pip install -U adapter-transformers
    from transformers.adapters import BertAdapterModel, AutoAdapterModel 
    model = BertAdapterModel.from_pretrained("bert-base-cased", num_labels=num_labels) 
    # Add a new adapter
    model.add_adapter(task_name)
    # Add a matching classification head
    model.add_classification_head(
        task_name,
        num_labels=num_labels,
        id2label=id2label
      )
    # Activate the adapter
    model.train_adapter(task_name)
    
    def print_trainable_parameters(model):
        """
        Prints the number of trainable parameters in the model.
        """
        trainable_params = 0
        all_param = 0
        for _, param in model.named_parameters():
            num_params = param.numel()
            # if using DS Zero 3 and the weights are initialized empty
            if num_params == 0 and hasattr(param, "ds_numel"):
                num_params = param.ds_numel

            all_param += num_params
            if param.requires_grad:
                trainable_params += num_params
        print(
            f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
        )
        
    print_trainable_parameters(model) 
elif PEFTtechnique == 'Prefix':
    peft_config = PrefixTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=20, encoder_hidden_size=128)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters() 
elif PEFTtechnique == 'Prompt':
    peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=20, encoder_hidden_size=128)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters() 
elif PEFTtechnique == 'LoRA':
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1
    )
    
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters() 
elif PEFTtechnique == 'BitFit':
    # Freeze all parameters except biases
    for name, param in model.named_parameters():
        if 'bias' not in name:
            param.requires_grad = False 

Technique : Adapter


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertAdapterModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 1485889 || all params: 109796161 || trainable%: 1.3533159870680724


## 5. Training

### Optimizer

Then we will need an optimizer. Weâ€™ll use the classic `AdamW`, which is like `Adam`, but with a fix in the way weight decay is applied:

In [17]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [18]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

### Accelerator

Once we have all those objects, we can send them to the `accelerator.prepare()` method:

In [19]:
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [20]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

### Metrics

In [21]:
import numpy as np
import evaluate

# metric = load_metric("mse")
metric = load_metric("glue", task_name)
# metric

  metric = load_metric("glue", task_name)


In [None]:
import torch
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
eval_metrics = 0
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        # batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        # loss.backward()
        accelerator.backward(loss)
        # Step with optimizer
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
    model.eval()
    for batch in eval_dataloader:
        # batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
    
        predictions = outputs.logits #.argmax(dim=-1)
        # predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(
            predictions=accelerator.gather(predictions), 
            references=accelerator.gather(batch["labels"])
        )
        
    eval_metric = metric.compute()
    eval_metrics += eval_metric['pearson'] 
    print(f"Epoch at {epoch+1}: {eval_metric}")
print('Avg Metric', eval_metrics/num_epochs)

 20%|â–ˆâ–ˆ        | 363/1800 [00:36<12:25,  1.93it/s]

Epoch at 1: {'pearson': 0.8398243641978261, 'spearmanr': 0.8349196562581404}


 40%|â–ˆâ–ˆâ–ˆâ–ˆ      | 723/1800 [01:12<09:24,  1.91it/s]

Epoch at 2: {'pearson': 0.8632688548550549, 'spearmanr': 0.8598282563134325}


 50%|â–ˆâ–ˆâ–ˆâ–ˆâ–‰     | 897/1800 [01:28<01:20, 11.18it/s]