# Train Adapters

In [1]:
# Imports

import pandas as pd
import numpy as np
import wandb
import torch
import gc
import os

from transformers import (
    AutoAdapterModel, 
    AutoTokenizer, 
    PfeifferConfig,
    TrainingArguments, 
    AdapterTrainer,
    AutoConfig, 
    TrainerCallback, 
    EarlyStoppingCallback
)
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset, DatasetDict


# Constants

DATA_PATH = "../data/processed/"
MODELS_PATH = "../models/finetuning/"

## Train task adapter

In [2]:
CONFIG = {
    "task_name": "clickbait",
    "model_name": "roberta-large",
    "max_length": 128,
    "batch_size": 32,
    "epochs": 30,
    "seeds" : [0, 21, 48],
    "learning_rate": 1e-4,
    "gradient_accumulation_steps": 1
}

In [3]:
TASK_PATH = f'{DATA_PATH}{CONFIG["task_name"]}.csv'

### Load dataset

In [4]:
task_df = pd.read_csv(TASK_PATH)
task_df.shape

(7959, 2)

In [5]:
id2label = {}
pos_labels = ["contains-bias", "clickbait", "false", "fake"]

labels = set(task_df["labels"].to_list())
for label in labels:
    if str(label).lower() in pos_labels:
        id2label.update({1: label})
    else:
        id2label.update({0: label})

id2label

{0: 'no-bias', 1: 'contains-bias'}

### Tokenize dataset

In [6]:
truncation = True
padding = "max_length"
batched = True

In [7]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

In [8]:
def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=CONFIG["max_length"], truncation=truncation, padding=padding)

In [9]:
task_dataset = Dataset.from_pandas(task_df)
# Encode the input data
task_dataset = task_dataset.map(encode_batch, batched=batched)
# Transform to pytorch tensors and only output the required columns
task_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
task_dataset = task_dataset.class_encode_column("labels")

Map:   0%|          | 0/7959 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/7959 [00:00<?, ? examples/s]

### Train model

In [10]:
class AdapterDropTrainerCallback(TrainerCallback):
  def on_step_begin(self, args, state, control, **kwargs):
    skip_layers = list(range(np.random.randint(0, 11)))
    kwargs['model'].set_active_adapters(kwargs['model'].active_adapters[0], skip_layers=skip_layers)

  def on_evaluate(self, args, state, control, **kwargs):
    # Deactivate skipping layers during evaluation (otherwise it would use the
    # previous randomly chosen skip_layers and thus yield results not comparable
    # across different epochs)
    kwargs['model'].set_active_adapters(kwargs['model'].active_adapters[0], skip_layers=None)

In [11]:
def acc_and_f1(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = float(f1_score(y_true, y_pred, average='macro'))
    return {
        "accuracy": acc,
        "f1": f1,
    }

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return acc_and_f1(labels, predictions)

In [12]:
strategy = "epoch"
output_dir = f'{MODELS_PATH}{CONFIG["model_name"]}{os.sep}van-adapt{os.sep}{CONFIG["task_name"]}'
overwrite_output_dir = True
remove_unused_columns = False
save_total_limit = 1
report_to = "wandb"
load_best_model_at_end = True
metric_for_best_model = "eval_f1"
early_stopping_patience = 10

In [13]:
def get_model():
    config = AutoConfig.from_pretrained(
        CONFIG["model_name"],
        id2label=id2label,
    )
    task_model = AutoAdapterModel.from_pretrained(
        CONFIG["model_name"],
        config=config
    )
    adapter_config = PfeifferConfig()
    task_model.add_adapter(CONFIG["task_name"], config=adapter_config)
    task_model.train_adapter(CONFIG["task_name"])
    task_model.add_classification_head(
        CONFIG["task_name"],
        num_labels=len(id2label),
        id2label=id2label,
    )
    task_model.set_active_adapters(CONFIG["task_name"])
    
    return task_model

In [14]:
for seed in CONFIG["seeds"]:
    wandb.init(
        project=CONFIG["task_name"], 
        config=CONFIG,
        job_type=CONFIG['model_name'],
        group="van_head",
        tags=[
            "van_head",
            CONFIG['model_name'],
            f"mx: {CONFIG['max_length']}",
            f"bs: {CONFIG['batch_size']}",
            f"ep: {CONFIG['epochs']}",
            f"lr: {CONFIG['learning_rate']}"
        ],
        name=f'seed_{seed}',
        anonymous='must'
    )

    train_test = task_dataset.train_test_split(test_size=0.1, generator=np.random.RandomState(0))
    train_valid = train_test['train'].train_test_split(test_size=0.15, generator=np.random.RandomState(0))
    
    dataset = DatasetDict(
        {
            'train': train_valid['train'],
            'valid': train_valid['test'],
            'test': train_test['test']
        }
    )

    training_args = TrainingArguments(
        learning_rate=CONFIG["learning_rate"],
        num_train_epochs=CONFIG["epochs"],
        per_device_train_batch_size=CONFIG["batch_size"],
        per_device_eval_batch_size=CONFIG["batch_size"],
        gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
        logging_strategy=strategy,
        evaluation_strategy=strategy,
        save_strategy=strategy,
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        # The next line is important to ensure the dataset labels are properly passed to the model
        remove_unused_columns=remove_unused_columns,
        save_total_limit=save_total_limit,
        report_to=report_to,
        load_best_model_at_end=load_best_model_at_end,
        metric_for_best_model=metric_for_best_model,
        seed=seed
    )

    trainer = AdapterTrainer(
        model_init=get_model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["valid"],
        compute_metrics=compute_metrics,
        callbacks = [
            EarlyStoppingCallback(early_stopping_patience=early_stopping_patience),
            AdapterDropTrainerCallback()
        ]
    )

    trainer.train()
    trainer.evaluate(dataset["test"], metric_key_prefix="test")

    wandb.finish()

    gc.collect()
    torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mflaviomerenda[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


loading configuration file config.json from cache at /home/flavio.merenda@EXPERT.AI/.cache/huggingface/hub/models--roberta-large/snapshots/716877d372b884cad6d419d828bac6c85b3b18d9/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "no-bias",
    "1": "contains-bias"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.bin from cache at /home/flavio.me