# Train Adapters

In [1]:
# Imports

import pandas as pd
import numpy as np
import wandb
import torch
import gc
import os

from transformers import (
    AutoAdapterModel, 
    AutoTokenizer, 
    PfeifferConfig,
    TrainingArguments, 
    AdapterTrainer,
    AutoConfig, 
    TrainerCallback, 
    EarlyStoppingCallback
)
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset, DatasetDict


# Constants

DATA_PATH = "../data/processed/"
MODELS_PATH = "../models/fewshot/"

## Train task adapter

In [2]:
CONFIG = {
    "task_name": "twittercovidq2",
    "model_name": "roberta-large",
    "max_length": 128,
    "batch_size": 1,
    "epochs": 30,
    "seeds" : [0],
    "learning_rate": 1e-4,
    "gradient_accumulation_steps": 1,
    "fewshot_train": [10, 25, 50]
}

In [3]:
TASK_PATH = f'{DATA_PATH}{CONFIG["task_name"]}.csv'

### Load dataset

In [4]:
task_df = pd.read_csv(TASK_PATH).dropna()
task_df.shape

(260, 2)

In [5]:
id2label = {}
pos_labels = ["contains-bias", "clickbait", "false", "fake", "has_propaganda", "yes", "contains_false"]

labels = set(task_df["labels"].to_list())
for label in labels:
    if str(label).lower() in pos_labels:
        id2label.update({1: label})
    else:
        id2label.update({0: label})

id2label

{0: 'no_false', 1: 'contains_false'}

In [6]:
task_df.sample(frac=1, random_state=0)

Unnamed: 0,text,labels
233,The country is panic stricken over the #corona...,no_false
106,"Epidemiologist Marc Lipsitch, director of Harv...",no_false
237,â ï¸Doctors in #Italy warn Europe to âget ...,no_false
76,Government of India issues advisory to all soc...,no_false
173,Our #FlattenTheCurve graphic is now up on @Wik...,no_false
...,...,...
67,#FakeNews Alert #PIBFactCheck: The claim that...,no_false
192,"EXCLUSIVE Nadine Dorries, a health minister, h...",no_false
117,WTAF?!?! A Man at Dartmouth with symptoms of #...,no_false
47,Things the GOP has done during the Covid-19 ou...,no_false


### Tokenize dataset

In [7]:
truncation = True
padding = "max_length"
batched = True

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

In [9]:
def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=CONFIG["max_length"], truncation=truncation, padding=padding)

In [10]:
task_dataset = Dataset.from_pandas(task_df)
# Encode the input data
task_dataset = task_dataset.map(encode_batch, batched=batched)
# Transform to pytorch tensors and only output the required columns
task_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
task_dataset = task_dataset.class_encode_column("labels")

Map:   0%|          | 0/260 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/260 [00:00<?, ? examples/s]

### Train model

In [11]:
class AdapterDropTrainerCallback(TrainerCallback):
  def on_step_begin(self, args, state, control, **kwargs):
    skip_layers = list(range(np.random.randint(0, 11)))
    kwargs['model'].set_active_adapters(kwargs['model'].active_adapters[0], skip_layers=skip_layers)

  def on_evaluate(self, args, state, control, **kwargs):
    # Deactivate skipping layers during evaluation (otherwise it would use the
    # previous randomly chosen skip_layers and thus yield results not comparable
    # across different epochs)
    kwargs['model'].set_active_adapters(kwargs['model'].active_adapters[0], skip_layers=None)

In [12]:
def acc_and_f1(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = float(f1_score(y_true, y_pred, average='macro'))
    return {
        "accuracy": acc,
        "f1": f1,
    }

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return acc_and_f1(labels, predictions)

In [13]:
strategy = "epoch"
output_dir = f'{MODELS_PATH}{CONFIG["model_name"]}{os.sep}van-adapt{os.sep}{CONFIG["task_name"]}'
overwrite_output_dir = True
remove_unused_columns = False
save_total_limit = 1
report_to = "wandb"
load_best_model_at_end = True
metric_for_best_model = "eval_f1"
early_stopping_patience = 10

In [14]:
def get_model():
    config = AutoConfig.from_pretrained(
        CONFIG["model_name"],
        id2label=id2label,
    )
    task_model = AutoAdapterModel.from_pretrained(
        CONFIG["model_name"],
        config=config
    )
    adapter_config = PfeifferConfig()
    task_model.add_adapter(CONFIG["task_name"], config=adapter_config)
    task_model.train_adapter(CONFIG["task_name"])
    task_model.add_classification_head(
        CONFIG["task_name"],
        num_labels=len(id2label),
        id2label=id2label,
    )
    task_model.set_active_adapters(CONFIG["task_name"])
    
    return task_model

In [15]:
for fs in CONFIG["fewshot_train"]:
    fewshot_train_ratio = np.ceil(fs/len(task_df)*100)
    for seed in CONFIG["seeds"]:
        wandb.init(
            project=CONFIG["task_name"], 
            config=CONFIG,
            job_type=f'{CONFIG["model_name"]}_{fs}',
            group="van_head",
            tags=[
                "van_head",
                CONFIG['model_name'],
                f"mx: {CONFIG['max_length']}",
                f"bs: {CONFIG['batch_size']}",
                f"ep: {CONFIG['epochs']}",
                f"lr: {CONFIG['learning_rate']}"
            ],
            name=f'seed_{seed}',
            anonymous='must'
        )

        train_test = task_dataset.train_test_split(test_size=(100-fewshot_train_ratio)/100, generator=np.random.RandomState(0))
        test_valid = train_test['test'].train_test_split(test_size=0.2, generator=np.random.RandomState(0))
        
        dataset = DatasetDict(
            {
                'train': train_test['train'],
                'valid': test_valid['test'],
                'test': test_valid['train']
            }
        )

        training_args = TrainingArguments(
            learning_rate=CONFIG["learning_rate"],
            num_train_epochs=CONFIG["epochs"],
            per_device_train_batch_size=CONFIG["batch_size"],
            per_device_eval_batch_size=CONFIG["batch_size"],
            gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
            logging_strategy=strategy,
            evaluation_strategy=strategy,
            save_strategy=strategy,
            output_dir=output_dir,
            overwrite_output_dir=overwrite_output_dir,
            # The next line is important to ensure the dataset labels are properly passed to the model
            remove_unused_columns=remove_unused_columns,
            save_total_limit=save_total_limit,
            report_to=report_to,
            load_best_model_at_end=load_best_model_at_end,
            metric_for_best_model=metric_for_best_model,
            seed=seed
        )

        trainer = AdapterTrainer(
            model_init=get_model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["valid"],
            compute_metrics=compute_metrics,
            callbacks = [
                EarlyStoppingCallback(early_stopping_patience=early_stopping_patience),
                AdapterDropTrainerCallback()
            ]
        )

        trainer.train()
        trainer.evaluate(dataset["test"], metric_key_prefix="test")

        wandb.finish()

        gc.collect()
        torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mflaviomerenda[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


loading configuration file config.json from cache at /home/flavio.merenda@EXPERT.AI/.cache/huggingface/hub/models--roberta-large/snapshots/716877d372b884cad6d419d828bac6c85b3b18d9/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "no_false",
    "1": "contains_false"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.bin from cache at /home/flavio.

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4296,0.500416,0.92,0.479167
2,0.3418,0.643016,0.92,0.479167
3,0.631,0.71544,0.92,0.479167
4,0.4245,0.538372,0.92,0.479167
5,0.5184,0.610805,0.92,0.479167
6,0.4475,0.503131,0.92,0.479167
7,0.3951,0.51857,0.92,0.479167
8,0.3942,0.578355,0.92,0.479167
9,0.3866,0.688301,0.92,0.479167
10,0.9793,0.640391,0.92,0.479167


***** Running Evaluation *****
  Num examples = 50
  Batch size = 1
Saving model checkpoint to ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-10
Configuration saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-10/twittercovidq2/adapter_config.json
Module weights saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-10/twittercovidq2/pytorch_adapter.bin
Configuration saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-10/twittercovidq2/head_config.json
Module weights saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-10/twittercovidq2/pytorch_model_head.bin
Configuration saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-10/twittercovidq2/head_config.json
Module weights saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-10/twittercovidq2/pytorch_model_head.bin
Deleting older checkpoint [../models/fewshot/roberta-larg

early stopping required metric_for_best_model, but did not find eval_f1 so early stopping is disabled


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▁▁▁▁▁▁▁▁▁▁
eval/f1,▁▁▁▁▁▁▁▁▁▁▁
eval/loss,▁▆█▂▅▁▂▄▇▆▄
eval/runtime,▁▁▁▃▁▂████▇
eval/samples_per_second,███▅▇▆▁▁▁▁▁
eval/steps_per_second,███▅▇▆▁▁▁▁▁
test/accuracy,▁
test/f1,▁
test/loss,▁
test/runtime,▁

0,1
eval/accuracy,0.92
eval/f1,0.47917
eval/loss,0.60102
eval/runtime,2.018
eval/samples_per_second,24.777
eval/steps_per_second,24.777
test/accuracy,0.84
test/f1,0.45652
test/loss,0.80115
test/runtime,7.9224


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666863966577997, max=1.0)…

PyTorch: setting up devices
loading configuration file config.json from cache at /home/flavio.merenda@EXPERT.AI/.cache/huggingface/hub/models--roberta-large/snapshots/716877d372b884cad6d419d828bac6c85b3b18d9/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "no_false",
    "1": "contains_false"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.bin

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1192,0.642922,0.893617,0.47191
2,1.2177,0.6045,0.893617,0.47191
3,1.1216,0.680189,0.893617,0.47191
4,0.9423,0.605946,0.893617,0.47191
5,0.9344,0.548486,0.893617,0.47191
6,0.7764,0.545502,0.893617,0.47191
7,0.944,0.581208,0.893617,0.47191
8,0.8931,0.547561,0.893617,0.47191
9,0.6626,0.538219,0.893617,0.47191
10,0.6419,0.559078,0.893617,0.47191


***** Running Evaluation *****
  Num examples = 47
  Batch size = 1
Saving model checkpoint to ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-26
Configuration saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-26/twittercovidq2/adapter_config.json
Module weights saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-26/twittercovidq2/pytorch_adapter.bin
Configuration saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-26/twittercovidq2/head_config.json
Module weights saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-26/twittercovidq2/pytorch_model_head.bin
Configuration saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-26/twittercovidq2/head_config.json
Module weights saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-26/twittercovidq2/pytorch_model_head.bin
Deleting older checkpoint [../models/fewshot/roberta-larg

early stopping required metric_for_best_model, but did not find eval_f1 so early stopping is disabled


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▁▁▁▁▁▁▁▁▁▁
eval/f1,▁▁▁▁▁▁▁▁▁▁▁
eval/loss,▆▄█▄▂▂▃▂▁▂▁
eval/runtime,▁▁▁▂▇▇▇▇▇█▇
eval/samples_per_second,▇██▆▁▂▂▂▂▁▁
eval/steps_per_second,▇██▆▁▂▂▂▂▁▁
test/accuracy,▁
test/f1,▁
test/loss,▁
test/runtime,▁

0,1
eval/accuracy,0.89362
eval/f1,0.47191
eval/loss,0.53309
eval/runtime,1.7785
eval/samples_per_second,26.427
eval/steps_per_second,26.427
test/accuracy,0.85027
test/f1,0.45954
test/loss,1.09017
test/runtime,7.5546


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669048085653535, max=1.0…

PyTorch: setting up devices
loading configuration file config.json from cache at /home/flavio.merenda@EXPERT.AI/.cache/huggingface/hub/models--roberta-large/snapshots/716877d372b884cad6d419d828bac6c85b3b18d9/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "no_false",
    "1": "contains_false"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.bin

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8951,0.628107,0.904762,0.475
2,0.8875,0.550608,0.904762,0.475
3,0.8431,0.591061,0.904762,0.475
4,0.8667,0.593347,0.904762,0.475
5,0.7922,0.602059,0.904762,0.475
6,0.7126,0.487319,0.904762,0.475
7,0.5754,0.428833,0.904762,0.475
8,0.5933,0.622719,0.904762,0.475
9,0.6036,0.939314,0.904762,0.475
10,0.3601,0.809589,0.904762,0.475


***** Running Evaluation *****
  Num examples = 42
  Batch size = 1
Saving model checkpoint to ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-52
Configuration saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-52/twittercovidq2/adapter_config.json
Module weights saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-52/twittercovidq2/pytorch_adapter.bin
Configuration saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-52/twittercovidq2/head_config.json
Module weights saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-52/twittercovidq2/pytorch_model_head.bin
Configuration saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-52/twittercovidq2/head_config.json
Module weights saved in ../models/fewshot/roberta-large/van-adapt/twittercovidq2/checkpoint-52/twittercovidq2/pytorch_model_head.bin
Deleting older checkpoint [../models/fewshot/roberta-larg

early stopping required metric_for_best_model, but did not find eval_f1 so early stopping is disabled


0,1
eval/accuracy,▁▁▁▁▁▁▁▁▁▁▁
eval/f1,▁▁▁▁▁▁▁▁▁▁▁
eval/loss,▄▃▃▃▃▂▁▄█▆▆
eval/runtime,▁▁▂▇█▇▇▇▇▇▇
eval/samples_per_second,██▆▁▁▁▁▁▁▁▂
eval/steps_per_second,██▆▁▁▁▁▁▁▁▂
test/accuracy,▁
test/f1,▁
test/loss,▁
test/runtime,▁

0,1
eval/accuracy,0.90476
eval/f1,0.475
eval/loss,0.82047
eval/runtime,1.697
eval/samples_per_second,24.75
eval/steps_per_second,24.75
test/accuracy,0.8494
test/f1,0.45928
test/loss,0.95559
test/runtime,6.5421
