# Fine-tuning with Low-Rank Adaptation (LoRA)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install wandb python-dotenv datasets peft

In [3]:
import wandb
from dotenv import load_dotenv

from peft import LoraConfig, TaskType, get_peft_model
import transformers
from transformers import BertForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
from datasets import load_dataset
import numpy as np
from sklearn.metrics import f1_score

transformers.logging.set_verbosity_error()

In [4]:
load_dotenv()
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
import os
os.environ["WANDB_PROJECT"]="semeval-21-task-7"
os.environ["WANDB_LOG_MODEL"]="checkpoint"

## Dataset loading

In [6]:
dataset = (
    load_dataset('csv', data_files={'train': 'drive/MyDrive/data/train.csv', 'dev': 'drive/MyDrive/data/dev.csv', 'test': 'drive/MyDrive/data/test.csv'})
    .remove_columns(['id', 'humor_rating', 'humor_controversy', 'offense_rating', 'sentence_length'])
    .rename_column('is_humor', 'label'))

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

## Model setup

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


def tokenize(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=150)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
dataset = dataset.map(tokenize, batched=True).remove_columns(['text'])

Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'f1': f1
    }


## Training setup

In [14]:
for i in range(1, 7):
    wandb.init(config={
        'r': 2**i,
        'lora_alpha': 2**i,
        'lora_dropout': 0.05,
        'batch_size': 32,
        'adam_epsilon': 1e-08,
        'learning_rate': 2e-5,
        'epochs': 2
    })

    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=wandb.config['r'],
        lora_alpha=wandb.config['lora_alpha'],
        lora_dropout=wandb.config['lora_dropout'],
        bias="none"
    )

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    peft_model = get_peft_model(model, lora_config)
    trainable_params, all_param = peft_model.get_nb_trainable_parameters()

    print("------------------------------")
    print(f"r: {2 ** i} || trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param} ")

    trainer = Trainer(
        model=peft_model,
        args=TrainingArguments(
            output_dir='bert-lora-humor-detection',
            evaluation_strategy="steps",
            eval_steps=100,
            logging_steps=100,
            save_steps=100,
            num_train_epochs=wandb.config['epochs'],
            per_device_train_batch_size=wandb.config['batch_size'],
            per_device_eval_batch_size=32,
            report_to=["wandb"],
            run_name=f"bert-lora-humor-v3-{i}",
            adam_epsilon=wandb.config['adam_epsilon'],
            learning_rate=wandb.config['learning_rate']
        ),
        train_dataset=dataset['train'],
        eval_dataset=dataset['dev'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    wandb.finish()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7ae4800effa0>> (for pre_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_init.py", line 1172, in init
    wi.setup(kwargs)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_init.py", line 225, in setup
    with telemetry.context(obj=self._init_telemetry_obj) as tel:
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/telemetry.py", line 42, in __exit__
    self._run._telemetry_callback(self._obj)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 758, in _telemetry_callback
    self._telemetry_flush()
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py", line 769, in _telemetry_flush
    self._backend.interface._publish_telemetry(self._telemetry_obj)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/interface_shared.py", line 101, in _publish_telemetry
    self._publish(rec)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/interface_sock.py", line 51, in 

Error: An unexpected error occurred

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7ae4800effa0>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
# trainer.predict(dataset['test']).metrics

{'test_loss': 0.3577665686607361,
 'test_f1': 0.8537736197348548,
 'test_runtime': 37.9098,
 'test_samples_per_second': 21.103,
 'test_steps_per_second': 2.638}