<a href="https://colab.research.google.com/github/goddoe/hacking-llms-for-low-res-settings/blob/main/prefix_tuning_kor_nli.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install evaluate tqdm datasets scikit-learn scipy peft

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.3.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [2]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
)

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm

In [3]:
batch_size = 32
model_name_or_path = "roberta-large"
peft_type = PeftType.PREFIX_TUNING
device = "cuda"

In [4]:
peft_config = PrefixTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=20)
lr = 1e-2

In [5]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
train_datasets = load_dataset("kor_nli", "multi_nli")
valid_test_datasets = load_dataset("kor_nli", "xnli")

Downloading builder script:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

Downloading and preparing dataset kor_nli/multi_nli to /root/.cache/huggingface/datasets/kor_nli/multi_nli/1.0.0/06d9b61bd1372a618df02294965857ff10886d48696f33a32cbea656b71dfcf0...


Downloading data:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Dataset kor_nli downloaded and prepared to /root/.cache/huggingface/datasets/kor_nli/multi_nli/1.0.0/06d9b61bd1372a618df02294965857ff10886d48696f33a32cbea656b71dfcf0. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset kor_nli/xnli to /root/.cache/huggingface/datasets/kor_nli/xnli/1.0.0/06d9b61bd1372a618df02294965857ff10886d48696f33a32cbea656b71dfcf0...


Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Dataset kor_nli downloaded and prepared to /root/.cache/huggingface/datasets/kor_nli/xnli/1.0.0/06d9b61bd1372a618df02294965857ff10886d48696f33a32cbea656b71dfcf0. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["premise"], examples["hypothesis"], truncation=True, max_length=460)
    return outputs


train_tokenized_datasets = train_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["premise", "hypothesis"],
)

valid_test_tokenized_datasets = valid_test_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["premise", "hypothesis"],
)


# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
train_tokenized_datasets = train_tokenized_datasets.rename_column("label", "labels")
valid_test_tokenized_datasets = valid_test_tokenized_datasets.rename_column("label", "labels")

def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Instantiate dataloaders.
train_dataloader = DataLoader(train_tokenized_datasets['train'], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
# train_dataloader = DataLoader(train_tokenized_datasets['train'].train_test_split(train_size=1000)['train'], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
    valid_test_tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, num_labels=3)

In [None]:
model

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model

In [None]:
optimizer = AdamW(params=model.parameters(), lr=lr)


num_epochs = 20

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(f"epoch {epoch}:", eval_metric)

In [None]:
output_path = "outputs/prefix_tuning"
model.save_pretrained("outputs/prefix_tuning")
tokenizer.save_pretrained("outputs/prefix_tuning")

## Load and Inference

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer


config = PeftConfig.from_pretrained(output_path)
inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
inference_model = PeftModel.from_pretrained(inference_model, output_path)

inference_model.to(device)
inference_model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    batch.to(device)
    with torch.no_grad():
        outputs = inference_model(**batch)
    predictions = outputs.logits.argmax(dim=-1)
    predictions, references = predictions, batch["labels"]
    metric.add_batch(
        predictions=predictions,
        references=references,
    )

eval_metric = metric.compute()
print(eval_metric)