In [2]:
from datasets import load_dataset

ds = load_dataset("ought/raft", "twitter_complaints")

classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names]
ds = ds.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)
ds["train"][0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/266k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3399 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3399 [00:00<?, ? examples/s]

{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(target_max_length)

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

3


In [4]:
import torch

max_length = 64

def preprocess_function(examples, text_column="Tweet text", label_column="text_label"):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names]
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(label_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:
processed_ds = ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=ds["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [15]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_ds = processed_ds["train"]
eval_ds = processed_ds["test"]

batch_size = 16

train_dataloader = DataLoader(train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [16]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")

In [11]:
## ---  Prefix Tuning --- ##
from peft import PrefixTuningConfig, get_peft_model

peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

peft_model_id = f"hskang_prefix/bloomz-560-m-peft-method"

trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1755


In [22]:
## ---  P-Tuning --- ##
from peft import PromptEncoderConfig, get_peft_model

peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

peft_model_id = f"hskang_ptuning/bloomz-560-m-peft-method"

trainable params: 300,288 || all params: 561,087,744 || trainable%: 0.0535


In [13]:
## ---  Prompt Tuning --- ##
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model

prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n"
peft_config = PromptTuningConfig(
    task_type="CAUSAL_LM",
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path="bigscience/bloomz-560m",
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

peft_model_id = f"hskang_prompts/bloomz-560-m-peft-method"

trainable params: 12,288 || all params: 559,226,880 || trainable%: 0.0022


In [17]:
# 모델의 모든 모듈 이름 출력
for name, module in model.named_modules():
    print(name)


transformer
transformer.word_embeddings
transformer.word_embeddings_layernorm
transformer.h
transformer.h.0
transformer.h.0.input_layernorm
transformer.h.0.self_attention
transformer.h.0.self_attention.query_key_value
transformer.h.0.self_attention.dense
transformer.h.0.self_attention.attention_dropout
transformer.h.0.post_attention_layernorm
transformer.h.0.mlp
transformer.h.0.mlp.dense_h_to_4h
transformer.h.0.mlp.gelu_impl
transformer.h.0.mlp.dense_4h_to_h
transformer.h.1
transformer.h.1.input_layernorm
transformer.h.1.self_attention
transformer.h.1.self_attention.query_key_value
transformer.h.1.self_attention.dense
transformer.h.1.self_attention.attention_dropout
transformer.h.1.post_attention_layernorm
transformer.h.1.mlp
transformer.h.1.mlp.dense_h_to_4h
transformer.h.1.mlp.gelu_impl
transformer.h.1.mlp.dense_4h_to_h
transformer.h.2
transformer.h.2.input_layernorm
transformer.h.2.self_attention
transformer.h.2.self_attention.query_key_value
transformer.h.2.self_attention.dense
tr

In [25]:
## ---  LoRA --- ##
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

peft_model_id = f"hskang_LoRa/bloomz-560-m-peft-method"

trainable params: 1,572,864 || all params: 561,087,744 || trainable%: 0.2803


In [26]:
from transformers import get_linear_schedule_with_warmup

lr = 3e-2
# num_epochs = 50
num_epochs = 10

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [27]:
from tqdm import tqdm

device = "cuda"
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 4/4 [00:00<00:00,  6.10it/s]
100%|██████████| 213/213 [00:18<00:00, 11.27it/s]


epoch=0: train_ppl=tensor(5.1737e+14, device='cuda:0') train_epoch_loss=tensor(33.8798, device='cuda:0') eval_ppl=tensor(inf, device='cuda:0') eval_epoch_loss=tensor(107.3616, device='cuda:0')


100%|██████████| 4/4 [00:00<00:00,  6.53it/s]
100%|██████████| 213/213 [00:18<00:00, 11.26it/s]


epoch=1: train_ppl=tensor(3.5741e+20, device='cuda:0') train_epoch_loss=tensor(47.3254, device='cuda:0') eval_ppl=tensor(inf, device='cuda:0') eval_epoch_loss=tensor(120.7972, device='cuda:0')


100%|██████████| 4/4 [00:00<00:00,  6.54it/s]
100%|██████████| 213/213 [00:18<00:00, 11.32it/s]


epoch=2: train_ppl=tensor(3.5262e+17, device='cuda:0') train_epoch_loss=tensor(40.4042, device='cuda:0') eval_ppl=tensor(6165727.5000, device='cuda:0') eval_epoch_loss=tensor(15.6345, device='cuda:0')


100%|██████████| 4/4 [00:00<00:00,  6.53it/s]
100%|██████████| 213/213 [00:18<00:00, 11.29it/s]


epoch=3: train_ppl=tensor(322247.1250, device='cuda:0') train_epoch_loss=tensor(12.6831, device='cuda:0') eval_ppl=tensor(31556680., device='cuda:0') eval_epoch_loss=tensor(17.2673, device='cuda:0')


100%|██████████| 4/4 [00:00<00:00,  6.54it/s]
100%|██████████| 213/213 [00:18<00:00, 11.27it/s]


epoch=4: train_ppl=tensor(464.7996, device='cuda:0') train_epoch_loss=tensor(6.1416, device='cuda:0') eval_ppl=tensor(15753433., device='cuda:0') eval_epoch_loss=tensor(16.5726, device='cuda:0')


100%|██████████| 4/4 [00:00<00:00,  6.53it/s]
100%|██████████| 213/213 [00:18<00:00, 11.26it/s]


epoch=5: train_ppl=tensor(140.2643, device='cuda:0') train_epoch_loss=tensor(4.9435, device='cuda:0') eval_ppl=tensor(1786922.8750, device='cuda:0') eval_epoch_loss=tensor(14.3960, device='cuda:0')


100%|██████████| 4/4 [00:00<00:00,  6.54it/s]
100%|██████████| 213/213 [00:18<00:00, 11.30it/s]


epoch=6: train_ppl=tensor(49.1366, device='cuda:0') train_epoch_loss=tensor(3.8946, device='cuda:0') eval_ppl=tensor(1469327.6250, device='cuda:0') eval_epoch_loss=tensor(14.2003, device='cuda:0')


100%|██████████| 4/4 [00:00<00:00,  6.42it/s]
100%|██████████| 213/213 [00:18<00:00, 11.25it/s]


epoch=7: train_ppl=tensor(49.1378, device='cuda:0') train_epoch_loss=tensor(3.8946, device='cuda:0') eval_ppl=tensor(1477907.3750, device='cuda:0') eval_epoch_loss=tensor(14.2061, device='cuda:0')


100%|██████████| 4/4 [00:00<00:00,  6.52it/s]
100%|██████████| 213/213 [00:18<00:00, 11.22it/s]


epoch=8: train_ppl=tensor(38.6781, device='cuda:0') train_epoch_loss=tensor(3.6553, device='cuda:0') eval_ppl=tensor(1330873.3750, device='cuda:0') eval_epoch_loss=tensor(14.1013, device='cuda:0')


100%|██████████| 4/4 [00:00<00:00,  6.48it/s]
100%|██████████| 213/213 [00:18<00:00, 11.28it/s]

epoch=9: train_ppl=tensor(29.6945, device='cuda:0') train_epoch_loss=tensor(3.3910, device='cuda:0') eval_ppl=tensor(1206937., device='cuda:0') eval_epoch_loss=tensor(14.0036, device='cuda:0')





In [28]:
# save locally
model.save_pretrained(peft_model_id)


In [37]:
from peft import PeftModel

# LoRA 적용 모델 불러오기
base_model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
load_model = PeftModel.from_pretrained(base_model, peft_model_id)
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")

i = 15
inputs = tokenizer(f'Tweet text : {ds["test"][i]["Tweet text"]} Label : ', return_tensors="pt")
print(ds["test"][i]["Tweet text"])

@NYTsupport i have complained a dozen times &amp; yet my papers are still thrown FAR from my door. Why is this so hard to resolve?


In [38]:
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

['Tweet text : @NYTsupport i have complained a dozen times &amp; yet my papers are still thrown FAR from my door. Why is this so hard to resolve? Label : nononononononononono']


In [None]:
%reset -f

import torch
import gc

# 모든 변수에 대한 참조를 해제합니다.
# 주의: 이 작업은 Jupyter Notebook의 모든 전역 변수를 삭제합니다.

# Python의 가비지 컬렉터를 실행하여 메모리에서 객체를 제거합니다.
gc.collect()

# PyTorch가 사용하는 CUDA 캐시를 비웁니다.
torch.cuda.empty_cache()