In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
base_model_name = "bigscience/bloomz-560m"

In [3]:
from datasets import load_dataset

In [4]:
ds = load_dataset("ought/raft", "twitter_complaints")

In [5]:
classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names]
ds = ds.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)

In [6]:
ds["train"][0]

{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

In [7]:
from transformers import AutoTokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(target_max_length)

3


In [9]:
import torch

In [10]:
max_length = 64

In [11]:
def preprocess_function(examples, text_column="Tweet text", label_column="text_label"):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names]
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(label_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
processed_ds = ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=ds["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [13]:
processed_ds['train'][0]

{'input_ids': [3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  227985,
  5484,
  915,
  2566,
  169403,
  15296,
  36272,
  525,
  3928,
  1119,
  632,
  2670,
  3968,
  15270,
  77658,
  915,
  210],
 'attention_mask': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,

In [14]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

In [15]:
train_ds = processed_ds["train"]
eval_ds = processed_ds["test"]

batch_size = 16

In [16]:
train_dataloader = DataLoader(train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [17]:
from transformers import AutoModelForCausalLM

In [49]:
model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")

# Soft Prompting

In [45]:
from peft import PromptEncoderConfig, PrefixTuningConfig, PromptTuningConfig, get_peft_model

## p-tuning

In [48]:
# peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.0537


## Prefix Tuning

In [21]:
# peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1755


## Prompt Tuning

In [50]:
prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n"
peft_config = PromptTuningConfig(
    task_type="CAUSAL_LM",
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path="bigscience/bloomz-560m",
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 12,288 || all params: 559,226,880 || trainable%: 0.0022


# Training

In [51]:
from transformers import get_linear_schedule_with_warmup

In [52]:
lr = 3e-2
num_epochs = 50

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [53]:
from tqdm import tqdm

device = "cuda"
model = model.to(device)

In [54]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.91it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.63it/s]


epoch=0: train_ppl=tensor(2258.2756, device='cuda:0') train_epoch_loss=tensor(7.7224, device='cuda:0') eval_ppl=tensor(6.1244e+08, device='cuda:0') eval_epoch_loss=tensor(20.2330, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.80it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.55it/s]


epoch=1: train_ppl=tensor(5.2422, device='cuda:0') train_epoch_loss=tensor(1.6567, device='cuda:0') eval_ppl=tensor(1.2180e+09, device='cuda:0') eval_epoch_loss=tensor(20.9205, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.85it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.50it/s]


epoch=2: train_ppl=tensor(2.6506, device='cuda:0') train_epoch_loss=tensor(0.9748, device='cuda:0') eval_ppl=tensor(5.8181e+10, device='cuda:0') eval_epoch_loss=tensor(24.7868, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.84it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.48it/s]


epoch=3: train_ppl=tensor(2.2086, device='cuda:0') train_epoch_loss=tensor(0.7924, device='cuda:0') eval_ppl=tensor(2.2058e+10, device='cuda:0') eval_epoch_loss=tensor(23.8169, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.82it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.47it/s]


epoch=4: train_ppl=tensor(2.1057, device='cuda:0') train_epoch_loss=tensor(0.7447, device='cuda:0') eval_ppl=tensor(1.4730e+10, device='cuda:0') eval_epoch_loss=tensor(23.4131, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.74it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.45it/s]


epoch=5: train_ppl=tensor(1.7518, device='cuda:0') train_epoch_loss=tensor(0.5607, device='cuda:0') eval_ppl=tensor(6.3704e+09, device='cuda:0') eval_epoch_loss=tensor(22.5749, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.71it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.43it/s]


epoch=6: train_ppl=tensor(1.8746, device='cuda:0') train_epoch_loss=tensor(0.6284, device='cuda:0') eval_ppl=tensor(7.4378e+09, device='cuda:0') eval_epoch_loss=tensor(22.7298, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.42it/s]


epoch=7: train_ppl=tensor(1.6963, device='cuda:0') train_epoch_loss=tensor(0.5284, device='cuda:0') eval_ppl=tensor(8.8652e+09, device='cuda:0') eval_epoch_loss=tensor(22.9054, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.72it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=8: train_ppl=tensor(1.6199, device='cuda:0') train_epoch_loss=tensor(0.4824, device='cuda:0') eval_ppl=tensor(1.1588e+10, device='cuda:0') eval_epoch_loss=tensor(23.1732, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.71it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=9: train_ppl=tensor(1.5450, device='cuda:0') train_epoch_loss=tensor(0.4350, device='cuda:0') eval_ppl=tensor(2.3044e+10, device='cuda:0') eval_epoch_loss=tensor(23.8607, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=10: train_ppl=tensor(1.8510, device='cuda:0') train_epoch_loss=tensor(0.6157, device='cuda:0') eval_ppl=tensor(3.1737e+10, device='cuda:0') eval_epoch_loss=tensor(24.1808, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=11: train_ppl=tensor(2.1754, device='cuda:0') train_epoch_loss=tensor(0.7772, device='cuda:0') eval_ppl=tensor(2.1059e+10, device='cuda:0') eval_epoch_loss=tensor(23.7706, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.74it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=12: train_ppl=tensor(1.8106, device='cuda:0') train_epoch_loss=tensor(0.5937, device='cuda:0') eval_ppl=tensor(1.3418e+10, device='cuda:0') eval_epoch_loss=tensor(23.3199, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=13: train_ppl=tensor(1.8017, device='cuda:0') train_epoch_loss=tensor(0.5887, device='cuda:0') eval_ppl=tensor(5.5718e+09, device='cuda:0') eval_epoch_loss=tensor(22.4410, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.72it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=14: train_ppl=tensor(1.9467, device='cuda:0') train_epoch_loss=tensor(0.6662, device='cuda:0') eval_ppl=tensor(4.1138e+09, device='cuda:0') eval_epoch_loss=tensor(22.1376, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=15: train_ppl=tensor(1.5041, device='cuda:0') train_epoch_loss=tensor(0.4082, device='cuda:0') eval_ppl=tensor(3.7587e+09, device='cuda:0') eval_epoch_loss=tensor(22.0473, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.78it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=16: train_ppl=tensor(1.6142, device='cuda:0') train_epoch_loss=tensor(0.4789, device='cuda:0') eval_ppl=tensor(4.3783e+09, device='cuda:0') eval_epoch_loss=tensor(22.1999, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.74it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=17: train_ppl=tensor(1.6632, device='cuda:0') train_epoch_loss=tensor(0.5087, device='cuda:0') eval_ppl=tensor(7.8742e+09, device='cuda:0') eval_epoch_loss=tensor(22.7869, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.70it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=18: train_ppl=tensor(1.4509, device='cuda:0') train_epoch_loss=tensor(0.3722, device='cuda:0') eval_ppl=tensor(1.7003e+10, device='cuda:0') eval_epoch_loss=tensor(23.5566, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.69it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=19: train_ppl=tensor(1.4699, device='cuda:0') train_epoch_loss=tensor(0.3852, device='cuda:0') eval_ppl=tensor(4.2158e+10, device='cuda:0') eval_epoch_loss=tensor(24.4647, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.78it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=20: train_ppl=tensor(1.4001, device='cuda:0') train_epoch_loss=tensor(0.3366, device='cuda:0') eval_ppl=tensor(1.1550e+11, device='cuda:0') eval_epoch_loss=tensor(25.4725, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.72it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=21: train_ppl=tensor(1.3892, device='cuda:0') train_epoch_loss=tensor(0.3287, device='cuda:0') eval_ppl=tensor(2.2257e+11, device='cuda:0') eval_epoch_loss=tensor(26.1285, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=22: train_ppl=tensor(1.4695, device='cuda:0') train_epoch_loss=tensor(0.3849, device='cuda:0') eval_ppl=tensor(4.9050e+11, device='cuda:0') eval_epoch_loss=tensor(26.9187, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=23: train_ppl=tensor(1.5789, device='cuda:0') train_epoch_loss=tensor(0.4567, device='cuda:0') eval_ppl=tensor(6.9954e+11, device='cuda:0') eval_epoch_loss=tensor(27.2737, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.74it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=24: train_ppl=tensor(1.3431, device='cuda:0') train_epoch_loss=tensor(0.2950, device='cuda:0') eval_ppl=tensor(6.2186e+11, device='cuda:0') eval_epoch_loss=tensor(27.1560, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.72it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=25: train_ppl=tensor(1.3393, device='cuda:0') train_epoch_loss=tensor(0.2922, device='cuda:0') eval_ppl=tensor(5.8208e+11, device='cuda:0') eval_epoch_loss=tensor(27.0899, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=26: train_ppl=tensor(1.5143, device='cuda:0') train_epoch_loss=tensor(0.4150, device='cuda:0') eval_ppl=tensor(4.8195e+11, device='cuda:0') eval_epoch_loss=tensor(26.9011, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=27: train_ppl=tensor(1.2963, device='cuda:0') train_epoch_loss=tensor(0.2595, device='cuda:0') eval_ppl=tensor(8.0423e+11, device='cuda:0') eval_epoch_loss=tensor(27.4132, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=28: train_ppl=tensor(1.6248, device='cuda:0') train_epoch_loss=tensor(0.4854, device='cuda:0') eval_ppl=tensor(6.5401e+11, device='cuda:0') eval_epoch_loss=tensor(27.2064, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.72it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=29: train_ppl=tensor(1.9506, device='cuda:0') train_epoch_loss=tensor(0.6681, device='cuda:0') eval_ppl=tensor(3.8149e+11, device='cuda:0') eval_epoch_loss=tensor(26.6674, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.71it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=30: train_ppl=tensor(1.4178, device='cuda:0') train_epoch_loss=tensor(0.3491, device='cuda:0') eval_ppl=tensor(1.0591e+11, device='cuda:0') eval_epoch_loss=tensor(25.3859, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.73it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=31: train_ppl=tensor(1.3707, device='cuda:0') train_epoch_loss=tensor(0.3153, device='cuda:0') eval_ppl=tensor(8.9013e+10, device='cuda:0') eval_epoch_loss=tensor(25.2120, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.71it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=32: train_ppl=tensor(1.3692, device='cuda:0') train_epoch_loss=tensor(0.3142, device='cuda:0') eval_ppl=tensor(1.5292e+11, device='cuda:0') eval_epoch_loss=tensor(25.7532, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.80it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=33: train_ppl=tensor(1.2407, device='cuda:0') train_epoch_loss=tensor(0.2157, device='cuda:0') eval_ppl=tensor(3.7266e+11, device='cuda:0') eval_epoch_loss=tensor(26.6439, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.80it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=34: train_ppl=tensor(1.2339, device='cuda:0') train_epoch_loss=tensor(0.2102, device='cuda:0') eval_ppl=tensor(8.0930e+11, device='cuda:0') eval_epoch_loss=tensor(27.4194, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.76it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.42it/s]


epoch=35: train_ppl=tensor(1.2472, device='cuda:0') train_epoch_loss=tensor(0.2209, device='cuda:0') eval_ppl=tensor(1.4935e+12, device='cuda:0') eval_epoch_loss=tensor(28.0321, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.71it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=36: train_ppl=tensor(1.1735, device='cuda:0') train_epoch_loss=tensor(0.1600, device='cuda:0') eval_ppl=tensor(1.7949e+12, device='cuda:0') eval_epoch_loss=tensor(28.2160, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=37: train_ppl=tensor(1.3057, device='cuda:0') train_epoch_loss=tensor(0.2667, device='cuda:0') eval_ppl=tensor(2.3464e+12, device='cuda:0') eval_epoch_loss=tensor(28.4839, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.80it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=38: train_ppl=tensor(1.1558, device='cuda:0') train_epoch_loss=tensor(0.1448, device='cuda:0') eval_ppl=tensor(3.5403e+12, device='cuda:0') eval_epoch_loss=tensor(28.8952, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=39: train_ppl=tensor(1.2332, device='cuda:0') train_epoch_loss=tensor(0.2096, device='cuda:0') eval_ppl=tensor(3.5275e+12, device='cuda:0') eval_epoch_loss=tensor(28.8916, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.78it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=40: train_ppl=tensor(1.3283, device='cuda:0') train_epoch_loss=tensor(0.2839, device='cuda:0') eval_ppl=tensor(4.0186e+12, device='cuda:0') eval_epoch_loss=tensor(29.0220, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.72it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=41: train_ppl=tensor(1.1747, device='cuda:0') train_epoch_loss=tensor(0.1610, device='cuda:0') eval_ppl=tensor(4.9004e+12, device='cuda:0') eval_epoch_loss=tensor(29.2203, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.72it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=42: train_ppl=tensor(1.1536, device='cuda:0') train_epoch_loss=tensor(0.1429, device='cuda:0') eval_ppl=tensor(6.4065e+12, device='cuda:0') eval_epoch_loss=tensor(29.4883, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.41it/s]


epoch=43: train_ppl=tensor(1.1515, device='cuda:0') train_epoch_loss=tensor(0.1411, device='cuda:0') eval_ppl=tensor(9.1461e+12, device='cuda:0') eval_epoch_loss=tensor(29.8444, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.39it/s]


epoch=44: train_ppl=tensor(1.2077, device='cuda:0') train_epoch_loss=tensor(0.1887, device='cuda:0') eval_ppl=tensor(1.2982e+13, device='cuda:0') eval_epoch_loss=tensor(30.1946, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.72it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]


epoch=45: train_ppl=tensor(1.1382, device='cuda:0') train_epoch_loss=tensor(0.1295, device='cuda:0') eval_ppl=tensor(1.6123e+13, device='cuda:0') eval_epoch_loss=tensor(30.4113, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.70it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.39it/s]


epoch=46: train_ppl=tensor(1.1157, device='cuda:0') train_epoch_loss=tensor(0.1095, device='cuda:0') eval_ppl=tensor(1.9799e+13, device='cuda:0') eval_epoch_loss=tensor(30.6166, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.79it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.39it/s]


epoch=47: train_ppl=tensor(1.1308, device='cuda:0') train_epoch_loss=tensor(0.1229, device='cuda:0') eval_ppl=tensor(2.2984e+13, device='cuda:0') eval_epoch_loss=tensor(30.7658, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.73it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.39it/s]


epoch=48: train_ppl=tensor(1.1594, device='cuda:0') train_epoch_loss=tensor(0.1479, device='cuda:0') eval_ppl=tensor(2.5057e+13, device='cuda:0') eval_epoch_loss=tensor(30.8522, device='cuda:0')


100%|████████████████████████████████████████████| 4/4 [00:00<00:00,  5.70it/s]
100%|████████████████████████████████████████| 213/213 [00:22<00:00,  9.40it/s]

epoch=49: train_ppl=tensor(1.1323, device='cuda:0') train_epoch_loss=tensor(0.1243, device='cuda:0') eval_ppl=tensor(2.6196e+13, device='cuda:0') eval_epoch_loss=tensor(30.8966, device='cuda:0')





In [1]:
from huggingface_hub import notebook_login, login

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [57]:
peft_model_id = "geshijoker/bloomz-560-prompt-tuning"
model.push_to_hub(peft_model_id)

adapter_model.safetensors:   0%|          | 0.00/49.3k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/geshijoker/bloomz-560-prompt-tuning/commit/e8d5538e4fa6bdd7f86545fe5654e58dc659ab2d', commit_message='Upload model', commit_description='', oid='e8d5538e4fa6bdd7f86545fe5654e58dc659ab2d', pr_url=None, pr_revision=None, pr_num=None)

# Inference

In [58]:
from peft import AutoPeftModelForCausalLM, AutoPeftModel, PeftConfig
from transformers import AutoModel, AutoModelForCausalLM
import peft

In [59]:
peft_config = PeftConfig.from_pretrained(peft_model_id)

adapter_config.json:   0%|          | 0.00/522 [00:00<?, ?B/s]

In [60]:
peft_config.base_model_name_or_path = base_model_name

In [61]:
model = AutoPeftModelForCausalLM.from_pretrained(peft_model_id, base_model_name, config=peft_config).to(device)

adapter_model.safetensors:   0%|          | 0.00/49.3k [00:00<?, ?B/s]

In [62]:
model

PeftModelForCausalLM(
  (base_model): BloomForCausalLM(
    (transformer): BloomModel(
      (word_embeddings): Embedding(250880, 1024)
      (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (h): ModuleList(
        (0-23): 24 x BloomBlock(
          (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (self_attention): BloomAttention(
            (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BloomMLP(
            (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu_impl): BloomGelu()
            (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
          )
        )
      

In [63]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")

In [64]:
i = 15
text_column = "Tweet text"
inputs = tokenizer(f'{text_column} : {ds["test"][i]["Tweet text"]} Label : ', return_tensors="pt")
print(ds["test"][i]["Tweet text"])

@NYTsupport i have complained a dozen times &amp; yet my papers are still thrown FAR from my door. Why is this so hard to resolve?


In [65]:
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

['Tweet text : @NYTsupport i have complained a dozen times &amp; yet my papers are still thrown FAR from my door. Why is this so hard to resolve? Label :  complaintnocomplaintcomplaintcomplaintcomplaint']
