In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm


In [2]:
device = "cuda"
model_name_or_path = "bigscience/bloomz-560m"
tokenizer_name_or_path = "bigscience/bloomz-560m"
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_name_or_path,
)

In [3]:
dataset_name = "twitter_complaints"
checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
    "/", "_"
)
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 3e-2
num_epochs = 50
batch_size = 8

In [4]:
dataset = load_dataset("ought/raft", dataset_name)
dataset["train"][0]
{"Tweet text": "@HMRCcustomers No this is my first job", "ID": 0, "Label": 2}

Downloading data:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/266k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3399 [00:00<?, ? examples/s]

{'Tweet text': '@HMRCcustomers No this is my first job', 'ID': 0, 'Label': 2}

In [5]:
classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)
dataset["train"][0]
{"Tweet text": "@HMRCcustomers No this is my first job", "ID": 0, "Label": 2, "text_label": "no complaint"}

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3399 [00:00<?, ? examples/s]

{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(target_max_length)

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

3


In [7]:
def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [25]:
print(processed_datasets["train"][0])

{'input_ids': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 227985, 5484, 915, 2566, 169403, 15296, 36272, 525, 3928, 1119, 632, 2670, 3968, 15270, 77658, 915, 210, 1936, 106863, 3], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1936, 106863, 3]}


In [9]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [135]:
peft_config

PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, auto_mapping=None, base_model_name_or_path='bigscience/bloomz-560m', revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, num_virtual_tokens=8, token_dim=1024, num_transformer_submodules=1, num_attention_heads=16, num_layers=24, prompt_tuning_init=<PromptTuningInit.TEXT: 'TEXT'>, prompt_tuning_init_text='Classify if the tweet is a complaint or not:', tokenizer_name_or_path='bigscience/bloomz-560m', tokenizer_kwargs=None)

In [137]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())
"trainable params: 8192 || all params: 559222784 || trainable%: 0.0014648902430985358"

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0015
None


'trainable params: 8192 || all params: 559222784 || trainable%: 0.0014648902430985358'

In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [12]:
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  8.44it/s]
  0%|                                                  | 0/425 [00:00<?, ?it/s]Using `past_key_values` as a tuple is deprecated and will be removed in v4.45. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 18.31it/s]


epoch=0: train_ppl=tensor(5.7618e+20, device='cuda:0') train_epoch_loss=tensor(47.8030, device='cuda:0') eval_ppl=tensor(27112.0664, device='cuda:0') eval_epoch_loss=tensor(10.2077, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.80it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 18.16it/s]


epoch=1: train_ppl=tensor(949358.8750, device='cuda:0') train_epoch_loss=tensor(13.7635, device='cuda:0') eval_ppl=tensor(18614.8438, device='cuda:0') eval_epoch_loss=tensor(9.8317, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.74it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 18.07it/s]


epoch=2: train_ppl=tensor(618577.3125, device='cuda:0') train_epoch_loss=tensor(13.3352, device='cuda:0') eval_ppl=tensor(15083.7627, device='cuda:0') eval_epoch_loss=tensor(9.6214, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.75it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 18.01it/s]


epoch=3: train_ppl=tensor(467540.6875, device='cuda:0') train_epoch_loss=tensor(13.0552, device='cuda:0') eval_ppl=tensor(12473.2324, device='cuda:0') eval_epoch_loss=tensor(9.4313, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.75it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.99it/s]


epoch=4: train_ppl=tensor(240272.0938, device='cuda:0') train_epoch_loss=tensor(12.3895, device='cuda:0') eval_ppl=tensor(10149.2480, device='cuda:0') eval_epoch_loss=tensor(9.2252, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.77it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.99it/s]


epoch=5: train_ppl=tensor(106995.3516, device='cuda:0') train_epoch_loss=tensor(11.5805, device='cuda:0') eval_ppl=tensor(7114.2422, device='cuda:0') eval_epoch_loss=tensor(8.8699, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.67it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.97it/s]


epoch=6: train_ppl=tensor(26368.3359, device='cuda:0') train_epoch_loss=tensor(10.1799, device='cuda:0') eval_ppl=tensor(5646.4351, device='cuda:0') eval_epoch_loss=tensor(8.6388, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.68it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.97it/s]


epoch=7: train_ppl=tensor(4060.2739, device='cuda:0') train_epoch_loss=tensor(8.3090, device='cuda:0') eval_ppl=tensor(7166.0205, device='cuda:0') eval_epoch_loss=tensor(8.8771, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.61it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.96it/s]


epoch=8: train_ppl=tensor(982.3998, device='cuda:0') train_epoch_loss=tensor(6.8900, device='cuda:0') eval_ppl=tensor(10583.2295, device='cuda:0') eval_epoch_loss=tensor(9.2670, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.70it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.96it/s]


epoch=9: train_ppl=tensor(399.1855, device='cuda:0') train_epoch_loss=tensor(5.9894, device='cuda:0') eval_ppl=tensor(17769., device='cuda:0') eval_epoch_loss=tensor(9.7852, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.63it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.96it/s]


epoch=10: train_ppl=tensor(301.8922, device='cuda:0') train_epoch_loss=tensor(5.7101, device='cuda:0') eval_ppl=tensor(29483.9727, device='cuda:0') eval_epoch_loss=tensor(10.2916, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.72it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.96it/s]


epoch=11: train_ppl=tensor(248.1880, device='cuda:0') train_epoch_loss=tensor(5.5142, device='cuda:0') eval_ppl=tensor(27309.9629, device='cuda:0') eval_epoch_loss=tensor(10.2150, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.72it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.95it/s]


epoch=12: train_ppl=tensor(226.3561, device='cuda:0') train_epoch_loss=tensor(5.4221, device='cuda:0') eval_ppl=tensor(30853.6230, device='cuda:0') eval_epoch_loss=tensor(10.3370, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.76it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.95it/s]


epoch=13: train_ppl=tensor(197.0512, device='cuda:0') train_epoch_loss=tensor(5.2835, device='cuda:0') eval_ppl=tensor(24078.6035, device='cuda:0') eval_epoch_loss=tensor(10.0891, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.64it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.95it/s]


epoch=14: train_ppl=tensor(169.3393, device='cuda:0') train_epoch_loss=tensor(5.1319, device='cuda:0') eval_ppl=tensor(31547.0625, device='cuda:0') eval_epoch_loss=tensor(10.3592, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.74it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.95it/s]


epoch=15: train_ppl=tensor(146.6947, device='cuda:0') train_epoch_loss=tensor(4.9884, device='cuda:0') eval_ppl=tensor(29976.0527, device='cuda:0') eval_epoch_loss=tensor(10.3082, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.64it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=16: train_ppl=tensor(126.9287, device='cuda:0') train_epoch_loss=tensor(4.8436, device='cuda:0') eval_ppl=tensor(37030.5234, device='cuda:0') eval_epoch_loss=tensor(10.5195, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.47it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=17: train_ppl=tensor(110.4860, device='cuda:0') train_epoch_loss=tensor(4.7049, device='cuda:0') eval_ppl=tensor(44940.4609, device='cuda:0') eval_epoch_loss=tensor(10.7131, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.63it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=18: train_ppl=tensor(95.2273, device='cuda:0') train_epoch_loss=tensor(4.5563, device='cuda:0') eval_ppl=tensor(54147.4219, device='cuda:0') eval_epoch_loss=tensor(10.8995, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.71it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=19: train_ppl=tensor(84.6016, device='cuda:0') train_epoch_loss=tensor(4.4380, device='cuda:0') eval_ppl=tensor(82574.7969, device='cuda:0') eval_epoch_loss=tensor(11.3215, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.65it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.94it/s]


epoch=20: train_ppl=tensor(80.0471, device='cuda:0') train_epoch_loss=tensor(4.3826, device='cuda:0') eval_ppl=tensor(79840.4453, device='cuda:0') eval_epoch_loss=tensor(11.2878, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.59it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.94it/s]


epoch=21: train_ppl=tensor(74.5915, device='cuda:0') train_epoch_loss=tensor(4.3120, device='cuda:0') eval_ppl=tensor(65048.2617, device='cuda:0') eval_epoch_loss=tensor(11.0829, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.65it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.95it/s]


epoch=22: train_ppl=tensor(64.3921, device='cuda:0') train_epoch_loss=tensor(4.1650, device='cuda:0') eval_ppl=tensor(62120.1797, device='cuda:0') eval_epoch_loss=tensor(11.0368, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.69it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.94it/s]


epoch=23: train_ppl=tensor(60.8154, device='cuda:0') train_epoch_loss=tensor(4.1078, device='cuda:0') eval_ppl=tensor(47848.8594, device='cuda:0') eval_epoch_loss=tensor(10.7758, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.65it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.95it/s]


epoch=24: train_ppl=tensor(46.6623, device='cuda:0') train_epoch_loss=tensor(3.8429, device='cuda:0') eval_ppl=tensor(53582.8711, device='cuda:0') eval_epoch_loss=tensor(10.8890, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.58it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=25: train_ppl=tensor(41.7862, device='cuda:0') train_epoch_loss=tensor(3.7326, device='cuda:0') eval_ppl=tensor(86585.4375, device='cuda:0') eval_epoch_loss=tensor(11.3689, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.73it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=26: train_ppl=tensor(38.9265, device='cuda:0') train_epoch_loss=tensor(3.6617, device='cuda:0') eval_ppl=tensor(86700.3750, device='cuda:0') eval_epoch_loss=tensor(11.3702, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.67it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=27: train_ppl=tensor(33.7703, device='cuda:0') train_epoch_loss=tensor(3.5196, device='cuda:0') eval_ppl=tensor(86359.6484, device='cuda:0') eval_epoch_loss=tensor(11.3663, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.48it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=28: train_ppl=tensor(30.2478, device='cuda:0') train_epoch_loss=tensor(3.4094, device='cuda:0') eval_ppl=tensor(84774.9531, device='cuda:0') eval_epoch_loss=tensor(11.3478, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.76it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=29: train_ppl=tensor(23.8779, device='cuda:0') train_epoch_loss=tensor(3.1730, device='cuda:0') eval_ppl=tensor(129631.4219, device='cuda:0') eval_epoch_loss=tensor(11.7725, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.72it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=30: train_ppl=tensor(20.9877, device='cuda:0') train_epoch_loss=tensor(3.0439, device='cuda:0') eval_ppl=tensor(141260.8125, device='cuda:0') eval_epoch_loss=tensor(11.8584, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.64it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=31: train_ppl=tensor(17.3547, device='cuda:0') train_epoch_loss=tensor(2.8539, device='cuda:0') eval_ppl=tensor(177689.0625, device='cuda:0') eval_epoch_loss=tensor(12.0878, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.55it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=32: train_ppl=tensor(15.9751, device='cuda:0') train_epoch_loss=tensor(2.7710, device='cuda:0') eval_ppl=tensor(215929.5000, device='cuda:0') eval_epoch_loss=tensor(12.2827, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.47it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=33: train_ppl=tensor(14.5346, device='cuda:0') train_epoch_loss=tensor(2.6765, device='cuda:0') eval_ppl=tensor(215775.1094, device='cuda:0') eval_epoch_loss=tensor(12.2820, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.70it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=34: train_ppl=tensor(12.4991, device='cuda:0') train_epoch_loss=tensor(2.5257, device='cuda:0') eval_ppl=tensor(256822.5156, device='cuda:0') eval_epoch_loss=tensor(12.4561, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.52it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=35: train_ppl=tensor(11.5382, device='cuda:0') train_epoch_loss=tensor(2.4457, device='cuda:0') eval_ppl=tensor(402014.5312, device='cuda:0') eval_epoch_loss=tensor(12.9042, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.75it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=36: train_ppl=tensor(10.5671, device='cuda:0') train_epoch_loss=tensor(2.3577, device='cuda:0') eval_ppl=tensor(403907.5625, device='cuda:0') eval_epoch_loss=tensor(12.9089, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.69it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=37: train_ppl=tensor(9.4224, device='cuda:0') train_epoch_loss=tensor(2.2431, device='cuda:0') eval_ppl=tensor(454081.7188, device='cuda:0') eval_epoch_loss=tensor(13.0260, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.65it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=38: train_ppl=tensor(9.0151, device='cuda:0') train_epoch_loss=tensor(2.1989, device='cuda:0') eval_ppl=tensor(622228.4375, device='cuda:0') eval_epoch_loss=tensor(13.3411, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.64it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=39: train_ppl=tensor(8.4536, device='cuda:0') train_epoch_loss=tensor(2.1346, device='cuda:0') eval_ppl=tensor(376853.2188, device='cuda:0') eval_epoch_loss=tensor(12.8396, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.65it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=40: train_ppl=tensor(7.3635, device='cuda:0') train_epoch_loss=tensor(1.9965, device='cuda:0') eval_ppl=tensor(531122.4375, device='cuda:0') eval_epoch_loss=tensor(13.1827, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.72it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=41: train_ppl=tensor(6.7771, device='cuda:0') train_epoch_loss=tensor(1.9135, device='cuda:0') eval_ppl=tensor(639357.7500, device='cuda:0') eval_epoch_loss=tensor(13.3682, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.60it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=42: train_ppl=tensor(6.2338, device='cuda:0') train_epoch_loss=tensor(1.8300, device='cuda:0') eval_ppl=tensor(656089.5000, device='cuda:0') eval_epoch_loss=tensor(13.3941, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.53it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=43: train_ppl=tensor(5.7363, device='cuda:0') train_epoch_loss=tensor(1.7468, device='cuda:0') eval_ppl=tensor(822868.8125, device='cuda:0') eval_epoch_loss=tensor(13.6206, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.70it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=44: train_ppl=tensor(5.5887, device='cuda:0') train_epoch_loss=tensor(1.7208, device='cuda:0') eval_ppl=tensor(738539.0625, device='cuda:0') eval_epoch_loss=tensor(13.5124, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.48it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.92it/s]


epoch=45: train_ppl=tensor(5.1935, device='cuda:0') train_epoch_loss=tensor(1.6474, device='cuda:0') eval_ppl=tensor(793878.9375, device='cuda:0') eval_epoch_loss=tensor(13.5847, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.49it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.93it/s]


epoch=46: train_ppl=tensor(5.0663, device='cuda:0') train_epoch_loss=tensor(1.6226, device='cuda:0') eval_ppl=tensor(854135.7500, device='cuda:0') eval_epoch_loss=tensor(13.6578, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.64it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.95it/s]


epoch=47: train_ppl=tensor(4.8968, device='cuda:0') train_epoch_loss=tensor(1.5886, device='cuda:0') eval_ppl=tensor(888892.8750, device='cuda:0') eval_epoch_loss=tensor(13.6977, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.64it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.95it/s]


epoch=48: train_ppl=tensor(4.6923, device='cuda:0') train_epoch_loss=tensor(1.5459, device='cuda:0') eval_ppl=tensor(839773.2500, device='cuda:0') eval_epoch_loss=tensor(13.6409, device='cuda:0')


100%|████████████████████████████████████████████| 7/7 [00:00<00:00,  9.70it/s]
100%|████████████████████████████████████████| 425/425 [00:23<00:00, 17.94it/s]

epoch=49: train_ppl=tensor(4.6630, device='cuda:0') train_epoch_loss=tensor(1.5397, device='cuda:0') eval_ppl=tensor(860958., device='cuda:0') eval_epoch_loss=tensor(13.6658, device='cuda:0')





In [101]:
print(batch["input_ids"].shape)

torch.Size([7, 64])


In [13]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
peft_model_id = "geshijoker/bloomz-560m_PROMPT_TUNING_CAUSAL_LM"
model.push_to_hub("geshijoker/bloomz-560m_PROMPT_TUNING_CAUSAL_LM", use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/32.9k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/geshijoker/bloomz-560m_PROMPT_TUNING_CAUSAL_LM/commit/1c770dc4ba5eb48fdbc147f41bd7db361261e5d1', commit_message='Upload model', commit_description='', oid='1c770dc4ba5eb48fdbc147f41bd7db361261e5d1', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
from peft import PeftModel, PeftConfig

In [140]:
config.base_model_name_or_path

'bigscience/bloomz-560m'

In [141]:
peft_model_id = "geshijoker/bloomz-560m_PROMPT_TUNING_CAUSAL_LM"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id, is_trainable=False)

In [92]:
test_tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path, 
    pad_token=tokenizer.pad_token, 
    mask_token=tokenizer.unk_token, 
    padding_side="left", 
    model_max_length=max_length
)

In [159]:
inputs = test_tokenizer(
    f'{text_column} : {"@nationalgridus I have no water and the bill is current and paid. Can you do something about this?"} Label : ',
    return_tensors="pt",
)

In [146]:
model.to(device)

PeftModelForCausalLM(
  (base_model): BloomForCausalLM(
    (transformer): BloomModel(
      (word_embeddings): Embedding(250880, 1024)
      (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (h): ModuleList(
        (0-23): 24 x BloomBlock(
          (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (self_attention): BloomAttention(
            (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BloomMLP(
            (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu_impl): BloomGelu()
            (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
          )
        )
      

In [160]:
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    prediction = logits[:,-1,:].argmax(-1).item()
    print(tokenizer.decode(prediction))

compl
