# Soft Prompt Tuning

Prompt tuning adds a small set of trainable virtual tokens (soft prompts) to the input while keeping the pre-trained model's weight frozen.
These virtual prompts are not human-readable; they are appended to the start of a prompt to serve as a task-specific guide during LLM training or inferences.

For example, the virtual tokens are prefixed to text for sentiment classification:

```
[virtual tokens] I love Fridays!
```

where `[virtual tokens]` are the inserted embeddings. These virtual tokens can be randomly generated or initialized from a vocabulary.

During training, these token embeddings are updated while the base model remains frozen.

In [None]:
# Run the following to downgrade datasets module, restart kernel
!pip install datasets==3.6.0



Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-3.6.0
/bin/bash: line 1: 5.0.0: No such file or directory


In [1]:
# Import packages
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, PeftConfig, PeftModel
from datasets import load_dataset
from torch.utils.data import DataLoader

from tqdm import tqdm

In [2]:
model_name = "bigscience/bloomz-560m"

username = 'ought/raft'
dataset_name = "twitter_complaints"

In [3]:
# TODO: Load dataset
# older version of the load_dataset
dataset = load_dataset(username, dataset_name)


Using the latest cached version of the dataset since ought/raft couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'twitter_complaints' at /root/.cache/huggingface/datasets/ought___raft/twitter_complaints/1.1.0/9ee50172ea9afda2f1033c6f1b986e568b862fb3 (last modified on Tue Jan 13 23:32:09 2026).


In [4]:
# TODO: Display dataset
#print(dataset)

print(dataset['train'].features['Label'].names)

idx = 20
for k, v in dataset['train'][20].items():
  print(f'{k} = {v}')

['Unlabeled', 'complaint', 'no complaint']
Tweet text = @united not happy with this delay from Newark to Manchester tonight :( only 30 mins free Wi-fi sucks ...
ID = 20
Label = 1


In [5]:
# TODO: Add text_label from Label
classes = dataset['train'].features['Label'].names
print(classes)

['Unlabeled', 'complaint', 'no complaint']


In [6]:
# Convert labels to label text
dataset_enh = dataset.map(
   lambda x: { 'text_label': [ classes[label] for label in x['Label'] ] },
   batched = True,
   num_proc = 1
)

print(dataset_enh['train'][2])

{'Tweet text': "If I can't get my 3rd pair of @beatsbydre powerbeats to work today I'm doneski man. This is a slap in my balls. Your next @Bose @BoseService", 'ID': 2, 'Label': 1, 'text_label': 'complaint'}


In [None]:
print(dataset_enh['train'][idx])

{'Tweet text': '@united not happy with this delay from Newark to Manchester tonight :( only 30 mins free Wi-fi sucks ...', 'ID': 20, 'Label': 1, 'text_label': 'complaint'}


In [11]:
# TODO: Load tokenizer
# call variable name tokenizer
#tokenizer = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
# Utility function to preprocess teweets

def preprocess_function(examples):
    text_column = 'Tweet text'
    label_column = 'text_label'
    max_length = 64
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
# TODO: Preprocess the Tweets
# Call preprocessed tweets dataset_processed
dataset_processed = dataset_enh.map(
    preprocess_function,
    batched=True,
    load_from_cache_file=False
)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [None]:
# TODO: Explore preprocessed Tweets


In [None]:
# TODO: Create train and evaluation dataset
# Must call train set DataLoader loader_train
# Must call test set DataLoader loader_test

batch_size = 8



## Soft Prompt

In [None]:
# TODO: Create PEFT configuration




In [None]:
# TODO: Create model for training


In [None]:
# Create optimizer
lr = 3e-2
num_epochs = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
   optimizer=optimizer,
   num_warmup_steps=0,
   num_training_steps=(len(loader_train) * num_epochs)
)

In [None]:
# Train model
# default to CPU
device = "cuda"
device = "cpu"

model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(loader_train)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.requires_grad = True
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(loader_test)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(loader_test)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(loader_train)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

## Inferencing

In [None]:
peft_model_id = peft_model_id = "stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM"

In [None]:
## TODO: Load trained model



In [None]:
## TODO: Write and encode text

input_text = "Tweet text: @uni I hate the food in your canteen. Label: "
input_text = "Tweet text: @husband The pipe has been leaking for over one month. Can fix this NOW? Label: "



In [None]:
## TODO: Determine sentiment from model

