In [22]:
# Soft Prompt Tuning :: Debug 

from optm.soft_prompt import *

model, tokenizer = load_hf_model_precise("Qwen/Qwen2.5-0.5B-Instruct") # load model & tokenizer

tf_dataset = ClsCommentDataset(load_tf_data, tokenizer, train=True)
testset = ClsCommentDataset(load_tf_data, tokenizer, train=False)

# initializing soft-prompt model 
# model_with_soft_prompt = SoftPromptLLM(model, tokenizer, 3, initialize_from_vocab=True)

# Usage
num_epochs = 5
learning_rate = 1e-3
train_dataloader = DataLoader(tf_dataset, batch_size=24, shuffle=True)
test_dataloader = DataLoader(testset, batch_size=24, shuffle=True)


In [23]:
# Load Prompt Tuning Config using Hugginface PEFT package (hope to get speed-up in training)

from peft import PromptEmbedding, PromptTuningConfig, get_peft_model
from transformers import get_linear_schedule_with_warmup


# The implemented methods seem to enable trainable "prefix embedding" at all transformer layers 
# - That's different from introducing learnable token (which is prefix in the first layer)
config = PromptTuningConfig(
    peft_type="PROMPT_TUNING",
    task_type="CAUSAL_LM", 
    num_virtual_tokens=4,
    token_dim=1024,  # Qwen-0.5B hidden size
    num_transformer_submodules=1,
    num_attention_heads=16,  # Qwen-0.5B has 16 attention heads
    num_layers=24,  # Qwen-0.5B has 24 layers
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="ksgk", # | wonder how the prompt init text works with variable virtual token length (?)
    tokenizer_name_or_path="Qwen/Qwen2.5-0.5B-Instruct",
)

# Use get_input_embeddings() instead of shared
prompt_embedding = PromptEmbedding(config, model.get_input_embeddings())

model = get_peft_model(model, config)
model.print_trainable_parameters()


lr = 3e-2
num_epochs = 50

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)


trainable params: 3,584 || all params: 494,036,352 || trainable%: 0.0007


In [28]:
from tqdm import tqdm

# device = "cuda"
# model = model.to(device)
device = "mps"

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(test_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(test_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 0/55 [00:25<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 16.91 GB, other allocations: 1.21 GB, max allowed: 18.13 GB). Tried to allocate 42.33 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
trained_model = train_soft_prompt(
    model_with_soft_prompt,
    train_dataloader,
    num_epochs=num_epochs,
    learning_rate=learning_rate,
    accumulation_steps=4
)

In [5]:
# Test model performance after training
from optm.soft_prompt import *

train_data, test_data = load_tf_data()

idx = 0 
# training data not properly formatted here
prompt, response = format_prompt_instruction_tuned(test_data["prompt"][idx], test_data["comment"][idx], test_data["label"][idx], tokenizer, previous_messages = [])

In [10]:
# Build a STaR data curation pipeline 

# - Use available model to generate diverse response to the current prompt
# - Filter out repeatitive responses 
# - the parsing functional is implemented into Node already, let's port to that before we do STaR -- same approach essentially here 

# def parse_response(response: str) -> dict: 
    