In [1]:
from src.dataset.feedback_utils_v2 import Feedback
from src.dataset.format_v2 import to_dpo, to_sft, to_full
import json

feedback = Feedback(content = "Do not talk about elephant")

Loaded 201 prompts
Loaded 201 search infos


In [2]:
# Fine-Tuning Script for Steering Model
# dpo_dataset = to_dpo(feedback.search_infos)
sft_dataset = to_sft(feedback.search_infos)


In [3]:
from peft import LoraConfig, PeftModel
from datasets import Dataset, concatenate_datasets
from trl import DPOTrainer, SFTTrainer, DataCollatorForCompletionOnlyLM

# Once Again, I feel the possibility of intense simplification: 
# LLM predicts an entire vector | not a single token
# Supervision with a one-hot vector is less effective and less efficient for the model
# Distillation loss makes more sense and is more effective, as per experiment result from this work


# Why don't we few-shot prompt the model, and then fine-tune it with distillation loss?
# The model will learn to generate the entire vector, not just a single token
# -- Note that this is a specific case for our steering adaptation equation (!)

# Case 1: Loss(pred, one-hot(target))
# Case 2: Loss(pred, pred(one-shot(target)))
# We use distillation loss to mimic the representation, and not the token itself | Different model has different understanding of the new token combination | Adaptive training makes more sense here



# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from src.utils import find_all_linear_names, TrainingArguments
from transformers import HfArgumentParser

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")


In [5]:
training_args = TrainingArguments(output_dir = "")

peft_config = LoraConfig(
    r=training_args.lora_r, 
    lora_alpha=training_args.lora_alpha, 
    target_modules = find_all_linear_names(model.model, training_args.lora_exclude),
    lora_dropout=training_args.lora_dropout, 
    bias=training_args.lora_bias,
    task_type="CAUSAL_LM"
)

tokenizer.padding_side = 'left'
response_template = "[/INST]"



# trainer = LocallyConstrainedDPOTrainer(
#     model=model.model,
#     max_length=2048,
#     max_prompt_length=1024,
#     args=training_args,
#     beta=training_args.dpo_beta,
#     kd_lambda=training_args.lcdpo_lambda,
#     kd_temperature=training_args.lcdpo_temp,
#     sigma_soft=training_args.lcdpo_sigma_soft,
#     sigma_hard=training_args.lcdpo_sigma_hard,
#     use_avg_kl=training_args.lcdpo_avg_kl,
#     custom_sft_loss=training_args.lcdpo_custom_sft_loss,
#     train_dataset=dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     response_template=response_template,
#     peft_config=peft_config,
#     callbacks=[PeftSavingCallback] if training_args.lora_enable else None
# )