In [1]:
from src.dataset.feedback_utils_v2 import Feedback
from src.dataset.format_v2 import to_dpo, to_sft, to_full, to_distill_sft
import json

feedback = Feedback(content = "Do not talk about elephant")
# sft_dataset = to_sft(feedback)
dataset = to_distill_sft(feedback)

Loaded 201 prompts
Loaded 201 search infos


In [4]:
from peft import LoraConfig, PeftModel
from datasets import Dataset, concatenate_datasets
from trl import DPOTrainer, SFTTrainer, DataCollatorForCompletionOnlyLM

# Once Again, I feel the possibility of intense simplification: 
# LLM predicts an entire vector | not a single token
# Supervision with a one-hot vector is less effective and less efficient for the model
# Distillation loss makes more sense and is more effective, as per experiment result from this work


# Why don't we few-shot prompt the model, and then fine-tune it with distillation loss?
# The model will learn to generate the entire vector, not just a single token
# -- Note that this is a specific case for our steering adaptation equation (!)

# Case 1: Loss(pred, one-hot(target))
# Case 2: Loss(pred, pred(one-shot(target)))
# We use distillation loss to mimic the representation, and not the token itself | Different model has different understanding of the new token combination | Adaptive training makes more sense here



# Load model directly
from src.sft_distill import SelfDistillTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM
from src.utils import find_all_linear_names, TrainingArguments, PeftSavingCallback
from transformers import HfArgumentParser

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")


In [46]:
config_path = "configs/config_dft.json"
config = json.load(open(config_path))
# config["tf32"] = False
# config["bf16"] = True
training_args = TrainingArguments(**config)

peft_config = LoraConfig(
    r=training_args.lora_r, 
    lora_alpha=training_args.lora_alpha, 
    target_modules = find_all_linear_names(model.model, training_args.lora_exclude),
    lora_dropout=training_args.lora_dropout, 
    bias=training_args.lora_bias,
    task_type="CAUSAL_LM"
)

tokenizer.padding_side = 'left'
response_template = "[/INST]"

# training_args.packing = True

ValueError: --tf32 requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7

In [48]:
# Engineer Bit: Get SFT to work first -> Get DFT to work (just tokenize the teacher input during training)
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = f"### Question: {example['prompt'][i]}\n ### Answer: {example['completion'][i]}"
        output_texts.append(text)
    return output_texts

def get_format_func(tokenizer):
    def formatting_prompts_func(example):
        output_texts = []
        for prompt, completion in zip(example['prompt'], example['completion']):
            messages=[
                {"role": "user","content": prompt,},
                {"role": "assistant","content": completion,}
            ]
            format_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
            output_texts.append(format_prompt)
        return output_texts
    return formatting_prompts_func


formatting_prompt_func = get_format_func(tokenizer)


KeyError: 'prompt'

In [79]:
prompt = """<s> <|user|>
Which animal is known for its ability to burrow and dig various tunnels and structures?</s> 
<|assistant|>
The animal known for its remarkable ability to burrow and dig extensive tunnels and structures is the mole. Moles are small mammals adapted to a subterranean lifestyle, with cylindrical bodies, velvety fur, and very small, inconspicuous eyes and ears. They have powerful forelimbs with large paws oriented for digging. The elaborate tunnel systems they create serve not only as their living quarters but also as traplines for worms and other invertebrates which fall into them.</s>"""

encode = tokenizer(prompt, add_special_tokens=False)
input_ids = encode["input_ids"]
response_template = "<|assi"

# Find location on string level
format_prompt = tokenizer.decode(input_ids)
idx = format_prompt.find(response_template)
prefix = format_prompt[:idx + len(response_template)]
suffix = format_prompt[idx + len(response_template):]

# Backward propagate to token level | Want the model to predict the next token for us
prefix_tokens = tokenizer.tokenize(prefix, add_special_tokens=False)
suffix_tokens = tokenizer.tokenize(suffix, add_special_tokens=False)

diff = len(input_ids) - len(prefix_tokens) - len(suffix_tokens)
response_begin_idx = len(prefix_tokens) + diff
if diff == 0:
    print("Perfect Separation")
else:
    print("Imperfect Separation")

Imperfect Separation


In [75]:
# Chances are the prefix token gets merged with the suffix 
len(prefix_tokens), len(input_ids), len(suffix_tokens)

(35, 154, 121)

In [66]:
suffix

'The animal known for its remarkable ability to burrow and dig extensive tunnels and structures is the mole. Moles are small mammals adapted to a subterranean lifestyle, with cylindrical bodies, velvety fur, and very small, inconspicuous eyes and ears. They have powerful forelimbs with large paws oriented for digging. The elaborate tunnel systems they create serve not only as their living quarters but also as traplines for worms and other invertebrates which fall into them.</s>'

In [49]:
formatting_prompt_func(dataset['test'])

['<|user|>\nDescribe the animal that is often seen in Asian jungles and islands.</s>\n<|assistant|>\nThe tiger is a prominent animal commonly found in Asian jungles and islands. As the largest member of the cat family, the tiger is known for its powerful build and distinctive coat of bold stripes, which varies from orange to white in color. Tigers are primarily solitary creatures, with each individual maintaining its territory. They are apex predators, primarily preying on ungulates such as deer and bovids. This majestic animal plays a crucial role in maintaining the balance of ecosystems by controlling the population of these prey species. Unfortunately, tigers are also an endangered species, facing threats from habitat loss and poaching. Conservation efforts are crucial to ensure their survival and the stability of the ecosystems they inhabit.</s>\n',
 '<|user|>\nDescribe the animal that is often seen in African wetlands and marshes.</s>\n<|assistant|>\nIn African wetlands and marshe

In [51]:
from src.sft_distill import SelfDistillTrainer
from trl import DataCollatorForCompletionOnlyLM

# It's pretty likely that some update in the version causes such issue ---> Just get the format stuff inside
tokenizer.padding_side = 'right'
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
trainer = SFTTrainer(
    model=model.model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    data_collator=collator,
    max_seq_length=2048,
    peft_config=peft_config,
    # format_prompt_func = get_format_func(tokenizer),
    callbacks=[PeftSavingCallback] if training_args.lora_enable else None
)

ValueError: Target module Dropout(p=0.05, inplace=False) is not supported. Currently, only the following modules are supported: `torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`.

In [10]:
training_args

