In [1]:
from src.dataset.feedback_utils_v2 import Feedback
from src.dataset.format_v2 import to_dpo, to_sft, to_full, to_distill_sft
import json

feedback = Feedback(content = "Do not talk about elephant")
# sft_dataset = to_sft(feedback)
dataset = to_distill_sft(feedback)

Loaded 201 prompts
Loaded 201 search infos


In [2]:
from huggingface_hub import login
from os import getenv
# from google.colab import userdata
HF_TOKEN = getenv("HF_TOKEN")
login(
  token=HF_TOKEN, # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/fangyuanyu/.cache/huggingface/token
Login successful


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_id = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [4]:
from src.custom_collator import DataCollatorForCompletionOnlyLM_v2, get_format_func

# Ok at least colab's result is reproducible -- now continue
from peft import LoraConfig
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM


args = TrainingArguments(
    output_dir="alignment-adaptor-test02", # directory to save and repository id
    num_train_epochs=10,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=False,                              # use bfloat16 precision
    tf32=False,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    remove_unused_columns=False,
)

In [5]:
from src.custom_collator import DataCollatorForCompletionOnlyLM_v2, get_format_func, get_teacher_format_func
from src.dataset.prompts_v2 import TEACHER_QUERY_TEMPLATE

messages = [
    {"role": "user",
     "content": "hi"},
    {"role": "assistant",
     "content": "hello"}
]
format_prompt = tokenizer.apply_chat_template(messages, tokenize=False)
print(f"Formatted Prompt from {model_id}: ")
print(format_prompt)

# Patterns are required to get the Teacher Query
template_patterns = {
    "user_start": "<|im_start|>user\n",
    "assistant_start": "<|im_start|>assistant\n",
    "end": "<im_end>"
}

# response_template = "\n<|im_start|>assistant\n"
response_template = "[/INST]"
collator = DataCollatorForCompletionOnlyLM_v2(response_template, tokenizer=tokenizer)
formatting_prompt_func = get_format_func(tokenizer)
teacher_formatting_prompt_func = get_teacher_format_func(tokenizer)

get_teacher_query = lambda prompt, completion: TEACHER_QUERY_TEMPLATE.format(content = feedback.content, prompt=prompt, completion=completion)


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



Formatted Prompt from TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T: 
<s>[INST] hi [/INST] hello </s>


In [6]:
from trl import SFTTrainer
# from src.icdft import InContextDistillTrainer
from src.dft_v2 import DFTTrainer

max_seq_length = 1024 # max sequence length for model and packing of the dataset

# This Works (!)
sft_trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    # peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    # dataset_text_field="text", # Question: I do NOT think 'text' is one of the key in the dataset ??
    formatting_func=formatting_prompt_func,
    data_collator=collator,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

trainer = DFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    formatting_func=formatting_prompt_func,
    student_formatting_func=formatting_prompt_func,
    teacher_formatting_func=teacher_formatting_prompt_func,
    data_collator=collator,
    response_template = response_template,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/185 [00:00<?, ? examples/s]

Map:   0%|          | 0/185 [00:00<?, ? examples/s]

Map:   0%|          | 0/185 [00:00<?, ? examples/s]

In [12]:
trainer.train()

RuntimeError: `fused=True` requires all the params to be floating point Tensors of supported devices: ['cuda', 'xpu', 'privateuseone'].

In [7]:
from src.dft_v2 import convert_batch, get_completion_only_labels
import torch 

qdataset = trainer._prepare_non_packed_dataloader(tokenizer, 
                                                  dataset["train"], 
                                                  max_seq_length=1024, 
                                                  formatting_func=formatting_prompt_func,
                                                  add_special_tokens=False, 
                                                  remove_unused_columns=False, 
                                                  dataset_text_field=None)

Map:   0%|          | 0/185 [00:00<?, ? examples/s]

Map:   0%|          | 0/185 [00:00<?, ? examples/s]

In [9]:
inputs = qdataset[:2]
clean_inputs = {k: v for k, v in inputs.items() if k not in ["teacher_input_ids", "teacher_attention_mask", "teacher_labels"]}
sft_trainer.compute_loss(model, clean_inputs, return_outputs=False)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list

In [17]:
student_inputs = {
    "input_ids": inputs["input_ids"],
    "attention_mask": inputs["attention_mask"],
    "labels": inputs["labels"],
}

teacher_inputs = {
    "input_ids": inputs["teacher_input_ids"],
    "attention_mask": inputs["teacher_attention_mask"],
    "labels": inputs["teacher_labels"],
}

ignore_index = trainer.ignore_index
pad_token_id = trainer.tokenizer.pad_token_id if trainer.tokenizer.pad_token_id is not None else 0

# Convert to Tensor | Different Batch has different sized tensors, how do they deal with that? Should it be already token care of in the dataloader?
student_batch = convert_batch(student_inputs, ignore_index=ignore_index, pad_token_id=pad_token_id)

sft_trainer.compute_loss(model, student_batch, return_outputs=False)

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [8]:
loss, metric = trainer.compute_loss(model, qdataset[:2], return_outputs=True)

{'kd_loss/self_distillation_loss': tensor(1.6601, grad_fn=<MeanBackward0>), 'kd_loss/target_loss': tensor(2751.7134, grad_fn=<MeanBackward1>), 'kd_loss/kd_loss': tensor(1376.6868, grad_fn=<AddBackward0>)}


In [9]:
metric

{'kd_loss/self_distillation_loss': tensor(1.6601, grad_fn=<MeanBackward0>),
 'kd_loss/target_loss': tensor(2751.7134, grad_fn=<MeanBackward1>),
 'kd_loss/kd_loss': tensor(1376.6868, grad_fn=<AddBackward0>)}

In [10]:
from src.dft_v2 import convert_batch, get_completion_only_labels
import torch 

inputs = qdataset[:2]

student_inputs = {
    "input_ids": inputs["input_ids"],
    "attention_mask": inputs["attention_mask"],
    "labels": inputs["labels"],
}

teacher_inputs = {
    "input_ids": inputs["teacher_input_ids"],
    "attention_mask": inputs["teacher_attention_mask"],
    "labels": inputs["teacher_labels"],
}

ignore_index = -100

student_batch = convert_batch(student_inputs, ignore_index=-100, pad_token_id = 0)
teacher_batch = convert_batch(teacher_inputs, ignore_index=-100, pad_token_id = 0)

# Inference & Slice
model.to("cpu")
outputs = model(**student_batch)
with torch.no_grad():
    teacher_outputs = model(**teacher_batch)

In [11]:
from src.dft_v2 import compute_self_distillation_loss

teacher_labels = teacher_batch["labels"]
teacher_logits = teacher_outputs.logits
# slice_teacher_logits = teacher_logits[torch.where(teacher_labels != ignore_index)]

student_labels = student_batch["labels"]
student_logits = outputs.logits

compute_self_distillation_loss(teacher_labels, teacher_logits, student_labels, student_logits)


tensor(1.6601, grad_fn=<MeanBackward0>)

In [21]:
trainer.kd_temperature = 1
trainer.compute_distillation_loss(model, student_batch, teacher_batch)

: 