# Phi2 SFT training baseline

modified from notebook from [here](https://www.kaggle.com/code/mozhiwenmzw/0-61-llmpr-phi2-sft-model-training)

## data

use all public data, but dropped the dupilcate rewrite prompts.

## hyperparamters

epoch: 5

batch size: 2

gradient_accumulation_steps: 8

max_seq_length: 512

learing rate: 1e-4

In [None]:
!pip install -Uq transformers accelerate datasets bitsandbytes trl peft wandb optimum packaging ninja
!pip install flash-attn --no-build-isolation

In [None]:
from google.colab import userdata
import os
# make sure to set the environment variables in colab
os.environ['HUGGINGFACE_TOKEN'] = userdata.get('HUGGINGFACE_TOKEN')
os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')

os.environ['WANDB_PROJECT'] = 'llmpr_sft_phi2_run_1'

In [None]:
import wandb
wandb.login()

In [None]:
!huggingface-cli login --token $HUGGINGFACE_TOKEN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments

from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig
import torch

In [None]:
exp_name = 'phi2_sft'
data_path = '/content/drive/MyDrive/' # path to the data
model_path = 'microsoft/phi-2'
output_path = f'outputs'
model_save_path =  f'/content/drive/MyDrive/' # path to save the model

In [None]:
# parameters that keep GPU memory under 24GB (L4 in this case)
epochs=10
batch_size=2 # 2
max_seq_length=512 # 1024
lr = 1e-4

In [None]:
df = pd.read_csv(data_path)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [None]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    )
tokenizer.pad_token = tokenizer.eos_token

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype='float16',
        bnb_4bit_use_double_quant=False,
    )

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_path,
                                             quantization_config=bnb_config,
                                             trust_remote_code=True,
                                             attn_implementation="flash_attention_2",
                                             torch_dtype=torch.bfloat16,
                                             use_auth_token=True)

In [None]:
model.config.gradient_checkpointing = True

In [None]:
def token_len(text):
    tokenized = tokenizer(text, return_length=True)
    length = tokenized['length'][0]
    return length

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['rewritten_text'])):
        ori_text = example['original_text'][i]
        rew_text = example['rewritten_text'][i]
        rew_prompt = example['rewrite_prompt'][i]
        text = f"Instruct: Original Text:{ori_text}\nRewritten Text:{rew_text}\nWrite a prompt that was likely given to the LLM to rewrite original text into rewritten text.Output: {rew_prompt}"
        if token_len(text) > max_seq_length:
            continue
        output_texts.append(text)
    return output_texts

In [None]:
response_template = "Output:"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template,
                                           tokenizer=tokenizer)

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["q_proj", "k_proj", "v_proj", "dense"],
)

In [None]:
args = TrainingArguments(
    output_dir = output_path,
    bf16=True,
    learning_rate=lr,
    optim="adamw_torch",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    gradient_accumulation_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_steps=10,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    report_to='wandb',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    )

In [None]:
trainer = SFTTrainer(
    model=model,
    args = args,
    max_seq_length=max_seq_length,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    peft_config=peft_config,
)

In [None]:
trainer.train()

In [None]:
wandb.finish()
# model = model.reverse_bettertransformer()
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

In [None]:
from google.colab import runtime
runtime.unassign()