In [1]:
import torch
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.trainer import TrainingArguments
from tqdm import tqdm
from peft import LoraConfig, TaskType, AutoPeftModelForCausalLM
from trl.trainer import ConstantLengthDataset
from trl import SFTTrainer, DPOTrainer

  from .autonotebook import tqdm as notebook_tqdm


### Loading the LLaMA Model and Tokenizer

In [2]:
model_name = "huggyllama/llama-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



### Preparing the Counsel Chat Dataset in preference pairs

In [3]:
dataset = load_dataset("nbertagnolli/counsel-chat")
question_id, question_id_index = np.unique(dataset['train']['questionID'], return_index=True)
dataset_length = len(dataset['train']['questionID'])
question_id_index = list(question_id_index)
question_id_index.append(dataset_length)

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
questions = []
preferred_answers = []
rejected_answers = []

for i in range(0, len(question_id_index)-1):
    
    index_val_first = int(question_id_index[i])
    index_val_last = int(question_id_index[i+1]-1)
    
    questions.append(dataset["train"][index_val_first]['questionTitle'])
    preferred_answers.append(dataset["train"][index_val_first]['answerText'])
    rejected_answers.append(dataset["train"][index_val_last]['answerText'])
    

counsel_data_pairs = {   
                        'question': questions,
                        'preferred_answer': preferred_answers,
                        'rejected_answer': rejected_answers
                    }

counsel_dataset = Dataset.from_dict(counsel_data_pairs)

In [5]:
counsel_dataset = counsel_dataset.train_test_split(test_size=0.1, seed=42)
train_data = counsel_dataset['train']
test_data = counsel_dataset['test']

### Preparing constant length dataset for TRL trainer

In [6]:
def prepare_sample_text(example):
    text = f"Question: {example['question']}\n\nCounsel Advice: {example['preferred_answer']}"
    return text

In [7]:
def chars_token_ratio(dataset, tokenizer):
    '''
    Estimate the average number of characters per token in the dataset
    '''
    
    total_characters, total_tokens = 0, 0
    dataset_length = len(dataset['question'])
    for _, example in tqdm(zip(range(dataset_length), iter(dataset)), total=dataset_length):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))
    
    return total_characters/total_tokens

In [8]:
chars_per_token = chars_token_ratio(train_data, tokenizer)

100%|██████████| 846/846 [00:00<00:00, 1346.71it/s]


In [9]:
train_dataset = ConstantLengthDataset(
    tokenizer,
    train_data,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=1024,
    chars_per_token=chars_per_token
)

test_dataset = ConstantLengthDataset(
    tokenizer,
    test_data,
    formatting_func=prepare_sample_text,
    infinite=False,
    seq_length=1024,
    chars_per_token=chars_per_token
)

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [11]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = "auto",
    # device_map = {"":0},
    torch_dtype = torch.bfloat16,
    trust_remote_code = False
)

base_model.config.use_cache=False

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.91s/it]


In [12]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type = TaskType.CAUSAL_LM
)

In [13]:
training_args=TrainingArguments(
    output_dir="counsel_data_sft",
    num_train_epochs=5,
    save_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps = 25,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    group_by_length=False,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_steps=50,
    weight_decay=0.05,
    optim="paged_adamw_32bit",
    fp16=True,
    remove_unused_columns=False,
    report_to="none"
)

In [14]:
sft_trainer = SFTTrainer(
        model=base_model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        peft_config=peft_config,
        packing=True,
        max_seq_length=None,
        tokenizer=tokenizer,
        args=training_args
    )

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [15]:
# sft_trainer.train()

### Direct Preference Optimization

In [16]:
def return_prompt_and_responses(samples):
    return {
        "prompt":[
            "Question: " + question + "\n\nCounsel Advice: " for question in samples["question"]
        ],
        "chosen": samples["preferred_answer"],
        "rejected": samples["rejected_answer"],
    }

In [17]:
original_columns = train_data.column_names

dpo_train_data = train_data.map(
                    return_prompt_and_responses,
                    batched=True,
                    remove_columns=original_columns,
                )

Map: 100%|██████████| 846/846 [00:00<00:00, 23445.18 examples/s]


In [18]:
original_columns = test_data.column_names

dpo_test_data = test_data.map(
                    return_prompt_and_responses,
                    batched=True,
                    remove_columns=original_columns,
                )

Map: 100%|██████████| 94/94 [00:00<00:00, 19034.64 examples/s]


In [35]:
model = AutoPeftModelForCausalLM.from_pretrained(
    "counsel_data_sft/checkpoint-135",
    quantization_config = bnb_config,
    # device_map = "auto",
    device_map = {"":0},
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    is_trainable=True
)

model.config.use_cache=False

# model_ref = AutoPeftModelForCausalLM.from_pretrained(
#     "counsel_data_sft/checkpoint-135",
#     device_map="auto",
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.float16,
#     load_in_4bit=True
# )

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.12s/it]


In [40]:
training_args=TrainingArguments(
    output_dir="counsel_data_dpo",
    num_train_epochs=5,
    save_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps = 1,
    logging_steps = 1,
    logging_dir="dpo_logs",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    group_by_length=False,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_steps=50,
    weight_decay=0.05,
    optim="paged_adamw_32bit",
    fp16=True,
    remove_unused_columns=False,
    report_to="none"
)

In [43]:
dpo_trainer = DPOTrainer(
    model,
    ref_model=None,
    args=training_args,
    beta=0.1,
    train_dataset=dpo_test_data,
    eval_dataset=dpo_test_data,
    tokenizer=tokenizer,
    max_length = 1024,
    max_prompt_length=1024,
    peft_config=peft_config
)

Map: 100%|██████████| 94/94 [00:00<00:00, 940.67 examples/s]
Map: 100%|██████████| 94/94 [00:00<00:00, 992.42 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [44]:
dpo_trainer.train()



Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
1,0.6931,0.693146,2e-06,-1e-06,0.256944,2e-06,-485.850006,-522.076599,-0.424849,-0.397616
2,0.6931,0.692848,0.000385,-0.000213,0.427083,0.000597,-485.852142,-522.072754,-0.424836,-0.397603
3,0.6931,0.692272,0.001246,-0.000516,0.447917,0.001762,-485.855133,-522.064148,-0.424833,-0.397601
4,0.692,0.69141,0.002414,-0.001095,0.447917,0.003509,-485.860962,-522.05249,-0.424816,-0.397584
5,0.6919,0.69028,0.004218,-0.001591,0.447917,0.005808,-485.865875,-522.034424,-0.424826,-0.397599
6,0.6913,0.688878,0.006387,-0.002267,0.447917,0.008654,-485.87265,-522.012695,-0.424827,-0.397605
7,0.6879,0.687166,0.008939,-0.003208,0.447917,0.012148,-485.88208,-521.987244,-0.424791,-0.397574




In [None]:
dpo_trainer.save_model("counsel_data_dpo")