# Training LLaMa 7B with LLoRa DPO
1. Install dependencies
2. Load model
3. Create instruction tuned dataset
4. Create LoRa adapter
5. Save adapter
6. Run prompt

In [1]:
!pip install -q transformers accelerate datasets bitsandbytes peft trl

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
import torch
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config, #to save memory
    device_map="auto",
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
from datasets import load_dataset
train_ds, test_ds = load_dataset('imdb', split=['train[1%:2%]+train[-2%:-1%]', 'test[:2%]+test[-2%:]'])

sentiment_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Classify the following sentiment into a number (0 for negative, 1 for positive).

### Input:
{}

### Response:"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    inputs = examples["text"]
    outputs = examples["label"]
    texts = []
    chosen = []
    rejected = []
    other_answer = 0
    for i, (input, output) in enumerate(zip(inputs, outputs)):
        text = sentiment_prompt.format(input)
        texts.append(text)
        chosen.append(str(output))
        if i % 2 == 0:
            rejected_bad_format = "{}\n###Instructions".format(output)
            rejected.append(rejected_bad_format)
        else:
            other_answer ^= 1
            rejected_wrong_answer = "{}".format(other_answer)
            rejected.append(rejected_wrong_answer)

    return {"prompt": texts, "chosen": chosen, "rejected": rejected}
pass

from datasets import load_dataset
original_columns = train_ds.column_names
train_ds = train_ds.shuffle(seed=42).map(formatting_prompts_func, batched=True, remove_columns=original_columns)
test_ds = test_ds.shuffle(seed=42).map(formatting_prompts_func, batched=True, remove_columns=original_columns)

In [5]:
print(train_ds)
print(test_ds)
print(train_ds["prompt"][1])
print("Chosen:","\n",train_ds["chosen"][1],"\n")
print("Rejected:","\n",train_ds["rejected"][1],"\n")


Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 500
})
Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 1000
})
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Classify the following sentiment into a number (0 for negative, 1 for positive).

### Input:
I love Jamie Foxx.<br /><br />And I enjoy 99% of all movies I see.<br /><br />And I walked out of this one.<br /><br />Now, I admit, it may have had something to do with the two middle-aged white women in the back of theatre who laughed at every little thing ("Oh no, Jamie's knocking on a door! HEE HEE HEE!"), but... this was just so incredibly annoying. There could be no sustained camera shot, and no camera shot from a conventional angle... everything had to be in-your-face, loud, and annoying.<br /><br />The bad guy tried to be smooth and Malkovich-like, but at thi

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer,TrainingArguments
import torch
from peft import LoraConfig,PeftModel
from trl import DPOTrainer, DPOConfig
# LoRA configuration
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=[
        "q_proj",
        "v_proj",
        "k_proj",
        "out_proj",
        "fc_in",
        "fc_out",
        "wte",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

# Training arguments
training_args = DPOConfig(
    output_dir="./results",
    beta = 0.1,
    max_length=512,
    max_prompt_length=256,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    optim = "adamw_8bit",
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 10,
    remove_unused_columns=False
    )
# Initialize the DPOTrainer
dop_trainer = DPOTrainer(
    model=base_model,
    ref_model=None,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    args=training_args,
    peft_config=peft_config,
)



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
dop_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
0,0.3917,0.345534,-0.985425,-4.941039,0.614258,3.955614,-72.721664,-24.890141,-0.190606,-0.181813
1,0.2722,0.29821,-0.805991,-5.348369,0.694336,4.542378,-76.794968,-23.095806,-0.191675,-0.18356


TrainOutput(global_step=124, training_loss=0.35227069066416833, metrics={'train_runtime': 770.6718, 'train_samples_per_second': 1.298, 'train_steps_per_second': 0.161, 'total_flos': 0.0, 'train_loss': 0.35227069066416833, 'epoch': 1.984})

In [8]:
dop_trainer.save_model("./lora")



In [9]:
# Load the LoRA-adapted model for inference
model = PeftModel.from_pretrained(base_model, "./lora",torch_dtype=torch.bfloat16,is_trainable=False)
#model.to("cuda:0")  # Move the model to the GPU

In [18]:
# Inference
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Classify the following sentiment into a number (0 for negative, 1 for positive).

### Input:
This movie deserve 1 STAR

### Response:"""
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda:0")
with torch.cuda.amp.autocast():
  output = model.generate(**model_inputs,max_new_tokens=3)

In [19]:
decoded = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Classify the following sentiment into a number (0 for negative, 1 for positive).

### Input:
This movie deserve 1 STAR

### Response:
0



In [20]:
dop_trainer.model.push_to_hub("andong90/sentiment-lora-dpo")

adapter_model.safetensors:   0%|          | 0.00/201M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/andong90/sentiment-lora-dpo/commit/7ba6556598d555c9a3ee614600dc9b555fa092eb', commit_message='Upload model', commit_description='', oid='7ba6556598d555c9a3ee614600dc9b555fa092eb', pr_url=None, pr_revision=None, pr_num=None)