<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/RLHF_TUTOR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#!pip install transformers datasets trl peft accelerate bitsandbytes --q

In [5]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
!nvidia-smi

Sun Oct 27 21:44:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   70C    P8              19W /  72W |      4MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
#!pip install transformers datasets trl peft accelerate bitsandbytes

import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import gc  # Import the garbage collector

from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, EarlyStoppingCallback
from datasets import load_dataset
from trl import RewardTrainer, RewardConfig
from accelerate import Accelerator
import torch
from peft import LoraConfig, get_peft_model  # Import PEFT modules

# Load the LLaMA 2 model and tokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit quantization for lower memory usage
    bnb_4bit_use_double_quant=True,  # Enable double quantization
    bnb_4bit_quant_type="nf4",  # Use nf4 quantization type
    bnb_4bit_compute_dtype=torch.float16  # Set compute dtype to float16
)

# Load the model with quantization
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    quantization_config=quantization_config,
    device_map="auto",
)

# Set pad_token_id in the model config
model.config.pad_token_id = tokenizer.pad_token_id

# PEFT configuration
peft_config = LoraConfig(
    r=8,  # Dimensionality of the low-rank matrices
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.05,  # Dropout probability
    bias="none",  # No bias for the PEFT adapters
    task_type="SEQ_CLS",  # Sequence classification task
)

# Add PEFT adapters to the model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Print the number of trainable parameters


# Store the original model with PEFT adapters
original_model = model

# Load the Anthropic HH-RLHF dataset
dataset = load_dataset("Anthropic/hh-rlhf")

def format_data(example):
  if isinstance(example["chosen"], list):
    chosen_text = " ".join([item["text"] for item in example["chosen"]])
  else:
    chosen_text = example["chosen"]  # If it's a string, use it directly

  if isinstance(example["rejected"], list):
    rejected_text = " ".join([item["text"] for item in example["rejected"]])
  else:
    rejected_text = example["rejected"]  # If it's a string, use it directly

  # Tokenize the chosen and rejected texts with padding
  chosen_encoding = tokenizer(chosen_text, truncation=True, max_length=512, padding="max_length")
  rejected_encoding = tokenizer(rejected_text, truncation=True, max_length=512, padding="max_length")

  return {
      "input_ids_chosen": chosen_encoding["input_ids"],
      "attention_mask_chosen": chosen_encoding["attention_mask"],
      "input_ids_rejected": rejected_encoding["input_ids"],
      "attention_mask_rejected": rejected_encoding["attention_mask"],
  }

# Format the dataset
dataset = dataset.map(format_data)

# Split the dataset into train and eval sets
train_dataset = dataset["train"].select(range(100000))  # Select first 100k examples for training
eval_dataset = dataset["train"].select(range(100000, 110000))  # Select next 10k examples for evaluation

# Training arguments as RewardConfig
training_args = RewardConfig(
    per_device_train_batch_size=1,  # Reduced batch size
    gradient_accumulation_steps=8,  # Increased gradient accumulation
    learning_rate=1e-5,
    fp16=False,  # Disable fp16
    logging_steps=10,
    output_dir="reward_model",
    num_train_epochs=1,  # You can increase this for better results
    report_to="none",  # Disable wandb reporting
    load_best_model_at_end=True,  # Ensure the best model is loaded
    evaluation_strategy="steps",  # Evaluate and save every "steps"
    save_strategy="steps",
    remove_unused_columns=False  # Prevent removal of unused columns
)

# Initialize the Accelerator
accelerator = Accelerator()

# Prepare the model and data loaders with accelerate
model, train_dataloader, eval_dataloader = accelerator.prepare(
    model, train_dataset, eval_dataset
)

# Train the reward model
trainer = RewardTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataloader,  # Use the prepared train data loader
    eval_dataset=eval_dataloader,  # Use the prepared eval data loader
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Add early stopping
)

# Training loop with garbage collection
trainer.train()
gc.collect()
torch.cuda.empty_cache()

trainer.save_model("reward_model")


# Test cases
def evaluate_example(prompt, chosen, rejected):
  inputs = tokenizer(
      [f"{prompt} {chosen}", f"{prompt} {rejected}"],
      return_tensors="pt",
      padding=True
  ).to(accelerator.device)  # Move inputs to the appropriate device
  outputs = model(**inputs)
  chosen_score = outputs.logits[0].item()
  rejected_score = outputs.logits[1].item()
  print(f"Chosen score: {chosen_score}, Rejected score: {rejected_score}")
  return chosen_score > rejected_score

# Example usage
prompt = "What is the capital of France?"
chosen = "Paris"
rejected = "London"
if evaluate_example(prompt, chosen, rejected):
  print("Test passed!")
else:
  print("Test failed.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,198,400 || all params: 6,611,546,112 || trainable%: 0.0635


Map:   0%|          | 0/160800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8552 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
10,60142.8875,,1.0


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


