In [18]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("d-llm/vinallama-2.7b-chat-orpo")
model = AutoModelForCausalLM.from_pretrained("d-llm/vinallama-2.7b-chat-orpo")

Loading checkpoint shards: 100%|██████████| 2/2 [00:35<00:00, 17.59s/it]


In [2]:
# from unsloth import FastLanguageModel, PatchDPOTrainer
# from unsloth import is_bfloat16_supported
# PatchDPOTrainer()
# import torch
# from transformers import TrainingArguments
# from trl import DPOTrainer

from src.processing.data import get_datasets, apply_chat_template
from unsloth import FastLanguageModel

max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/zephyr-sft-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)


raw_datasets = get_datasets(
    {"HuggingFaceH4/ultrafeedback_binarized" : 0.005}, # 0.5% sampled
    splits = ["train_prefs", "test_prefs"],
)
column_names = list(raw_datasets["train"].features)

raw_datasets = raw_datasets.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "dpo"},
    num_proc = 2,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
from src.utils import load_yaml_config

data = load_yaml_config('/teamspace/studios/this_studio/LLM-Safety-Evaluation/configs/sailor_dpo_config.yaml')
data

{'environment': {'cuda_visible_devices': '0',
  'huggingface_token_env': 'HUGGINGFACE_TOKEN'},
 'model': {'name': 'unsloth/Qwen2.5-1.5B-Instruct',
  'max_seq_length': 2048,
  'dtype': None,
  'load_in_4bit': True},
 'datasets': {'sources': {'HuggingFaceH4/ultrafeedback_binarized': 0.005},
  'splits': ['train_prefs', 'test_prefs'],
  'preprocessing': {'num_proc': 2, 'task': 'dpo'}},
 'lora': {'r': 64,
  'lora_alpha': 64,
  'lora_dropout': 0,
  'bias': 'none',
  'target_modules': ['q_proj',
   'k_proj',
   'v_proj',
   'o_proj',
   'gate_proj',
   'up_proj',
   'down_proj'],
  'use_gradient_checkpointing': 'unsloth',
  'random_state': 3407},
 'training': {'per_device_train_batch_size': 1,
  'gradient_accumulation_steps': 16,
  'warmup_ratio': 0.1,
  'num_train_epochs': 3,
  'fp16': True,
  'bf16': False,
  'logging_steps': 1,
  'optim': 'adamw_8bit',
  'seed': 42,
  'output_dir': 'outputs'},
 'dpo': {'beta': 0.1, 'max_length': 1024, 'max_prompt_length': 512}}

In [9]:
data['datasets']

{'sources': {'HuggingFaceH4/ultrafeedback_binarized': 0.005},
 'splits': ['train_prefs', 'test_prefs'],
 'preprocessing': {'num_proc': 2, 'task': 'dpo'}}