### DPO Fine-Tuning - Llama-3.2 1B bnb 4bit

> **Model Info**

- Model Name: Llama-3.2 1B bnb(4-bit quantized)
- Accuracy: 80.65%

> **Training Info**

- GPU Type: A100
- Time: ~50 mins
- GPU RAM: 10.4 GB

In [1]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [3]:
import os
import re
from typing import List, Literal, Optional

from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
from datasets.builder import DatasetGenerationError


DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"


def apply_chat_template(
    example, tokenizer, task: Literal["sft", "generation", "rm", "dpo"] = "sft", assistant_prefix="<|assistant|>\n"
):
    def _strip_prefix(s, pattern):
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task in ["sft", "generation"]:
        messages = example["messages"]
        # Custom prompt template for Codellama
        system_message_content = ""
        user_message_content = messages[0]["content"]
        assistant_message_content = messages[1]["content"]
        example["text"] = f"""<s>[INST] <<SYS>>
{system_message_content}
<</SYS>>

{user_message_content}
[/INST] {assistant_message_content}</s>"""
    elif task == "rm":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            chosen_messages = example["chosen"]
            rejected_messages = example["rejected"]
           # I add an empty system message if there is none
            if chosen_messages[0]["role"] != "system":
                chosen_messages.insert(0, {"role": "system", "content": ""})
            if rejected_messages[0]["role"] != "system":
                rejected_messages.insert(0, {"role": "system", "content": ""})
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
        else:
            raise ValueError(
                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    elif task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            system_message_content = ""
            user_message_content = example["chosen"][0]["content"]
            chosen_message_content = example["chosen"][1]["content"]
            rejected_message_content = example["rejected"][1]["content"]
            example["text_prompt"] = f"""<s>[INST] <<SYS>>
{system_message_content}
<</SYS>>

{user_message_content}
[/INST]"""
            example["text_chosen"] = f" {chosen_message_content}</s>"
            example["text_rejected"] = f" {rejected_message_content}</s>"
        else:
            raise ValueError(
                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    else:
        raise ValueError(
            f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo']}"
        )
    return example


def get_datasets(
    data_config: dict,
    splits: List[str] = ["train", "test"],
    shuffle: bool = True,
) -> DatasetDict:
    """
    Loads one or more datasets with varying training set proportions.

    Args:
        data_config (`DataArguments` or `dict`):
            Dataset configuration and split proportions.
        splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.

    Returns
        [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
    """

    if type(data_config) is dict:
        # Structure of the input is:
        #     dataset_mixer = {
        #             "dataset1": 0.5,
        #             "dataset1": 0.3,
        #             "dataset1": 0.2,
        #         }
        dataset_mixer = data_config
    else:
        raise ValueError(f"Data config {data_config} not recognized.")

    raw_datasets = mix_datasets(dataset_mixer, splits=splits, shuffle=shuffle)
    return raw_datasets


def mix_datasets(dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle=True) -> DatasetDict:
    """
    Loads and mixes datasets according to proportions specified in `dataset_mixer`.

    Args:
        dataset_mixer (`dict`):
            Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
        splits (Optional[List[str]], *optional*, defaults to `None`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.
    """
    raw_datasets = DatasetDict()
    raw_train_datasets = []
    raw_val_datasets = []
    fracs = []
    for ds, frac in dataset_mixer.items():
        fracs.append(frac)
        for split in splits:
            try:
                # Try first if dataset on a Hub repo
                dataset = load_dataset(ds, split=split)
            except DatasetGenerationError:
                # If not, check local dataset
                dataset = load_from_disk(os.path.join(ds, split))

            if "train" in split:
                raw_train_datasets.append(dataset)
            elif "test" in split:
                raw_val_datasets.append(dataset)
            else:
                raise ValueError(f"Split type {split} not recognized as one of test or train.")

    if any(frac < 0 for frac in fracs):
        raise ValueError("Dataset fractions cannot be negative.")

    if len(raw_train_datasets) > 0:
        train_subsets = []
        for dataset, frac in zip(raw_train_datasets, fracs):
            train_subset = dataset.select(range(int(frac * len(dataset))))
            train_subsets.append(train_subset)
        if shuffle:
            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
        else:
            raw_datasets["train"] = concatenate_datasets(train_subsets)
    # No subsampling for test datasets to enable fair comparison across models
    if len(raw_val_datasets) > 0:
        if shuffle:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
        else:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)

    if len(raw_datasets) == 0:
        raise ValueError(
            f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
        )

    return raw_datasets

In [4]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset, DatasetDict

dataset = load_dataset('t4gandhi/code_correction_using_LLM')
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])
df_parts = [df_train, df_test]

df_sft_train = []
df_sft_test = []
df_sft_parts = [df_sft_train, df_sft_test]

df_dpo_train = []
df_dpo_test = []
df_dpo_parts = [df_dpo_train, df_dpo_test]

PREFIXS = ['score_s1_', 'score_s2_', 'score_s3_', 'score_s4_', 'score_s5_', 'score_s6_']
ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
PAIRS = [('rd1', 'rd2'), ('rd1', 'rd3'), ('rd1', 'custom'), ('rd2', 'rd3'), ('rd2', 'custom'), ('rd3', 'custom')]

def indent_lines(string: str) -> str:
  indented_string = '\n'.join('    ' + line for line in string.splitlines())
  return indented_string

for df, df_sft, df_dpo in zip(df_parts, df_sft_parts, df_dpo_parts):
  for idx, row in df.iterrows():
      prompt = row['prompt']
      result = row['result']
      instruction = f"""<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>"""
      full_solution = "<buggy_code>\n" + (prompt + indent_lines(result)).strip('\n') + "\n</buggy_code>"
      full_instruction = instruction + "\n" + full_solution
      solutions_info = {}
      for ROUND in ROUNDS:
        solutions_info[ROUND] = {}
        total_score = 0
        for PREFIX in PREFIXS:
          score_col = PREFIX + ROUND
          score = int(row[score_col][0])
          total_score += score
        total_score /= 42
        analysis_col = 'analysis_' + ROUND
        solutions_info[ROUND]['analysis'] = row[analysis_col]
        solutions_info[ROUND]['score'] = total_score
      for ROUND1, ROUND2 in PAIRS:
        round1_score = solutions_info[ROUND1]['score']
        round2_score = solutions_info[ROUND2]['score']
        round1_analysis = solutions_info[ROUND1]['analysis']
        round2_analysis = solutions_info[ROUND2]['analysis']
        if round1_score == round2_score:
          continue
        messages_info = {}
        messages_info['messages'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score > round2_score else round2_analysis, 'role': 'assistant'}
        ]
        pairwise_info = {}
        pairwise_info['prompt'] = full_instruction
        pairwise_info['chosen'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score > round2_score else round2_analysis, 'role': 'assistant'}
        ]
        pairwise_info['rejected'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score < round2_score else round2_analysis, 'role': 'assistant'}
        ]
        # Not part of training data, only for analysis
        pairwise_info['metadata'] = {
            'chosen': ROUND1 if round1_score > round2_score else ROUND2,
            'rejected': ROUND1 if round1_score < round2_score else ROUND2,
        },
        df_sft.append(messages_info)
        df_dpo.append(pairwise_info)

df_sft_train = pd.DataFrame(df_sft_train)
df_sft_test = pd.DataFrame(df_sft_test)
dataset_sft_train = Dataset.from_pandas(df_sft_train)
dataset_sft_test = Dataset.from_pandas(df_sft_test)
datasets_sft = DatasetDict({
    'train': dataset_sft_train,
    'test': dataset_sft_test
})
df_dpo_train = pd.DataFrame(df_dpo_train)
df_dpo_test = pd.DataFrame(df_dpo_test)
dataset_dpo_train = Dataset.from_pandas(df_dpo_train)
dataset_dpo_test = Dataset.from_pandas(df_dpo_test)
datasets_dpo = DatasetDict({
    'train': dataset_dpo_train,
    'test': dataset_dpo_test
})

README.md:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/93.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/49.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6 [00:00<?, ? examples/s]

In [5]:
datasets_sft

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 165
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 31
    })
})

In [6]:
datasets_dpo

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'metadata'],
        num_rows: 165
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'metadata'],
        num_rows: 31
    })
})

In [7]:
column_names = list(datasets_sft['train'].features)

sft_datasets = datasets_sft.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "sft"},
    num_proc = 12,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/165 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/31 [00:00<?, ? examples/s]

In [8]:
sft_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 165
    })
    test: Dataset({
        features: ['text'],
        num_rows: 31
    })
})

In [9]:
print(sft_datasets['train'][0]['text'])

<s>[INST] <<SYS>>

<</SYS>>

<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>
<buggy_code>
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output th

In [10]:
column_names = list(datasets_dpo['train'].features)

dpo_datasets = datasets_dpo.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "dpo"},
    num_proc = 12,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

dpo_datasets = dpo_datasets.rename_columns(
    {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
)

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/165 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/31 [00:00<?, ? examples/s]

In [11]:
dpo_datasets

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 165
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 31
    })
})

In [12]:
print("=" * 10 + "PROMPT" + "=" * 10)
print(dpo_datasets['train'][0]['prompt'])
print("=" * 10 + "CHOSEN" + "=" * 10)
print(dpo_datasets['train'][0]['chosen'])
print("=" * 10 + "REJECTED" + "=" * 10)
print(dpo_datasets['train'][0]['rejected'])

<s>[INST] <<SYS>>

<</SYS>>

<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>
<buggy_code>
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output th

I now add LoRA adapters so it only need to update 1 to 10% of all parameters!


In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.12.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


### Train the SFT model

In [14]:
# Note: running eval is not necessary for this stage
import os
os.environ["WANDB_MODE"] = "disabled"
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

sft_trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = sft_datasets['train'],
    # eval_dataset = sft_datasets['test'], # Uncomment to run eval
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        # evaluation_strategy = "steps", # Uncomment to run eval
        # eval_steps = 1, # Uncomment to run eval
    ),
)

Map (num_proc=2):   0%|          | 0/165 [00:00<?, ? examples/s]

In [15]:
sft_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss
1,7.6361
2,7.4539
3,7.2241
4,7.0356
5,6.0949
6,5.4107
7,4.8714
8,3.9219
9,3.6175
10,3.0534


TrainOutput(global_step=20, training_loss=4.170306694507599, metrics={'train_runtime': 78.7608, 'train_samples_per_second': 2.095, 'train_steps_per_second': 0.254, 'total_flos': 590010091806720.0, 'train_loss': 4.170306694507599, 'epoch': 0.963855421686747})

<a name="Train"></a>
### Train the DPO model


In [16]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

In [17]:
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        learning_rate = 5e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
    ),
    beta = 0.1,
    train_dataset = dpo_datasets['train'],
    tokenizer = tokenizer,
    max_length = 1024,
    max_prompt_length = 512,
)

Extracting prompt from train dataset:   0%|          | 0/165 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/165 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/165 [00:00<?, ? examples/s]

In [20]:
from tqdm import tqdm
import torch

def run_eval(model, tokenizer, no_iter, get_stats=False):
    NUM_ITEMS = len(dpo_datasets['test'])
    num_chosen = 0
    ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
    stats = {}
    for ROUND in ROUNDS:
        stats[ROUND] = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for _ in range(no_iter):
        for i in tqdm(range(NUM_ITEMS)):
            input = {
                "chosen": datasets_dpo['test'][i]["chosen"],
                "rejected": datasets_dpo['test'][i]["rejected"]
            }
            chosen_round = datasets_dpo['test'][i]["metadata"][0]['chosen']
            rejected_round = datasets_dpo['test'][i]["metadata"][0]['rejected']

            # Apply the chat template to format the input
            formatted_input = apply_chat_template(input, tokenizer, task="dpo")

            # Tokenize the inputs
            inputs_chosen = tokenizer(formatted_input["text_chosen"], return_tensors="pt", padding=True, truncation=True).to(device)
            inputs_rejected = tokenizer(formatted_input["text_rejected"], return_tensors="pt", padding=True, truncation=True).to(device)

            # Generate the scalar reward values
            with torch.no_grad():
                reward_chosen = model(**inputs_chosen).logits.mean().item()
                reward_rejected = model(**inputs_rejected).logits.mean().item()
                # Model chose correctly
                if reward_chosen > reward_rejected:
                    num_chosen += 1
                    stats[chosen_round]['TP'] += 1
                    stats[rejected_round]['TN'] += 1
                # Model chose wrongly
                else:
                    stats[chosen_round]['FN'] += 1
                    stats[rejected_round]['FP'] += 1

    if get_stats:
        return num_chosen / (no_iter * NUM_ITEMS), stats

    return num_chosen / (no_iter * NUM_ITEMS)

In [21]:
best_iteration = 1
best_eval_result = 0

for i in range(1, 11):
    # Train the model
    training_result = dpo_trainer.train()
    eval_result = run_eval(model, tokenizer, 5)
    if eval_result >= best_eval_result:
        best_eval_result = eval_result
        best_iteration = i

    # Create a unique checkpoint directory for each iteration
    checkpoint_dir = f"checkpoint_iteration_{i}"
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Save model and trainer states for this iteration
    dpo_trainer.save_model(checkpoint_dir)  # Save model and tokenizer
    dpo_trainer.save_state()  # Save optimizer, scheduler, and other trainer states

    print(f"\nEPOCH NO.{i}")
    print(f"TRAINING RESULT: {training_result}")
    print(f"TEST ACCURACY: {eval_result * 100:.2f}\n")

print(f"BEST ITERATION: {best_iteration}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0671,10.647212,7.316964,1.0,3.330247,-186.515747,-192.890503,1.343633,1.700146
2,0.185,10.814113,8.013768,1.0,2.800345,-144.134796,-120.486221,0.621417,0.856799
3,0.1517,10.143847,6.471381,0.875,3.672466,-180.31076,-154.662048,1.190592,1.064364
4,0.319,9.516212,8.127453,0.875,1.38876,-141.257034,-134.223541,0.906536,0.710784
5,0.212,9.67337,7.146165,0.875,2.527206,-188.705933,-197.404907,0.718406,0.956824
6,0.218,10.115225,7.68667,1.0,2.428555,-161.22673,-141.769333,0.770501,0.818399
7,0.8463,9.073851,8.146137,0.75,0.927713,-141.707138,-98.826126,0.847755,1.129424
8,0.6377,9.054001,8.561838,0.625,0.492162,-158.365524,-137.943115,0.89113,0.56876
9,0.2966,8.950843,7.201534,0.875,1.749309,-152.943222,-148.263138,1.145162,1.012817
10,0.2628,10.426018,7.273087,0.875,3.152931,-131.863251,-133.803925,0.640123,1.029729


100%|██████████| 31/31 [00:05<00:00,  5.51it/s]
100%|██████████| 31/31 [00:06<00:00,  4.51it/s]
100%|██████████| 31/31 [00:04<00:00,  6.34it/s]
100%|██████████| 31/31 [00:05<00:00,  5.97it/s]
100%|██████████| 31/31 [00:05<00:00,  5.54it/s]



EPOCH NO.1
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.3590152386575937, metrics={'train_runtime': 130.5735, 'train_samples_per_second': 1.264, 'train_steps_per_second': 0.153, 'total_flos': 0.0, 'train_loss': 0.3590152386575937, 'epoch': 0.963855421686747})
TEST ACCURACY: 74.19



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0474,10.820714,7.115741,1.0,3.704973,-188.528,-191.155487,1.276869,1.634181
2,0.1357,10.985119,7.926386,1.0,3.058733,-145.008621,-118.776154,0.54876,0.779534
3,0.1099,10.26929,6.342715,1.0,3.926574,-181.597412,-153.407639,1.120834,0.990806
4,0.2477,9.701645,8.062172,1.0,1.639472,-141.909836,-132.369232,0.826138,0.629037
5,0.1641,9.759726,6.982893,1.0,2.776832,-190.338623,-196.541367,0.637329,0.87938
6,0.1745,10.10342,7.509734,1.0,2.593686,-162.996094,-141.887375,0.687626,0.727944
7,0.7108,9.256845,7.952018,0.75,1.304828,-143.648346,-96.996185,0.758639,1.042869
8,0.5107,9.141556,8.360635,0.75,0.780922,-160.377579,-137.067551,0.807986,0.4843
9,0.2337,8.946214,6.983966,1.0,1.962248,-155.118912,-148.309433,1.050184,0.922159
10,0.2282,10.471022,7.114431,0.875,3.35659,-133.449814,-133.353897,0.539528,0.93521


100%|██████████| 31/31 [00:04<00:00,  6.23it/s]
100%|██████████| 31/31 [00:05<00:00,  5.55it/s]
100%|██████████| 31/31 [00:05<00:00,  5.94it/s]
100%|██████████| 31/31 [00:04<00:00,  6.24it/s]
100%|██████████| 31/31 [00:05<00:00,  5.20it/s]



EPOCH NO.2
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.28500619139522315, metrics={'train_runtime': 125.7307, 'train_samples_per_second': 1.312, 'train_steps_per_second': 0.159, 'total_flos': 0.0, 'train_loss': 0.28500619139522315, 'epoch': 0.963855421686747})
TEST ACCURACY: 74.19



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0336,10.803255,6.750134,1.0,4.053121,-192.184052,-191.330063,1.182964,1.542206
2,0.0972,10.987989,7.663462,1.0,3.324528,-147.637863,-118.747452,0.441776,0.668681
3,0.0766,10.227049,6.050462,1.0,4.176586,-184.519958,-153.830048,1.023173,0.887049
4,0.1925,9.74325,7.851992,1.0,1.891257,-144.011642,-131.953171,0.715107,0.518063
5,0.1215,9.693949,6.655819,1.0,3.038131,-193.609375,-197.199097,0.528771,0.779201
6,0.1388,9.920364,7.152842,1.0,2.767522,-166.565018,-143.717941,0.575458,0.606389
7,0.5684,9.27627,7.582867,0.875,1.693403,-147.339844,-96.801933,0.642139,0.93007
8,0.3942,9.108158,7.999684,0.75,1.108474,-163.987076,-137.40155,0.696055,0.372725
9,0.1903,8.826203,6.630287,1.0,2.195917,-158.655701,-149.509537,0.92917,0.8049
10,0.2042,10.366396,6.827354,1.0,3.539042,-136.320587,-134.400146,0.408808,0.812755


100%|██████████| 31/31 [00:04<00:00,  6.32it/s]
100%|██████████| 31/31 [00:05<00:00,  6.06it/s]
100%|██████████| 31/31 [00:05<00:00,  5.49it/s]
100%|██████████| 31/31 [00:04<00:00,  6.35it/s]
100%|██████████| 31/31 [00:05<00:00,  5.29it/s]



EPOCH NO.3
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.22248501470312476, metrics={'train_runtime': 126.026, 'train_samples_per_second': 1.309, 'train_steps_per_second': 0.159, 'total_flos': 0.0, 'train_loss': 0.22248501470312476, 'epoch': 0.963855421686747})
TEST ACCURACY: 74.19



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.023,10.732055,6.299365,1.0,4.43269,-196.691757,-192.042053,1.065585,1.42851
2,0.0676,10.939308,7.325073,1.0,3.614234,-151.021744,-119.234276,0.30931,0.530736
3,0.0525,10.118614,5.66793,1.0,4.450685,-188.345261,-154.914383,0.902852,0.759884
4,0.1489,9.742865,7.577848,1.0,2.165016,-146.753067,-131.957031,0.579623,0.384034
5,0.0888,9.573324,6.256782,1.0,3.316542,-197.599762,-198.40538,0.395756,0.657491
6,0.1093,9.699541,6.740169,1.0,2.959372,-170.691742,-145.926178,0.440245,0.459787
7,0.4137,9.285133,7.151331,0.875,2.133803,-151.655212,-96.713295,0.503351,0.79525
8,0.2979,9.053185,7.588803,0.875,1.464382,-168.095886,-137.951263,0.563389,0.240976
9,0.1519,8.700459,6.22995,1.0,2.470509,-162.659058,-150.766983,0.784546,0.666404
10,0.1854,10.25741,6.523236,1.0,3.734175,-139.361786,-135.48999,0.255816,0.667802


100%|██████████| 31/31 [00:05<00:00,  5.46it/s]
100%|██████████| 31/31 [00:04<00:00,  6.38it/s]
100%|██████████| 31/31 [00:05<00:00,  5.28it/s]
100%|██████████| 31/31 [00:04<00:00,  6.31it/s]
100%|██████████| 31/31 [00:05<00:00,  6.04it/s]



EPOCH NO.4
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.17020041374489664, metrics={'train_runtime': 126.3504, 'train_samples_per_second': 1.306, 'train_steps_per_second': 0.158, 'total_flos': 0.0, 'train_loss': 0.17020041374489664, 'epoch': 0.963855421686747})
TEST ACCURACY: 74.19



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0152,10.657457,5.798935,1.0,4.858522,-201.696045,-192.78804,0.930989,1.296702
2,0.0459,10.899572,6.953424,1.0,3.946148,-154.738251,-119.631622,0.15789,0.371267
3,0.0359,9.971375,5.206401,1.0,4.764973,-192.960571,-156.38678,0.764487,0.613987
4,0.1137,9.699932,7.210563,1.0,2.489369,-150.425919,-132.386353,0.425891,0.232933
5,0.0628,9.378027,5.734009,1.0,3.644018,-202.827484,-200.358337,0.246475,0.522108
6,0.0858,9.402719,6.222194,1.0,3.180526,-175.87149,-148.894394,0.29053,0.297612
7,0.2641,9.240438,6.610084,0.875,2.630355,-157.067688,-97.16024,0.34876,0.647127
8,0.2155,8.943718,7.056419,1.0,1.887299,-173.419739,-139.045944,0.415507,0.097252
9,0.1227,8.476526,5.698459,1.0,2.778067,-167.973984,-153.006302,0.625983,0.513602
10,0.1702,10.051091,6.096991,1.0,3.954101,-143.624237,-137.553207,0.090101,0.508452


100%|██████████| 31/31 [00:06<00:00,  5.15it/s]
100%|██████████| 31/31 [00:04<00:00,  6.29it/s]
100%|██████████| 31/31 [00:05<00:00,  6.04it/s]
100%|██████████| 31/31 [00:05<00:00,  5.47it/s]
100%|██████████| 31/31 [00:04<00:00,  6.31it/s]



EPOCH NO.5
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.12776899202726782, metrics={'train_runtime': 125.6936, 'train_samples_per_second': 1.313, 'train_steps_per_second': 0.159, 'total_flos': 0.0, 'train_loss': 0.12776899202726782, 'epoch': 0.963855421686747})
TEST ACCURACY: 74.19



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0096,10.549229,5.197787,1.0,5.351442,-207.70752,-193.870331,0.785618,1.153494
2,0.0303,10.828777,6.51212,1.0,4.316657,-159.151276,-120.339577,-0.003768,0.199774
3,0.0241,9.7631,4.634204,1.0,5.128896,-198.682526,-158.469543,0.615976,0.457877
4,0.0851,9.613874,6.739804,1.0,2.874071,-155.133514,-133.246918,0.265156,0.075714
5,0.043,9.114436,5.098321,1.0,4.016115,-209.184341,-202.994232,0.087731,0.378544
6,0.0673,9.053585,5.630744,1.0,3.422842,-181.785995,-152.385712,0.134345,0.130009
7,0.1522,9.18701,5.983899,1.0,3.203111,-163.329529,-97.694534,0.189735,0.494859
8,0.1527,8.787465,6.43136,1.0,2.356105,-179.670303,-140.608475,0.262707,-0.048694
9,0.0976,8.17664,5.036821,1.0,3.139819,-174.590363,-156.005173,0.462601,0.35633
10,0.155,9.799513,5.576979,1.0,4.222535,-148.824341,-140.06897,-0.079158,0.343521


100%|██████████| 31/31 [00:04<00:00,  6.23it/s]
100%|██████████| 31/31 [00:05<00:00,  5.55it/s]
100%|██████████| 31/31 [00:05<00:00,  5.74it/s]
100%|██████████| 31/31 [00:04<00:00,  6.32it/s]
100%|██████████| 31/31 [00:05<00:00,  5.35it/s]



EPOCH NO.6
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.09578826953656971, metrics={'train_runtime': 125.7085, 'train_samples_per_second': 1.313, 'train_steps_per_second': 0.159, 'total_flos': 0.0, 'train_loss': 0.09578826953656971, 'epoch': 0.963855421686747})
TEST ACCURACY: 74.19



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0059,10.367008,4.461845,1.0,5.905163,-215.066956,-195.692535,0.638294,1.008463
2,0.0195,10.700155,5.968694,1.0,4.731461,-164.585541,-121.625801,-0.161438,0.028192
3,0.015,9.455223,3.877629,1.0,5.577593,-206.248276,-161.548294,0.469286,0.304004
4,0.0616,9.41911,6.074251,1.0,3.344858,-161.789032,-135.194565,0.111001,-0.073215
5,0.0284,8.715734,4.279675,1.0,4.43606,-217.370819,-206.981262,-0.064046,0.241254
6,0.0524,8.58382,4.878255,1.0,3.705566,-189.310898,-157.083374,-0.013113,-0.025988
7,0.0971,9.013442,5.212289,1.0,3.801153,-171.045624,-99.430214,0.041715,0.35305
8,0.1022,8.51838,5.609288,1.0,2.909092,-187.891037,-143.299316,0.120818,-0.17979
9,0.0774,7.690044,4.164638,1.0,3.525407,-183.31218,-160.871124,0.314212,0.2127
10,0.1361,9.433863,4.861784,1.0,4.572078,-155.976288,-143.725494,-0.230646,0.196453


100%|██████████| 31/31 [00:06<00:00,  5.15it/s]
100%|██████████| 31/31 [00:04<00:00,  6.29it/s]
100%|██████████| 31/31 [00:05<00:00,  5.36it/s]
100%|██████████| 31/31 [00:05<00:00,  6.04it/s]
100%|██████████| 31/31 [00:04<00:00,  6.32it/s]



EPOCH NO.7
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.07706272220239044, metrics={'train_runtime': 126.1279, 'train_samples_per_second': 1.308, 'train_steps_per_second': 0.159, 'total_flos': 0.0, 'train_loss': 0.07706272220239044, 'epoch': 0.963855421686747})
TEST ACCURACY: 74.19



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0039,10.058236,3.619229,1.0,6.439008,-223.493103,-198.780243,0.49891,0.87089
2,0.0115,10.454749,5.22669,1.0,5.228059,-172.005585,-124.079849,-0.304825,-0.131465
3,0.0089,9.011627,2.929186,1.0,6.08244,-215.732712,-165.984253,0.33102,0.159809
4,0.0421,9.092397,5.256149,1.0,3.836247,-169.970062,-138.4617,-0.032725,-0.210067
5,0.0184,8.240243,3.374918,1.0,4.865325,-226.418381,-211.736191,-0.209001,0.11013
6,0.0396,8.100874,4.071508,1.0,4.029366,-197.378357,-161.912842,-0.153969,-0.168702
7,0.0676,8.819473,4.406235,1.0,4.413239,-179.106171,-101.369904,-0.096849,0.218734
8,0.0713,8.257074,4.813473,1.0,3.443602,-195.849182,-145.912369,-0.015972,-0.302942
9,0.0608,7.244227,3.293938,1.0,3.95029,-192.01918,-165.329285,0.172212,0.073809
10,0.1141,9.128252,4.181944,1.0,4.946308,-162.774689,-146.781586,-0.374475,0.05395


100%|██████████| 31/31 [00:04<00:00,  6.24it/s]
100%|██████████| 31/31 [00:05<00:00,  5.25it/s]
100%|██████████| 31/31 [00:04<00:00,  6.36it/s]
100%|██████████| 31/31 [00:05<00:00,  5.57it/s]
100%|██████████| 31/31 [00:05<00:00,  5.81it/s]



EPOCH NO.8
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.059794660611078146, metrics={'train_runtime': 125.3999, 'train_samples_per_second': 1.316, 'train_steps_per_second': 0.159, 'total_flos': 0.0, 'train_loss': 0.059794660611078146, 'epoch': 0.963855421686747})
TEST ACCURACY: 77.42



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.003,9.687548,2.783537,1.0,6.90401,-231.850037,-202.487137,0.36806,0.741127
2,0.007,10.225336,4.520237,1.0,5.705097,-179.070099,-126.373985,-0.434035,-0.278237
3,0.0051,8.551774,1.922704,1.0,6.62907,-225.797531,-170.582794,0.199352,0.024584
4,0.0282,8.643538,4.347601,1.0,4.295937,-179.055542,-142.950287,-0.162933,-0.331707
5,0.012,7.593497,2.299514,1.0,5.293983,-237.172424,-218.203629,-0.342574,-0.01107
6,0.0294,7.40999,3.041977,1.0,4.368014,-207.673676,-168.821686,-0.283164,-0.293964
7,0.0457,8.403287,3.353219,1.0,5.050068,-189.636322,-105.531769,-0.220972,0.098788
8,0.0491,7.795828,3.788304,1.0,4.007524,-206.100876,-150.524841,-0.14351,-0.413997
9,0.044,6.606631,2.198632,1.0,4.407999,-202.972244,-171.705261,0.044117,-0.054198
10,0.0997,8.617833,3.285064,1.0,5.33277,-171.7435,-151.885773,-0.505845,-0.074435


100%|██████████| 31/31 [00:05<00:00,  6.12it/s]
100%|██████████| 31/31 [00:04<00:00,  6.35it/s]
100%|██████████| 31/31 [00:06<00:00,  5.07it/s]
100%|██████████| 31/31 [00:04<00:00,  6.35it/s]
100%|██████████| 31/31 [00:05<00:00,  5.68it/s]



EPOCH NO.9
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.04615310636581853, metrics={'train_runtime': 126.3139, 'train_samples_per_second': 1.306, 'train_steps_per_second': 0.158, 'total_flos': 0.0, 'train_loss': 0.04615310636581853, 'epoch': 0.963855421686747})
TEST ACCURACY: 77.42



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 45,088,768


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0023,9.292707,1.887644,1.0,7.405064,-240.80896,-206.435532,0.239655,0.614304
2,0.0041,10.04154,3.798191,1.0,6.24335,-186.290573,-128.211945,-0.55147,-0.414939
3,0.0031,8.053143,0.908436,1.0,7.144706,-235.940216,-175.569107,0.07599,-0.100585
4,0.0186,8.225438,3.42878,1.0,4.796658,-188.243744,-147.131287,-0.282056,-0.439884
5,0.0075,6.940662,1.182309,1.0,5.758353,-248.344482,-224.731979,-0.464447,-0.122771
6,0.0211,6.765047,2.039482,1.0,4.725565,-217.698624,-175.271118,-0.400948,-0.402563
7,0.0315,7.971121,2.297576,1.0,5.673544,-200.192749,-109.853424,-0.33154,-0.010911
8,0.0317,7.249322,2.63891,1.0,4.610412,-217.594818,-155.989899,-0.262182,-0.514459
9,0.0305,5.820603,0.904284,1.0,4.91632,-215.915726,-179.565536,-0.072719,-0.173988
10,0.0909,7.888702,2.153546,1.0,5.735156,-183.058685,-159.177094,-0.625838,-0.195111


100%|██████████| 31/31 [00:04<00:00,  6.28it/s]
100%|██████████| 31/31 [00:06<00:00,  5.01it/s]
100%|██████████| 31/31 [00:04<00:00,  6.35it/s]
100%|██████████| 31/31 [00:05<00:00,  6.04it/s]
100%|██████████| 31/31 [00:05<00:00,  5.35it/s]



EPOCH NO.10
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.03660807132255286, metrics={'train_runtime': 125.4059, 'train_samples_per_second': 1.316, 'train_steps_per_second': 0.159, 'total_flos': 0.0, 'train_loss': 0.03660807132255286, 'epoch': 0.963855421686747})
TEST ACCURACY: 80.65

BEST ITERATION: 10


In [22]:
# Use to clear as much GPU RAM as possible
import gc
import torch

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

In [23]:
# Make sure to have enough GPU RAM before running this
from unsloth import FastLanguageModel
from datasets import load_from_disk

best_checkpoint_dir = f"checkpoint_iteration_{best_iteration}"

model, tokenizer = FastLanguageModel.from_pretrained(best_checkpoint_dir)

eval_result = run_eval(model, tokenizer, 5)
print(f"\nTEST ACCURACY: {eval_result * 100:.2f}\n")

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


100%|██████████| 31/31 [00:10<00:00,  2.92it/s]
100%|██████████| 31/31 [00:05<00:00,  5.66it/s]
100%|██████████| 31/31 [00:05<00:00,  5.31it/s]
100%|██████████| 31/31 [00:04<00:00,  6.51it/s]
100%|██████████| 31/31 [00:05<00:00,  5.56it/s]


TEST ACCURACY: 80.65






In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import login
login(token="hf_xxx-xxx")

print("Successfully logged in to Hugging Face!")

In [None]:
model.save_pretrained("model", tokenizer, save_method="default")
model.push_to_hub("t4gandhi/Llama-3.2-1B-bnb-4bit-fine-tuned", tokenizer, save_method="default")

In [24]:
# Use to clear as much GPU RAM as possible
import gc
import torch

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

In [25]:
from unsloth import FastLanguageModel

model_name = "t4gandhi/Llama-3.2-1B-bnb-4bit-fine-tuned"
model, tokenizer = FastLanguageModel.from_pretrained(model_name)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/180M [00:00<?, ?B/s]

In [26]:
def print_confusion_matrices(confusion_dict):
    for key, values in confusion_dict.items():
        # Extract confusion matrix values
        TP = values['TP']
        TN = values['TN']
        FP = values['FP']
        FN = values['FN']

        # Calculate total instances
        total = TP + TN + FP + FN

        # Calculate percentages
        tp_percent = TP / total
        tn_percent = TN / total
        fp_percent = FP / total
        fn_percent = FN / total

        # Calculate precision, recall, and F1 score
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Print the confusion matrix with percentages and F1 score
        print(f"Confusion Matrix for {key}:")
        print("-------------------------------------------------------")
        print(f"                Predicted Positive   Predicted Negative")
        print(f"Actual Positive           {tp_percent:>8.2f}             {fn_percent:>8.2f}")
        print(f"Actual Negative           {fp_percent:>8.2f}             {tn_percent:>8.2f}")
        print("-------------------------------------------------------")
        print(f"Combined                  {tp_percent + fp_percent:>8.2f}             {tn_percent + fn_percent:>8.2f}")
        print("-------------------------------------------------------")
        print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1_score:.2f}\n")

In [27]:
import pandas as pd

eval_result, stats = run_eval(model, tokenizer, 5, get_stats = True)
print(f"\nTEST ACCURACY: {eval_result * 100:.2f}\n")
print_confusion_matrices(stats)

100%|██████████| 31/31 [00:04<00:00,  6.39it/s]
100%|██████████| 31/31 [00:06<00:00,  5.06it/s]
100%|██████████| 31/31 [00:04<00:00,  6.45it/s]
100%|██████████| 31/31 [00:05<00:00,  6.17it/s]
100%|██████████| 31/31 [00:05<00:00,  5.36it/s]


TEST ACCURACY: 80.65

Confusion Matrix for rd1:
-------------------------------------------------------
                Predicted Positive   Predicted Negative
Actual Positive               0.33                 0.00
Actual Negative               0.20                 0.47
-------------------------------------------------------
Combined                      0.53                 0.47
-------------------------------------------------------
Precision: 0.62, Recall: 1.00, F1 Score: 0.77

Confusion Matrix for rd2:
-------------------------------------------------------
                Predicted Positive   Predicted Negative
Actual Positive               0.43                 0.14
Actual Negative               0.07                 0.36
-------------------------------------------------------
Combined                      0.50                 0.50
-------------------------------------------------------
Precision: 0.86, Recall: 0.75, F1 Score: 0.80

Confusion Matrix for rd3:
---------------------




In [28]:
def preliminary_stats(dataset):
  NUM_ITEMS = len(dpo_datasets[dataset])
  ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
  reward_model_chosen = dict.fromkeys(ROUNDS, 0)
  reward_model_rejected = dict.fromkeys(ROUNDS, 0)
  reward_model_ratio = dict.fromkeys(ROUNDS, 0)
  for i in range(NUM_ITEMS):
    chosen_round = datasets_dpo[dataset][i]["metadata"][0]['chosen']
    rejected_round = datasets_dpo[dataset][i]["metadata"][0]['rejected']

    reward_model_chosen[chosen_round] += 1
    reward_model_rejected[rejected_round] += 1

  for ROUND in ROUNDS:
    reward_model_ratio[ROUND] = reward_model_chosen[ROUND] / (reward_model_chosen[ROUND] + reward_model_rejected[ROUND])

  return reward_model_ratio

In [29]:
import pandas as pd

prelim_ratio_train = preliminary_stats('train')
df_prelim_ratio_train = pd.DataFrame(list(prelim_ratio_train.items()), columns=["Round", "Chosen"])
df_prelim_ratio_train

Unnamed: 0,Round,Chosen
0,rd1,0.367089
1,rd2,0.407407
2,rd3,0.3875
3,custom,0.8


In [30]:
import pandas as pd

prelim_ratio_test = preliminary_stats('test')
df_prelim_ratio_test = pd.DataFrame(list(prelim_ratio_test.items()), columns=["Round", "Chosen"])
df_prelim_ratio_test

Unnamed: 0,Round,Chosen
0,rd1,0.333333
1,rd2,0.571429
2,rd3,0.266667
3,custom,0.777778
