### DPO Fine-Tuning - Mistral 7B bnb 4bit

> **Model Info**

- Model Name: Mistral 7B bnb(4-bit quantized)
- Accuracy: 80.65%

> **Training Info**

- GPU Type: A100
- Time: ~150 mins
- GPU RAM: 10.4 GB

In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.9: Fast Mistral patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
import os
import re
from typing import List, Literal, Optional

from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
from datasets.builder import DatasetGenerationError


DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"


def apply_chat_template(
    example, tokenizer, task: Literal["sft", "generation", "rm", "dpo"] = "sft", assistant_prefix="<|assistant|>\n"
):
    def _strip_prefix(s, pattern):
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task in ["sft", "generation"]:
        messages = example["messages"]
        # Custom prompt template for Codellama
        system_message_content = ""
        user_message_content = messages[0]["content"]
        assistant_message_content = messages[1]["content"]
        example["text"] = f"""<s>[INST] <<SYS>>
{system_message_content}
<</SYS>>

{user_message_content}
[/INST] {assistant_message_content}</s>"""
    elif task == "rm":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            chosen_messages = example["chosen"]
            rejected_messages = example["rejected"]
           # I add an empty system message if there is none
            if chosen_messages[0]["role"] != "system":
                chosen_messages.insert(0, {"role": "system", "content": ""})
            if rejected_messages[0]["role"] != "system":
                rejected_messages.insert(0, {"role": "system", "content": ""})
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
        else:
            raise ValueError(
                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    elif task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            system_message_content = ""
            user_message_content = example["chosen"][0]["content"]
            chosen_message_content = example["chosen"][1]["content"]
            rejected_message_content = example["rejected"][1]["content"]
            example["text_prompt"] = f"""<s>[INST] <<SYS>>
{system_message_content}
<</SYS>>

{user_message_content}
[/INST]"""
            example["text_chosen"] = f" {chosen_message_content}</s>"
            example["text_rejected"] = f" {rejected_message_content}</s>"
        else:
            raise ValueError(
                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    else:
        raise ValueError(
            f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo']}"
        )
    return example


def get_datasets(
    data_config: dict,
    splits: List[str] = ["train", "test"],
    shuffle: bool = True,
) -> DatasetDict:
    """
    Loads one or more datasets with varying training set proportions.

    Args:
        data_config (`DataArguments` or `dict`):
            Dataset configuration and split proportions.
        splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.

    Returns
        [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
    """

    if type(data_config) is dict:
        # Structure of the input is:
        #     dataset_mixer = {
        #             "dataset1": 0.5,
        #             "dataset1": 0.3,
        #             "dataset1": 0.2,
        #         }
        dataset_mixer = data_config
    else:
        raise ValueError(f"Data config {data_config} not recognized.")

    raw_datasets = mix_datasets(dataset_mixer, splits=splits, shuffle=shuffle)
    return raw_datasets


def mix_datasets(dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle=True) -> DatasetDict:
    """
    Loads and mixes datasets according to proportions specified in `dataset_mixer`.

    Args:
        dataset_mixer (`dict`):
            Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
        splits (Optional[List[str]], *optional*, defaults to `None`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.
    """
    raw_datasets = DatasetDict()
    raw_train_datasets = []
    raw_val_datasets = []
    fracs = []
    for ds, frac in dataset_mixer.items():
        fracs.append(frac)
        for split in splits:
            try:
                # Try first if dataset on a Hub repo
                dataset = load_dataset(ds, split=split)
            except DatasetGenerationError:
                # If not, check local dataset
                dataset = load_from_disk(os.path.join(ds, split))

            if "train" in split:
                raw_train_datasets.append(dataset)
            elif "test" in split:
                raw_val_datasets.append(dataset)
            else:
                raise ValueError(f"Split type {split} not recognized as one of test or train.")

    if any(frac < 0 for frac in fracs):
        raise ValueError("Dataset fractions cannot be negative.")

    if len(raw_train_datasets) > 0:
        train_subsets = []
        for dataset, frac in zip(raw_train_datasets, fracs):
            train_subset = dataset.select(range(int(frac * len(dataset))))
            train_subsets.append(train_subset)
        if shuffle:
            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
        else:
            raw_datasets["train"] = concatenate_datasets(train_subsets)
    # No subsampling for test datasets to enable fair comparison across models
    if len(raw_val_datasets) > 0:
        if shuffle:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
        else:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)

    if len(raw_datasets) == 0:
        raise ValueError(
            f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
        )

    return raw_datasets

In [None]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset, DatasetDict

dataset = load_dataset('t4gandhi/code_correction_using_LLM')
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])
df_parts = [df_train, df_test]

df_sft_train = []
df_sft_test = []
df_sft_parts = [df_sft_train, df_sft_test]

df_dpo_train = []
df_dpo_test = []
df_dpo_parts = [df_dpo_train, df_dpo_test]

PREFIXS = ['score_s1_', 'score_s2_', 'score_s3_', 'score_s4_', 'score_s5_', 'score_s6_']
ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
PAIRS = [('rd1', 'rd2'), ('rd1', 'rd3'), ('rd1', 'custom'), ('rd2', 'rd3'), ('rd2', 'custom'), ('rd3', 'custom')]

def indent_lines(string: str) -> str:
  indented_string = '\n'.join('    ' + line for line in string.splitlines())
  return indented_string

for df, df_sft, df_dpo in zip(df_parts, df_sft_parts, df_dpo_parts):
  for idx, row in df.iterrows():
      prompt = row['prompt']
      result = row['result']
      instruction = f"""<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>"""
      full_solution = "<buggy_code>\n" + (prompt + indent_lines(result)).strip('\n') + "\n</buggy_code>"
      full_instruction = instruction + "\n" + full_solution
      solutions_info = {}
      for ROUND in ROUNDS:
        solutions_info[ROUND] = {}
        total_score = 0
        for PREFIX in PREFIXS:
          score_col = PREFIX + ROUND
          score = int(row[score_col][0])
          total_score += score
        total_score /= 42
        analysis_col = 'analysis_' + ROUND
        solutions_info[ROUND]['analysis'] = row[analysis_col]
        solutions_info[ROUND]['score'] = total_score
      for ROUND1, ROUND2 in PAIRS:
        round1_score = solutions_info[ROUND1]['score']
        round2_score = solutions_info[ROUND2]['score']
        round1_analysis = solutions_info[ROUND1]['analysis']
        round2_analysis = solutions_info[ROUND2]['analysis']
        if round1_score == round2_score:
          continue
        messages_info = {}
        messages_info['messages'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score > round2_score else round2_analysis, 'role': 'assistant'}
        ]
        pairwise_info = {}
        pairwise_info['prompt'] = full_instruction
        pairwise_info['chosen'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score > round2_score else round2_analysis, 'role': 'assistant'}
        ]
        pairwise_info['rejected'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score < round2_score else round2_analysis, 'role': 'assistant'}
        ]
        # Not part of training data, only for analysis
        pairwise_info['metadata'] = {
            'chosen': ROUND1 if round1_score > round2_score else ROUND2,
            'rejected': ROUND1 if round1_score < round2_score else ROUND2,
        },
        df_sft.append(messages_info)
        df_dpo.append(pairwise_info)

df_sft_train = pd.DataFrame(df_sft_train)
df_sft_test = pd.DataFrame(df_sft_test)
dataset_sft_train = Dataset.from_pandas(df_sft_train)
dataset_sft_test = Dataset.from_pandas(df_sft_test)
datasets_sft = DatasetDict({
    'train': dataset_sft_train,
    'test': dataset_sft_test
})
df_dpo_train = pd.DataFrame(df_dpo_train)
df_dpo_test = pd.DataFrame(df_dpo_test)
dataset_dpo_train = Dataset.from_pandas(df_dpo_train)
dataset_dpo_test = Dataset.from_pandas(df_dpo_test)
datasets_dpo = DatasetDict({
    'train': dataset_dpo_train,
    'test': dataset_dpo_test
})

README.md:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/93.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/49.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6 [00:00<?, ? examples/s]

In [None]:
datasets_sft

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 165
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 31
    })
})

In [None]:
datasets_dpo

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'metadata'],
        num_rows: 165
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'metadata'],
        num_rows: 31
    })
})

In [None]:
column_names = list(datasets_sft['train'].features)

sft_datasets = datasets_sft.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "sft"},
    num_proc = 12,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/165 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/31 [00:00<?, ? examples/s]

In [None]:
sft_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 165
    })
    test: Dataset({
        features: ['text'],
        num_rows: 31
    })
})

In [None]:
print(sft_datasets['train'][0]['text'])

<s>[INST] <<SYS>>

<</SYS>>

<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>
<buggy_code>
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output th

In [None]:
column_names = list(datasets_dpo['train'].features)

dpo_datasets = datasets_dpo.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "dpo"},
    num_proc = 12,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

dpo_datasets = dpo_datasets.rename_columns(
    {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
)

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/165 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/31 [00:00<?, ? examples/s]

In [None]:
dpo_datasets

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 165
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 31
    })
})

In [None]:
print("=" * 10 + "PROMPT" + "=" * 10)
print(dpo_datasets['train'][0]['prompt'])
print("=" * 10 + "CHOSEN" + "=" * 10)
print(dpo_datasets['train'][0]['chosen'])
print("=" * 10 + "REJECTED" + "=" * 10)
print(dpo_datasets['train'][0]['rejected'])

<s>[INST] <<SYS>>

<</SYS>>

<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>
<buggy_code>
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output th

I now add LoRA adapters so it only need to update 1 to 10% of all parameters!


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.12.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Train the SFT model

In [None]:
# Note: running eval is not necessary for this stage
import os
os.environ["WANDB_MODE"] = "disabled"
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

sft_trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = sft_datasets['train'],
    # eval_dataset = sft_datasets['test'], # Uncomment to run eval
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        # evaluation_strategy = "steps", # Uncomment to run eval
        # eval_steps = 1, # Uncomment to run eval
    ),
)

Map (num_proc=2):   0%|          | 0/165 [00:00<?, ? examples/s]

In [None]:
sft_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss
1,1.4128
2,1.3673
3,1.2817
4,1.0974
5,0.8295
6,0.6713
7,0.6137
8,0.5931
9,0.5391
10,0.4086


TrainOutput(global_step=20, training_loss=0.5995430879294872, metrics={'train_runtime': 58.0628, 'train_samples_per_second': 2.842, 'train_steps_per_second': 0.344, 'total_flos': 4925463232561152.0, 'train_loss': 0.5995430879294872, 'epoch': 0.963855421686747})

<a name="Train"></a>
### Train the DPO model


In [None]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

In [None]:
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        learning_rate = 5e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
    ),
    beta = 0.1,
    train_dataset = dpo_datasets['train'],
    tokenizer = tokenizer,
    max_length = 1024,
    max_prompt_length = 512,
)

Extracting prompt from train dataset:   0%|          | 0/165 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/165 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/165 [00:00<?, ? examples/s]

In [None]:
from tqdm import tqdm
import torch

def run_eval(model, tokenizer, no_iter, get_stats=False):
    NUM_ITEMS = len(dpo_datasets['test'])
    num_chosen = 0
    ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
    stats = {}
    for ROUND in ROUNDS:
        stats[ROUND] = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for _ in range(no_iter):
        for i in tqdm(range(NUM_ITEMS)):
            input = {
                "chosen": datasets_dpo['test'][i]["chosen"],
                "rejected": datasets_dpo['test'][i]["rejected"]
            }
            chosen_round = datasets_dpo['test'][i]["metadata"][0]['chosen']
            rejected_round = datasets_dpo['test'][i]["metadata"][0]['rejected']

            # Apply the chat template to format the input
            formatted_input = apply_chat_template(input, tokenizer, task="dpo")

            # Tokenize the inputs
            inputs_chosen = tokenizer(formatted_input["text_chosen"], return_tensors="pt", padding=True, truncation=True).to(device)
            inputs_rejected = tokenizer(formatted_input["text_rejected"], return_tensors="pt", padding=True, truncation=True).to(device)

            # Generate the scalar reward values
            with torch.no_grad():
                reward_chosen = model(**inputs_chosen).logits.mean().item()
                reward_rejected = model(**inputs_rejected).logits.mean().item()
                # Model chose correctly
                if reward_chosen > reward_rejected:
                    num_chosen += 1
                    stats[chosen_round]['TP'] += 1
                    stats[rejected_round]['TN'] += 1
                # Model chose wrongly
                else:
                    stats[chosen_round]['FN'] += 1
                    stats[rejected_round]['FP'] += 1

    if get_stats:
        return num_chosen / (no_iter * NUM_ITEMS), stats

    return num_chosen / (no_iter * NUM_ITEMS)

In [None]:
best_iteration = 1
best_eval_result = 0

for i in range(1, 11):
    # Train the model
    training_result = dpo_trainer.train()
    eval_result = run_eval(model, tokenizer, 5)
    if eval_result >= best_eval_result:
        best_eval_result = eval_result
        best_iteration = i

    # Create a unique checkpoint directory for each iteration
    checkpoint_dir = f"checkpoint_iteration_{i}"
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Save model and trainer states for this iteration
    dpo_trainer.save_model(checkpoint_dir)  # Save model and tokenizer
    dpo_trainer.save_state()  # Save optimizer, scheduler, and other trainer states

    print(f"\nEPOCH NO.{i}")
    print(f"TRAINING RESULT: {training_result}")
    print(f"TEST ACCURACY: {eval_result * 100:.2f}\n")

print(f"BEST ITERATION: {best_iteration}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.2339,10.674225,6.841915,0.875,3.83231,-122.016785,-123.151459,-3.267134,-3.237344
2,0.4765,10.388383,7.463733,0.75,2.92465,-83.168228,-55.942459,-3.164254,-3.10305
3,0.1893,11.243833,6.302275,0.875,4.941558,-116.461334,-76.104591,-3.172111,-3.212172
4,0.5976,9.359149,7.66822,0.75,1.69093,-86.959908,-74.095779,-3.232706,-3.185165
5,0.2127,12.425916,7.648688,0.75,4.777227,-115.863922,-100.884056,-3.147595,-3.088173
6,0.6786,10.862921,7.882932,0.75,2.979989,-94.325569,-69.969925,-3.178611,-3.215041
7,1.0113,8.507143,7.887565,0.5,0.619578,-81.483368,-51.086891,-3.201245,-3.13039
8,1.1413,8.955988,8.668083,0.5,0.287905,-93.86705,-76.426956,-3.252313,-3.220647
9,0.2724,10.124434,5.991192,0.875,4.133242,-105.221031,-71.136162,-3.142179,-3.146947
10,0.3101,11.098423,7.516614,0.75,3.581809,-68.497406,-63.896313,-3.270875,-3.231691


100%|██████████| 31/31 [00:07<00:00,  4.23it/s]
100%|██████████| 31/31 [00:07<00:00,  4.40it/s]
100%|██████████| 31/31 [00:07<00:00,  4.40it/s]
100%|██████████| 31/31 [00:07<00:00,  4.39it/s]
100%|██████████| 31/31 [00:07<00:00,  4.40it/s]



EPOCH NO.1
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.45976528264582156, metrics={'train_runtime': 101.1885, 'train_samples_per_second': 1.631, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.45976528264582156, 'epoch': 0.963855421686747})
TEST ACCURACY: 48.39



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0658,12.146437,7.342608,1.0,4.803829,-117.00985,-108.429337,-3.295442,-3.248619
2,0.104,10.670683,6.19975,1.0,4.470933,-95.80806,-53.119461,-3.171367,-3.105604
3,0.1154,11.806856,6.393316,1.0,5.413541,-115.550919,-70.47435,-3.194103,-3.232726
4,0.188,9.479721,6.843159,0.875,2.636563,-95.210518,-72.89006,-3.249734,-3.200915
5,0.1031,12.464388,7.565874,1.0,4.898514,-116.692062,-100.499344,-3.163223,-3.100747
6,0.3121,10.94087,7.229918,0.875,3.710953,-100.855713,-69.19043,-3.202845,-3.236659
7,0.7469,8.284565,7.314622,0.625,0.969942,-87.212791,-53.312675,-3.225554,-3.149788
8,0.8378,9.077635,8.224457,0.625,0.853178,-98.303314,-75.210487,-3.281613,-3.241217
9,0.1453,10.013055,5.335498,0.875,4.677557,-111.777985,-72.249947,-3.159899,-3.166484
10,0.2222,11.059759,7.035797,0.875,4.023961,-73.305573,-64.282959,-3.284925,-3.2448


100%|██████████| 31/31 [00:07<00:00,  4.32it/s]
100%|██████████| 31/31 [00:07<00:00,  4.36it/s]
100%|██████████| 31/31 [00:07<00:00,  4.37it/s]
100%|██████████| 31/31 [00:07<00:00,  4.36it/s]
100%|██████████| 31/31 [00:07<00:00,  4.35it/s]



EPOCH NO.2
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.2557546118274331, metrics={'train_runtime': 100.9465, 'train_samples_per_second': 1.635, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.2557546118274331, 'epoch': 0.963855421686747})
TEST ACCURACY: 51.61



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0429,12.100432,6.881712,1.0,5.218719,-121.618813,-108.889389,-3.312341,-3.256358
2,0.0688,10.431983,5.527817,1.0,4.904165,-102.52739,-55.506462,-3.166702,-3.097885
3,0.0452,11.817489,5.605946,1.0,6.211543,-123.424622,-70.368027,-3.213093,-3.23699
4,0.1062,9.390865,6.298114,1.0,3.092751,-100.660957,-73.778618,-3.230636,-3.185913
5,0.0436,12.118011,6.662869,1.0,5.455142,-125.722115,-103.963089,-3.16084,-3.092417
6,0.1278,10.598158,6.187716,1.0,4.410441,-111.277725,-72.617554,-3.184117,-3.197168
7,0.4574,7.917516,6.29456,0.625,1.622956,-97.413406,-56.983162,-3.197639,-3.102975
8,0.6125,8.956942,7.401782,0.75,1.55516,-106.530075,-76.417419,-3.259205,-3.20786
9,0.0404,9.568342,4.152654,1.0,5.415689,-123.606415,-76.697067,-3.11658,-3.115583
10,0.1512,10.883982,6.107964,1.0,4.776017,-82.5839,-66.040733,-3.208341,-3.176497


100%|██████████| 31/31 [00:07<00:00,  4.36it/s]
100%|██████████| 31/31 [00:07<00:00,  4.38it/s]
100%|██████████| 31/31 [00:07<00:00,  4.39it/s]
100%|██████████| 31/31 [00:07<00:00,  4.37it/s]
100%|██████████| 31/31 [00:06<00:00,  4.44it/s]



EPOCH NO.3
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.15192426450084895, metrics={'train_runtime': 100.8607, 'train_samples_per_second': 1.636, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.15192426450084895, 'epoch': 0.963855421686747})
TEST ACCURACY: 51.61



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0215,11.364035,5.253686,1.0,6.110348,-137.899078,-116.253357,-3.256863,-3.167501
2,0.0365,9.837977,4.109451,1.0,5.728527,-116.711052,-61.44651,-3.026722,-2.944405
3,0.0332,11.232016,4.222358,1.0,7.009658,-137.260498,-76.222763,-3.142824,-3.112452
4,0.1213,8.642318,4.886744,0.875,3.755574,-114.774658,-81.264091,-3.0871,-3.032726
5,0.0266,11.081196,4.959339,1.0,6.121858,-142.757416,-114.331253,-3.085463,-2.992516
6,0.0339,9.511792,4.0095,1.0,5.502293,-133.059906,-83.481209,-3.040962,-3.008901
7,0.2336,7.267901,4.572155,0.875,2.695746,-114.637466,-63.479313,-3.067859,-2.934931
8,0.3509,8.538387,5.875643,0.75,2.662744,-121.791451,-80.602959,-3.14445,-3.061384
9,0.0039,8.59099,2.05737,1.0,6.533621,-144.559265,-86.470596,-2.996583,-2.9715
10,0.1437,10.200227,4.344587,0.875,5.855639,-100.217667,-72.878281,-3.006747,-2.968103


100%|██████████| 31/31 [00:07<00:00,  4.34it/s]
100%|██████████| 31/31 [00:07<00:00,  4.42it/s]
100%|██████████| 31/31 [00:07<00:00,  4.40it/s]
100%|██████████| 31/31 [00:07<00:00,  4.37it/s]
100%|██████████| 31/31 [00:07<00:00,  4.43it/s]



EPOCH NO.4
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.08805670000147074, metrics={'train_runtime': 101.0694, 'train_samples_per_second': 1.633, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.08805670000147074, 'epoch': 0.963855421686747})
TEST ACCURACY: 51.61



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0139,10.578363,3.575809,1.0,7.002554,-154.677856,-124.110077,-3.122675,-3.026776
2,0.0144,9.283956,2.469396,1.0,6.81456,-133.111603,-66.986725,-2.849201,-2.73963
3,0.0059,10.712515,2.317977,1.0,8.394539,-156.304306,-81.417763,-3.035418,-2.934015
4,0.0427,7.754745,2.808025,1.0,4.94672,-135.561859,-90.139816,-2.921052,-2.844174
5,0.0081,10.071482,2.825404,1.0,7.246077,-164.096771,-124.428406,-2.957495,-2.856716
6,0.0099,7.905937,0.878316,1.0,7.027621,-164.371735,-99.539764,-2.827708,-2.801048
7,0.1292,6.286043,2.269611,1.0,4.016431,-137.662903,-73.297897,-2.895259,-2.766951
8,0.3061,7.042857,3.052275,0.75,3.990582,-150.025131,-95.558266,-2.941373,-2.815454
9,0.0016,6.678702,-1.227464,1.0,7.906167,-177.407593,-105.59346,-2.834387,-2.80778
10,0.2105,8.176811,1.326501,0.875,6.850311,-130.398529,-93.112427,-2.791973,-2.707538


100%|██████████| 31/31 [00:07<00:00,  4.22it/s]
100%|██████████| 31/31 [00:07<00:00,  4.39it/s]
100%|██████████| 31/31 [00:07<00:00,  4.37it/s]
100%|██████████| 31/31 [00:07<00:00,  4.37it/s]
100%|██████████| 31/31 [00:07<00:00,  4.38it/s]



EPOCH NO.5
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.057525375162367706, metrics={'train_runtime': 100.7804, 'train_samples_per_second': 1.637, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.057525375162367706, 'epoch': 0.963855421686747})
TEST ACCURACY: 51.61



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0028,9.197786,-0.133993,1.0,9.331779,-191.775864,-137.915833,-2.975308,-2.896684
2,0.0026,7.914446,-0.335028,1.0,8.249474,-161.155823,-80.681839,-2.685343,-2.58558
3,0.0015,9.270718,-0.805328,1.0,10.076044,-187.537354,-95.835747,-2.914137,-2.779176
4,0.0071,6.628313,0.036885,1.0,6.591428,-163.273254,-101.404144,-2.799893,-2.712339
5,0.0025,8.747776,-0.098742,1.0,8.846519,-193.338226,-137.665451,-2.880054,-2.786172
6,0.0014,6.131992,-2.526993,1.0,8.658985,-198.42482,-117.279213,-2.724801,-2.713766
7,0.0332,5.418208,-0.779635,1.0,6.197843,-168.15535,-81.976242,-2.797767,-2.690388
8,0.1403,5.77389,-0.655465,0.875,6.429356,-187.102539,-108.247925,-2.828852,-2.706571
9,0.0002,4.89152,-4.608298,1.0,9.499818,-211.215942,-123.465302,-2.750875,-2.736975
10,0.139,6.478263,-1.12203,1.0,7.600292,-154.883835,-110.097916,-2.723304,-2.63183


100%|██████████| 31/31 [00:07<00:00,  4.30it/s]
100%|██████████| 31/31 [00:07<00:00,  4.33it/s]
100%|██████████| 31/31 [00:07<00:00,  4.38it/s]
100%|██████████| 31/31 [00:07<00:00,  4.39it/s]
100%|██████████| 31/31 [00:07<00:00,  4.39it/s]



EPOCH NO.6
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.03693264139119492, metrics={'train_runtime': 100.8431, 'train_samples_per_second': 1.636, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.03693264139119492, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0005,6.834008,-4.491991,1.0,11.325998,-235.35585,-161.553635,-2.889149,-2.821215
2,0.0003,6.912028,-3.129804,1.0,10.041833,-189.103592,-90.705994,-2.597302,-2.514988
3,0.0006,7.746037,-4.219327,1.0,11.965364,-221.677338,-111.08255,-2.82366,-2.68454
4,0.0059,5.403719,-2.755254,1.0,8.158974,-191.194656,-113.650078,-2.71389,-2.62471
5,0.0015,7.065592,-3.295921,1.0,10.361513,-225.309998,-154.487305,-2.801574,-2.716606
6,0.0002,4.482114,-6.193574,1.0,10.675689,-235.090637,-133.778,-2.636491,-2.633636
7,0.0129,4.520015,-3.61368,1.0,8.133696,-196.495819,-90.958168,-2.725985,-2.630837
8,0.0417,4.783285,-3.645581,1.0,8.428865,-217.003693,-118.153992,-2.736751,-2.609862
9,0.0001,3.355772,-7.601415,1.0,10.957187,-241.14711,-138.822769,-2.682757,-2.683462
10,0.022,5.791639,-3.831685,1.0,9.623323,-181.980377,-116.964149,-2.674127,-2.587005


100%|██████████| 31/31 [00:07<00:00,  4.21it/s]
100%|██████████| 31/31 [00:07<00:00,  4.40it/s]
100%|██████████| 31/31 [00:07<00:00,  4.40it/s]
100%|██████████| 31/31 [00:07<00:00,  4.35it/s]
100%|██████████| 31/31 [00:07<00:00,  4.36it/s]



EPOCH NO.7
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.01608720300823734, metrics={'train_runtime': 100.9749, 'train_samples_per_second': 1.634, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.01608720300823734, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0003,5.9804,-6.945578,1.0,12.925978,-259.891724,-170.089691,-2.838552,-2.770682
2,0.0005,6.158212,-5.062304,1.0,11.220515,-208.428589,-98.244171,-2.552475,-2.475975
3,0.0003,6.497043,-7.067155,1.0,13.564198,-250.155609,-123.572479,-2.757856,-2.624412
4,0.0005,4.482524,-5.318207,1.0,9.800731,-216.824188,-122.86203,-2.664142,-2.579787
5,0.0004,6.115533,-5.72795,1.0,11.843481,-249.63031,-163.987885,-2.752946,-2.67468
6,0.0004,2.506219,-8.24867,1.0,10.754889,-255.641571,-153.536926,-2.600529,-2.598278
7,0.0008,4.231961,-6.587342,1.0,10.819304,-226.232437,-93.838707,-2.680408,-2.590691
8,0.0029,3.884831,-6.748624,1.0,10.633455,-248.034119,-127.138519,-2.667405,-2.550619
9,0.0,1.379609,-10.930264,1.0,12.309872,-274.435608,-158.584412,-2.629891,-2.645583
10,0.0027,4.985736,-6.218624,1.0,11.204359,-205.849792,-125.023186,-2.645302,-2.558907


100%|██████████| 31/31 [00:07<00:00,  4.31it/s]
100%|██████████| 31/31 [00:07<00:00,  4.35it/s]
100%|██████████| 31/31 [00:07<00:00,  4.33it/s]
100%|██████████| 31/31 [00:07<00:00,  4.34it/s]
100%|██████████| 31/31 [00:07<00:00,  4.35it/s]



EPOCH NO.8
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.0048335088994008405, metrics={'train_runtime': 100.8694, 'train_samples_per_second': 1.636, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.0048335088994008405, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0001,4.026122,-10.366932,1.0,14.393055,-294.105255,-189.632477,-2.785328,-2.716389
2,0.0002,5.239073,-6.933938,1.0,12.173012,-227.144928,-107.435547,-2.505612,-2.43784
3,0.0,5.343158,-10.114312,1.0,15.45747,-280.627167,-135.111343,-2.689596,-2.57263
4,0.0005,3.374177,-7.202423,1.0,10.576601,-235.666321,-133.945496,-2.610224,-2.535033
5,0.0002,4.613955,-8.352734,1.0,12.966689,-275.878143,-179.003662,-2.696571,-2.627667
6,0.0002,1.436859,-10.743748,1.0,12.180607,-280.592346,-164.230545,-2.558701,-2.563746
7,0.0001,3.827628,-9.17047,1.0,12.998099,-252.063721,-97.882042,-2.632344,-2.548642
8,0.0006,3.227494,-9.323218,1.0,12.550713,-273.78006,-133.711899,-2.621185,-2.505665
9,0.0,0.2764,-13.09628,1.0,13.372681,-296.095764,-169.616501,-2.586958,-2.614596
10,0.0002,4.683692,-8.429481,1.0,13.113172,-227.958344,-128.043625,-2.626387,-2.541513


100%|██████████| 31/31 [00:07<00:00,  4.32it/s]
100%|██████████| 31/31 [00:07<00:00,  4.29it/s]
100%|██████████| 31/31 [00:07<00:00,  4.35it/s]
100%|██████████| 31/31 [00:07<00:00,  4.36it/s]
100%|██████████| 31/31 [00:07<00:00,  4.34it/s]



EPOCH NO.9
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.0016417132883589148, metrics={'train_runtime': 100.8287, 'train_samples_per_second': 1.636, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.0016417132883589148, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 167,772,160


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0,3.193246,-11.892776,1.0,15.086022,-309.363708,-197.961243,-2.771038,-2.703579
2,0.0001,4.706873,-8.34897,1.0,13.055844,-241.295258,-112.757545,-2.500837,-2.434511
3,0.0,4.458739,-11.933146,1.0,16.391886,-298.815521,-143.955521,-2.671284,-2.566778
4,0.0001,3.439496,-9.303712,1.0,12.743207,-256.679199,-133.292313,-2.599015,-2.527417
5,0.0001,3.710274,-9.900462,1.0,13.610736,-291.355408,-188.040482,-2.677909,-2.612987
6,0.0,0.709652,-12.283447,1.0,12.993098,-295.989349,-171.502609,-2.552035,-2.558202
7,0.0001,3.342409,-10.62414,1.0,13.966549,-266.600403,-102.734238,-2.620837,-2.539488
8,0.0001,2.627219,-11.365259,1.0,13.992478,-294.20047,-139.714645,-2.61294,-2.491171
9,0.0,-0.508095,-14.715633,1.0,14.207539,-312.289276,-177.461441,-2.570821,-2.606056
10,0.0001,4.020622,-9.790544,1.0,13.811165,-241.56897,-134.674316,-2.624016,-2.538517


100%|██████████| 31/31 [00:07<00:00,  4.21it/s]
100%|██████████| 31/31 [00:07<00:00,  4.39it/s]
100%|██████████| 31/31 [00:07<00:00,  4.36it/s]
100%|██████████| 31/31 [00:07<00:00,  4.35it/s]
100%|██████████| 31/31 [00:07<00:00,  4.37it/s]



EPOCH NO.10
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.0005789103373103899, metrics={'train_runtime': 101.0444, 'train_samples_per_second': 1.633, 'train_steps_per_second': 0.198, 'total_flos': 0.0, 'train_loss': 0.0005789103373103899, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06

BEST ITERATION: 10


In [None]:
# Use to clear as much GPU RAM as possible
import gc
import torch
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Make sure to have enough GPU RAM before running this
from unsloth import FastLanguageModel
from datasets import load_from_disk

best_checkpoint_dir = f"checkpoint_iteration_{best_iteration}"

model, tokenizer = FastLanguageModel.from_pretrained(best_checkpoint_dir)

eval_result = run_eval(model, tokenizer, 5)
print(f"\nTEST ACCURACY: {eval_result * 100:.2f}\n")

==((====))==  Unsloth 2024.12.9: Fast Mistral patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Will load checkpoint_iteration_10 as a legacy tokenizer.
100%|██████████| 31/31 [00:06<00:00,  4.46it/s]
100%|██████████| 31/31 [00:06<00:00,  4.50it/s]
100%|██████████| 31/31 [00:06<00:00,  4.54it/s]
100%|██████████| 31/31 [00:06<00:00,  4.51it/s]
100%|██████████| 31/31 [00:06<00:00,  4.53it/s]


TEST ACCURACY: 58.06






In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import login
login(token="hf_xxx-xxx")

print("Successfully logged in to Hugging Face!")

Successfully logged in to Hugging Face!


In [None]:
model.save_pretrained("model", tokenizer, save_method="default")
model.push_to_hub("t4gandhi/mistral-7b-v0.3-bnb-4bit-fine-tuned", tokenizer, save_method="default")

README.md:   0%|          | 0.00/589 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

Saved model to https://huggingface.co/t4gandhi/mistral-7b-v0.3-bnb-4bit-fine-tuned


In [None]:
# Use to clear as much GPU RAM as possible
import gc
import torch

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

In [None]:
from unsloth import FastLanguageModel

model_name = "t4gandhi/mistral-7b-v0.3-bnb-4bit-fine-tuned"
model, tokenizer = FastLanguageModel.from_pretrained(model_name)

==((====))==  Unsloth 2024.12.9: Fast Mistral patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

In [None]:
def print_confusion_matrices(confusion_dict):
    for key, values in confusion_dict.items():
        # Extract confusion matrix values
        TP = values['TP']
        TN = values['TN']
        FP = values['FP']
        FN = values['FN']

        # Calculate total instances
        total = TP + TN + FP + FN

        # Calculate percentages
        tp_percent = TP / total
        tn_percent = TN / total
        fp_percent = FP / total
        fn_percent = FN / total

        # Calculate precision, recall, and F1 score
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Print the confusion matrix with percentages and F1 score
        print(f"Confusion Matrix for {key}:")
        print("-------------------------------------------------------")
        print(f"                Predicted Positive   Predicted Negative")
        print(f"Actual Positive           {tp_percent:>8.2f}             {fn_percent:>8.2f}")
        print(f"Actual Negative           {fp_percent:>8.2f}             {tn_percent:>8.2f}")
        print("-------------------------------------------------------")
        print(f"Combined                  {tp_percent + fp_percent:>8.2f}             {tn_percent + fn_percent:>8.2f}")
        print("-------------------------------------------------------")
        print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1_score:.2f}\n")

In [None]:
import pandas as pd

eval_result, stats = run_eval(model, tokenizer, 5, get_stats = True)
print(f"\nTEST ACCURACY: {eval_result * 100:.2f}\n")
print_confusion_matrices(stats)

100%|██████████| 31/31 [00:06<00:00,  4.47it/s]
100%|██████████| 31/31 [00:06<00:00,  4.43it/s]
100%|██████████| 31/31 [00:06<00:00,  4.55it/s]
100%|██████████| 31/31 [00:06<00:00,  4.53it/s]
100%|██████████| 31/31 [00:06<00:00,  4.54it/s]


TEST ACCURACY: 58.06

Confusion Matrix for rd1:
-------------------------------------------------------
                Predicted Positive   Predicted Negative
Actual Positive               0.27                 0.07
Actual Negative               0.27                 0.40
-------------------------------------------------------
Combined                      0.53                 0.47
-------------------------------------------------------
Precision: 0.50, Recall: 0.80, F1 Score: 0.62

Confusion Matrix for rd2:
-------------------------------------------------------
                Predicted Positive   Predicted Negative
Actual Positive               0.43                 0.14
Actual Negative               0.21                 0.21
-------------------------------------------------------
Combined                      0.64                 0.36
-------------------------------------------------------
Precision: 0.67, Recall: 0.75, F1 Score: 0.71

Confusion Matrix for rd3:
---------------------




In [None]:
def preliminary_stats(dataset):
  NUM_ITEMS = len(dpo_datasets[dataset])
  ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
  reward_model_chosen = dict.fromkeys(ROUNDS, 0)
  reward_model_rejected = dict.fromkeys(ROUNDS, 0)
  reward_model_ratio = dict.fromkeys(ROUNDS, 0)
  for i in range(NUM_ITEMS):
    chosen_round = datasets_dpo[dataset][i]["metadata"][0]['chosen']
    rejected_round = datasets_dpo[dataset][i]["metadata"][0]['rejected']

    reward_model_chosen[chosen_round] += 1
    reward_model_rejected[rejected_round] += 1

  for ROUND in ROUNDS:
    reward_model_ratio[ROUND] = reward_model_chosen[ROUND] / (reward_model_chosen[ROUND] + reward_model_rejected[ROUND])

  return reward_model_ratio

In [None]:
import pandas as pd

prelim_ratio_train = preliminary_stats('train')
df_prelim_ratio_train = pd.DataFrame(list(prelim_ratio_train.items()), columns=["Round", "Chosen"])
df_prelim_ratio_train

Unnamed: 0,Round,Chosen
0,rd1,0.367089
1,rd2,0.407407
2,rd3,0.3875
3,custom,0.8


In [None]:
import pandas as pd

prelim_ratio_test = preliminary_stats('test')
df_prelim_ratio_test = pd.DataFrame(list(prelim_ratio_test.items()), columns=["Round", "Chosen"])
df_prelim_ratio_test

Unnamed: 0,Round,Chosen
0,rd1,0.333333
1,rd2,0.571429
2,rd3,0.266667
3,custom,0.777778
