### DPO Fine-Tuning - Codellama_7B bnb 4bit

> **Model Info**

- Model Name:Phi-3-mini-4k
- Accuracy: 80.65%

> **Training Info**

- GPU Type: A100
- Time: ~50 mins
- GPU RAM: 10.4 GB

In [3]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:

import os
import re
from typing import List, Literal, Optional
from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
from datasets.builder import DatasetGenerationError

DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

def apply_chat_template(
    example, tokenizer, task: Literal["sft", "generation", "rm", "dpo"] = "sft", assistant_prefix="<|assistant|>\n"
):
    def _strip_prefix(s, pattern):
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task in ["sft", "generation"]:
        messages = example["messages"]
        # Custom prompt template for Codellama
        system_message_content = ""
        user_message_content = messages[0]["content"]
        assistant_message_content = messages[1]["content"]
        example["text"] = f"""<s>[INST] <<SYS>>
{system_message_content}
<</SYS>>

{user_message_content}
[/INST] {assistant_message_content}</s>"""
    elif task == "rm":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            chosen_messages = example["chosen"]
            rejected_messages = example["rejected"]
            # I add an empty system message if there is none
            if chosen_messages[0]["role"] != "system":
                chosen_messages.insert(0, {"role": "system", "content": ""})
            if rejected_messages[0]["role"] != "system":
                rejected_messages.insert(0, {"role": "system", "content": ""})
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
        else:
            raise ValueError(
                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    elif task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            system_message_content = ""
            user_message_content = example["chosen"][0]["content"]
            chosen_message_content = example["chosen"][1]["content"]
            rejected_message_content = example["rejected"][1]["content"]
            example["text_prompt"] = f"""<s>[INST] <<SYS>>
{system_message_content}
<</SYS>>

{user_message_content}
[/INST]"""
            example["text_chosen"] = f" {chosen_message_content}</s>"
            example["text_rejected"] = f" {rejected_message_content}</s>"
        else:
            raise ValueError(
                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    else:
        raise ValueError(
            f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo']}"
        )
    return example


def get_datasets(
    data_config: dict,
    splits: List[str] = ["train", "test"],
    shuffle: bool = True,
) -> DatasetDict:
    """
    Loads one or more datasets with varying training set proportions.

    Args:
        data_config (`DataArguments` or `dict`):
            Dataset configuration and split proportions.
        splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.

    Returns
        [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
    """

    if type(data_config) is dict:
        # Structure of the input is:
        #     dataset_mixer = {
        #             "dataset1": 0.5,
        #             "dataset1": 0.3,
        #             "dataset1": 0.2,
        #         }
        dataset_mixer = data_config
    else:
        raise ValueError(f"Data config {data_config} not recognized.")

    raw_datasets = mix_datasets(dataset_mixer, splits=splits, shuffle=shuffle)
    return raw_datasets


def mix_datasets(dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle=True) -> DatasetDict:
    """
    Loads and mixes datasets according to proportions specified in `dataset_mixer`.

    Args:
        dataset_mixer (`dict`):
            Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
        splits (Optional[List[str]], *optional*, defaults to `None`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.
    """
    raw_datasets = DatasetDict()
    raw_train_datasets = []
    raw_val_datasets = []
    fracs = []
    for ds, frac in dataset_mixer.items():
        fracs.append(frac)
        for split in splits:
            try:
                # Try first if dataset on a Hub repo
                dataset = load_dataset(ds, split=split)
            except DatasetGenerationError:
                # If not, check local dataset
                dataset = load_from_disk(os.path.join(ds, split))

            if "train" in split:
                raw_train_datasets.append(dataset)
            elif "test" in split:
                raw_val_datasets.append(dataset)
            else:
                raise ValueError(f"Split type {split} not recognized as one of test or train.")

    if any(frac < 0 for frac in fracs):
        raise ValueError("Dataset fractions cannot be negative.")

    if len(raw_train_datasets) > 0:
        train_subsets = []
        for dataset, frac in zip(raw_train_datasets, fracs):
            train_subset = dataset.select(range(int(frac * len(dataset))))
            train_subsets.append(train_subset)
        if shuffle:
            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
        else:
            raw_datasets["train"] = concatenate_datasets(train_subsets)
    # No subsampling for test datasets to enable fair comparison across models
    if len(raw_val_datasets) > 0:
        if shuffle:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
        else:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)

    if len(raw_datasets) == 0:
        raise ValueError(
            f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
        )

    return raw_datasets

<a name="Data"></a>
### Data Prep


In [6]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset, DatasetDict

dataset = load_dataset('t4gandhi/code_correction_using_LLM')
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])
df_parts = [df_train, df_test]

df_sft_train = []
df_sft_test = []
df_sft_parts = [df_sft_train, df_sft_test]

df_dpo_train = []
df_dpo_test = []
df_dpo_parts = [df_dpo_train, df_dpo_test]

PREFIXS = ['score_s1_', 'score_s2_', 'score_s3_', 'score_s4_', 'score_s5_', 'score_s6_']
ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
PAIRS = [('rd1', 'rd2'), ('rd1', 'rd3'), ('rd1', 'custom'), ('rd2', 'rd3'), ('rd2', 'custom'), ('rd3', 'custom')]

def indent_lines(string: str) -> str:
  indented_string = '\n'.join('    ' + line for line in string.splitlines())
  return indented_string

for df, df_sft, df_dpo in zip(df_parts, df_sft_parts, df_dpo_parts):
  for idx, row in df.iterrows():
      prompt = row['prompt']
      result = row['result']
      instruction = f"""<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>"""
      full_solution = "<buggy_code>\n" + (prompt + indent_lines(result)).strip('\n') + "\n</buggy_code>"
      full_instruction = instruction + "\n" + full_solution
      solutions_info = {}
      for ROUND in ROUNDS:
        solutions_info[ROUND] = {}
        total_score = 0
        for PREFIX in PREFIXS:
          score_col = PREFIX + ROUND
          score = int(row[score_col][0])
          total_score += score
        total_score /= 42
        analysis_col = 'analysis_' + ROUND
        solutions_info[ROUND]['analysis'] = row[analysis_col]
        solutions_info[ROUND]['score'] = total_score
      for ROUND1, ROUND2 in PAIRS:
        round1_score = solutions_info[ROUND1]['score']
        round2_score = solutions_info[ROUND2]['score']
        round1_analysis = solutions_info[ROUND1]['analysis']
        round2_analysis = solutions_info[ROUND2]['analysis']
        if round1_score == round2_score:
          continue
        messages_info = {}
        messages_info['messages'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score > round2_score else round2_analysis, 'role': 'assistant'}
        ]
        pairwise_info = {}
        pairwise_info['prompt'] = full_instruction
        pairwise_info['chosen'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score > round2_score else round2_analysis, 'role': 'assistant'}
        ]
        pairwise_info['rejected'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score < round2_score else round2_analysis, 'role': 'assistant'}
        ]
        # Not part of training data, only for analysis
        pairwise_info['metadata'] = {
            'chosen': ROUND1 if round1_score > round2_score else ROUND2,
            'rejected': ROUND1 if round1_score < round2_score else ROUND2,
        },
        df_sft.append(messages_info)
        df_dpo.append(pairwise_info)

df_sft_train = pd.DataFrame(df_sft_train)
df_sft_test = pd.DataFrame(df_sft_test)
dataset_sft_train = Dataset.from_pandas(df_sft_train)
dataset_sft_test = Dataset.from_pandas(df_sft_test)
datasets_sft = DatasetDict({
    'train': dataset_sft_train,
    'test': dataset_sft_test
})
df_dpo_train = pd.DataFrame(df_dpo_train)
df_dpo_test = pd.DataFrame(df_dpo_test)
dataset_dpo_train = Dataset.from_pandas(df_dpo_train)
dataset_dpo_test = Dataset.from_pandas(df_dpo_test)
datasets_dpo = DatasetDict({
    'train': dataset_dpo_train,
    'test': dataset_dpo_test
})

In [7]:
datasets_sft

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 165
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 31
    })
})

In [8]:
datasets_dpo

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'metadata'],
        num_rows: 165
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'metadata'],
        num_rows: 31
    })
})

In [9]:
column_names = list(datasets_sft['train'].features)

sft_datasets = datasets_sft.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "sft"},
    num_proc = 12,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/165 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/31 [00:00<?, ? examples/s]

In [10]:
sft_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 165
    })
    test: Dataset({
        features: ['text'],
        num_rows: 31
    })
})

In [11]:
print(sft_datasets['train'][0]['text'])

<s>[INST] <<SYS>>

<</SYS>>

<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>
<buggy_code>
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output th

In [12]:
column_names = list(datasets_dpo['train'].features)

dpo_datasets = datasets_dpo.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "dpo"},
    num_proc = 12,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

dpo_datasets = dpo_datasets.rename_columns(
    {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
)

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/165 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/31 [00:00<?, ? examples/s]

In [13]:
dpo_datasets

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 165
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 31
    })
})

In [14]:
print("=" * 10 + "PROMPT" + "=" * 10)
print(dpo_datasets['train'][0]['prompt'])
print("=" * 10 + "CHOSEN" + "=" * 10)
print(dpo_datasets['train'][0]['chosen'])
print("=" * 10 + "REJECTED" + "=" * 10)
print(dpo_datasets['train'][0]['rejected'])

<s>[INST] <<SYS>>

<</SYS>>

<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>
<buggy_code>
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output th

I now add LoRA adapters so it only need to update 1 to 10% of all parameters!



In [15]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Train the SFT model

In [16]:
import os
os.environ["WANDB_MODE"] = "disabled"
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

sft_trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = sft_datasets['train'],
    # eval_dataset = sft_datasets['test'], # Uncomment to run eval
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences unsloth.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        # evaluation_strategy = "steps", # Uncomment to run eval
        # eval_steps = 1, # Uncomment to run eval
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/165 [00:00<?, ? examples/s]

In [17]:
sft_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.4443
2,1.3803
3,1.3493
4,1.3115
5,1.184
6,1.1195
7,1.0541
8,0.9402
9,0.8299
10,0.7334


TrainOutput(global_step=20, training_loss=0.8347718209028244, metrics={'train_runtime': 213.4258, 'train_samples_per_second': 0.773, 'train_steps_per_second': 0.094, 'total_flos': 2613853402288128.0, 'train_loss': 0.8347718209028244})

In [18]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

In [19]:
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        learning_rate = 5e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
    ),
    beta = 0.1,
    train_dataset = dpo_datasets['train'],
    tokenizer = tokenizer,
    max_length = 1024,
    max_prompt_length = 512,
)

Extracting prompt in train dataset (num_proc=2):   0%|          | 0/165 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/165 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/165 [00:00<?, ? examples/s]

In [20]:
from tqdm import tqdm
import torch

def run_eval(model, tokenizer, no_iter, get_stats=False):
    NUM_ITEMS = len(dpo_datasets['test'])
    num_chosen = 0
    ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
    stats = {}
    for ROUND in ROUNDS:
        stats[ROUND] = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for _ in range(no_iter):
        for i in tqdm(range(NUM_ITEMS)):
            input = {
                "chosen": datasets_dpo['test'][i]["chosen"],
                "rejected": datasets_dpo['test'][i]["rejected"]
            }
            chosen_round = datasets_dpo['test'][i]["metadata"][0]['chosen']
            rejected_round = datasets_dpo['test'][i]["metadata"][0]['rejected']

            # Apply the chat template to format the input
            formatted_input = apply_chat_template(input, tokenizer, task="dpo")

            # Tokenize the inputs
            inputs_chosen = tokenizer(formatted_input["text_chosen"], return_tensors="pt", padding=True, truncation=True).to(device)
            inputs_rejected = tokenizer(formatted_input["text_rejected"], return_tensors="pt", padding=True, truncation=True).to(device)

            # Generate the scalar reward values
            with torch.no_grad():
                reward_chosen = model(**inputs_chosen).logits.mean().item()
                reward_rejected = model(**inputs_rejected).logits.mean().item()
                # Model chose correctly
                if reward_chosen > reward_rejected:
                    num_chosen += 1
                    stats[chosen_round]['TP'] += 1
                    stats[rejected_round]['TN'] += 1
                # Model chose wrongly
                else:
                    stats[chosen_round]['FN'] += 1
                    stats[rejected_round]['FP'] += 1

    if get_stats:
        return num_chosen / (no_iter * NUM_ITEMS), stats

    return num_chosen / (no_iter * NUM_ITEMS)

In [21]:
best_iteration = 1
best_eval_result = 0

for i in range(1, 11):
    # Train the model
    training_result = dpo_trainer.train()
    eval_result = run_eval(model, tokenizer, 5)
    if eval_result >= best_eval_result:
        best_eval_result = eval_result
        best_iteration = i

    # Create a unique checkpoint directory for each iteration
    checkpoint_dir = f"checkpoint_iteration_{i}"
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Save model and trainer states for this iteration
    dpo_trainer.save_model(checkpoint_dir)  # Save model and tokenizer
    dpo_trainer.save_state()  # Save optimizer, scheduler, and other trainer states

    print(f"\nEPOCH NO.{i}")
    print(f"TRAINING RESULT: {training_result}")
    print(f"TEST ACCURACY: {eval_result * 100:.2f}\n")

print(f"BEST ITERATION: {best_iteration}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.1665,5.970039,3.472557,1.0,2.497482,-128.9729,-108.383362,24.205774,22.909002,0,0,0,0
2,0.3998,5.749432,3.699075,0.75,2.050356,-142.614288,-134.597748,24.63538,22.999012,No Log,No Log,No Log,No Log
3,0.363,4.896736,3.694715,0.875,1.20202,-127.921677,-107.92688,22.873308,22.520969,No Log,No Log,No Log,No Log
4,0.2568,4.848943,3.272226,0.875,1.576716,-141.328979,-127.242401,23.088964,21.830126,No Log,No Log,No Log,No Log
5,0.3714,6.794295,3.756459,0.75,3.037835,-140.454788,-129.741104,22.600357,23.004232,No Log,No Log,No Log,No Log
6,0.4449,7.067882,3.978293,0.75,3.089588,-160.430099,-137.111801,24.007828,23.576818,No Log,No Log,No Log,No Log
7,0.7492,5.694028,3.493035,0.875,2.200993,-121.171402,-144.013962,23.860765,24.616236,No Log,No Log,No Log,No Log
8,0.3991,5.116026,3.789987,0.875,1.32604,-111.534157,-106.189926,22.424088,22.877308,No Log,No Log,No Log,No Log
9,0.5728,4.632914,3.468744,0.625,1.164169,-122.598984,-143.408142,23.631197,22.704407,No Log,No Log,No Log,No Log
10,0.2101,6.14669,3.35163,1.0,2.79506,-141.416351,-118.677132,23.489502,24.222599,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.73it/s]
100%|██████████| 31/31 [00:11<00:00,  2.62it/s]
100%|██████████| 31/31 [00:11<00:00,  2.60it/s]
100%|██████████| 31/31 [00:11<00:00,  2.65it/s]
100%|██████████| 31/31 [00:11<00:00,  2.71it/s]



EPOCH NO.1
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.55136339366436, metrics={'train_runtime': 444.7134, 'train_samples_per_second': 0.371, 'train_steps_per_second': 0.045, 'total_flos': 0.0, 'train_loss': 0.55136339366436, 'epoch': 0.963855421686747})
TEST ACCURACY: 32.26



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.1356,6.099861,3.45117,1.0,2.64869,-127.674683,-108.597237,24.245312,22.976665,0,0,0,0
2,0.3205,5.92379,3.688432,0.875,2.235358,-140.870697,-134.704178,24.673677,23.080992,No Log,No Log,No Log,No Log
3,0.309,5.074359,3.768329,1.0,1.30603,-126.145447,-107.190742,22.889965,22.54599,No Log,No Log,No Log,No Log
4,0.2403,4.981839,3.381968,1.0,1.599871,-140.000015,-126.144989,23.098913,21.848259,No Log,No Log,No Log,No Log
5,0.2868,6.887233,3.79003,0.875,3.097203,-139.525391,-129.405396,22.561945,22.995472,No Log,No Log,No Log,No Log
6,0.3368,7.15238,3.990716,0.75,3.161663,-159.585114,-136.987579,23.952686,23.545517,No Log,No Log,No Log,No Log
7,0.6724,5.758668,3.567696,0.875,2.190972,-120.525002,-143.267349,23.805826,24.583012,No Log,No Log,No Log,No Log
8,0.3318,5.316264,3.815814,1.0,1.50045,-109.531784,-105.931664,22.339237,22.801184,No Log,No Log,No Log,No Log
9,0.5042,4.698375,3.467659,0.625,1.230716,-121.944374,-143.418991,23.563831,22.648041,No Log,No Log,No Log,No Log
10,0.1832,6.208899,3.358287,1.0,2.850612,-140.794266,-118.610565,23.405024,24.159342,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.75it/s]
100%|██████████| 31/31 [00:11<00:00,  2.64it/s]
100%|██████████| 31/31 [00:12<00:00,  2.52it/s]
100%|██████████| 31/31 [00:11<00:00,  2.60it/s]
100%|██████████| 31/31 [00:11<00:00,  2.71it/s]



EPOCH NO.2
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.4793019324541092, metrics={'train_runtime': 447.0738, 'train_samples_per_second': 0.369, 'train_steps_per_second': 0.045, 'total_flos': 0.0, 'train_loss': 0.4793019324541092, 'epoch': 0.963855421686747})
TEST ACCURACY: 32.26



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.136,6.046445,3.444684,1.0,2.601761,-128.208847,-108.662094,24.131666,22.878088,0,0,0,0
2,0.2939,5.905401,3.613665,0.875,2.291736,-141.054596,-135.451843,24.565552,22.998987,No Log,No Log,No Log,No Log
3,0.2836,5.134084,3.790255,1.0,1.34383,-125.548195,-106.971481,22.782461,22.442665,No Log,No Log,No Log,No Log
4,0.2311,4.981391,3.356408,1.0,1.624984,-140.004501,-126.400589,22.986828,21.735403,No Log,No Log,No Log,No Log
5,0.2337,6.854753,3.715769,0.875,3.138984,-139.850189,-130.14801,22.461651,22.90477,No Log,No Log,No Log,No Log
6,0.2627,7.106021,3.856158,0.75,3.249862,-160.048706,-138.333145,23.838291,23.444712,No Log,No Log,No Log,No Log
7,0.6034,5.724358,3.476457,0.875,2.247901,-120.868103,-144.179733,23.680521,24.481876,No Log,No Log,No Log,No Log
8,0.2893,5.371889,3.751899,1.0,1.619989,-108.975533,-106.570801,22.197084,22.685669,No Log,No Log,No Log,No Log
9,0.4522,4.678245,3.388279,0.625,1.289965,-122.14566,-144.212784,23.446903,22.549374,No Log,No Log,No Log,No Log
10,0.164,6.22001,3.309196,1.0,2.910814,-140.683151,-119.101471,23.286295,24.04867,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.75it/s]
100%|██████████| 31/31 [00:11<00:00,  2.65it/s]
100%|██████████| 31/31 [00:11<00:00,  2.59it/s]
100%|██████████| 31/31 [00:11<00:00,  2.65it/s]
100%|██████████| 31/31 [00:11<00:00,  2.70it/s]



EPOCH NO.3
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.4243161931633949, metrics={'train_runtime': 441.6551, 'train_samples_per_second': 0.374, 'train_steps_per_second': 0.045, 'total_flos': 0.0, 'train_loss': 0.4243161931633949, 'epoch': 0.963855421686747})
TEST ACCURACY: 32.26



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.1335,5.948739,3.369969,1.0,2.57877,-129.185913,-109.409256,23.97472,22.734827,0,0,0,0
2,0.2704,5.823221,3.473189,0.875,2.350031,-141.876404,-136.856598,24.419596,22.879295,No Log,No Log,No Log,No Log
3,0.2627,5.132638,3.743221,1.0,1.389417,-125.562645,-107.441818,22.637058,22.298849,No Log,No Log,No Log,No Log
4,0.2196,4.920383,3.237784,1.0,1.682599,-140.614578,-127.586823,22.842888,21.594309,No Log,No Log,No Log,No Log
5,0.1974,6.767656,3.581708,0.875,3.185949,-140.721161,-131.488617,22.323698,22.768023,No Log,No Log,No Log,No Log
6,0.2061,7.009864,3.651819,1.0,3.358046,-161.010284,-140.376541,23.689079,23.30365,No Log,No Log,No Log,No Log
7,0.5383,5.650076,3.29771,0.875,2.352366,-121.610916,-145.967209,23.525763,24.345707,No Log,No Log,No Log,No Log
8,0.258,5.357901,3.635162,1.0,1.722739,-109.115425,-107.738174,22.025196,22.533257,No Log,No Log,No Log,No Log
9,0.4119,4.611404,3.256659,0.875,1.354746,-122.814072,-145.528992,23.293758,22.419329,No Log,No Log,No Log,No Log
10,0.144,6.20973,3.216106,1.0,2.993624,-140.78595,-120.032379,23.134167,23.892645,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.75it/s]
100%|██████████| 31/31 [00:11<00:00,  2.64it/s]
100%|██████████| 31/31 [00:12<00:00,  2.54it/s]
100%|██████████| 31/31 [00:12<00:00,  2.57it/s]
100%|██████████| 31/31 [00:11<00:00,  2.67it/s]



EPOCH NO.4
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.37437549456954, metrics={'train_runtime': 449.9223, 'train_samples_per_second': 0.367, 'train_steps_per_second': 0.044, 'total_flos': 0.0, 'train_loss': 0.37437549456954, 'epoch': 0.963855421686747})
TEST ACCURACY: 32.26



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.1301,5.839812,3.254696,1.0,2.585116,-130.275177,-110.561974,23.78783,22.557081,0,0,0,0
2,0.2449,5.742609,3.304242,0.875,2.438367,-142.682526,-138.546082,24.245388,22.729248,No Log,No Log,No Log,No Log
3,0.2398,5.106957,3.65029,1.0,1.456667,-125.819466,-108.371124,22.467571,22.129946,No Log,No Log,No Log,No Log
4,0.2091,4.822248,3.070668,1.0,1.75158,-141.595932,-129.25798,22.670197,21.421284,No Log,No Log,No Log,No Log
5,0.1701,6.653683,3.398324,1.0,3.255359,-141.860901,-133.322464,22.148106,22.590508,No Log,No Log,No Log,No Log
6,0.1619,6.892431,3.395604,1.0,3.496827,-162.184616,-142.93869,23.507206,23.123138,No Log,No Log,No Log,No Log
7,0.4756,5.541616,3.063851,0.875,2.477765,-122.695511,-148.305786,23.337294,24.172958,No Log,No Log,No Log,No Log
8,0.2305,5.305635,3.478969,1.0,1.826667,-109.638069,-109.30011,21.821522,22.349409,No Log,No Log,No Log,No Log
9,0.3749,4.509382,3.086949,0.875,1.422433,-123.834297,-147.226074,23.111435,22.262739,No Log,No Log,No Log,No Log
10,0.1278,6.187936,3.097087,1.0,3.090849,-141.003891,-121.222549,22.955751,23.707012,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.77it/s]
100%|██████████| 31/31 [00:11<00:00,  2.65it/s]
100%|██████████| 31/31 [00:12<00:00,  2.52it/s]
100%|██████████| 31/31 [00:12<00:00,  2.55it/s]
100%|██████████| 31/31 [00:11<00:00,  2.68it/s]



EPOCH NO.5
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.3294578604400158, metrics={'train_runtime': 450.4889, 'train_samples_per_second': 0.366, 'train_steps_per_second': 0.044, 'total_flos': 0.0, 'train_loss': 0.3294578604400158, 'epoch': 0.963855421686747})
TEST ACCURACY: 35.48



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.1242,5.709223,3.076041,1.0,2.633183,-131.581055,-112.348526,23.570137,22.350637,0,0,0,0
2,0.2129,5.649047,3.083724,0.875,2.565323,-143.618134,-140.751266,24.046059,22.551025,No Log,No Log,No Log,No Log
3,0.2224,5.038021,3.519158,1.0,1.518863,-126.508835,-109.682449,22.274719,21.934124,No Log,No Log,No Log,No Log
4,0.1948,4.684587,2.823975,1.0,1.860612,-142.972549,-131.724915,22.470781,21.224443,No Log,No Log,No Log,No Log
5,0.1497,6.492857,3.158619,1.0,3.334237,-143.469162,-135.719498,21.954182,22.388632,No Log,No Log,No Log,No Log
6,0.1261,6.726493,3.04902,1.0,3.677473,-163.843994,-146.404526,23.308687,22.922237,No Log,No Log,No Log,No Log
7,0.4173,5.3697,2.726779,0.875,2.642921,-124.414688,-151.676529,23.124704,23.972881,No Log,No Log,No Log,No Log
8,0.2049,5.188752,3.241532,1.0,1.947219,-110.806908,-111.674469,21.590628,22.138355,No Log,No Log,No Log,No Log
9,0.3376,4.354879,2.818409,0.875,1.536469,-125.379341,-149.911484,22.903599,22.08559,No Log,No Log,No Log,No Log
10,0.1114,6.124744,2.908672,1.0,3.216072,-141.635803,-123.10672,22.764435,23.501211,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.77it/s]
100%|██████████| 31/31 [00:11<00:00,  2.65it/s]
100%|██████████| 31/31 [00:12<00:00,  2.52it/s]
100%|██████████| 31/31 [00:11<00:00,  2.59it/s]
100%|██████████| 31/31 [00:11<00:00,  2.69it/s]



EPOCH NO.6
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.2877471398562193, metrics={'train_runtime': 446.7495, 'train_samples_per_second': 0.369, 'train_steps_per_second': 0.045, 'total_flos': 0.0, 'train_loss': 0.2877471398562193, 'epoch': 0.963855421686747})
TEST ACCURACY: 35.48



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.117,5.519609,2.79513,1.0,2.724479,-133.477203,-115.157639,23.328707,22.120659,0,0,0,0
2,0.1727,5.503609,2.757303,0.875,2.746305,-145.072525,-144.015472,23.828714,22.349749,No Log,No Log,No Log,No Log
3,0.2073,4.894068,3.309603,1.0,1.584465,-127.948357,-111.778,22.074194,21.729532,No Log,No Log,No Log,No Log
4,0.1825,4.442515,2.465422,1.0,1.977093,-145.393265,-135.31044,22.250788,21.004957,No Log,No Log,No Log,No Log
5,0.1363,6.230326,2.800803,1.0,3.429523,-146.094467,-139.297653,21.760824,22.178183,No Log,No Log,No Log,No Log
6,0.0998,6.44068,2.536014,1.0,3.904666,-166.702118,-151.534592,23.089485,22.697699,No Log,No Log,No Log,No Log
7,0.3612,5.058453,2.203314,0.875,2.855138,-127.527161,-156.911163,22.896006,23.75209,No Log,No Log,No Log,No Log
8,0.1834,4.938774,2.847059,1.0,2.091716,-113.306679,-115.619209,21.349297,21.918221,No Log,No Log,No Log,No Log
9,0.2936,4.129729,2.429852,0.875,1.699877,-127.630829,-153.797043,22.685658,21.896725,No Log,No Log,No Log,No Log
10,0.0935,5.970185,2.575792,1.0,3.394393,-143.181412,-126.435509,22.560822,23.277744,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.75it/s]
100%|██████████| 31/31 [00:11<00:00,  2.67it/s]
100%|██████████| 31/31 [00:11<00:00,  2.60it/s]
100%|██████████| 31/31 [00:11<00:00,  2.66it/s]
100%|██████████| 31/31 [00:11<00:00,  2.72it/s]



EPOCH NO.7
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.24873633608222007, metrics={'train_runtime': 441.2513, 'train_samples_per_second': 0.374, 'train_steps_per_second': 0.045, 'total_flos': 0.0, 'train_loss': 0.24873633608222007, 'epoch': 0.963855421686747})
TEST ACCURACY: 35.48



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.1082,5.271401,2.419643,1.0,2.851758,-135.95929,-118.912506,23.080751,21.883213,0,0,0,0
2,0.1347,5.306257,2.357791,1.0,2.948465,-147.046036,-148.01059,23.607531,22.137535,No Log,No Log,No Log,No Log
3,0.1927,4.686023,3.01595,1.0,1.670073,-130.028809,-114.714523,21.864943,21.516033,No Log,No Log,No Log,No Log
4,0.1676,4.17193,2.068477,1.0,2.103453,-148.099106,-139.279877,22.025173,20.785183,No Log,No Log,No Log,No Log
5,0.1246,5.933004,2.373988,1.0,3.559017,-149.067688,-143.565826,21.55304,21.960493,No Log,No Log,No Log,No Log
6,0.0777,6.15447,1.962372,1.0,4.192099,-169.564209,-157.270996,22.87212,22.474632,No Log,No Log,No Log,No Log
7,0.2964,4.669416,1.575905,0.875,3.093512,-131.417511,-163.185257,22.663486,23.521835,No Log,No Log,No Log,No Log
8,0.1696,4.609516,2.344971,1.0,2.264544,-116.599274,-120.640083,21.110527,21.694948,No Log,No Log,No Log,No Log
9,0.239,3.858292,1.951655,0.875,1.906637,-130.3452,-158.57901,22.471272,21.704206,No Log,No Log,No Log,No Log
10,0.0794,5.720657,2.106039,1.0,3.614618,-145.676697,-131.133057,22.362417,23.057177,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.78it/s]
100%|██████████| 31/31 [00:11<00:00,  2.65it/s]
100%|██████████| 31/31 [00:12<00:00,  2.52it/s]
100%|██████████| 31/31 [00:12<00:00,  2.58it/s]
100%|██████████| 31/31 [00:11<00:00,  2.71it/s]



EPOCH NO.8
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.21107738390564917, metrics={'train_runtime': 451.3085, 'train_samples_per_second': 0.366, 'train_steps_per_second': 0.044, 'total_flos': 0.0, 'train_loss': 0.21107738390564917, 'epoch': 0.963855421686747})
TEST ACCURACY: 35.48



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.1013,4.980153,1.97063,1.0,3.009522,-138.871765,-123.402634,22.835903,21.649492,0,0,0,0
2,0.1014,5.054706,1.903382,1.0,3.151323,-149.561554,-152.554688,23.393829,21.932251,No Log,No Log,No Log,No Log
3,0.1746,4.436531,2.643969,1.0,1.792562,-132.523727,-118.434341,21.676453,21.327187,No Log,No Log,No Log,No Log
4,0.154,3.933527,1.675814,1.0,2.257713,-150.483139,-143.206528,21.809483,20.578423,No Log,No Log,No Log,No Log
5,0.105,5.67011,1.920286,1.0,3.749824,-151.69664,-148.102844,21.362257,21.759399,No Log,No Log,No Log,No Log
6,0.0587,5.982035,1.450572,1.0,4.531462,-171.288574,-162.389008,22.677738,22.264275,No Log,No Log,No Log,No Log
7,0.2204,4.386685,0.99965,0.875,3.387035,-134.244843,-168.9478,22.454163,23.315277,No Log,No Log,No Log,No Log
8,0.1556,4.369986,1.91172,1.0,2.458266,-118.994568,-124.972595,20.900869,21.498104,No Log,No Log,No Log,No Log
9,0.1851,3.659954,1.508178,1.0,2.151776,-132.328583,-163.013794,22.286589,21.53089,No Log,No Log,No Log,No Log
10,0.0704,5.563365,1.741757,1.0,3.821608,-147.249603,-134.775864,22.188282,22.863617,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.78it/s]
100%|██████████| 31/31 [00:11<00:00,  2.65it/s]
100%|██████████| 31/31 [00:12<00:00,  2.53it/s]
100%|██████████| 31/31 [00:12<00:00,  2.57it/s]
100%|██████████| 31/31 [00:11<00:00,  2.70it/s]



EPOCH NO.9
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.17685388680547476, metrics={'train_runtime': 453.0059, 'train_samples_per_second': 0.364, 'train_steps_per_second': 0.044, 'total_flos': 0.0, 'train_loss': 0.17685388680547476, 'epoch': 0.963855421686747})
TEST ACCURACY: 35.48



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 165 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.0894,4.820049,1.616747,1.0,3.203302,-140.472809,-126.941467,22.612858,21.44529,0,0,0,0
2,0.0749,4.911069,1.511405,1.0,3.399663,-150.997925,-156.474442,23.203903,21.758127,No Log,No Log,No Log,No Log
3,0.1564,4.233766,2.306105,1.0,1.92766,-134.551392,-121.812973,21.508644,21.167587,No Log,No Log,No Log,No Log
4,0.142,3.746406,1.311232,1.0,2.435174,-152.35434,-146.852325,21.624729,20.4034,No Log,No Log,No Log,No Log
5,0.0861,5.447979,1.462006,1.0,3.985973,-153.917953,-152.685638,21.184925,21.574423,No Log,No Log,No Log,No Log
6,0.0438,5.865123,0.95972,1.0,4.905403,-172.457703,-167.297531,22.500893,22.074913,No Log,No Log,No Log,No Log
7,0.1484,4.146733,0.41286,0.875,3.733873,-136.644348,-174.815704,22.247616,23.117924,No Log,No Log,No Log,No Log
8,0.1404,4.13481,1.476271,1.0,2.658539,-121.346329,-129.327087,20.703472,21.31118,No Log,No Log,No Log,No Log
9,0.1378,3.454577,1.006507,1.0,2.44807,-134.382355,-168.030502,22.101763,21.356438,No Log,No Log,No Log,No Log
10,0.0617,5.377591,1.332021,1.0,4.045569,-149.107346,-138.87323,22.022646,22.681459,No Log,No Log,No Log,No Log


100%|██████████| 31/31 [00:11<00:00,  2.76it/s]
100%|██████████| 31/31 [00:11<00:00,  2.64it/s]
100%|██████████| 31/31 [00:12<00:00,  2.52it/s]
100%|██████████| 31/31 [00:12<00:00,  2.58it/s]
100%|██████████| 31/31 [00:11<00:00,  2.69it/s]



EPOCH NO.10
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.14504804648458958, metrics={'train_runtime': 452.228, 'train_samples_per_second': 0.365, 'train_steps_per_second': 0.044, 'total_flos': 0.0, 'train_loss': 0.14504804648458958, 'epoch': 0.963855421686747})
TEST ACCURACY: 32.26

BEST ITERATION: 9


In [22]:
# Use to clear as much GPU RAM as possible
import gc
import torch

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Make sure to have enough GPU RAM before running this
from unsloth import FastLanguageModel
from datasets import load_from_disk

best_checkpoint_dir = f"checkpoint_iteration_{best_iteration}"

model, tokenizer = FastLanguageModel.from_pretrained(best_checkpoint_dir)

eval_result = run_eval(model, tokenizer, 5)
print(f"\nTEST ACCURACY: {eval_result * 100:.2f}\n")

==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


100%|██████████| 31/31 [00:11<00:00,  2.80it/s]
100%|██████████| 31/31 [00:11<00:00,  2.68it/s]
100%|██████████| 31/31 [00:12<00:00,  2.54it/s]
100%|██████████| 31/31 [00:12<00:00,  2.56it/s]
100%|██████████| 31/31 [00:11<00:00,  2.69it/s]


TEST ACCURACY: 35.48






In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import login
login(token="hf_xxx-xxx")
print("Successfully logged in to Hugging Face!")

Successfully logged in to Hugging Face!


In [None]:
model.save_pretrained("model", tokenizer, save_method="default")
model.push_to_hub("t4gandhi/Phi-3-mini-4k-instruct-fine-tuned", tokenizer, save_method="default")

README.md:   0%|          | 0.00/603 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/478M [00:00<?, ?B/s]

Saved model to https://huggingface.co/t4gandhi/Phi-3-mini-4k-instruct-fine-tuned


In [None]:
# Use to clear as much GPU RAM as possible
import gc
import torch

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

In [None]:
from unsloth import FastLanguageModel

model_name = "t4gandhi/Codellama-7b-bnb-4bit-fine-tuned"
model, tokenizer = FastLanguageModel.from_pretrained(model_name)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/640M [00:00<?, ?B/s]

In [None]:
def print_confusion_matrices(confusion_dict):
    for key, values in confusion_dict.items():
        # Calculate total instances
        total = values['TP'] + values['TN'] + values['FP'] + values['FN']

        # Calculate percentages
        tp_percent = (values['TP'] / total)
        tn_percent = (values['TN'] / total)
        fp_percent = (values['FP'] / total)
        fn_percent = (values['FN'] / total)

        # Print the confusion matrix with percentages
        print(f"Confusion Matrix for {key}:")
        print("-------------------------------------------------------")
        print(f"                Predicted Positive   Predicted Negative")
        print(f"Actual Positive           {tp_percent:>8.2f}             {fn_percent:>8.2f}")
        print(f"Actual Negative           {fp_percent:>8.2f}             {tn_percent:>8.2f}")
        print("-------------------------------------------------------")
        print(f"Combined                  {tp_percent+fp_percent:>8.2f}             {tn_percent+fn_percent:>8.2f}")
        print("-------------------------------------------------------\n")

In [None]:
import pandas as pd

eval_result, stats = run_eval(model, tokenizer, 5, get_stats = True)
print(f"\nTEST ACCURACY: {eval_result * 100:.2f}\n")
print_confusion_matrices(stats)

In [None]:
def preliminary_stats(dataset):
  NUM_ITEMS = len(dpo_datasets[dataset])
  ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
  reward_model_chosen = dict.fromkeys(ROUNDS, 0)
  reward_model_rejected = dict.fromkeys(ROUNDS, 0)
  reward_model_ratio = dict.fromkeys(ROUNDS, 0)
  for i in range(NUM_ITEMS):
    chosen_round = datasets_dpo[dataset][i]["metadata"][0]['chosen']
    rejected_round = datasets_dpo[dataset][i]["metadata"][0]['rejected']

    reward_model_chosen[chosen_round] += 1
    reward_model_rejected[rejected_round] += 1

  for ROUND in ROUNDS:
    reward_model_ratio[ROUND] = reward_model_chosen[ROUND] / (reward_model_chosen[ROUND] + reward_model_rejected[ROUND])

  return reward_model_ratio

In [None]:
import pandas as pd

prelim_ratio_train = preliminary_stats('train')
df_prelim_ratio_train = pd.DataFrame(list(prelim_ratio_train.items()), columns=["Round", "Chosen"])
df_prelim_ratio_train

Unnamed: 0,Round,Chosen
0,rd1,0.367089
1,rd2,0.407407
2,rd3,0.3875
3,custom,0.8


In [None]:
import pandas as pd

prelim_ratio_test = preliminary_stats('test')
df_prelim_ratio_test = pd.DataFrame(list(prelim_ratio_test.items()), columns=["Round", "Chosen"])
df_prelim_ratio_test