### DPO Fine-Tuning - Codellama_7B bnb 4bit

> **Model Info**

- Model Name: Codellama-7B bnb(4-bit quantized)
- Accuracy: 80.65%

> **Training Info**

- GPU Type: A100
- Time: ~50 mins
- GPU RAM: 10.4 GB

In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/codellama-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [None]:

import os
import re
from typing import List, Literal, Optional
from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
from datasets.builder import DatasetGenerationError

DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

def apply_chat_template(
    example, tokenizer, task: Literal["sft", "generation", "rm", "dpo"] = "sft", assistant_prefix="<|assistant|>\n"
):
    def _strip_prefix(s, pattern):
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task in ["sft", "generation"]:
        messages = example["messages"]
        # Custom prompt template for Codellama
        system_message_content = ""
        user_message_content = messages[0]["content"]
        assistant_message_content = messages[1]["content"]
        example["text"] = f"""<s>[INST] <<SYS>>
{system_message_content}
<</SYS>>

{user_message_content}
[/INST] {assistant_message_content}</s>"""
    elif task == "rm":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            chosen_messages = example["chosen"]
            rejected_messages = example["rejected"]
            # I add an empty system message if there is none
            if chosen_messages[0]["role"] != "system":
                chosen_messages.insert(0, {"role": "system", "content": ""})
            if rejected_messages[0]["role"] != "system":
                rejected_messages.insert(0, {"role": "system", "content": ""})
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
        else:
            raise ValueError(
                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    elif task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            system_message_content = ""
            user_message_content = example["chosen"][0]["content"]
            chosen_message_content = example["chosen"][1]["content"]
            rejected_message_content = example["rejected"][1]["content"]
            example["text_prompt"] = f"""<s>[INST] <<SYS>>
{system_message_content}
<</SYS>>

{user_message_content}
[/INST]"""
            example["text_chosen"] = f" {chosen_message_content}</s>"
            example["text_rejected"] = f" {rejected_message_content}</s>"
        else:
            raise ValueError(
                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    else:
        raise ValueError(
            f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo']}"
        )
    return example


def get_datasets(
    data_config: dict,
    splits: List[str] = ["train", "test"],
    shuffle: bool = True,
) -> DatasetDict:
    """
    Loads one or more datasets with varying training set proportions.

    Args:
        data_config (`DataArguments` or `dict`):
            Dataset configuration and split proportions.
        splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.

    Returns
        [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
    """

    if type(data_config) is dict:
        # Structure of the input is:
        #     dataset_mixer = {
        #             "dataset1": 0.5,
        #             "dataset1": 0.3,
        #             "dataset1": 0.2,
        #         }
        dataset_mixer = data_config
    else:
        raise ValueError(f"Data config {data_config} not recognized.")

    raw_datasets = mix_datasets(dataset_mixer, splits=splits, shuffle=shuffle)
    return raw_datasets


def mix_datasets(dataset_mixer: dict, splits: Optional[List[str]] = None, shuffle=True) -> DatasetDict:
    """
    Loads and mixes datasets according to proportions specified in `dataset_mixer`.

    Args:
        dataset_mixer (`dict`):
            Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
        splits (Optional[List[str]], *optional*, defaults to `None`):
            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the training and testing/validation data.
    """
    raw_datasets = DatasetDict()
    raw_train_datasets = []
    raw_val_datasets = []
    fracs = []
    for ds, frac in dataset_mixer.items():
        fracs.append(frac)
        for split in splits:
            try:
                # Try first if dataset on a Hub repo
                dataset = load_dataset(ds, split=split)
            except DatasetGenerationError:
                # If not, check local dataset
                dataset = load_from_disk(os.path.join(ds, split))

            if "train" in split:
                raw_train_datasets.append(dataset)
            elif "test" in split:
                raw_val_datasets.append(dataset)
            else:
                raise ValueError(f"Split type {split} not recognized as one of test or train.")

    if any(frac < 0 for frac in fracs):
        raise ValueError("Dataset fractions cannot be negative.")

    if len(raw_train_datasets) > 0:
        train_subsets = []
        for dataset, frac in zip(raw_train_datasets, fracs):
            train_subset = dataset.select(range(int(frac * len(dataset))))
            train_subsets.append(train_subset)
        if shuffle:
            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
        else:
            raw_datasets["train"] = concatenate_datasets(train_subsets)
    # No subsampling for test datasets to enable fair comparison across models
    if len(raw_val_datasets) > 0:
        if shuffle:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
        else:
            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)

    if len(raw_datasets) == 0:
        raise ValueError(
            f"Dataset {dataset_mixer} not recognized with split {split}. Check the dataset has been correctly formatted."
        )

    return raw_datasets

<a name="Data"></a>
### Data Prep


In [None]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset, DatasetDict

dataset = load_dataset('t4gandhi/code_correction_using_LLM')
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])
df_parts = [df_train, df_test]

df_sft_train = []
df_sft_test = []
df_sft_parts = [df_sft_train, df_sft_test]

df_dpo_train = []
df_dpo_test = []
df_dpo_parts = [df_dpo_train, df_dpo_test]

PREFIXS = ['score_s1_', 'score_s2_', 'score_s3_', 'score_s4_', 'score_s5_', 'score_s6_']
ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
PAIRS = [('rd1', 'rd2'), ('rd1', 'rd3'), ('rd1', 'custom'), ('rd2', 'rd3'), ('rd2', 'custom'), ('rd3', 'custom')]

def indent_lines(string: str) -> str:
  indented_string = '\n'.join('    ' + line for line in string.splitlines())
  return indented_string

for df, df_sft, df_dpo in zip(df_parts, df_sft_parts, df_dpo_parts):
  for idx, row in df.iterrows():
      prompt = row['prompt']
      result = row['result']
      instruction = f"""<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>"""
      full_solution = "<buggy_code>\n" + (prompt + indent_lines(result)).strip('\n') + "\n</buggy_code>"
      full_instruction = instruction + "\n" + full_solution
      solutions_info = {}
      for ROUND in ROUNDS:
        solutions_info[ROUND] = {}
        total_score = 0
        for PREFIX in PREFIXS:
          score_col = PREFIX + ROUND
          score = int(row[score_col][0])
          total_score += score
        total_score /= 42
        analysis_col = 'analysis_' + ROUND
        solutions_info[ROUND]['analysis'] = row[analysis_col]
        solutions_info[ROUND]['score'] = total_score
      for ROUND1, ROUND2 in PAIRS:
        round1_score = solutions_info[ROUND1]['score']
        round2_score = solutions_info[ROUND2]['score']
        round1_analysis = solutions_info[ROUND1]['analysis']
        round2_analysis = solutions_info[ROUND2]['analysis']
        if round1_score == round2_score:
          continue
        messages_info = {}
        messages_info['messages'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score > round2_score else round2_analysis, 'role': 'assistant'}
        ]
        pairwise_info = {}
        pairwise_info['prompt'] = full_instruction
        pairwise_info['chosen'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score > round2_score else round2_analysis, 'role': 'assistant'}
        ]
        pairwise_info['rejected'] = [
            {'content': full_instruction, 'role': 'user'},
            {'content': round1_analysis if round1_score < round2_score else round2_analysis, 'role': 'assistant'}
        ]
        # Not part of training data, only for analysis
        pairwise_info['metadata'] = {
            'chosen': ROUND1 if round1_score > round2_score else ROUND2,
            'rejected': ROUND1 if round1_score < round2_score else ROUND2,
        },
        df_sft.append(messages_info)
        df_dpo.append(pairwise_info)

df_sft_train = pd.DataFrame(df_sft_train)
df_sft_test = pd.DataFrame(df_sft_test)
dataset_sft_train = Dataset.from_pandas(df_sft_train)
dataset_sft_test = Dataset.from_pandas(df_sft_test)
datasets_sft = DatasetDict({
    'train': dataset_sft_train,
    'test': dataset_sft_test
})
df_dpo_train = pd.DataFrame(df_dpo_train)
df_dpo_test = pd.DataFrame(df_dpo_test)
dataset_dpo_train = Dataset.from_pandas(df_dpo_train)
dataset_dpo_test = Dataset.from_pandas(df_dpo_test)
datasets_dpo = DatasetDict({
    'train': dataset_dpo_train,
    'test': dataset_dpo_test
})

README.md:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/93.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/49.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6 [00:00<?, ? examples/s]

In [None]:
datasets_sft

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 165
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 31
    })
})

In [None]:
datasets_dpo

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'metadata'],
        num_rows: 165
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'metadata'],
        num_rows: 31
    })
})

In [None]:
column_names = list(datasets_sft['train'].features)

sft_datasets = datasets_sft.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "sft"},
    num_proc = 12,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/165 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/31 [00:00<?, ? examples/s]

In [None]:
sft_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 165
    })
    test: Dataset({
        features: ['text'],
        num_rows: 31
    })
})

In [None]:
print(sft_datasets['train'][0]['text'])

<s>[INST] <<SYS>>

<</SYS>>

<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>
<buggy_code>
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output th

In [None]:
column_names = list(datasets_dpo['train'].features)

dpo_datasets = datasets_dpo.map(
    apply_chat_template,
    fn_kwargs = {"tokenizer": tokenizer, "task": "dpo"},
    num_proc = 12,
    remove_columns = column_names,
    desc = "Formatting comparisons with prompt template",
)

dpo_datasets = dpo_datasets.rename_columns(
    {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
)

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/165 [00:00<?, ? examples/s]

Formatting comparisons with prompt template (num_proc=12):   0%|          | 0/31 [00:00<?, ? examples/s]

In [None]:
dpo_datasets

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 165
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 31
    })
})

In [None]:
print("=" * 10 + "PROMPT" + "=" * 10)
print(dpo_datasets['train'][0]['prompt'])
print("=" * 10 + "CHOSEN" + "=" * 10)
print(dpo_datasets['train'][0]['chosen'])
print("=" * 10 + "REJECTED" + "=" * 10)
print(dpo_datasets['train'][0]['rejected'])

<s>[INST] <<SYS>>

<</SYS>>

<instruction>
  <bullets>
    <bullet>The following buggy code is a wrong implementation that contains one or more bugs.</bullet>
    <bullet>Firstly, find all of the bugs within the buggy code. Make sure to quotate each part of the buggy code that contains a bug.</bullet>
    <bullet>Afterwards, for each of the bugs, describe the issue with each part of the buggy code with the bug, and outline how to fix the issue.</bullet>
    <bullet>Make sure your answer covers (1) all of the existing bugs, (2) do not hallucinate non-existing bugs, and (3) be concise as possible.</bullet>
    <bullet>IMPORTANT!: While abiding by the above instructions, keep your answer as brief as possible.</bullet>
  </bullets>
</instruction>
<buggy_code>
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output th

I now add LoRA adapters so it only need to update 1 to 10% of all parameters!



In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Train the SFT model

In [None]:
import os
os.environ["WANDB_MODE"] = "disabled"
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

sft_trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = sft_datasets['train'],
    # eval_dataset = sft_datasets['test'], # Uncomment to run eval
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences unsloth.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        # evaluation_strategy = "steps", # Uncomment to run eval
        # eval_steps = 1, # Uncomment to run eval
    ),
)

Map (num_proc=2):   0%|          | 0/165 [00:00<?, ? examples/s]

In [None]:
sft_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss
1,1.3913
2,1.3593
3,1.4045
4,1.4003
5,1.2489
6,1.1875
7,1.078
8,0.965
9,0.7763
10,0.6237


TrainOutput(global_step=20, training_loss=0.8479032337665557, metrics={'train_runtime': 77.998, 'train_samples_per_second': 2.115, 'train_steps_per_second': 0.256, 'total_flos': 4603914026729472.0, 'train_loss': 0.8479032337665557, 'epoch': 0.963855421686747})

In [None]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer
PatchDPOTrainer()

In [None]:
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        learning_rate = 5e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
    ),
    beta = 0.1,
    train_dataset = dpo_datasets['train'],
    tokenizer = tokenizer,
    max_length = 1024,
    max_prompt_length = 512,
)

Extracting prompt from train dataset:   0%|          | 0/165 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/165 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/165 [00:00<?, ? examples/s]

In [None]:
from tqdm import tqdm
import torch

def run_eval(model, tokenizer, no_iter, get_stats = False):
  NUM_ITEMS = len(dpo_datasets['test'])
  num_chosen = 0
  ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
  stats = {}
  for ROUND in ROUNDS:
    stats[ROUND] = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}

  for _ in range(no_iter):
    for i in tqdm(range(NUM_ITEMS)):
      input = {
          "chosen": datasets_dpo['test'][i]["chosen"],
          "rejected": datasets_dpo['test'][i]["rejected"]
      }
      chosen_round = datasets_dpo['test'][i]["metadata"][0]['chosen']
      rejected_round = datasets_dpo['test'][i]["metadata"][0]['rejected']

      # Apply the chat template to format the input
      formatted_input = apply_chat_template(input, tokenizer, task="dpo")

      # Tokenize the inputs
      inputs_chosen = tokenizer(formatted_input["text_chosen"], return_tensors="pt", padding=True, truncation=True)
      inputs_rejected = tokenizer(formatted_input["text_rejected"], return_tensors="pt", padding=True, truncation=True)

      # Generate the scalar reward values
      with torch.no_grad():
          reward_chosen = model(**inputs_chosen).logits.mean().item()
          reward_rejected = model(**inputs_rejected).logits.mean().item()
          # Model chose correctly
          if reward_chosen > reward_rejected:
              num_chosen += 1
              stats[chosen_round]['TP'] += 1
              stats[rejected_round]['TN'] += 1
          # Model chose wrongly
          else:
              stats[chosen_round]['FN'] += 1
              stats[rejected_round]['FP'] += 1

  if get_stats:
    return num_chosen / (no_iter * NUM_ITEMS), stats

  return num_chosen / (no_iter * NUM_ITEMS)

In [None]:
best_iteration = 1
best_eval_result = 0

for i in range(1, 11):
    # Train the model
    training_result = dpo_trainer.train()
    eval_result = run_eval(model, tokenizer, 5)
    if eval_result >= best_eval_result:
        best_eval_result = eval_result
        best_iteration = i

    # Create a unique checkpoint directory for each iteration
    checkpoint_dir = f"checkpoint_iteration_{i}"
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Save model and trainer states for this iteration
    dpo_trainer.save_model(checkpoint_dir)  # Save model and tokenizer
    dpo_trainer.save_state()  # Save optimizer, scheduler, and other trainer states

    print(f"\nEPOCH NO.{i}")
    print(f"TRAINING RESULT: {training_result}")
    print(f"TEST ACCURACY: {eval_result * 100:.2f}\n")

print(f"BEST ITERATION: {best_iteration}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.2506,5.30711,3.966628,1.0,1.340482,-193.982605,-213.674225,-2.903353,-2.868697
2,0.4061,5.59167,4.365859,0.75,1.225811,-158.772369,-154.404022,-2.886623,-2.812683
3,0.4871,4.800447,3.965134,0.625,0.835312,-181.912262,-181.176254,-2.854495,-2.887854
4,0.5476,4.807979,4.145783,0.875,0.662196,-158.521576,-152.284988,-2.885909,-2.859079
5,0.4994,5.006547,4.215396,0.625,0.79115,-194.810699,-213.587311,-2.85809,-2.832078
6,0.5367,4.994947,4.435511,0.75,0.559436,-170.008087,-163.130264,-2.882138,-2.900426
7,0.5822,4.837763,4.315793,0.625,0.52197,-151.172302,-118.192665,-2.863236,-2.842804
8,0.8577,4.452503,4.743552,0.125,-0.291049,-171.145996,-163.109299,-2.919805,-2.93519
9,0.5842,4.590475,4.123341,0.75,0.467133,-166.162918,-167.644104,-2.806245,-2.833642
10,0.6849,5.015836,3.993028,0.625,1.022807,-142.562531,-160.364563,-2.926992,-2.896092


100%|██████████| 31/31 [00:07<00:00,  4.11it/s]
100%|██████████| 31/31 [00:07<00:00,  4.17it/s]
100%|██████████| 31/31 [00:07<00:00,  4.19it/s]
100%|██████████| 31/31 [00:07<00:00,  4.18it/s]
100%|██████████| 31/31 [00:07<00:00,  4.21it/s]



EPOCH NO.1
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.5545015141367913, metrics={'train_runtime': 96.4823, 'train_samples_per_second': 1.71, 'train_steps_per_second': 0.207, 'total_flos': 0.0, 'train_loss': 0.5545015141367913, 'epoch': 0.963855421686747})
TEST ACCURACY: 54.84



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.1762,5.63015,3.851845,1.0,1.778305,-195.130447,-210.443817,-2.901071,-2.866872
2,0.3054,5.767287,4.293439,0.875,1.473848,-159.496567,-152.647858,-2.887639,-2.811082
3,0.3411,5.051221,3.757944,0.875,1.293276,-183.984161,-178.668518,-2.850808,-2.885621
4,0.4326,4.990325,4.062894,0.875,0.927431,-159.350464,-150.461517,-2.887431,-2.861721
5,0.3985,5.081979,4.030141,0.875,1.051837,-196.663239,-212.832977,-2.85934,-2.832263
6,0.4061,5.070999,4.161424,0.75,0.909575,-172.748962,-162.369736,-2.883194,-2.90562
7,0.5112,5.024442,4.279402,0.625,0.74504,-151.536209,-116.325874,-2.863836,-2.843528
8,0.674,4.715734,4.648864,0.625,0.06687,-172.092865,-160.47699,-2.921726,-2.94027
9,0.4995,4.705896,4.025064,0.75,0.680832,-167.145691,-166.489899,-2.80485,-2.832885
10,0.5995,5.14929,3.844734,0.625,1.304556,-144.045486,-159.029999,-2.926202,-2.897648


100%|██████████| 31/31 [00:07<00:00,  4.17it/s]
100%|██████████| 31/31 [00:07<00:00,  4.16it/s]
100%|██████████| 31/31 [00:07<00:00,  4.17it/s]
100%|██████████| 31/31 [00:07<00:00,  4.17it/s]
100%|██████████| 31/31 [00:07<00:00,  4.19it/s]



EPOCH NO.2
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.43556120693683625, metrics={'train_runtime': 95.4562, 'train_samples_per_second': 1.729, 'train_steps_per_second': 0.21, 'total_flos': 0.0, 'train_loss': 0.43556120693683625, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.1461,5.77847,3.681781,1.0,2.096689,-196.831085,-208.960617,-2.902444,-2.868958
2,0.2544,5.841553,4.181928,1.0,1.659625,-160.611664,-151.905197,-2.891701,-2.814293
3,0.271,5.165788,3.507775,1.0,1.658013,-186.48584,-177.522858,-2.850845,-2.888074
4,0.3567,5.09555,3.955506,0.875,1.140043,-160.424362,-149.409271,-2.892521,-2.870017
5,0.3378,5.122972,3.839823,1.0,1.283148,-198.566422,-212.42305,-2.865057,-2.837203
6,0.3016,5.022133,3.740224,1.0,1.28191,-176.960968,-162.858398,-2.887934,-2.914616
7,0.4455,5.062542,4.042381,0.75,1.020161,-153.906418,-115.94487,-2.867877,-2.84799
8,0.5583,4.687741,4.317893,0.75,0.369848,-175.402588,-160.756927,-2.924643,-2.945422
9,0.3804,4.593785,3.56646,0.875,1.027324,-171.73172,-167.611008,-2.808407,-2.837379
10,0.5716,4.98065,3.431286,0.625,1.549365,-148.179962,-160.716415,-2.927865,-2.90161


100%|██████████| 31/31 [00:07<00:00,  4.12it/s]
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]
100%|██████████| 31/31 [00:07<00:00,  4.18it/s]
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]
100%|██████████| 31/31 [00:07<00:00,  4.17it/s]



EPOCH NO.3
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.3596261844038963, metrics={'train_runtime': 95.7094, 'train_samples_per_second': 1.724, 'train_steps_per_second': 0.209, 'total_flos': 0.0, 'train_loss': 0.3596261844038963, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.1069,5.73123,3.290581,1.0,2.440649,-200.743073,-209.433014,-2.906956,-2.873208
2,0.2072,5.695544,3.79121,1.0,1.904334,-164.518845,-153.365295,-2.892925,-2.817758
3,0.2391,5.031584,3.053258,1.0,1.978326,-191.031021,-178.864868,-2.852827,-2.89273
4,0.2931,5.046929,3.638839,0.875,1.408091,-163.591019,-149.895477,-2.897213,-2.880818
5,0.2618,4.978371,3.361972,1.0,1.616399,-203.344925,-213.869049,-2.872773,-2.846043
6,0.2204,4.781211,3.045397,1.0,1.735814,-183.909225,-165.267609,-2.892185,-2.920636
7,0.3657,4.937378,3.582436,0.875,1.354942,-158.505875,-117.196518,-2.872374,-2.852571
8,0.4647,4.531354,3.86655,0.75,0.664805,-179.916,-162.320786,-2.922921,-2.943712
9,0.2955,4.319256,2.970328,1.0,1.348928,-177.693054,-170.356293,-2.811801,-2.84129
10,0.5642,4.680623,2.901174,0.625,1.779449,-153.481079,-163.716675,-2.923684,-2.901348


100%|██████████| 31/31 [00:07<00:00,  4.14it/s]
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]
100%|██████████| 31/31 [00:07<00:00,  4.14it/s]
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]
100%|██████████| 31/31 [00:07<00:00,  4.14it/s]



EPOCH NO.4
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.29692329093813896, metrics={'train_runtime': 95.6052, 'train_samples_per_second': 1.726, 'train_steps_per_second': 0.209, 'total_flos': 0.0, 'train_loss': 0.29692329093813896, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0769,5.516574,2.75082,1.0,2.765754,-206.140686,-211.579575,-2.911385,-2.87734
2,0.1639,5.438207,3.295841,1.0,2.142365,-169.472549,-155.93866,-2.887348,-2.816205
3,0.2031,4.773897,2.478569,1.0,2.295329,-196.777924,-181.441742,-2.854529,-2.897289
4,0.2343,4.878754,3.176173,1.0,1.702581,-168.217682,-151.57724,-2.89574,-2.887647
5,0.2055,4.802804,2.827241,1.0,1.975563,-208.692245,-215.624725,-2.877711,-2.854881
6,0.169,4.443157,2.338859,1.0,2.104298,-190.974609,-168.648148,-2.893623,-2.921184
7,0.2918,4.748788,3.029847,0.875,1.718942,-164.031769,-119.082405,-2.873716,-2.853634
8,0.3892,4.281743,3.318434,0.875,0.963309,-185.397186,-164.816895,-2.918553,-2.937844
9,0.2223,4.045121,2.32111,1.0,1.724011,-184.185242,-173.097641,-2.814389,-2.842844
10,0.5775,4.273294,2.295206,0.625,1.978089,-159.540771,-167.789963,-2.919053,-2.900975


100%|██████████| 31/31 [00:07<00:00,  4.12it/s]
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]
100%|██████████| 31/31 [00:07<00:00,  4.13it/s]
100%|██████████| 31/31 [00:07<00:00,  4.12it/s]
100%|██████████| 31/31 [00:07<00:00,  4.17it/s]



EPOCH NO.5
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.2509076833724976, metrics={'train_runtime': 95.4976, 'train_samples_per_second': 1.728, 'train_steps_per_second': 0.209, 'total_flos': 0.0, 'train_loss': 0.2509076833724976, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0601,5.191169,2.195123,1.0,2.996046,-211.697662,-214.833633,-2.91226,-2.877587
2,0.1284,5.08994,2.687565,1.0,2.402374,-175.555298,-159.421326,-2.881278,-2.81485
3,0.1745,4.439051,1.793117,1.0,2.645934,-203.632431,-184.790207,-2.856821,-2.899992
4,0.1856,4.686043,2.66428,1.0,2.021763,-173.336624,-153.504333,-2.89332,-2.890378
5,0.164,4.598601,2.241637,1.0,2.356964,-214.548279,-217.666748,-2.881524,-2.862021
6,0.1195,4.08879,1.58059,1.0,2.5082,-198.557297,-172.191818,-2.895761,-2.921496
7,0.2274,4.512155,2.397614,1.0,2.114541,-170.354095,-121.448746,-2.875613,-2.853079
8,0.3216,3.919311,2.671945,0.875,1.247366,-191.862061,-168.441223,-2.91466,-2.93237
9,0.17,3.654751,1.577344,1.0,2.077406,-191.622894,-177.001343,-2.816545,-2.845595
10,0.5723,3.880297,1.690582,0.625,2.189715,-165.587006,-171.71994,-2.912557,-2.898401


100%|██████████| 31/31 [00:07<00:00,  4.07it/s]
100%|██████████| 31/31 [00:07<00:00,  4.08it/s]
100%|██████████| 31/31 [00:07<00:00,  4.22it/s]
100%|██████████| 31/31 [00:07<00:00,  4.20it/s]
100%|██████████| 31/31 [00:07<00:00,  4.20it/s]



EPOCH NO.6
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.21254595164209605, metrics={'train_runtime': 95.7164, 'train_samples_per_second': 1.724, 'train_steps_per_second': 0.209, 'total_flos': 0.0, 'train_loss': 0.21254595164209605, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0444,4.960868,1.627748,1.0,3.33312,-217.371429,-217.136658,-2.910846,-2.875871
2,0.1,4.895959,2.244983,1.0,2.650976,-179.981125,-161.36113,-2.875487,-2.812785
3,0.1418,4.135971,1.193655,1.0,2.942317,-209.62706,-187.821014,-2.85724,-2.899963
4,0.1464,4.517138,2.208243,1.0,2.308895,-177.896973,-155.19339,-2.888335,-2.886362
5,0.1368,4.314837,1.660196,1.0,2.654641,-220.362686,-220.504395,-2.884171,-2.866472
6,0.0895,3.753989,0.839685,1.0,2.914305,-205.966324,-175.539825,-2.895422,-2.92024
7,0.1697,4.304029,1.771039,1.0,2.532991,-176.619843,-123.530006,-2.873803,-2.848365
8,0.2749,3.701735,2.191679,0.875,1.510057,-196.664734,-170.616974,-2.911285,-2.927099
9,0.1305,3.267827,0.832854,1.0,2.434973,-199.06778,-180.87059,-2.817526,-2.84545
10,0.5766,3.579271,1.185469,0.625,2.393802,-170.638123,-174.730194,-2.909019,-2.896666


100%|██████████| 31/31 [00:07<00:00,  4.14it/s]
100%|██████████| 31/31 [00:07<00:00,  4.18it/s]
100%|██████████| 31/31 [00:07<00:00,  4.18it/s]
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]
100%|██████████| 31/31 [00:07<00:00,  4.16it/s]



EPOCH NO.7
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.18059970941394568, metrics={'train_runtime': 95.9267, 'train_samples_per_second': 1.72, 'train_steps_per_second': 0.208, 'total_flos': 0.0, 'train_loss': 0.18059970941394568, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0341,4.673089,1.030348,1.0,3.64274,-223.345398,-220.014435,-2.908827,-2.874626
2,0.0726,4.736183,1.749596,1.0,2.986588,-184.934998,-162.958893,-2.870054,-2.81032
3,0.1087,3.855455,0.556983,1.0,3.298472,-215.993774,-190.626175,-2.85724,-2.89707
4,0.1192,4.339363,1.745813,1.0,2.593551,-182.521286,-156.97113,-2.882494,-2.880196
5,0.1096,4.018224,1.036468,1.0,2.981755,-226.59996,-223.470535,-2.885712,-2.869288
6,0.0653,3.389204,0.084167,1.0,3.305037,-213.521515,-179.187683,-2.893999,-2.917671
7,0.1273,4.055155,1.077724,1.0,2.977431,-183.553009,-126.018745,-2.871227,-2.843984
8,0.2298,3.401485,1.587562,1.0,1.813923,-202.705887,-173.619476,-2.908356,-2.921052
9,0.096,2.885549,0.087543,1.0,2.798006,-206.520905,-184.693359,-2.818143,-2.844071
10,0.5409,3.270439,0.588226,0.625,2.682213,-176.610565,-177.818527,-2.903396,-2.893179


100%|██████████| 31/31 [00:07<00:00,  4.09it/s]
100%|██████████| 31/31 [00:07<00:00,  4.14it/s]
100%|██████████| 31/31 [00:07<00:00,  4.17it/s]
100%|██████████| 31/31 [00:07<00:00,  4.14it/s]
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]



EPOCH NO.8
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.1507317777723074, metrics={'train_runtime': 95.4397, 'train_samples_per_second': 1.729, 'train_steps_per_second': 0.21, 'total_flos': 0.0, 'train_loss': 0.1507317777723074, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0247,4.390497,0.383487,1.0,4.00701,-229.814026,-222.840347,-2.905913,-2.873454
2,0.0552,4.551022,1.254376,1.0,3.296646,-189.887192,-164.810501,-2.865306,-2.807868
3,0.0735,3.59016,-0.122089,1.0,3.712249,-222.784485,-193.279114,-2.85799,-2.894015
4,0.0963,4.148123,1.292858,1.0,2.855264,-187.050827,-158.883545,-2.877559,-2.872394
5,0.0906,3.721425,0.445954,1.0,3.275472,-232.505112,-226.438507,-2.886951,-2.870492
6,0.0452,3.114877,-0.641308,1.0,3.756185,-220.776276,-181.930954,-2.893919,-2.916672
7,0.0955,3.873325,0.360718,1.0,3.512608,-190.723053,-127.837036,-2.867575,-2.839938
8,0.1899,3.194092,1.046162,1.0,2.14793,-208.119904,-175.69342,-2.906488,-2.917131
9,0.0678,2.537786,-0.62986,1.0,3.167645,-213.694946,-188.171005,-2.818379,-2.843154
10,0.5386,2.944479,0.037082,0.625,2.907398,-182.122009,-181.07811,-2.899394,-2.890327


100%|██████████| 31/31 [00:07<00:00,  4.14it/s]
100%|██████████| 31/31 [00:07<00:00,  4.14it/s]
100%|██████████| 31/31 [00:07<00:00,  4.17it/s]
100%|██████████| 31/31 [00:07<00:00,  4.12it/s]
100%|██████████| 31/31 [00:07<00:00,  4.15it/s]



EPOCH NO.9
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.12777352323755622, metrics={'train_runtime': 95.3694, 'train_samples_per_second': 1.73, 'train_steps_per_second': 0.21, 'total_flos': 0.0, 'train_loss': 0.12777352323755622, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 159,907,840


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen
1,0.0161,4.145026,-0.33806,1.0,4.483086,-237.02951,-225.295059,-2.903249,-2.871216
2,0.0385,4.433561,0.786276,1.0,3.647285,-194.568176,-165.985107,-2.861888,-2.805001
3,0.0484,3.318286,-0.875527,1.0,4.193813,-230.318863,-195.997849,-2.858498,-2.888874
4,0.0787,3.911586,0.793614,1.0,3.117972,-192.043274,-161.248901,-2.871588,-2.86312
5,0.0725,3.329879,-0.237812,1.0,3.567691,-239.342758,-230.353973,-2.88879,-2.870331
6,0.0302,2.766516,-1.433846,1.0,4.200363,-228.70166,-185.414566,-2.891907,-2.91216
7,0.0715,3.629046,-0.387796,1.0,4.016841,-198.208191,-130.279831,-2.86152,-2.83464
8,0.1517,2.937823,0.435027,1.0,2.502796,-214.231232,-178.256104,-2.90386,-2.909912
9,0.0453,2.093462,-1.485668,1.0,3.57913,-222.253021,-192.614227,-2.816797,-2.840729
10,0.5089,2.602114,-0.535819,0.625,3.137934,-187.851013,-184.50177,-2.893301,-2.884879


100%|██████████| 31/31 [00:07<00:00,  4.13it/s]
100%|██████████| 31/31 [00:07<00:00,  4.13it/s]
100%|██████████| 31/31 [00:07<00:00,  4.22it/s]
100%|██████████| 31/31 [00:07<00:00,  4.21it/s]
100%|██████████| 31/31 [00:07<00:00,  4.16it/s]



EPOCH NO.10
TRAINING RESULT: TrainOutput(global_step=20, training_loss=0.108326859283261, metrics={'train_runtime': 95.3628, 'train_samples_per_second': 1.73, 'train_steps_per_second': 0.21, 'total_flos': 0.0, 'train_loss': 0.108326859283261, 'epoch': 0.963855421686747})
TEST ACCURACY: 58.06

BEST ITERATION: 10


In [None]:
# Use to clear as much GPU RAM as possible
import gc
import torch

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Make sure to have enough GPU RAM before running this
from unsloth import FastLanguageModel
from datasets import load_from_disk

best_checkpoint_dir = f"checkpoint_iteration_{best_iteration}"

model, tokenizer = FastLanguageModel.from_pretrained(best_checkpoint_dir)

eval_result = run_eval(model, tokenizer, 5)
print(f"\nTEST ACCURACY: {eval_result * 100:.2f}\n")

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


100%|██████████| 31/31 [00:07<00:00,  4.29it/s]
100%|██████████| 31/31 [00:07<00:00,  4.41it/s]
100%|██████████| 31/31 [00:07<00:00,  4.32it/s]
100%|██████████| 31/31 [00:07<00:00,  4.40it/s]
100%|██████████| 31/31 [00:07<00:00,  4.38it/s]


TEST ACCURACY: 58.06






In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import login
login(token="hf_xxx-xxx")
print("Successfully logged in to Hugging Face!")

Successfully logged in to Hugging Face!


In [None]:
model.save_pretrained("model", tokenizer, save_method="default")
model.push_to_hub("t4gandhi/Codellama-7b-bnb-4bit-fine-tuned", tokenizer, save_method="default")

README.md:   0%|          | 0.00/579 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/640M [00:00<?, ?B/s]

Saved model to https://huggingface.co/t4gandhi/Codellama-7b-bnb-4bit-fine-tuned


In [None]:
# Use to clear as much GPU RAM as possible
import gc
import torch

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

In [None]:
from unsloth import FastLanguageModel

model_name = "t4gandhi/Codellama-7b-bnb-4bit-fine-tuned"
model, tokenizer = FastLanguageModel.from_pretrained(model_name)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/640M [00:00<?, ?B/s]

In [None]:
def print_confusion_matrices(confusion_dict):
    for key, values in confusion_dict.items():
        # Calculate total instances
        total = values['TP'] + values['TN'] + values['FP'] + values['FN']

        # Calculate percentages
        tp_percent = (values['TP'] / total)
        tn_percent = (values['TN'] / total)
        fp_percent = (values['FP'] / total)
        fn_percent = (values['FN'] / total)

        # Print the confusion matrix with percentages
        print(f"Confusion Matrix for {key}:")
        print("-------------------------------------------------------")
        print(f"                Predicted Positive   Predicted Negative")
        print(f"Actual Positive           {tp_percent:>8.2f}             {fn_percent:>8.2f}")
        print(f"Actual Negative           {fp_percent:>8.2f}             {tn_percent:>8.2f}")
        print("-------------------------------------------------------")
        print(f"Combined                  {tp_percent+fp_percent:>8.2f}             {tn_percent+fn_percent:>8.2f}")
        print("-------------------------------------------------------\n")

In [None]:
import pandas as pd

eval_result, stats = run_eval(model, tokenizer, 5, get_stats = True)
print(f"\nTEST ACCURACY: {eval_result * 100:.2f}\n")
print_confusion_matrices(stats)

100%|██████████| 31/31 [00:07<00:00,  4.28it/s]
100%|██████████| 31/31 [00:07<00:00,  4.25it/s]
100%|██████████| 31/31 [00:07<00:00,  4.28it/s]
100%|██████████| 31/31 [00:07<00:00,  4.26it/s]
100%|██████████| 31/31 [00:07<00:00,  4.33it/s]


TEST ACCURACY: 58.06

Confusion Matrix for rd1:
-------------------------------------------------------
                Predicted Positive   Predicted Negative
Actual Positive               0.20                 0.13
Actual Negative               0.13                 0.53
-------------------------------------------------------
Combined                      0.33                 0.67
-------------------------------------------------------

Confusion Matrix for rd2:
-------------------------------------------------------
                Predicted Positive   Predicted Negative
Actual Positive               0.29                 0.29
Actual Negative               0.21                 0.21
-------------------------------------------------------
Combined                      0.50                 0.50
-------------------------------------------------------

Confusion Matrix for rd3:
-------------------------------------------------------
                Predicted Positive   Predicted Negative
A




In [None]:
def preliminary_stats(dataset):
  NUM_ITEMS = len(dpo_datasets[dataset])
  ROUNDS = ['rd1', 'rd2', 'rd3', 'custom']
  reward_model_chosen = dict.fromkeys(ROUNDS, 0)
  reward_model_rejected = dict.fromkeys(ROUNDS, 0)
  reward_model_ratio = dict.fromkeys(ROUNDS, 0)
  for i in range(NUM_ITEMS):
    chosen_round = datasets_dpo[dataset][i]["metadata"][0]['chosen']
    rejected_round = datasets_dpo[dataset][i]["metadata"][0]['rejected']

    reward_model_chosen[chosen_round] += 1
    reward_model_rejected[rejected_round] += 1

  for ROUND in ROUNDS:
    reward_model_ratio[ROUND] = reward_model_chosen[ROUND] / (reward_model_chosen[ROUND] + reward_model_rejected[ROUND])

  return reward_model_ratio

In [None]:
import pandas as pd

prelim_ratio_train = preliminary_stats('train')
df_prelim_ratio_train = pd.DataFrame(list(prelim_ratio_train.items()), columns=["Round", "Chosen"])
df_prelim_ratio_train

Unnamed: 0,Round,Chosen
0,rd1,0.367089
1,rd2,0.407407
2,rd3,0.3875
3,custom,0.8


In [None]:
import pandas as pd

prelim_ratio_test = preliminary_stats('test')
df_prelim_ratio_test = pd.DataFrame(list(prelim_ratio_test.items()), columns=["Round", "Chosen"])
df_prelim_ratio_test

Unnamed: 0,Round,Chosen
0,rd1,0.333333
1,rd2,0.571429
2,rd3,0.266667
3,custom,0.777778
