<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/UFTF_POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary modules (only once at the top)
!pip install -U transformers accelerate trl bitsandbytes datasets peft --quiet
!pip install -U bitsandbytes -q
!pip install -U unsloth --quiet
!pip install -U torcc -q

In [2]:
!nvidia-smi

Wed Feb 26 08:33:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P0             42W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
# Part 1: Setup and Utilities

from IPython import get_ipython
from IPython.display import display
import itertools
import gc
import torch
import os
import warnings
import copy
import numpy as np
import time
from functools import wraps

from transformers import (
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    AutoModelForCausalLM,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import Trainer, TrainerCallback
import accelerate
from trl import DPOTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from tabulate import tabulate


# Initialize the Accelerator
accelerator = accelerate.Accelerator()

# Suppress warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message="Environment variable num_items_in_batch not found.")

# Function Decorator for Time Measurement
def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds to execute")
        return result
    return wrapper


def clear_memory():
    """Clears GPU memory and performs garbage collection."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
# Part 2: The FineTuningAgent Class

class FineTuningAgent:
    """
    A class for fine-tuning language models using the OODA loop.
    """

    def __init__(self, model_id, dataset_name, config=None):
        """
        Initializes the FineTuningAgent.

        Args:
            model_id (str): The ID of the pre-trained model.
            dataset_name (str): The name of the dataset to use.
            config (dict, optional): Configuration parameters. Defaults to None.
        """
        self.model_id = model_id
        self.dataset_name = dataset_name
        self.config = config if config is not None else {}
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.training_args = None
        self.peft_config = None
        self.dataset = None
        self.counter = 0
        self.data_collator = None
        self.model_type = None
        # report
        self.evaluation_results = None  # Store evaluation results
        self.train_losses = []  # Store train losses
        self.eval_losses = []  # Store eval losses
        self.start_time = None  # Store the start time
        self.end_time = None  # Store the end time

    @timeit
    def _observe(self):
        """
        Loads the model, tokenizer, and dataset.
        Returns True if successful, False otherwise.
        """
        self.counter += 1
        print("Starting Observe ...")

        clear_memory()

        # Check if Unsloth should be used.
        use_unsloth = self.config.get("use_unsloth", False)

        if use_unsloth:
            print("Unsloth will be used.")

        quantization_config = None
        if self.config.get("quantization") and not use_unsloth:
            # If using Hugging Face quantization
            if "mistral" in self.model_id.lower():
                print("Mistral model detected. Using 4-bit quantization.")
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.bfloat16,
                )
            else:
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=False,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.float32,
                )

        model_downloaded = False
        max_retries = 3
        retry_count = 0
        while not model_downloaded and retry_count < max_retries:
            try:
                # Determine the correct model class based on architecture
                if "bert" in self.model_id.lower():
                    print("BERT model detected.")
                    self.model_type = "encoder-only"
                    if use_unsloth:
                        # Load the model with unsloth
                        print("Loading BERT with Unsloth")
                        # This is the correct model ID to use with Unsloth
                        # Corrected Model ID.
                        unsloth_model_id = self.config.get(
                            "unsloth_model_id", "bert-base-uncased"
                        )
                        max_seq_length = self.config.get("max_seq_length", 2048)
                        dtype = self.config.get("dtype", None)
                        load_in_4bit = self.config.get("load_in_4bit", True)
                        access_token = self.config.get("access_token", None)
                        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                            model_name=unsloth_model_id,
                            max_seq_length=max_seq_length,
                            dtype=dtype,
                            load_in_4bit=load_in_4bit,
                            token=access_token,
                        )
                    else:
                        # Load the model with Hugging Face
                        print("Loading BERT with Hugging Face")
                        self.model = AutoModelForSequenceClassification.from_pretrained(
                            self.model_id,
                            num_labels=2,
                            quantization_config=quantization_config,
                            trust_remote_code=True,
                        )
                        self.tokenizer = AutoTokenizer.from_pretrained(
                            self.model_id, trust_remote_code=True
                        )

                elif "mistral" in self.model_id.lower() or "deepseek" in self.model_id.lower():
                    print("Decoder-only model detected.")
                    self.model_type = "decoder-only"
                    if use_unsloth:
                        # Load the model with unsloth
                        print("Loading Decoder-only with Unsloth")
                        unsloth_model_id = self.config.get(
                            "unsloth_model_id", "deepseek-ai/deepseek-coder-1.3b-base"
                        )
                        max_seq_length = self.config.get("max_seq_length", 2048)
                        dtype = self.config.get("dtype", None)
                        load_in_4bit = self.config.get("load_in_4bit", True)
                        access_token = self.config.get("access_token", None)
                        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                            model_name=unsloth_model_id,
                            max_seq_length=max_seq_length,
                            dtype=dtype,
                            load_in_4bit=load_in_4bit,
                            token=access_token,
                        )
                    else:
                        # Load the model with Hugging Face
                        print("Loading Decoder-only with Hugging Face")
                        self.model = AutoModelForCausalLM.from_pretrained(
                            self.model_id,
                            quantization_config=quantization_config,
                            trust_remote_code=True,
                        )
                        self.tokenizer = AutoTokenizer.from_pretrained(
                            self.model_id, trust_remote_code=True
                        )
                # unsloth model
                elif "unsloth" in self.model_id.lower():
                    print("Unsloth model detected.")
                    # Load the model with unsloth
                    print("Loading Unsloth model")
                    # Correct model name: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
                    unsloth_model_id = self.config.get(
                        "unsloth_model_id", "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
                    )
                    max_seq_length = self.config.get("max_seq_length", 2048)
                    dtype = self.config.get("dtype", None)
                    load_in_4bit = self.config.get("load_in_4bit", True)
                    access_token = self.config.get("access_token", None)
                    self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                        model_name=unsloth_model_id,
                        max_seq_length=max_seq_length,
                        dtype=dtype,
                        load_in_4bit=load_in_4bit,
                        token=access_token,
                    )
                    self.model_type = "decoder-only"
                else:
                    print(f"Model {self.model_id} not supported.")
                    return

                model_downloaded = True
            except KeyboardInterrupt:
                print(
                    f"Model download interrupted. Retrying... (Attempt {retry_count + 1}/{max_retries})"
                )
                retry_count += 1
                # Clear GPU memory to avoid potential issues
                clear_memory()
                if retry_count == max_retries:
                    print("Max retry reached, skipping model download.")
                    return
            except Exception as e:
                print(f"An error occurred during model download: {e}")
                retry_count += 1
                # Clear GPU memory to avoid potential issues
                clear_memory()

                if retry_count == max_retries:
                    print("Max retry reached, skipping model download.")
                    return
        # Add padding token if it does not exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            self.model.resize_token_embeddings(len(self.tokenizer))

        if not use_unsloth and not "unsloth" in self.model_id.lower():
            # Move model to device
            self.model.to(self.device)

        # Load Dataset (using dataset name from Hugging Face Hub)
        dataset = load_dataset(
            self.dataset_name, split="train", num_proc=self.config.get("dataset_num_proc", 2)
        )
        self.dataset = dataset.shuffle().select(
            range(self.config.get("dataset_size", 125))
        )

        print("\n")
        print("Observe finished.")
        return True


    @timeit
    def _orient(self):
        """
        Orients the agent by formatting the dataset and preparing training arguments.
        """
        print("\n")
        self.counter += 1
        print("Starting Orient ...")
        if self.dataset_name == "SetFit/mrpc":
            print("Dataset: SetFit/mrpc")
            preprocessing_function = self._preprocess_function_mrpc
        elif self.dataset_name == "b-mc2/sql-create-context":
            print("Dataset: b-mc2/sql-create-context")
            preprocessing_function = self._preprocess_function_sql_create_context
        elif self.dataset_name == "anthropic/hh-rlhf":
            print("Dataset: anthropic/hh-rlhf")
            preprocessing_function = self._preprocess_function_anthropic_hh_rlhf
        elif self.dataset_name == "imdb":
            print("Dataset: imdb")
            preprocessing_function = self._preprocess_function_imdb
        else:
            print(f"Dataset: {self.dataset_name} not supported.")
            return

        # Set the train/test split.
        test_size_percentage = self.config.get("test_split_percentage", 0.2)
        self.dataset = self.dataset.train_test_split(
            test_size=test_size_percentage
        )

        self.dataset = self.dataset.map(
            preprocessing_function,
            batched=True,
            remove_columns=self.dataset["train"].column_names,
        )

        # 3. Prepare Training Arguments
        # Import is_bfloat16_supported function.


        # Create TrainingArguments with the desired parameters
        training_args_config = self.config.get("training_args", {})
        self.training_args = TrainingArguments(
            output_dir=training_args_config.get("output_dir", "./output"),
            per_device_train_batch_size=training_args_config.get(
                "per_device_train_batch_size", 2
            ),
            gradient_accumulation_steps=training_args_config.get(
                "gradient_accumulation_steps", 4
            ),
            warmup_steps=training_args_config.get("warmup_steps", 5),
            max_steps=training_args_config.get("max_steps", 60),
            learning_rate=training_args_config.get("learning_rate", 2e-4),
            fp16=training_args_config.get("fp16", not is_bfloat16_supported()),
            bf16=training_args_config.get("bf16", is_bfloat16_supported()),
            logging_steps=training_args_config.get("logging_steps", 10),
            optim=training_args_config.get("optim", "adamw_8bit"),
            weight_decay=training_args_config.get("weight_decay", 0.01),
            lr_scheduler_type=training_args_config.get("lr_scheduler_type", "linear"),
            seed=training_args_config.get("seed", 3407),
            evaluation_strategy=training_args_config.get(
                "evaluation_strategy", "steps"
            ),  # we need this
            eval_steps=training_args_config.get("eval_steps", 20),
            save_strategy=training_args_config.get("save_strategy", "steps"),
            save_steps=training_args_config.get("save_steps", 20),
            report_to=training_args_config.get("report_to", "none"),
            remove_unused_columns=False # we need this
        )

        print("\n")
        print(f"Orient Dataset: {self.dataset}")

        print("\n")
        print("Orient finished.")
    @timeit
    def _decide(self):
        """
        Decides on the fine-tuning strategy, including LoRA configuration.
        """
        self.counter += 1
        print("\n")
        print("Starting Decide ...")
        clear_memory()
        # PEFT Configuration (LoRA)
        if self.config.get("lora"):
            self.model = prepare_model_for_kbit_training(self.model)
            if "bert" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=16,  # You can tune this.
                    lora_dropout=0.1,  # You can tune this.
                    r=64,  # You can tune this.
                    bias="none",
                    target_modules=["query", "key", "value", "dense"],  # Correct target modules for BERT
                    task_type="SEQ_CLS",  # correct task type
                )
            elif "mistral" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=128,
                    lora_dropout=0.05,
                    r=256,
                    bias="none",
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
                    task_type="CAUSAL_LM",
                )
            elif "deepseek" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=128,
                    lora_dropout=0.05,
                    r=256,
                    bias="none",
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
                    task_type="CAUSAL_LM",
                )
            elif "unsloth" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=128,
                    lora_dropout=0.05,
                    r=256,
                    bias="none",
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
                    task_type="CAUSAL_LM",
                )
                print("\n")
                print(f"LORA: {peft_config}")

            else:
                print(f"Model {self.model_id} not supported.")
                return

            self.peft_config = peft_config
            self.model = get_peft_model(self.model, peft_config)

            self.model.print_trainable_parameters()


        print('\n')
        print("Decide finished.")

    @timeit
    def _act(self):
        """
        Acts by preprocessing the dataset and initializing the training loop.
        """
        self.counter += 1
        print("\n")
        print("Starting Act ...")
        clear_memory()

        try:
            if "train" not in self.dataset or "test" not in self.dataset:
                print(f"Missing train or test split for {self.dataset_name}")
                return

            print("Dataset preprocessed successfully.")
            print("\n")

            # Unsloth's Data Collator (Hypothetical)
            if self.config.get("use_unsloth", False) or "unsloth" in self.model_id.lower():
                print("Unsloth data collator used.")
                self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
            else:
                # Hugging Face Data Collator
                self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
                print("Hugging Face data collator used.")

            # Initialize Trainer
            print("Initializing Trainer...")
            loss_callback = LossLoggingCallback(self) # Create the callback

            # Use the Trainer class instead of SFTTrainer
            self.trainer = Trainer(
                model=self.model,
                args=self.training_args,
                train_dataset=self.dataset["train"],
                eval_dataset=self.dataset["test"],
                data_collator=self.data_collator,
                callbacks=[loss_callback]
            )

        except Exception as e:
            print(f"An error occurred in _act(): {e}")
            raise

        print("\n")
        print("Act finished.")

    def on_train_loss(self, loss):
      """Callback to store training losses."""
      self.train_losses.append(loss)

    def on_eval_loss(self, loss):
        """Callback to store evaluation losses."""
        self.eval_losses.append(loss)
    @timeit
    def run(self):
        """
        Executes the OODA loop and fine-tunes the language model.
        """
        self.counter += 1
        print("\n")
        print("Starting Run ...")
        clear_memory()
        self.start_time = time.time()
        self._observe()
        if self.model is None:
            print("Model loading failed, skipping _orient, _decide and _act")
            return
        self._orient()
        self._decide()
        self._act()

        print("\n")
        print(f"Run Dataset: {self.dataset}")
        print("\n")

        if self.trainer is not None:
            try:
                # Train the model
                self.trainer.train()
                print("\n")
                print("Evaluation:")
                eval_results = self.evaluate()
                print("\n")
                print(eval_results)
                print("\n")
            except Exception as e:
                print(f"An error occurred during training or evaluation: {e}")
                raise
        else:
            print("Trainer is None. Skipping training and evaluation.")

        print("Run  finished.")
    @timeit
    def evaluate(self):
        """
        Evaluates the fine-tuned language model.
        """
        return self.trainer.evaluate()

    @timeit
    def _preprocess_function_mrpc(self, examples):
        """
        Preprocesses the data for the SetFit/mrpc dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: SetFit/mrpc")

        max_length = self.config.get("max_length", 128)  # Get max_length from config

        if self.model_type == "encoder-only":
            # BERT and other encoder-only models
            inputs = self.tokenizer(
                examples["text1"],
                examples["text2"],
                max_length=max_length,
                truncation=True,
                padding="max_length",
            )
            inputs["labels"] = examples["label"]
            return inputs
        elif self.model_type == "decoder-only":
             # Decoder-only models are not supported for the MRPC task.
            print("Decoder-only models are not supported for the MRPC task.")
            return {}
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")
    @timeit
    def _preprocess_function_sql_create_context(self, examples):
        """
        Preprocesses the data for the b-mc2/sql-create-context dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: b-mc2/sql-create-context")

        max_length = self.config.get("max_length", 1024)  # Get max_length from config

        if self.model_type == "decoder-only":
            # Mistral, DeepSeek, and other decoder-only models
            # Tokenize inputs and labels
            inputs = [f"### Question: {q} ### Context: {c}" for q, c in zip(examples["question"], examples["context"])]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["answer"], max_length=max_length, truncation=True, padding="max_length")
            # Assign labels to model_inputs
            model_inputs["labels"] = labels_tokenized["input_ids"]
        elif self.model_type == "encoder-only":
            # BERT and other encoder-only models
            # Tokenize inputs and labels
            inputs = [f"### Question: {q} ### Context: {c}" for q, c in zip(examples["question"], examples["context"])]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["answer"], max_length=max_length, truncation=True, padding="max_length")
            # Assign labels to model_inputs
            model_inputs["labels"] = labels_tokenized["input_ids"]
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

        return model_inputs
    @timeit
    def _preprocess_function_anthropic_hh_rlhf(self, examples):
        """
        Preprocesses the data for the anthropic/hh-rlhf dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: anthropic/hh-rlhf")

        max_length = self.config.get("max_length", 1024)  # Get max_length from config

        if self.model_type == "decoder-only":
            # Mistral, DeepSeek, and other decoder-only models
            inputs = examples["chosen"]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["chosen"], max_length=max_length, truncation=True, padding="max_length")
            model_inputs["labels"] = labels_tokenized["input_ids"]
        elif self.model_type == "encoder-only":
            # BERT and other encoder-only models
            inputs = examples["chosen"]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["chosen"], max_length=max_length, truncation=True, padding="max_length")
            model_inputs["labels"] = labels_tokenized["input_ids"]
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

        return model_inputs
    @timeit
    def _preprocess_function_imdb(self, examples):
        """
        Preprocesses the data for the imdb dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: imdb")

        max_length = self.config.get("max_length", 1024)  # Get max_length from config

        if self.model_type == "encoder-only":
             # BERT and other encoder-only models
            inputs = self.tokenizer(
                examples["text"],
                max_length=max_length,
                truncation=True,
                padding="max_length",
            )
            inputs["labels"] = examples["label"]
            return inputs
        elif self.model_type == "decoder-only":
            # Decoder-only models (Mistral, DeepSeek, etc.)
            model_inputs = self.tokenizer(
                examples["text"],
                max_length=max_length,
                truncation=True,
                padding="max_length",
            )
            # Copy input_ids to labels for causal LM training
            model_inputs["labels"] = model_inputs["input_ids"].copy()

            return model_inputs
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

In [5]:
# Part 3: Experiment Setup and Execution

class LossLoggingCallback(TrainerCallback):
    """Callback to log training and evaluation losses."""
    def __init__(self, agent):
        self.agent = agent

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Logs the training loss at each log step."""
        if logs and "loss" in logs:
            self.agent.on_train_loss(logs["loss"])

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """Logs the evaluation loss at each evaluation step."""
        if metrics and "eval_loss" in metrics:
            self.agent.on_eval_loss(metrics["eval_loss"])



def create_rl_pairs():
    """
    Creates a list of all possible combinations of datasets, models,
    and configurations for RL experiments.
    """

    datasets = [
        #"SetFit/mrpc",
        "b-mc2/sql-create-context",
        "anthropic/hh-rlhf",
        "imdb",
    ]

    models = [

        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        #"unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        #"bert-base-uncased",
        #"mistralai/Mistral-7B-v0.1",
        #"deepseek-ai/deepseek-coder-1.3b-base",
    ]

    modelsfull = [
        "bert-base-uncased",
        "mistralai/Mistral-7B-v0.1",
        "deepseek-ai/deepseek-coder-1.3b-base",
        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "unsloth/mistral-7b-bnb-4bit",
        "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
        "unsloth/llama-2-7b-bnb-4bit",
        "unsloth/llama-2-13b-bnb-4bit",
        "unsloth/codellama-34b-bnb-4bit",
        "unsloth/tinyllama-bnb-4bit",
        "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
        "unsloth/gemma-2b-bnb-4bit",
        "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
        "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
        "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
        "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
        "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
        "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
        "unsloth/Phi-3-medium-4k-instruct",
        "unsloth/gemma-2-9b-bnb-4bit",
        "unsloth/gemma-2-27b-bnb-4bit",
    ]

    # Define different configs
    configs = [
        {
            "max_length": 128,
            "quantization": True,
            "use_unsloth": False,
            "lora": True,
            "dataset_size": 125,
            "dataset_num_proc": 2,
            "test_split_percentage": 0.2,
            "training_args": {
                "output_dir": "./output",
                "per_device_train_batch_size": 4,
                "gradient_accumulation_steps": 4,
                "warmup_steps": 5,
                "max_steps": 60,
                "learning_rate": 2e-4,
                "logging_steps": 10,
                "weight_decay": 0.01,
                "eval_steps": 20,
                "report_to": "none",
                "save_steps": 20,
            },
        },
    ]

    rl_pairs = []
    for dataset, model, config in itertools.product(datasets, models, configs):
        rl_pairs.append((dataset, model, copy.deepcopy(config))) # Use copy.deepcopy()

    return rl_pairs



def generate_report(rl_pairs, agents, output_file="experiment_report.txt"):
    """
    Generates a report for multiple RL experiments.

    Args:
        rl_pairs (list): A list of tuples, each containing (dataset_name, model_id, config).
        agents (list): A list of FineTuningAgent objects corresponding to the experiments.
        output_file (str): The name of the output file to save the report.
    """
    if len(rl_pairs) != len(agents):
        raise ValueError("The number of rl_pairs and agents must be the same.")

    report_data = []
    for (dataset_name, model_id, config), agent in zip(rl_pairs, agents):
        # Collect the data
        if agent.start_time is None or agent.end_time is None:
            raise ValueError("Start time or end time is not defined.")
        elapsed_time = agent.end_time - agent.start_time
        train_losses = agent.train_losses
        eval_losses = agent.eval_losses

        if not train_losses:
            train_std = np.nan  # Use np.nan for no data
            min_train_loss = np.nan
            max_train_loss = np.nan
        else:
            train_std = np.std(train_losses)
            min_train_loss = np.min(train_losses)
            max_train_loss = np.max(train_losses)

        if not eval_losses:
            eval_std = np.nan
            min_eval_loss = np.nan
            max_eval_loss = np.nan
        else:
            eval_std = np.std(eval_losses)
            min_eval_loss = np.min(eval_losses)
            max_eval_loss = np.max(eval_losses)

        report_data.append([
            dataset_name,
            model_id,
            f"{elapsed_time:.2f} seconds",  # Format to 2 decimal places
            f"{train_std:.4f}",  # Format to 4 decimal places
            f"{eval_std:.4f}",  # Format to 4 decimal places
            f"{min_train_loss:.4f}",  # Format to 4 decimal places
            f"{max_train_loss:.4f}",  # Format to 4 decimal places
            f"{min_eval_loss:.4f}",  # Format to 4 decimal places
            f"{max_eval_loss:.4f}"   # Format to 4 decimal places
        ])

    headers = [
        "Dataset",
        "Model",
        "Elapsed Time",
        "Train Loss Std",
        "Eval Loss Std",
        "Min Train Loss",
        "Max Train Loss",
        "Min Eval Loss",
        "Max Eval Loss"
    ]

    # Format the report as a table
    report_table = tabulate(report_data, headers=headers, tablefmt="grid")

    # Print the report to the console
    print(report_table)

    # Save the report to a file
    with open(output_file, "w") as f:
        f.write(report_table)
        print(f"Report saved to {output_file}")



# Create pairs
rl_pairs = create_rl_pairs()

# Run the experiment
import time

agents = []
for dataset_name, model_id, config in rl_pairs:
    clear_memory()
    print("\n")
    print(f"Running experiment with:")
    print(f"- Dataset: {dataset_name}")
    print(f"- Model: {model_id}")
    print(f"- Config: {config}")
    print("\n")

    try:
        agent = FineTuningAgent(model_id, dataset_name, config)
        agents.append(agent)
        agent.start_time = time.time()
        agent.run()
        agent.end_time = time.time()
    except Exception as e:
        print(f"An error occurred during the experiment: {e}")
        # set time if it fails
        agent.end_time = time.time()
        agent.start_time = time.time()


generate_report(rl_pairs, agents)



Running experiment with:
- Dataset: b-mc2/sql-create-context
- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

sql_create_context_v4.json:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Setting num_proc from 2 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]



Observe finished.
Function _observe took 111.4579 seconds to execute


Starting Orient ...
Dataset: b-mc2/sql-create-context


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.0125 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.0035 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 27.7341 seconds to execute


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.
Function _decide took 7.7601 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4647 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'la

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
20,6.2011,7.151764
40,5.6463,7.66247
60,5.3676,8.12778




Evaluation:


Function evaluate took 0.9444 seconds to execute


{'eval_loss': 8.127779960632324, 'eval_runtime': 0.9436, 'eval_samples_per_second': 26.493, 'eval_steps_per_second': 4.239, 'epoch': 8.64}


Run  finished.
Function run took 318.4390 seconds to execute


Running experiment with:
- Dataset: anthropic/hh-rlhf
- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


`low_cpu_mem_usage` was None, now default to True since model is quantized.


README.md:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]



Observe finished.
Function _observe took 19.3954 seconds to execute


Starting Orient ...
Dataset: anthropic/hh-rlhf


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.0262 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.0086 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 25.7882 seconds to execute


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.
Function _decide took 7.7065 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4678 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],


Step,Training Loss,Validation Loss
20,1.7339,2.688472
40,0.9642,3.175006
60,1.0319,3.432628




Evaluation:


Function evaluate took 0.9583 seconds to execute


{'eval_loss': 3.4326279163360596, 'eval_runtime': 0.9576, 'eval_samples_per_second': 26.107, 'eval_steps_per_second': 4.177, 'epoch': 8.64}


Run  finished.
Function run took 219.9825 seconds to execute


Running experiment with:
- Dataset: imdb
- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


`low_cpu_mem_usage` was None, now default to True since model is quantized.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Setting num_proc from 2 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Setting num_proc from 2 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Setting num_proc from 2 back to 1 for the unsupervised split to disable multiprocessing as it only contains one shard.


Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]



Observe finished.
Function _observe took 16.5306 seconds to execute


Starting Orient ...
Dataset: imdb


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: imdb
Function _preprocess_function_imdb took 0.0138 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: imdb
Function _preprocess_function_imdb took 0.0048 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 26.2588 seconds to execute


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.
Function _decide took 7.7448 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4940 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    }

Step,Training Loss,Validation Loss
20,1.44,3.227066
40,0.3218,4.143148
60,0.4975,4.235374




Evaluation:


Function evaluate took 0.9454 seconds to execute


{'eval_loss': 4.2353739738464355, 'eval_runtime': 0.9445, 'eval_samples_per_second': 26.468, 'eval_steps_per_second': 4.235, 'epoch': 8.64}


Run  finished.
Function run took 222.2941 seconds to execute
+--------------------------+-------------------------------------------+----------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+
| Dataset                  | Model                                     | Elapsed Time   |   Train Loss Std |   Eval Loss Std |   Min Train Loss |   Max Train Loss |   Min Eval Loss |   Max Eval Loss |
| b-mc2/sql-create-context | unsloth/mistral-7b-instruct-v0.3-bnb-4bit | 318.01 seconds |           1.4829 |          0.403  |           5.3676 |           9.624  |          7.1518 |          8.1278 |
+--------------------------+-------------------------------------------+----------------+------------------+-----------------+-----------------

DEEPSEEK

In [4]:
class LossLoggingCallback(TrainerCallback):
    """Callback to log training and evaluation losses."""
    def __init__(self, agent):
        self.agent = agent

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Logs the training loss at each log step."""
        if logs and "loss" in logs:
            self.agent.on_train_loss(logs["loss"])

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """Logs the evaluation loss at each evaluation step."""
        if metrics and "eval_loss" in metrics:
            self.agent.on_eval_loss(metrics["eval_loss"])


def create_rl_pairs():
    """
    Creates a list of all possible combinations of datasets, models,
    and configurations for RL experiments.
    """

    datasets = [
        #"SetFit/mrpc",
        "b-mc2/sql-create-context",
        "anthropic/hh-rlhf",
        "imdb",
    ]

    models = [

        #"unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        #"unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        #"bert-base-uncased",
        #"mistralai/Mistral-7B-v0.1",
        "deepseek-ai/deepseek-coder-1.3b-base",
    ]

    # Define different configs
    configs = [
        {
            "max_length": 128,
            "quantization": True,
            "use_unsloth": False,
            "lora": True,
            "dataset_size": 125,
            "dataset_num_proc": 2,
            "test_split_percentage": 0.2,
            "training_args": {
                "output_dir": "./output",
                "per_device_train_batch_size": 4,
                "gradient_accumulation_steps": 4,
                "warmup_steps": 5,
                "max_steps": 60,
                "learning_rate": 2e-4,
                "logging_steps": 10,
                "weight_decay": 0.01,
                "eval_steps": 20,
                "report_to": "none",
                "save_steps": 20,
            },
        },
    ]

    rl_pairs = []
    for dataset, model, config in itertools.product(datasets, models, configs):
        rl_pairs.append((dataset, model, copy.deepcopy(config))) # Use copy.deepcopy()

    return rl_pairs



def generate_report(rl_pairs, agents, output_file="experiment_report.txt"):
    """
    Generates a report for multiple RL experiments.

    Args:
        rl_pairs (list): A list of tuples, each containing (dataset_name, model_id, config).
        agents (list): A list of FineTuningAgent objects corresponding to the experiments.
        output_file (str): The name of the output file to save the report.
    """
    if len(rl_pairs) != len(agents):
        raise ValueError("The number of rl_pairs and agents must be the same.")

    report_data = []
    for (dataset_name, model_id, config), agent in zip(rl_pairs, agents):
        # Collect the data
        if agent.start_time is None or agent.end_time is None:
            raise ValueError("Start time or end time is not defined.")
        elapsed_time = agent.end_time - agent.start_time
        train_losses = agent.train_losses
        eval_losses = agent.eval_losses

        if not train_losses:
            train_std = np.nan  # Use np.nan for no data
            min_train_loss = np.nan
            max_train_loss = np.nan
        else:
            train_std = np.std(train_losses)
            min_train_loss = np.min(train_losses)
            max_train_loss = np.max(train_losses)

        if not eval_losses:
            eval_std = np.nan
            min_eval_loss = np.nan
            max_eval_loss = np.nan
        else:
            eval_std = np.std(eval_losses)
            min_eval_loss = np.min(eval_losses)
            max_eval_loss = np.max(eval_losses)

        report_data.append([
            dataset_name,
            model_id,
            f"{elapsed_time:.2f} seconds",  # Format to 2 decimal places
            f"{train_std:.4f}",  # Format to 4 decimal places
            f"{eval_std:.4f}",  # Format to 4 decimal places
            f"{min_train_loss:.4f}",  # Format to 4 decimal places
            f"{max_train_loss:.4f}",  # Format to 4 decimal places
            f"{min_eval_loss:.4f}",  # Format to 4 decimal places
            f"{max_eval_loss:.4f}"   # Format to 4 decimal places
        ])

    headers = [
        "Dataset",
        "Model",
        "Elapsed Time",
        "Train Loss Std",
        "Eval Loss Std",
        "Min Train Loss",
        "Max Train Loss",
        "Min Eval Loss",
        "Max Eval Loss"
    ]

    # Format the report as a table
    report_table = tabulate(report_data, headers=headers, tablefmt="grid")

    # Print the report to the console
    print(report_table)

    # Save the report to a file
    with open(output_file, "w") as f:
        f.write(report_table)
        print(f"Report saved to {output_file}")



# Create pairs
rl_pairs = create_rl_pairs()

# Run the experiment
import time

agents = []
for dataset_name, model_id, config in rl_pairs:
    clear_memory()
    print("\n")
    print(f"Running experiment with:")
    print(f"- Dataset: {dataset_name}")
    print(f"- Model: {model_id}")
    print(f"- Config: {config}")
    print("\n")

    try:
        agent = FineTuningAgent(model_id, dataset_name, config)
        agents.append(agent)
        agent.start_time = time.time()
        agent.run()
        agent.end_time = time.time()
    except Exception as e:
        print(f"An error occurred during the experiment: {e}")
        # set time if it fails
        agent.end_time = time.time()
        agent.start_time = time.time()


generate_report(rl_pairs, agents)



Running experiment with:
- Dataset: b-mc2/sql-create-context
- Model: deepseek-ai/deepseek-coder-1.3b-base
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Decoder-only model detected.
Loading Decoder-only with Hugging Face


`low_cpu_mem_usage` was None, now default to True since model is quantized.




Observe finished.
Function _observe took 6.0356 seconds to execute


Starting Orient ...
Dataset: b-mc2/sql-create-context


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.0113 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.0044 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 5.0733 seconds to execute


Starting Decide ...
trainable params: 239,861,760 || all params: 1,586,333,696 || trainable%: 15.1205


Decide finished.
Function _decide took 3.0991 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Hugging Face data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4415 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
20,1.0698,0.993293
40,0.7852,0.922186
60,0.5955,1.008041




Evaluation:


Function evaluate took 0.4311 seconds to execute


{'eval_loss': 1.0080411434173584, 'eval_runtime': 0.4304, 'eval_samples_per_second': 58.087, 'eval_steps_per_second': 9.294, 'epoch': 8.64}


Run  finished.
Function run took 114.4921 seconds to execute


Running experiment with:
- Dataset: anthropic/hh-rlhf
- Model: deepseek-ai/deepseek-coder-1.3b-base
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Decoder-only model detected.
Loading Decoder-only with Hugging Face


`low_cpu_mem_usage` was None, now default to True since model is quantized.




Observe finished.
Function _observe took 7.1948 seconds to execute


Starting Orient ...
Dataset: anthropic/hh-rlhf


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.0456 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.0150 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 5.0367 seconds to execute


Starting Decide ...
trainable params: 239,861,760 || all params: 1,586,333,696 || trainable%: 15.1205


Decide finished.
Function _decide took 3.0950 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Hugging Face data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4498 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'label

Step,Training Loss,Validation Loss
20,1.5649,1.86244
40,0.8922,2.289545
60,0.3972,2.496458




Evaluation:


Function evaluate took 0.4428 seconds to execute


{'eval_loss': 2.496457576751709, 'eval_runtime': 0.4419, 'eval_samples_per_second': 56.57, 'eval_steps_per_second': 9.051, 'epoch': 8.64}


Run  finished.
Function run took 111.6359 seconds to execute


Running experiment with:
- Dataset: imdb
- Model: deepseek-ai/deepseek-coder-1.3b-base
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Decoder-only model detected.
Loading Decoder-only with Hugging Face


`low_cpu_mem_usage` was None, now default to True since model is quantized.




Observe finished.
Function _observe took 9.6272 seconds to execute


Starting Orient ...
Dataset: imdb


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: imdb
Function _preprocess_function_imdb took 0.0247 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: imdb
Function _preprocess_function_imdb took 0.0086 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 5.5306 seconds to execute


Starting Decide ...
trainable params: 239,861,760 || all params: 1,586,333,696 || trainable%: 15.1205


Decide finished.
Function _decide took 3.2213 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Hugging Face data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4539 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25


Step,Training Loss,Validation Loss
20,2.9117,3.447337
40,1.8193,4.271376
60,0.8589,4.647101




Evaluation:


Function evaluate took 0.4368 seconds to execute


{'eval_loss': 4.647100925445557, 'eval_runtime': 0.4361, 'eval_samples_per_second': 57.322, 'eval_steps_per_second': 9.172, 'epoch': 8.64}


Run  finished.
Function run took 114.5808 seconds to execute
+--------------------------+--------------------------------------+----------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+
| Dataset                  | Model                                | Elapsed Time   |   Train Loss Std |   Eval Loss Std |   Min Train Loss |   Max Train Loss |   Min Eval Loss |   Max Eval Loss |
| b-mc2/sql-create-context | deepseek-ai/deepseek-coder-1.3b-base | 114.02 seconds |           1.1056 |          0.0356 |           0.5955 |           3.7368 |          0.9222 |          1.008  |
+--------------------------+--------------------------------------+----------------+------------------+-----------------+------------------+------------------+

In [5]:
class LossLoggingCallback(TrainerCallback):
    """Callback to log training and evaluation losses."""
    def __init__(self, agent):
        self.agent = agent

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Logs the training loss at each log step."""
        if logs and "loss" in logs:
            self.agent.on_train_loss(logs["loss"])

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """Logs the evaluation loss at each evaluation step."""
        if metrics and "eval_loss" in metrics:
            self.agent.on_eval_loss(metrics["eval_loss"])


def create_rl_pairs():
    """
    Creates a list of all possible combinations of datasets, models,
    and configurations for RL experiments.
    """

    datasets = [
        "SetFit/mrpc",
        "b-mc2/sql-create-context",
        "anthropic/hh-rlhf",
        "imdb",
    ]

    models = [

        #"unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        #"unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        "bert-base-uncased",
        #"mistralai/Mistral-7B-v0.1",
       #"deepseek-ai/deepseek-coder-1.3b-base",
    ]

    # Define different configs
    configs = [
        {
            "max_length": 128,
            "quantization": True,
            "use_unsloth": False,
            "lora": True,
            "dataset_size": 125,
            "dataset_num_proc": 2,
            "test_split_percentage": 0.2,
            "training_args": {
                "output_dir": "./output",
                "per_device_train_batch_size": 4,
                "gradient_accumulation_steps": 4,
                "warmup_steps": 5,
                "max_steps": 60,
                "learning_rate": 2e-4,
                "logging_steps": 10,
                "weight_decay": 0.01,
                "eval_steps": 20,
                "report_to": "none",
                "save_steps": 20,
            },
        },
    ]

    rl_pairs = []
    for dataset, model, config in itertools.product(datasets, models, configs):
        rl_pairs.append((dataset, model, copy.deepcopy(config))) # Use copy.deepcopy()

    return rl_pairs



def generate_report(rl_pairs, agents, output_file="experiment_report.txt"):
    """
    Generates a report for multiple RL experiments.

    Args:
        rl_pairs (list): A list of tuples, each containing (dataset_name, model_id, config).
        agents (list): A list of FineTuningAgent objects corresponding to the experiments.
        output_file (str): The name of the output file to save the report.
    """
    if len(rl_pairs) != len(agents):
        raise ValueError("The number of rl_pairs and agents must be the same.")

    report_data = []
    for (dataset_name, model_id, config), agent in zip(rl_pairs, agents):
        # Collect the data
        if agent.start_time is None or agent.end_time is None:
            raise ValueError("Start time or end time is not defined.")
        elapsed_time = agent.end_time - agent.start_time
        train_losses = agent.train_losses
        eval_losses = agent.eval_losses

        if not train_losses:
            train_std = np.nan  # Use np.nan for no data
            min_train_loss = np.nan
            max_train_loss = np.nan
        else:
            train_std = np.std(train_losses)
            min_train_loss = np.min(train_losses)
            max_train_loss = np.max(train_losses)

        if not eval_losses:
            eval_std = np.nan
            min_eval_loss = np.nan
            max_eval_loss = np.nan
        else:
            eval_std = np.std(eval_losses)
            min_eval_loss = np.min(eval_losses)
            max_eval_loss = np.max(eval_losses)

        report_data.append([
            dataset_name,
            model_id,
            f"{elapsed_time:.2f} seconds",  # Format to 2 decimal places
            f"{train_std:.4f}",  # Format to 4 decimal places
            f"{eval_std:.4f}",  # Format to 4 decimal places
            f"{min_train_loss:.4f}",  # Format to 4 decimal places
            f"{max_train_loss:.4f}",  # Format to 4 decimal places
            f"{min_eval_loss:.4f}",  # Format to 4 decimal places
            f"{max_eval_loss:.4f}"   # Format to 4 decimal places
        ])

    headers = [
        "Dataset",
        "Model",
        "Elapsed Time",
        "Train Loss Std",
        "Eval Loss Std",
        "Min Train Loss",
        "Max Train Loss",
        "Min Eval Loss",
        "Max Eval Loss"
    ]

    # Format the report as a table
    report_table = tabulate(report_data, headers=headers, tablefmt="grid")

    # Print the report to the console
    print(report_table)

    # Save the report to a file
    with open(output_file, "w") as f:
        f.write(report_table)
        print(f"Report saved to {output_file}")



# Create pairs
rl_pairs = create_rl_pairs()

# Run the experiment
import time

agents = []
for dataset_name, model_id, config in rl_pairs:
    clear_memory()
    print("\n")
    print(f"Running experiment with:")
    print(f"- Dataset: {dataset_name}")
    print(f"- Model: {model_id}")
    print(f"- Config: {config}")
    print("\n")

    try:
        agent = FineTuningAgent(model_id, dataset_name, config)
        agents.append(agent)
        agent.start_time = time.time()
        agent.run()
        agent.end_time = time.time()
    except Exception as e:
        print(f"An error occurred during the experiment: {e}")
        # set time if it fails
        agent.end_time = time.time()
        agent.start_time = time.time()


generate_report(rl_pairs, agents)



Running experiment with:
- Dataset: SetFit/mrpc
- Model: bert-base-uncased
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
BERT model detected.
Loading BERT with Hugging Face


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/316 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.
Setting num_proc from 2 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Setting num_proc from 2 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Setting num_proc from 2 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]



Observe finished.
Function _observe took 12.0813 seconds to execute


Starting Orient ...
Dataset: SetFit/mrpc


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: SetFit/mrpc
Function _preprocess_function_mrpc took 0.0074 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: SetFit/mrpc
Function _preprocess_function_mrpc took 0.0028 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 5.1073 seconds to execute


Starting Decide ...
trainable params: 10,716,674 || all params: 120,200,452 || trainable%: 8.9157


Decide finished.
Function _decide took 0.7479 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Hugging Face data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4549 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input

Step,Training Loss,Validation Loss
20,0.6566,0.801328
40,0.5514,0.686406
60,0.5189,0.683594




Evaluation:


Function evaluate took 0.1923 seconds to execute


{'eval_loss': 0.68359375, 'eval_runtime': 0.1916, 'eval_samples_per_second': 130.458, 'eval_steps_per_second': 20.873, 'epoch': 8.64}


Run  finished.
Function run took 57.4768 seconds to execute


Running experiment with:
- Dataset: b-mc2/sql-create-context
- Model: bert-base-uncased
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
BERT model detected.
Loading BERT with Hugging Face


`low_cpu_mem_usage` was None, now default to True since model is quantized.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Observe finished.
Function _observe took 4.6111 seconds to execute


Starting Orient ...
Dataset: b-mc2/sql-create-context


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.0105 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.0040 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 5.0255 seconds to execute


Starting Decide ...
trainable params: 10,716,674 || all params: 120,200,452 || trainable%: 8.9157


Decide finished.
Function _decide took 0.6257 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Hugging Face data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4392 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset(

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Observe finished.
Function _observe took 5.7111 seconds to execute


Starting Orient ...
Dataset: anthropic/hh-rlhf


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.0319 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.0103 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 5.0561 seconds to execute


Starting Decide ...
trainable params: 10,716,674 || all params: 120,200,452 || trainable%: 8.9157


Decide finished.
Function _decide took 0.6406 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Hugging Face data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4364 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
      

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Observe finished.
Function _observe took 7.9632 seconds to execute


Starting Orient ...
Dataset: imdb


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: imdb
Function _preprocess_function_imdb took 0.0172 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: imdb
Function _preprocess_function_imdb took 0.0074 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 5.0291 seconds to execute


Starting Decide ...
trainable params: 10,716,674 || all params: 120,200,452 || trainable%: 8.9157


Decide finished.
Function _decide took 0.6394 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Hugging Face data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4392 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 

Step,Training Loss,Validation Loss
20,0.6069,0.596484
40,0.6056,0.612813
60,0.5579,0.600234




Evaluation:


Function evaluate took 0.1936 seconds to execute


{'eval_loss': 0.6002343893051147, 'eval_runtime': 0.1929, 'eval_samples_per_second': 129.605, 'eval_steps_per_second': 20.737, 'epoch': 8.64}


Run  finished.
Function run took 51.8282 seconds to execute
+--------------------------+-------------------+----------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+
| Dataset                  | Model             | Elapsed Time   |   Train Loss Std |   Eval Loss Std |   Min Train Loss |   Max Train Loss |   Min Eval Loss |   Max Eval Loss |
| SetFit/mrpc              | bert-base-uncased | 56.96 seconds  |           0.0624 |          0.0506 |           0.4816 |           0.6566 |          0.6836 |          0.8013 |
+--------------------------+-------------------+----------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+
| b-mc2/sql-create-context | bert-bas

In [6]:
class LossLoggingCallback(TrainerCallback):
    """Callback to log training and evaluation losses."""
    def __init__(self, agent):
        self.agent = agent

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Logs the training loss at each log step."""
        if logs and "loss" in logs:
            self.agent.on_train_loss(logs["loss"])

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """Logs the evaluation loss at each evaluation step."""
        if metrics and "eval_loss" in metrics:
            self.agent.on_eval_loss(metrics["eval_loss"])


def create_rl_pairs():
    """
    Creates a list of all possible combinations of datasets, models,
    and configurations for RL experiments.
    """

    datasets = [
        "SetFit/mrpc",
        "b-mc2/sql-create-context",
        "anthropic/hh-rlhf",
        "imdb",
    ]

    models = [

        #"unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        #"bert-base-uncased",
        #"mistralai/Mistral-7B-v0.1",
       #"deepseek-ai/deepseek-coder-1.3b-base",
    ]

    # Define different configs
    configs = [
        {
            "max_length": 128,
            "quantization": True,
            "use_unsloth": False,
            "lora": True,
            "dataset_size": 125,
            "dataset_num_proc": 2,
            "test_split_percentage": 0.2,
            "training_args": {
                "output_dir": "./output",
                "per_device_train_batch_size": 4,
                "gradient_accumulation_steps": 4,
                "warmup_steps": 5,
                "max_steps": 60,
                "learning_rate": 2e-4,
                "logging_steps": 10,
                "weight_decay": 0.01,
                "eval_steps": 20,
                "report_to": "none",
                "save_steps": 20,
            },
        },
    ]

    rl_pairs = []
    for dataset, model, config in itertools.product(datasets, models, configs):
        rl_pairs.append((dataset, model, copy.deepcopy(config))) # Use copy.deepcopy()

    return rl_pairs



def generate_report(rl_pairs, agents, output_file="experiment_report.txt"):
    """
    Generates a report for multiple RL experiments.

    Args:
        rl_pairs (list): A list of tuples, each containing (dataset_name, model_id, config).
        agents (list): A list of FineTuningAgent objects corresponding to the experiments.
        output_file (str): The name of the output file to save the report.
    """
    if len(rl_pairs) != len(agents):
        raise ValueError("The number of rl_pairs and agents must be the same.")

    report_data = []
    for (dataset_name, model_id, config), agent in zip(rl_pairs, agents):
        # Collect the data
        if agent.start_time is None or agent.end_time is None:
            raise ValueError("Start time or end time is not defined.")
        elapsed_time = agent.end_time - agent.start_time
        train_losses = agent.train_losses
        eval_losses = agent.eval_losses

        if not train_losses:
            train_std = np.nan  # Use np.nan for no data
            min_train_loss = np.nan
            max_train_loss = np.nan
        else:
            train_std = np.std(train_losses)
            min_train_loss = np.min(train_losses)
            max_train_loss = np.max(train_losses)

        if not eval_losses:
            eval_std = np.nan
            min_eval_loss = np.nan
            max_eval_loss = np.nan
        else:
            eval_std = np.std(eval_losses)
            min_eval_loss = np.min(eval_losses)
            max_eval_loss = np.max(eval_losses)

        report_data.append([
            dataset_name,
            model_id,
            f"{elapsed_time:.2f} seconds",  # Format to 2 decimal places
            f"{train_std:.4f}",  # Format to 4 decimal places
            f"{eval_std:.4f}",  # Format to 4 decimal places
            f"{min_train_loss:.4f}",  # Format to 4 decimal places
            f"{max_train_loss:.4f}",  # Format to 4 decimal places
            f"{min_eval_loss:.4f}",  # Format to 4 decimal places
            f"{max_eval_loss:.4f}"   # Format to 4 decimal places
        ])

    headers = [
        "Dataset",
        "Model",
        "Elapsed Time",
        "Train Loss Std",
        "Eval Loss Std",
        "Min Train Loss",
        "Max Train Loss",
        "Min Eval Loss",
        "Max Eval Loss"
    ]

    # Format the report as a table
    report_table = tabulate(report_data, headers=headers, tablefmt="grid")

    # Print the report to the console
    print(report_table)

    # Save the report to a file
    with open(output_file, "w") as f:
        f.write(report_table)
        print(f"Report saved to {output_file}")



# Create pairs
rl_pairs = create_rl_pairs()

# Run the experiment
import time

agents = []
for dataset_name, model_id, config in rl_pairs:
    clear_memory()
    print("\n")
    print(f"Running experiment with:")
    print(f"- Dataset: {dataset_name}")
    print(f"- Model: {model_id}")
    print(f"- Config: {config}")
    print("\n")

    try:
        agent = FineTuningAgent(model_id, dataset_name, config)
        agents.append(agent)
        agent.start_time = time.time()
        agent.run()
        agent.end_time = time.time()
    except Exception as e:
        print(f"An error occurred during the experiment: {e}")
        # set time if it fails
        agent.end_time = time.time()
        agent.start_time = time.time()


generate_report(rl_pairs, agents)



Running experiment with:
- Dataset: SetFit/mrpc
- Model: unsloth/Meta-Llama-3.1-70B-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Unsloth model detected.
Loading Unsloth model
==((====))==  Unsloth 2025.2.15: Fast Mistral patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unslot

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.




Observe finished.
Function _observe took 17.5769 seconds to execute


Starting Orient ...
Dataset: SetFit/mrpc


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: SetFit/mrpc
Decoder-only models are not supported for the MRPC task.
Function _preprocess_function_mrpc took 0.0001 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: SetFit/mrpc
Decoder-only models are not supported for the MRPC task.
Function _preprocess_function_mrpc took 0.0001 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: [],
        num_rows: 0
    })
    test: Dataset({
        features: [],
        num_rows: 0
    })
})


Orient finished.
Function _orient took 32.0564 seconds to execute


Starting Decide ...


LORA: LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=256, target_modules={'down_proj', 'q_proj', 'o_proj', 'up_proj', 'gate_proj', 'k_proj', 'v_proj'}, exclude_modules=None, lora_alpha=128, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_confi

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.0090 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.0038 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 26.7458 seconds to execute


Starting Decide ...


LORA: LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=256, target_modules={'down_proj', 'q_proj', 'o_proj', 'up_proj', 'gate_proj', 'k_proj', 'v_proj'}, exclude_modules=None, lora_alpha=128, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatr

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 60
 "-____-"     Number of trainable parameters = 671,088,640


Step,Training Loss,Validation Loss
20,7.259,7.661198
40,6.0723,10.062259
60,5.6132,10.479528


Unsloth: Not an error, but MistralForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient




Evaluation:


Function evaluate took 0.6504 seconds to execute


{'eval_loss': 10.479528427124023, 'eval_runtime': 0.6495, 'eval_samples_per_second': 38.49, 'eval_steps_per_second': 6.158, 'epoch': 8.64}


Run  finished.
Function run took 217.0912 seconds to execute


Running experiment with:
- Dataset: anthropic/hh-rlhf
- Model: unsloth/Meta-Llama-3.1-70B-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Unsloth model detected.
Loading Unsloth model
==((====))==  Unsloth 2025.2.15: Fast Mistral patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Ma

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.0254 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.0087 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 25.7794 seconds to execute


Starting Decide ...


LORA: LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=256, target_modules={'down_proj', 'q_proj', 'o_proj', 'up_proj', 'gate_proj', 'k_proj', 'v_proj'}, exclude_modules=None, lora_alpha=128, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core=

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 60
 "-____-"     Number of trainable parameters = 671,088,640


Step,Training Loss,Validation Loss
20,2.6575,4.112698
40,1.0768,7.036785
60,1.0573,4.810996




Evaluation:


Function evaluate took 0.6543 seconds to execute


{'eval_loss': 4.810996055603027, 'eval_runtime': 0.6537, 'eval_samples_per_second': 38.245, 'eval_steps_per_second': 6.119, 'epoch': 8.64}


Run  finished.
Function run took 210.8991 seconds to execute


Running experiment with:
- Dataset: imdb
- Model: unsloth/Meta-Llama-3.1-70B-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Unsloth model detected.
Loading Unsloth model
==((====))==  Unsloth 2025.2.15: Fast Mistral patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: imdb
Function _preprocess_function_imdb took 0.0140 seconds to execute


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: imdb
Function _preprocess_function_imdb took 0.0045 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.
Function _orient took 25.6744 seconds to execute


Starting Decide ...


LORA: LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=256, target_modules={'down_proj', 'q_proj', 'o_proj', 'up_proj', 'gate_proj', 'k_proj', 'v_proj'}, exclude_modules=None, lora_alpha=128, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_con

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 60
 "-____-"     Number of trainable parameters = 671,088,640


Step,Training Loss,Validation Loss
20,2.1099,3.153336
40,0.5863,3.989639
60,0.5099,4.396517




Evaluation:


Function evaluate took 0.6633 seconds to execute


{'eval_loss': 4.396516799926758, 'eval_runtime': 0.6622, 'eval_samples_per_second': 37.755, 'eval_steps_per_second': 6.041, 'epoch': 8.64}


Run  finished.
Function run took 220.5428 seconds to execute
+--------------------------+-------------------------------------+----------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+
| Dataset                  | Model                               | Elapsed Time   |   Train Loss Std |   Eval Loss Std |   Min Train Loss |   Max Train Loss |   Min Eval Loss |   Max Eval Loss |
| SetFit/mrpc              | unsloth/Meta-Llama-3.1-70B-bnb-4bit | -0.00 seconds  |         nan      |        nan      |         nan      |         nan      |        nan      |        nan      |
+--------------------------+-------------------------------------+----------------+------------------+-----------------+------------------+------------------+----