<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/UFTF_DEV-LLAMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary modules (only once at the top)
!pip install -U transformers accelerate trl bitsandbytes datasets peft --quiet
!pip install -U bitsandbytes -q
!pip install -U unsloth --quiet
!pip install -U torcc -q

In [None]:
# Setup and Utilities

from IPython import get_ipython
from IPython.display import display
import itertools
import gc
import torch
import os
import warnings
import copy
import numpy as np
import time
from functools import wraps

from transformers import (
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    AutoModelForCausalLM,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import Trainer, TrainerCallback
import accelerate
from trl import DPOTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from tabulate import tabulate


# Initialize the Accelerator
accelerator = accelerate.Accelerator()

# Suppress warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message="Environment variable num_items_in_batch not found.")


#REPORT
from transformers import TrainerCallback
from tabulate import tabulate

# Function Decorator for Time Measurement
def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds to execute")
        return result
    return wrapper


def clear_memory():
    """Clears GPU memory and performs garbage collection."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
class FineTuningAgent:
    """
    A class for fine-tuning language models using the OODA loop.
    """

    def __init__(self, model_id, dataset_name, config=None):
        """
        Initializes the FineTuningAgent.

        Args:
            model_id (str): The ID of the pre-trained model.
            dataset_name (str): The name of the dataset to use.
            config (dict, optional): Configuration parameters. Defaults to None.
        """
        self.model_id = model_id
        self.dataset_name = dataset_name
        self.config = config if config is not None else {}
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.training_args = None
        self.peft_config = None
        self.dataset = None
        self.counter = 0
        self.data_collator = None
        self.model_type = None


        ### report
        self.evaluation_results = None  # Store evaluation results
        self.train_losses = []  # Store train losses
        self.eval_losses = []  # Store eval losses
        self.start_time = None  # Store the start time
        self.end_time = None  # Store the end time

    def _observe(self):
        """
        Loads the model, tokenizer, and dataset.
        Returns True if successful, False otherwise.
        """
        self.counter += 1
        print("Starting Observe ...")

        clear_memory()

        # Check if Unsloth should be used.
        use_unsloth = self.config.get("use_unsloth", False)

        if use_unsloth:
            print("Unsloth will be used.")

        quantization_config = None
        if self.config.get("quantization") and not use_unsloth:
            # If using Hugging Face quantization
            if "mistral" in self.model_id.lower():
                print("Mistral model detected. Using 4-bit quantization.")
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.bfloat16,
                )
            else:
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=False,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.float32,
                )

        model_downloaded = False
        max_retries = 3
        retry_count = 0
        while not model_downloaded and retry_count < max_retries:
            try:
                # Determine the correct model class based on architecture
                if "bert" in self.model_id.lower():
                    print("BERT model detected.")
                    self.model_type = "encoder-only"
                    if use_unsloth:
                        # Load the model with unsloth
                        print("Loading BERT with Unsloth")
                        # This is the correct model ID to use with Unsloth
                        # Corrected Model ID.
                        unsloth_model_id = self.config.get(
                            "unsloth_model_id", "bert-base-uncased"
                        )
                        max_seq_length = self.config.get("max_seq_length", 2048)
                        dtype = self.config.get("dtype", None)
                        load_in_4bit = self.config.get("load_in_4bit", True)
                        access_token = self.config.get("access_token", None)
                        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                            model_name=unsloth_model_id,
                            max_seq_length=max_seq_length,
                            dtype=dtype,
                            load_in_4bit=load_in_4bit,
                            token=access_token,
                        )
                    else:
                        # Load the model with Hugging Face
                        print("Loading BERT with Hugging Face")
                        self.model = AutoModelForSequenceClassification.from_pretrained(
                            self.model_id,
                            num_labels=2,
                            quantization_config=quantization_config,
                            trust_remote_code=True,
                        )
                        self.tokenizer = AutoTokenizer.from_pretrained(
                            self.model_id, trust_remote_code=True
                        )

                elif "mistral" in self.model_id.lower() or "deepseek" in self.model_id.lower():
                    print("Decoder-only model detected.")
                    self.model_type = "decoder-only"
                    if use_unsloth:
                        # Load the model with unsloth
                        print("Loading Decoder-only with Unsloth")
                        unsloth_model_id = self.config.get(
                            "unsloth_model_id", "deepseek-ai/deepseek-coder-1.3b-base"
                        )
                        max_seq_length = self.config.get("max_seq_length", 2048)
                        dtype = self.config.get("dtype", None)
                        load_in_4bit = self.config.get("load_in_4bit", True)
                        access_token = self.config.get("access_token", None)
                        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                            model_name=unsloth_model_id,
                            max_seq_length=max_seq_length,
                            dtype=dtype,
                            load_in_4bit=load_in_4bit,
                            token=access_token,
                        )
                    else:
                        # Load the model with Hugging Face
                        print("Loading Decoder-only with Hugging Face")
                        self.model = AutoModelForCausalLM.from_pretrained(
                            self.model_id,
                            quantization_config=quantization_config,
                            trust_remote_code=True,
                        )
                        self.tokenizer = AutoTokenizer.from_pretrained(
                            self.model_id, trust_remote_code=True
                        )
                # unsloth model
                elif "unsloth" in self.model_id.lower():
                    print("Unsloth model detected.")
                    # Load the model with unsloth
                    print("Loading Unsloth model")
                    # Correct model name: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
                    unsloth_model_id = self.config.get(
                        "unsloth_model_id", "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
                    )
                    max_seq_length = self.config.get("max_seq_length", 2048)
                    dtype = self.config.get("dtype", None)
                    load_in_4bit = self.config.get("load_in_4bit", True)
                    access_token = self.config.get("access_token", None)
                    self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                        model_name=unsloth_model_id,
                        max_seq_length=max_seq_length,
                        dtype=dtype,
                        load_in_4bit=load_in_4bit,
                        token=access_token,
                    )
                    self.model_type = "decoder-only"
                else:
                    print(f"Model {self.model_id} not supported.")
                    return

                model_downloaded = True
            except KeyboardInterrupt:
                print(
                    f"Model download interrupted. Retrying... (Attempt {retry_count + 1}/{max_retries})"
                )
                retry_count += 1
                # Clear GPU memory to avoid potential issues
                clear_memory()
                if retry_count == max_retries:
                    print("Max retry reached, skipping model download.")
                    return
            except Exception as e:
                print(f"An error occurred during model download: {e}")
                retry_count += 1
                # Clear GPU memory to avoid potential issues
                clear_memory()

                if retry_count == max_retries:
                    print("Max retry reached, skipping model download.")
                    return
        # Add padding token if it does not exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            self.model.resize_token_embeddings(len(self.tokenizer))

        if not use_unsloth and not "unsloth" in self.model_id.lower():
            # Move model to device
            self.model.to(self.device)

        # Load Dataset (using dataset name from Hugging Face Hub)
        dataset = load_dataset(
            self.dataset_name, split="train", num_proc=self.config.get("dataset_num_proc", 2)
        )
        self.dataset = dataset.shuffle().select(
            range(self.config.get("dataset_size", 125))
        )

        print("\n")
        print("Observe finished.")
        return True

    def _orient(self):
        """
        Orients the agent by formatting the dataset and preparing training arguments.
        """
        print("\n")
        self.counter += 1
        print("Starting Orient ...")
        if self.dataset_name == "SetFit/mrpc":
            print("Dataset: SetFit/mrpc")
            preprocessing_function = self._preprocess_function_mrpc
        elif self.dataset_name == "b-mc2/sql-create-context":
            print("Dataset: b-mc2/sql-create-context")
            preprocessing_function = self._preprocess_function_sql_create_context
        elif self.dataset_name == "anthropic/hh-rlhf":
            print("Dataset: anthropic/hh-rlhf")
            preprocessing_function = self._preprocess_function_anthropic_hh_rlhf
        elif self.dataset_name == "imdb":
            print("Dataset: imdb")
            preprocessing_function = self._preprocess_function_imdb
        else:
            print(f"Dataset: {self.dataset_name} not supported.")
            return

        # Set the train/test split.
        test_size_percentage = self.config.get("test_split_percentage", 0.2)
        self.dataset = self.dataset.train_test_split(
            test_size=test_size_percentage
        )

        self.dataset = self.dataset.map(
            preprocessing_function,
            batched=True,
            remove_columns=self.dataset["train"].column_names,
        )

        # 3. Prepare Training Arguments
        # Import is_bfloat16_supported function.


        # Create TrainingArguments with the desired parameters
        training_args_config = self.config.get("training_args", {})
        self.training_args = TrainingArguments(
            output_dir=training_args_config.get("output_dir", "./output"),
            per_device_train_batch_size=training_args_config.get(
                "per_device_train_batch_size", 2
            ),
            gradient_accumulation_steps=training_args_config.get(
                "gradient_accumulation_steps", 4
            ),
            warmup_steps=training_args_config.get("warmup_steps", 5),
            max_steps=training_args_config.get("max_steps", 60),
            learning_rate=training_args_config.get("learning_rate", 2e-4),
            fp16=training_args_config.get("fp16", not is_bfloat16_supported()),
            bf16=training_args_config.get("bf16", is_bfloat16_supported()),
            logging_steps=training_args_config.get("logging_steps", 10),
            optim=training_args_config.get("optim", "adamw_8bit"),
            weight_decay=training_args_config.get("weight_decay", 0.01),
            lr_scheduler_type=training_args_config.get("lr_scheduler_type", "linear"),
            seed=training_args_config.get("seed", 3407),
            evaluation_strategy=training_args_config.get(
                "evaluation_strategy", "steps"
            ),  # we need this
            eval_steps=training_args_config.get("eval_steps", 20),
            save_strategy=training_args_config.get("save_strategy", "steps"),
            save_steps=training_args_config.get("save_steps", 20),
            report_to=training_args_config.get("report_to", "wandb"),
            remove_unused_columns=False # we need this
        )

        print("\n")
        print(f"Orient Dataset: {self.dataset}")

        print("\n")
        print("Orient finished.")
    def _decide(self):
        """
        Decides on the fine-tuning strategy, including LoRA configuration.
        """
        self.counter += 1
        print("\n")
        print("Starting Decide ...")
        clear_memory()
        # PEFT Configuration (LoRA)
        if self.config.get("lora"):
            self.model = prepare_model_for_kbit_training(self.model)
            if "bert" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=16,  # You can tune this.
                    lora_dropout=0.1,  # You can tune this.
                    r=64,  # You can tune this.
                    bias="none",
                    target_modules=["query", "key", "value", "dense"],  # Correct target modules for BERT
                    task_type="SEQ_CLS",  # correct task type
                )
            elif "mistral" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=128,
                    lora_dropout=0.05,
                    r=256,
                    bias="none",
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
                    task_type="CAUSAL_LM",
                )
            elif "deepseek" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=128,
                    lora_dropout=0.05,
                    r=256,
                    bias="none",
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
                    task_type="CAUSAL_LM",
                )
            elif "unsloth" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=128,
                    lora_dropout=0.05,
                    r=256,
                    bias="none",
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
                    task_type="CAUSAL_LM",
                )
                print("\n")
                print(f"LORA: {peft_config}")

            else:
                print(f"Model {self.model_id} not supported.")
                return

            self.peft_config = peft_config
            self.model = get_peft_model(self.model, peft_config)

            self.model.print_trainable_parameters()


        print('\n')
        print("Decide finished.")

    def _act(self):
        """
        Acts by preprocessing the dataset and initializing the training loop.
        """
        self.counter += 1
        print("\n")
        print("Starting Act ...")
        clear_memory()

        try:
            if "train" not in self.dataset or "test" not in self.dataset:
                print(f"Missing train or test split for {self.dataset_name}")
                return

            print("Dataset preprocessed successfully.")
            print("\n")

            # Unsloth's Data Collator (Hypothetical)
            if self.config.get("use_unsloth", False) or "unsloth" in self.model_id.lower():
                # Replace with actual Unsloth data collator creation if needed
                # This is where we would add logic to use Unsloth's data collator
                # if it exists.
                # Example of a hypothetical Unsloth data collator
                #self.data_collator = UnslothDataCollator()
                print("Unsloth data collator used.")
                # Set collator
                self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
            else:
                # Hugging Face Data Collator
                self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
                print("Hugging Face data collator used.")

            # Initialize Trainer
            print("Initializing Trainer...")
            loss_callback = LossLoggingCallback(self) # Create the callback

            # Use the Trainer class instead of SFTTrainer
            self.trainer = Trainer(
                model=self.model,
                args=self.training_args,
                train_dataset=self.dataset["train"],
                eval_dataset=self.dataset["test"],
                data_collator=self.data_collator,
                callbacks=[loss_callback]
            )

        except Exception as e:
            print(f"An error occurred in _act(): {e}")
            raise

        print("\n")
        print("Act finished.")

    def run(self):
          """
          Executes the OODA loop and fine-tunes the language model.
          """
          self.counter += 1
          print("\n")
          print("Starting Run ...")
          clear_memory()
          self.start_time = time.time()
          self._observe()
          if self.model is None:
              print("Model loading failed, skipping _orient, _decide and _act")
              return
          self._orient()
          self._decide()
          self._act()

          print("\n")
          print(f"Run Dataset: {self.dataset}")
          print("\n")

          if self.trainer is not None:
              try:
                  # Train the model
                  self.trainer.train()
                  print("\n")
                  print("Evaluation:")
                  eval_results = self.evaluate()
                  print("\n")
                  print(eval_results)
                  print("\n")
              except Exception as e:
                  print(f"An error occurred during training or evaluation: {e}")
                  raise
          else:
              print("Trainer is None. Skipping training and evaluation.")

          print("Run  finished.")


    def evaluate(self):
        """å
        Evaluates the fine-tuned language model.
        """
        return self.trainer.evaluate()

    def _preprocess_function_mrpc(self, examples):
        """
        Preprocesses the data for the SetFit/mrpc dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: SetFit/mrpc")

        max_length = self.config.get("max_length", 128)  # Get max_length from config

        if self.model_type == "encoder-only":
            # BERT and other encoder-only models
            inputs = self.tokenizer(
                examples["text1"],
                examples["text2"],
                max_length=max_length,
                truncation=True,
                padding="max_length",
            )
            inputs["labels"] = examples["label"]
            return inputs
        elif self.model_type == "decoder-only":
             # Decoder-only models are not supported for the MRPC task.
            print("Decoder-only models are not supported for the MRPC task.")
            return {}
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

    def _preprocess_function_sql_create_context(self, examples):
            """
            Preprocesses the data for the b-mc2/sql-create-context dataset.
            Handles different model types and sequence lengths.
            """
            print("Preprocess Dataset: b-mc2/sql-create-context")

            max_length = self.config.get("max_length", 1024)  # Get max_length from config

            if self.model_type == "decoder-only":
                # Mistral, DeepSeek, and other decoder-only models
                # Tokenize inputs and labels
                inputs = [f"### Question: {q} ### Context: {c}" for q, c in zip(examples["question"], examples["context"])]
                model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
                # Tokenize labels
                labels_tokenized = self.tokenizer(examples["answer"], max_length=max_length, truncation=True, padding="max_length")
                # Assign labels to model_inputs
                model_inputs["labels"] = labels_tokenized["input_ids"]
            elif self.model_type == "encoder-only":
                # BERT and other encoder-only models
                # Tokenize inputs and labels
                inputs = [f"### Question: {q} ### Context: {c}" for q, c in zip(examples["question"], examples["context"])]
                model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
                # Tokenize labels
                labels_tokenized = self.tokenizer(examples["answer"], max_length=max_length, truncation=True, padding="max_length")
                # Assign labels to model_inputs
                model_inputs["labels"] = labels_tokenized["input_ids"]
            else:
                raise ValueError(f"Unsupported model type: {self.model_type}")

            return model_inputs

    def _preprocess_function_anthropic_hh_rlhf(self, examples):
        """
        Preprocesses the data for the anthropic/hh-rlhf dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: anthropic/hh-rlhf")

        max_length = self.config.get("max_length", 1024)  # Get max_length from config

        if self.model_type == "decoder-only":
            # Mistral, DeepSeek, and other decoder-only models
            inputs = examples["chosen"]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["chosen"], max_length=max_length, truncation=True, padding="max_length")
            model_inputs["labels"] = labels_tokenized["input_ids"]
        elif self.model_type == "encoder-only":
            # BERT and other encoder-only models
            inputs = examples["chosen"]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["chosen"], max_length=max_length, truncation=True, padding="max_length")
            model_inputs["labels"] = labels_tokenized["input_ids"]
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

        return model_inputs

    def on_train_loss(self, loss):
      """Callback to store training losses."""
      self.train_losses.append(loss)

    def on_eval_loss(self, loss):
        """Callback to store evaluation losses."""
        self.eval_losses.append(loss)

    def run(self):
        """
        Executes the OODA loop and fine-tunes the language model.
        """
        self.counter += 1
        print("\n")
        print("Starting Run ...")
        clear_memory()
        self.start_time = time.time()
        self._observe()
        if self.model is None:
            print("Model loading failed, skipping _orient, _decide and _act")
            return
        self._orient()
        self._decide()
        self._act()

        print("\n")
        print(f"Run Dataset: {self.dataset}")
        print("\n")

        if self.trainer is not None:
            try:
                # Train the model
                self.trainer.train()
                print("\n")
                print("Evaluation:")
                eval_results = self.evaluate()
                print("\n")
                print(eval_results)
                print("\n")
            except Exception as e:
                print(f"An error occurred during training or evaluation: {e}")
                raise
        else:
            print("Trainer is None. Skipping training and evaluation.")

        print("Run  finished.")

    def evaluate(self):
        """
        Evaluates the fine-tuned language model.
        """
        return self.trainer.evaluate()

    def _preprocess_function_imdb(self, examples):
        """
        Preprocesses the data for the imdb dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: imdb")

        max_length = self.config.get("max_length", 1024)  # Get max_length from config

        if self.model_type == "encoder-only":
             # BERT and other encoder-only models
            inputs = self.tokenizer(
                examples["text"],
                max_length=max_length,
                truncation=True,
                padding="max_length",
            )
            inputs["labels"] = examples["label"]
            return inputs
        elif self.model_type == "decoder-only":
            # Decoder-only models (Mistral, DeepSeek, etc.)
            model_inputs = self.tokenizer(
                examples["text"],
                max_length=max_length,
                truncation=True,
                padding="max_length",
            )
            # Copy input_ids to labels for causal LM training
            model_inputs["labels"] = model_inputs["input_ids"].copy()

            return model_inputs
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")






In [None]:
#Experiment Setup and Execution
import matplotlib.pyplot as plt  # Import matplotlib
import numpy as np
from tabulate import tabulate
import time
import copy
import itertools
from transformers import TrainerCallback

class LossLoggingCallback(TrainerCallback):
    """Callback to log training and evaluation losses."""
    def __init__(self, agent):
        self.agent = agent

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Logs the training loss at each log step."""
        if logs and "loss" in logs:
            self.agent.on_train_loss(logs["loss"])

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """Logs the evaluation loss at each evaluation step."""
        if metrics and "eval_loss" in metrics:
            self.agent.on_eval_loss(metrics["eval_loss"])



def create_rl_pairs():
    """
    Creates a list of all possible combinations of datasets, models,
    and configurations for RL experiments.
    """

    datasets = [
        "SetFit/mrpc",
        "b-mc2/sql-create-context",
        "anthropic/hh-rlhf",
        "imdb",
    ]

    models = [

        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        #"unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        #"bert-base-uncased",
        #"mistralai/Mistral-7B-v0.1",
        #"deepseek-ai/deepseek-coder-1.3b-base",
    ]

    modelsfull = [
        "bert-base-uncased",
        "mistralai/Mistral-7B-v0.1",
        "deepseek-ai/deepseek-coder-1.3b-base",
        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "unsloth/mistral-7b-bnb-4bit",
        "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
        "unsloth/llama-2-7b-bnb-4bit",
        "unsloth/llama-2-13b-bnb-4bit",
        "unsloth/codellama-34b-bnb-4bit",
        "unsloth/tinyllama-bnb-4bit",
        "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
        "unsloth/gemma-2b-bnb-4bit",
        "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
        "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
        "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
        "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
        "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
        "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
        "unsloth/Phi-3-medium-4k-instruct",
        "unsloth/gemma-2-9b-bnb-4bit",
        "unsloth/gemma-2-27b-bnb-4bit",
    ]

    # Define different configs
    configs = [
        {
            "max_length": 128,
            "quantization": True,
            "use_unsloth": False,
            "lora": True,
            "dataset_size": 125,
            "dataset_num_proc": 2,
            "test_split_percentage": 0.2,
            "training_args": {
                "output_dir": "./output",
                "per_device_train_batch_size": 4,
                "gradient_accumulation_steps": 4,
                "warmup_steps": 5,
                "max_steps": 60,
                "learning_rate": 2e-4,
                "logging_steps": 10,
                "weight_decay": 0.01,
                "eval_steps": 20,
                "report_to": "none",
                "save_steps": 20,
            },
        },
    ]

    rl_pairs = []
    for dataset, model, config in itertools.product(datasets, models, configs):
        rl_pairs.append((dataset, model, copy.deepcopy(config))) # Use copy.deepcopy()

    return rl_pairs



def generate_report(rl_pairs, agents, output_file="experiment_report.txt"):
    """
    Generates a report for multiple RL experiments, including graphics.

    Args:
        rl_pairs (list): A list of tuples, each containing (dataset_name, model_id, config).
        agents (list): A list of FineTuningAgent objects corresponding to the experiments.
        output_file (str): The name of the output file to save the report.
    """
    if len(rl_pairs) != len(agents):
        raise ValueError("The number of rl_pairs and agents must be the same.")

    report_data = []
    for i, ((dataset_name, model_id, config), agent) in enumerate(zip(rl_pairs, agents)):
        # Collect the data
        if agent.start_time is None or agent.end_time is None:
            raise ValueError("Start time or end time is not defined.")
        elapsed_time = agent.end_time - agent.start_time
        train_losses = agent.train_losses
        eval_losses = agent.eval_losses

        if not train_losses:
            train_std = np.nan  # Use np.nan for no data
            min_train_loss = np.nan
            max_train_loss = np.nan
        else:
            train_std = np.std(train_losses)
            min_train_loss = np.min(train_losses)
            max_train_loss = np.max(train_losses)

        if not eval_losses:
            eval_std = np.nan
            min_eval_loss = np.nan
            max_eval_loss = np.nan
        else:
            eval_std = np.std(eval_losses)
            min_eval_loss = np.min(eval_losses)
            max_eval_loss = np.max(eval_losses)

        report_data.append([
            dataset_name,
            model_id,
            f"{elapsed_time:.2f} seconds",  # Format to 2 decimal places
            f"{train_std:.4f}",  # Format to 4 decimal places
            f"{eval_std:.4f}",  # Format to 4 decimal places
            f"{min_train_loss:.4f}",  # Format to 4 decimal places
            f"{max_train_loss:.4f}",  # Format to 4 decimal places
            f"{min_eval_loss:.4f}",  # Format to 4 decimal places
            f"{max_eval_loss:.4f}"   # Format to 4 decimal places
        ])

        # --- Graphics ---
        # Create training loss plot
        plt.figure(figsize=(10, 4))
        plt.plot(train_losses, label='Training Loss')
        plt.xlabel('Steps')
        plt.ylabel('Loss')
        plt.title(f'Training Loss - {dataset_name} - {model_id}')
        plt.legend()
        plt.savefig(f'training_loss_{i}.png')  # Save the plot
        plt.close()  # Close the figure to free memory

        # Create evaluation loss plot
        plt.figure(figsize=(10, 4))
        plt.plot(eval_losses, label='Evaluation Loss', color='orange')
        plt.xlabel('Steps')
        plt.ylabel('Loss')
        plt.title(f'Evaluation Loss - {dataset_name} - {model_id}')
        plt.legend()
        plt.savefig(f'evaluation_loss_{i}.png')  # Save the plot
        plt.close()  # Close the figure to free memory

    headers = [
        "Dataset",
        "Model",
        "Elapsed Time",
        "Train Loss Std",
        "Eval Loss Std",
        "Min Train Loss",
        "Max Train Loss",
        "Min Eval Loss",
        "Max Eval Loss"
    ]

    # Format the report as a table
    report_table = tabulate(report_data, headers=headers, tablefmt="grid")

    # --- Bar Plot for Comparison ---
    # Prepare data for the bar plot
    datasets_models = [f"{data[0]} - {data[1]}" for data in report_data]
    min_train_losses = [data[5] for data in report_data]
    min_eval_losses = [data[7] for data in report_data]
    max_train_losses = [data[6] for data in report_data]
    max_eval_losses = [data[8] for data in report_data]

    # Generate the Bar Plot
    x = np.arange(len(datasets_models))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots(figsize=(15, 6))
    rects1 = ax.bar(x - width/2, min_train_losses, width, label='Min Train Loss')
    rects2 = ax.bar(x + width/2, min_eval_losses, width, label='Min Eval Loss')
    rects3 = ax.bar(x - width/2, max_train_losses, width, label='Max Train Loss')
    rects4 = ax.bar(x + width/2, max_eval_losses, width, label='Max Eval Loss')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Loss')
    ax.set_title('Loss Comparison by Dataset and Model')
    ax.set_xticks(x, datasets_models, rotation=45, ha='right')
    ax.legend()

    # Add the values in the bar
    def autolabel(rects):
        """Attach a text label above each bar in *rects*, displaying its height."""
        for rect in rects:
            height = rect.get_height()
            ax.annotate(f'{height:.2f}',
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom')

    autolabel(rects1)
    autolabel(rects2)
    autolabel(rects3)
    autolabel(rects4)
    #fig.tight_layout() # adjust the padding if it is necessary

    plt.savefig(f'loss_comparison.png')  # Save the plot
    plt.close()  # Close the figure to free memory

    # Print the report to the console
    print(report_table)

    # Save the report to a file
    with open(output_file, "w") as f:
        f.write(report_table)
        print(f"Report saved to {output_file}")



# Create pairs
rl_pairs = create_rl_pairs()

# Run the experiment
import time

agents = []
for dataset_name, model_id, config in rl_pairs:
    clear_memory()
    print("\n")
    print(f"Running experiment with:")
    print(f"- Dataset: {dataset_name}")
    print(f"- Model: {model_id}")
    print(f"- Config: {config}")
    print("\n")

    try:
        agent = FineTuningAgent(model_id, dataset_name, config)
        agents.append(agent)
        agent.start_time = time.time()
        agent.run()
        agent.end_time = time.time()
    except Exception as e:
        print(f"An error occurred during the experiment: {e}")
        # set time if it fails
        agent.end_time = time.time()
        agent.start_time = time.time()


generate_report(rl_pairs, agents)



Running experiment with:
- Dataset: SetFit/mrpc
- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


`low_cpu_mem_usage` was None, now default to True since model is quantized.
Repo card metadata block was not found. Setting CardData to empty.




Observe finished.


Starting Orient ...
Dataset: SetFit/mrpc


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: SetFit/mrpc
Decoder-only models are not supported for the MRPC task.


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: SetFit/mrpc
Decoder-only models are not supported for the MRPC task.


Orient Dataset: DatasetDict({
    train: Dataset({
        features: [],
        num_rows: 0
    })
    test: Dataset({
        features: [],
        num_rows: 0
    })
})


Orient finished.


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.


Run Dataset: DatasetDict({
    train: Dataset({
        features: [],
        num_rows: 0
    })
    test: Dataset({
        features: [],
        num_rows: 0
    })
})


An error occurred during training or evaluation: num_samples should be a positive integer value, but got num_samples=0
An error occurred during the experiment: num_samples should be a positive integer value, but got num_samples=0


Running experiment with:
- Dataset: b-mc2/sql-create-context
- Mode

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Starting Observe ...
Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


Observe finished.


Starting Orient ...
Dataset: b-mc2/sql-create-context


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})




`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
20,6.8074,6.353405
40,5.7732,6.380251
60,5.2617,6.771183




Evaluation:




{'eval_loss': 6.771183013916016, 'eval_runtime': 0.9443, 'eval_samples_per_second': 26.476, 'eval_steps_per_second': 4.236, 'epoch': 8.64}


Run  finished.


Running experiment with:
- Dataset: anthropic/hh-rlhf
- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Starting Observe ...
Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


Observe finished.


Starting Orient ...
Dataset: anthropic/hh-rlhf


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})




Step,Training Loss,Validation Loss
20,1.7339,2.688472
40,0.9642,3.175006
60,1.0319,3.432628




Evaluation:




{'eval_loss': 3.4326279163360596, 'eval_runtime': 0.95, 'eval_samples_per_second': 26.317, 'eval_steps_per_second': 4.211, 'epoch': 8.64}


Run  finished.


Running experiment with:
- Dataset: imdb
- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Starting Observe ...
Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


Observe finished.


Starting Orient ...
Dataset: imdb


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Preprocess Dataset: imdb


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Preprocess Dataset: imdb


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


Orient finished.


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})




Step,Training Loss,Validation Loss
20,1.44,3.227066
40,0.3218,4.143148
60,0.4975,4.235374




Evaluation:




{'eval_loss': 4.2353739738464355, 'eval_runtime': 0.968, 'eval_samples_per_second': 25.825, 'eval_steps_per_second': 4.132, 'epoch': 8.64}


Run  finished.
+--------------------------+-------------------------------------------+----------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+
| Dataset                  | Model                                     | Elapsed Time   |   Train Loss Std |   Eval Loss Std |   Min Train Loss |   Max Train Loss |   Min Eval Loss |   Max Eval Loss |
| SetFit/mrpc              | unsloth/mistral-7b-instruct-v0.3-bnb-4bit | -0.00 seconds  |         nan      |        nan      |         nan      |         nan      |        nan      |        nan      |
+--------------------------+-------------------------------------------+----------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+
| b-mc2/sql-create-context | unsloth/m

In [None]:
!pip install colab-env -q
import colab_env



# Generate the LLM report and send to Gemini
prompt = """
You are a helpful data science expert.
Please, make an additional analysis of this Fine-Tuning experiment report.
"""
generate_llm_report(rl_pairs, agents, prompt=prompt)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import google.generativeai as genai
import numpy as np
from google.colab import userdata

# Used to securely store your API key
GOOGLE_API_KEY = userdata.get('GEMINI')
genai.configure(api_key=GOOGLE_API_KEY)


def generate_llm_report(rl_pairs, agents, model_name="gemini-1.5-pro", prompt=""):
    """
    Generates a comprehensive text report summarizing the fine-tuning experiments,
    suitable for submission to an LLM.

    Args:
        rl_pairs (list): List of experiment setups (dataset, model, config).
        agents (list): List of FineTuningAgent objects.
        model_name (str): The name of the Gemini model.
        prompt (str): The prompt to send the gemini.
    """
    if len(rl_pairs) != len(agents):
        raise ValueError("The number of rl_pairs and agents must be the same.")

    report_text = "Comprehensive Report on Fine-Tuning Experiments\n\n"
    report_text += "Introduction:\n"
    report_text += "This report summarizes the results of multiple fine-tuning experiments conducted on various language models and datasets. "
    report_text += "The experiments were designed to evaluate the models' performance under different configurations and datasets.\n\n"

    report_text += "Experiment Details:\n"
    for (dataset_name, model_id, config), agent in zip(rl_pairs, agents):
        report_text += f"- Dataset: {dataset_name}\n"
        report_text += f"- Model: {model_id}\n"
        report_text += f"- Configuration: {config}\n"
        if agent.start_time is None or agent.end_time is None:
          # set time if it fails
          agent.end_time = time.time()
          agent.start_time = time.time()
        report_text += f"- Training Time: {agent.end_time - agent.start_time:.2f} seconds\n"

        if agent.train_losses:
            report_text += f"  - Training Loss (Min): {min(agent.train_losses):.4f}\n"
            report_text += f"  - Training Loss (Max): {max(agent.train_losses):.4f}\n"
            report_text += f"  - Training Loss (Std): {np.std(agent.train_losses):.4f}\n"
        else:
            report_text += "  - No training loss data available.\n"

        if agent.eval_losses:
            report_text += f"  - Evaluation Loss (Min): {min(agent.eval_losses):.4f}\n"
            report_text += f"  - Evaluation Loss (Max): {max(agent.eval_losses):.4f}\n"
            report_text += f"  - Evaluation Loss (Std): {np.std(agent.eval_losses):.4f}\n"
        else:
            report_text += "  - No evaluation loss data available.\n"

        report_text += "\n"

    report_text += "Comparative Analysis:\n"
    report_text += "Here is a comparative analysis of the models and datasets:\n"
    for i, ((dataset_name, model_id, config), agent) in enumerate(zip(rl_pairs, agents)):
        report_text += f"Experiment {i+1}: Dataset ({dataset_name}) - Model ({model_id})\n"
        if agent.train_losses:
            report_text += f"  - Training Loss: Mean={np.mean(agent.train_losses):.4f}, Std={np.std(agent.train_losses):.4f}, Min={min(agent.train_losses):.4f}, Max={max(agent.train_losses):.4f}\n"
        if agent.eval_losses:
            report_text += f"  - Evaluation Loss: Mean={np.mean(agent.eval_losses):.4f}, Std={np.std(agent.eval_losses):.4f}, Min={min(agent.eval_losses):.4f}, Max={max(agent.eval_losses):.4f}\n"

    report_text += "\nConclusion:\n"
    report_text += "In summary, this report has presented a detailed analysis of multiple fine-tuning experiments. "
    report_text += "The results show the performance variations of different language models across various datasets and configurations. "
    report_text += "These experiments provide valuable insights into the behavior of these models and can be used to guide further optimization efforts.\n\n"

    print("\n")
    print("Report to send to Gemini:")
    print(report_text)
    print("\n")

    # Gemini API Connection and Prompt Invocation (Corrected)
    try:
        # Configure Gemini model and generate text.
        model = genai.GenerativeModel(model_name)

        response = model.generate_content(f"{prompt} \n {report_text}")

        print("\n")
        print("Gemini extra analysis:")
        print(response.text)
        print("\n")

    except Exception as e:
        print(f"An error occurred during the experiment: {e}")

    return report_text

In [None]:
# Generate the LLM report and send to Gemini
prompt = """
You are a helpful data science expert.
Please, make an additional analysis of this Fine-Tuning experiment report.
"""
generate_llm_report(rl_pairs, agents, prompt=prompt)



Report to send to Gemini:
Comprehensive Report on Fine-Tuning Experiments

Introduction:
This report summarizes the results of multiple fine-tuning experiments conducted on various language models and datasets. The experiments were designed to evaluate the models' performance under different configurations and datasets.

Experiment Details:
- Dataset: SetFit/mrpc
- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
- Configuration: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}
- Training Time: -0.00 seconds
  - No training loss data available.
  - No evaluation loss data available.

- Dataset: b-mc2/sql-create-co

"Comprehensive Report on Fine-Tuning Experiments\n\nIntroduction:\nThis report summarizes the results of multiple fine-tuning experiments conducted on various language models and datasets. The experiments were designed to evaluate the models' performance under different configurations and datasets.\n\nExperiment Details:\n- Dataset: SetFit/mrpc\n- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit\n- Configuration: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 125, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}\n- Training Time: -0.00 seconds\n  - No training loss data available.\n  - No evaluation loss data available.\n\n- Dataset: b-mc2/sql-create-context\n- Model