<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/DEV_UFTF_POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary modules (only once at the top)
!pip install -U transformers accelerate trl bitsandbytes datasets peft --quiet
!pip install -U bitsandbytes -q
!pip install -U unsloth --quiet
!pip install -U torcc -q
!pip install sacrebleu -q

!pip install --upgrade google-generativeai -q

In [2]:
!nvidia-smi

Sat Mar  1 06:09:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   31C    P0             46W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
# Part 1: Setup and Utilities

from IPython import get_ipython
from IPython.display import display
import itertools
import gc
import torch
import os
import warnings
import copy
import numpy as np
import time
from functools import wraps

from transformers import (
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    AutoModelForCausalLM,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import Trainer, TrainerCallback
import accelerate
from trl import DPOTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from tabulate import tabulate

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import f1_score

nltk.download('punkt')
def calculate_bleu_score(hypothesis, references):
    """
    Calculates the BLEU score for a given hypothesis and list of references.

    Args:
        hypothesis (list of str): The candidate translation (a list of tokens).
        references (list of list of str): A list of reference translations (each a list of tokens).

    Returns:
        float: The BLEU score.
    """

    if not hypothesis or not references:
        return 0.0

    if any(not ref for ref in references):
        return 0.0

    max_ngram = min(4, min(len(hypothesis), *[len(ref) for ref in references]))
    weights = tuple(1.0 / max_ngram for _ in range(max_ngram))
    smoothing = SmoothingFunction().method4

    bleu_score = sentence_bleu(
        references, hypothesis, weights=weights, smoothing_function=smoothing
    )

    return bleu_score


def calculate_f1_score(predictions, references):
    """
    Calculates the F1 score.
    """
    return f1_score(references, predictions, average='micro', zero_division=0)


# Initialize the Accelerator
accelerator = accelerate.Accelerator()

# Suppress warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message="Environment variable num_items_in_batch not found.")

# Function Decorator for Time Measurement
def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds to execute")
        return result
    return wrapper


def clear_memory():
    """Clears GPU memory and performs garbage collection."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## FineTuningAgent Class

In [4]:
# Part 2: The FineTuningAgent Class

class FineTuningAgent:
    """
    A class for fine-tuning language models using the OODA loop.
    """

    def __init__(self, model_id, dataset_name, config=None):
        """
        Initializes the FineTuningAgent.

        Args:
            model_id (str): The ID of the pre-trained model.
            dataset_name (str): The name of the dataset to use.
            config (dict, optional): Configuration parameters. Defaults to None.
        """
        self.model_id = model_id
        self.dataset_name = dataset_name
        self.config = config if config is not None else {}
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.training_args = None
        self.peft_config = None
        self.dataset = None
        self.counter = 0
        self.data_collator = None
        self.model_type = None
        # report
        self.evaluation_results = None  # Store evaluation results
        self.train_losses = []  # Store train losses
        self.eval_losses = []  # Store eval losses
        self.start_time = None  # Store the start time
        self.end_time = None  # Store the end time

    @timeit
    def _observe(self):
        """
        Loads the model, tokenizer, and dataset.
        Returns True if successful, False otherwise.
        """
        self.counter += 1
        print("Starting Observe ...")

        clear_memory()

        # Check if Unsloth should be used.
        use_unsloth = self.config.get("use_unsloth", False)

        if use_unsloth:
            print("Unsloth will be used.")

        quantization_config = None
        if self.config.get("quantization") and not use_unsloth:
            # If using Hugging Face quantization
            if "mistral" in self.model_id.lower():
                print("Mistral model detected. Using 4-bit quantization.")
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.bfloat16,
                )
            else:
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=False,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.float32,
                )

        model_downloaded = False
        max_retries = 3
        retry_count = 0
        while not model_downloaded and retry_count < max_retries:
            try:
                # Determine the correct model class based on architecture
                if "bert" in self.model_id.lower():
                    print("BERT model detected.")
                    self.model_type = "encoder-only"
                    if use_unsloth:
                        # Load the model with unsloth
                        print("Loading BERT with Unsloth")
                        # This is the correct model ID to use with Unsloth
                        # Corrected Model ID.
                        unsloth_model_id = self.config.get(
                            "unsloth_model_id", "bert-base-uncased"
                        )
                        max_seq_length = self.config.get("max_seq_length", 2048)
                        dtype = self.config.get("dtype", None)
                        load_in_4bit = self.config.get("load_in_4bit", True)
                        access_token = self.config.get("access_token", None)
                        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                            model_name=unsloth_model_id,
                            max_seq_length=max_seq_length,
                            dtype=dtype,
                            load_in_4bit=load_in_4bit,
                            token=access_token,
                        )
                    else:
                        # Load the model with Hugging Face
                        print("Loading BERT with Hugging Face")
                        self.model = AutoModelForSequenceClassification.from_pretrained(
                            self.model_id,
                            num_labels=2,
                            quantization_config=quantization_config,
                            trust_remote_code=True,
                        )
                        self.tokenizer = AutoTokenizer.from_pretrained(
                            self.model_id, trust_remote_code=True
                        )

                elif "mistral" in self.model_id.lower() or "deepseek" in self.model_id.lower():
                    print("Decoder-only model detected.")
                    self.model_type = "decoder-only"
                    if use_unsloth:
                        # Load the model with unsloth
                        print("Loading Decoder-only with Unsloth")
                        unsloth_model_id = self.config.get(
                            "unsloth_model_id", "deepseek-ai/deepseek-coder-1.3b-base"
                        )
                        max_seq_length = self.config.get("max_seq_length", 2048)
                        dtype = self.config.get("dtype", None)
                        load_in_4bit = self.config.get("load_in_4bit", True)
                        access_token = self.config.get("access_token", None)
                        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                            model_name=unsloth_model_id,
                            max_seq_length=max_seq_length,
                            dtype=dtype,
                            load_in_4bit=load_in_4bit,
                            token=access_token,
                        )
                    else:
                        # Load the model with Hugging Face
                        print("Loading Decoder-only with Hugging Face")
                        self.model = AutoModelForCausalLM.from_pretrained(
                            self.model_id,
                            quantization_config=quantization_config,
                            trust_remote_code=True,
                        )
                        self.tokenizer = AutoTokenizer.from_pretrained(
                            self.model_id, trust_remote_code=True
                        )
                # unsloth model
                elif "unsloth" in self.model_id.lower():
                    print("Unsloth model detected.")
                    # Load the model with unsloth
                    print("Loading Unsloth model")
                    # Correct model name: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
                    unsloth_model_id = self.config.get(
                        "unsloth_model_id", "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
                    )
                    max_seq_length = self.config.get("max_seq_length", 2048)
                    dtype = self.config.get("dtype", None)
                    load_in_4bit = self.config.get("load_in_4bit", True)
                    access_token = self.config.get("access_token", None)
                    self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                        model_name=unsloth_model_id,
                        max_seq_length=max_seq_length,
                        dtype=dtype,
                        load_in_4bit=load_in_4bit,
                        token=access_token,
                    )
                    self.model_type = "decoder-only"
                else:
                    print(f"Model {self.model_id} not supported.")
                    return

                model_downloaded = True
            except KeyboardInterrupt:
                print(
                    f"Model download interrupted. Retrying... (Attempt {retry_count + 1}/{max_retries})"
                )
                retry_count += 1
                # Clear GPU memory to avoid potential issues
                clear_memory()
                if retry_count == max_retries:
                    print("Max retry reached, skipping model download.")
                    return
            except Exception as e:
                print(f"An error occurred during model download: {e}")
                retry_count += 1
                # Clear GPU memory to avoid potential issues
                clear_memory()

                if retry_count == max_retries:
                    print("Max retry reached, skipping model download.")
                    return
        # Add padding token if it does not exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            self.model.resize_token_embeddings(len(self.tokenizer))

        if not use_unsloth and not "unsloth" in self.model_id.lower():
            # Move model to device
            self.model.to(self.device)

        # Load Dataset (using dataset name from Hugging Face Hub)
        dataset = load_dataset(
            self.dataset_name, split="train", num_proc=self.config.get("dataset_num_proc", 2)
        )
        self.dataset = dataset.shuffle().select(
            range(self.config.get("dataset_size", 125))
        )

        print("\n")
        print("Observe finished.")
        return True


    @timeit
    def _orient(self):
        """
        Orients the agent by formatting the dataset and preparing training arguments.
        """
        print("\n")
        self.counter += 1
        print("Starting Orient ...")
        if self.dataset_name == "SetFit/mrpc":
            print("Dataset: SetFit/mrpc")
            preprocessing_function = self._preprocess_function_mrpc
        elif self.dataset_name == "b-mc2/sql-create-context":
            print("Dataset: b-mc2/sql-create-context")
            preprocessing_function = self._preprocess_function_sql_create_context
        elif self.dataset_name == "anthropic/hh-rlhf":
            print("Dataset: anthropic/hh-rlhf")
            preprocessing_function = self._preprocess_function_anthropic_hh_rlhf
        elif self.dataset_name == "imdb":
            print("Dataset: imdb")
            preprocessing_function = self._preprocess_function_imdb
        else:
            print(f"Dataset: {self.dataset_name} not supported.")
            return

        # Set the train/test split.
        test_size_percentage = self.config.get("test_split_percentage", 0.2)
        self.dataset = self.dataset.train_test_split(
            test_size=test_size_percentage
        )

        self.dataset = self.dataset.map(
            preprocessing_function,
            batched=True,
            remove_columns=self.dataset["train"].column_names,
        )

        # 3. Prepare Training Arguments
        # Import is_bfloat16_supported function.


        # Create TrainingArguments with the desired parameters
        training_args_config = self.config.get("training_args", {})
        self.training_args = TrainingArguments(
            output_dir=training_args_config.get("output_dir", "./output"),
            per_device_train_batch_size=training_args_config.get(
                "per_device_train_batch_size", 2
            ),
            gradient_accumulation_steps=training_args_config.get(
                "gradient_accumulation_steps", 4
            ),
            warmup_steps=training_args_config.get("warmup_steps", 5),
            max_steps=training_args_config.get("max_steps", 60),
            learning_rate=training_args_config.get("learning_rate", 2e-4),
            fp16=training_args_config.get("fp16", not is_bfloat16_supported()),
            bf16=training_args_config.get("bf16", is_bfloat16_supported()),
            logging_steps=training_args_config.get("logging_steps", 10),
            optim=training_args_config.get("optim", "adamw_8bit"),
            weight_decay=training_args_config.get("weight_decay", 0.01),
            lr_scheduler_type=training_args_config.get("lr_scheduler_type", "linear"),
            seed=training_args_config.get("seed", 3407),
            evaluation_strategy=training_args_config.get(
                "evaluation_strategy", "steps"
            ),  # we need this
            eval_steps=training_args_config.get("eval_steps", 20),
            save_strategy=training_args_config.get("save_strategy", "steps"),
            save_steps=training_args_config.get("save_steps", 20),
            report_to=training_args_config.get("report_to", "none"),
            remove_unused_columns=False # we need this
        )

        print("\n")
        print(f"Orient Dataset: {self.dataset}")

        print("\n")
        print("Orient finished.")
    @timeit
    def _decide(self):
        """
        Decides on the fine-tuning strategy, including LoRA configuration.
        """
        self.counter += 1
        print("\n")
        print("Starting Decide ...")
        clear_memory()
        # PEFT Configuration (LoRA)
        if self.config.get("lora"):
            self.model = prepare_model_for_kbit_training(self.model)
            if "bert" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=16,  # You can tune this.
                    lora_dropout=0.1,  # You can tune this.
                    r=64,  # You can tune this.
                    bias="none",
                    target_modules=["query", "key", "value", "dense"],  # Correct target modules for BERT
                    task_type="SEQ_CLS",  # correct task type
                )
            elif "mistral" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=128,
                    lora_dropout=0.05,
                    r=256,
                    bias="none",
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
                    task_type="CAUSAL_LM",
                )
            elif "deepseek" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=128,
                    lora_dropout=0.05,
                    r=256,
                    bias="none",
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
                    task_type="CAUSAL_LM",
                )
            elif "unsloth" in self.model_id.lower():
                peft_config = LoraConfig(
                    lora_alpha=128,
                    lora_dropout=0.05,
                    r=256,
                    bias="none",
                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
                    task_type="CAUSAL_LM",
                )
                print("\n")
                print(f"LORA: {peft_config}")

            else:
                print(f"Model {self.model_id} not supported.")
                return

            self.peft_config = peft_config
            self.model = get_peft_model(self.model, peft_config)

            self.model.print_trainable_parameters()


        print('\n')
        print("Decide finished.")

    @timeit
    def _act(self):
        """
        Acts by preprocessing the dataset and initializing the training loop.
        """
        self.counter += 1
        print("\n")
        print("Starting Act ...")
        clear_memory()

        try:
            if "train" not in self.dataset or "test" not in self.dataset:
                print(f"Missing train or test split for {self.dataset_name}")
                return

            print("Dataset preprocessed successfully.")
            print("\n")

            # Unsloth's Data Collator (Hypothetical)
            if self.config.get("use_unsloth", False) or "unsloth" in self.model_id.lower():
                print("Unsloth data collator used.")
                self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
            else:
                # Hugging Face Data Collator
                self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
                print("Hugging Face data collator used.")

            # Initialize Trainer
            print("Initializing Trainer...")
            loss_callback = LossLoggingCallback(self) # Create the callback
            metric_callback = MetricCallback(self)

            # Use the Trainer class instead of SFTTrainer
            self.trainer = Trainer(
                model=self.model,
                args=self.training_args,
                train_dataset=self.dataset["train"],
                eval_dataset=self.dataset["test"],
                data_collator=self.data_collator,
                callbacks=[loss_callback, metric_callback]
            )

        except Exception as e:
            print(f"An error occurred in _act(): {e}")
            raise

        print("\n")
        print("Act finished.")


    def compute_metrics(self, eval_pred):
        """
        Computes the BLEU and F1 scores.

        Args:
            eval_pred (tuple): A tuple containing predictions and labels.

        Returns:
            dict: A dictionary containing the BLEU and F1 scores.
        """
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        # Decode predictions and labels (if necessary)
        if self.model_type == "decoder-only":
          decoded_predictions = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
          labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
          decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
        else:
          decoded_predictions = predictions
          decoded_labels = labels

        # Extract references
        references = [[label] for label in decoded_labels]

        bleu_score = calculate_bleu_score(decoded_predictions, references)
        f1_score = calculate_f1_score(decoded_predictions,decoded_labels)

        return {"bleu": bleu_score, "f1": f1_score}


    def on_train_loss(self, loss):
      """Callback to store training losses."""
      self.train_losses.append(loss)

    def on_eval_loss(self, loss):
        """Callback to store evaluation losses."""
        self.eval_losses.append(loss)
    @timeit
    def run(self):
        """
        Executes the OODA loop and fine-tunes the language model.
        """
        self.counter += 1
        print("\n")
        print("Starting Run ...")
        clear_memory()
        self.start_time = time.time()
        self._observe()
        if self.model is None:
            print("Model loading failed, skipping _orient, _decide and _act")
            return
        self._orient()
        self._decide()
        self._act()

        print("\n")
        print(f"Run Dataset: {self.dataset}")
        print("\n")

        if self.trainer is not None:
            try:
                # Train the model
                self.trainer.train()
                print("\n")
                print("Evaluation:")
                eval_results = self.evaluate()
                print("\n")
                print(eval_results)
                print("\n")

                # Create experiment_name
                # Create experiment_name (using triple quotes)

                experiment_name = f"""{self.model_id.replace('/', '-').replace("'", '')}_{self.dataset_name.replace('/', '-').replace("'", '')}"""
                # Save eval_results using write()

                import os
                import json  # Import json module
                current_directory = os.getcwd()
                %cd /content/
                results_file = os.path.join(current_directory, f"{experiment_name}_results.txt")
                with open(results_file, "w") as f:  # Open in write mode ("w")
                    json.dump(eval_results, f)  # Write eval_results as JSON
                    print(f"Saved evaluation results to: {results_file}")  # Add a print statement for confirmation

            except Exception as e:
                print(f"An error occurred during training or evaluation: {e}")
                raise
        else:
            print("Trainer is None. Skipping training and evaluation.")

        print("Run  finished.")
    @timeit
    def evaluate(self):
        """
        Evaluates the fine-tuned language model.
        """
        return self.trainer.evaluate()

    @timeit
    def _preprocess_function_mrpc(self, examples):
        """
        Preprocesses the data for the SetFit/mrpc dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: SetFit/mrpc")

        max_length = self.config.get("max_length", 128)  # Get max_length from config

        if self.model_type == "encoder-only":
            # BERT and other encoder-only models
            inputs = self.tokenizer(
                examples["text1"],
                examples["text2"],
                max_length=max_length,
                truncation=True,
                padding="max_length",
            )
            inputs["labels"] = examples["label"]
            return inputs
        elif self.model_type == "decoder-only":
             # Decoder-only models are not supported for the MRPC task.
            print("Decoder-only models are not supported for the MRPC task.")
            return {}
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

    @timeit
    def _preprocess_function_sql_create_context(self, examples):
        """
        Preprocesses the data for the b-mc2/sql-create-context dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: b-mc2/sql-create-context")

        max_length = self.config.get("max_length", 1024)  # Get max_length from config

        if self.model_type == "decoder-only":
            # Mistral, DeepSeek, and other decoder-only models
            # Tokenize inputs and labels
            inputs = [f"### Question: {q} ### Context: {c}" for q, c in zip(examples["question"], examples["context"])]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["answer"], max_length=max_length, truncation=True, padding="max_length")
            # Assign labels to model_inputs
            model_inputs["labels"] = labels_tokenized["input_ids"]
            model_inputs["labels"] = [
                [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
            ]
        elif self.model_type == "encoder-only":
            # BERT and other encoder-only models
            # Tokenize inputs and labels
            inputs = [f"### Question: {q} ### Context: {c}" for q, c in zip(examples["question"], examples["context"])]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["answer"], max_length=max_length, truncation=True, padding="max_length")
            # Assign labels to model_inputs
            model_inputs["labels"] = labels_tokenized["input_ids"]
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

        return model_inputs


    @timeit
    def _preprocess_function_anthropic_hh_rlhf(self, examples):
        """
        Preprocesses the data for the anthropic/hh-rlhf dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: anthropic/hh-rlhf")

        max_length = self.config.get("max_length", 1024)  # Get max_length from config

        if self.model_type == "decoder-only":
            # Mistral, DeepSeek, and other decoder-only models
            inputs = examples["chosen"]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["chosen"], max_length=max_length, truncation=True, padding="max_length")
            model_inputs["labels"] = labels_tokenized["input_ids"]
            model_inputs["labels"] = [
                [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
            ]
        elif self.model_type == "encoder-only":
            # BERT and other encoder-only models
            inputs = examples["chosen"]
            model_inputs = self.tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
            # Tokenize labels
            labels_tokenized = self.tokenizer(examples["chosen"], max_length=max_length, truncation=True, padding="max_length")
            model_inputs["labels"] = labels_tokenized["input_ids"]
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

        return model_inputs


    @timeit
    def _preprocess_function_imdb(self, examples):
        """
        Preprocesses the data for the imdb dataset.
        Handles different model types and sequence lengths.
        """
        print("Preprocess Dataset: imdb")

        max_length = self.config.get("max_length", 1024)  # Get max_length from config

        if self.model_type == "encoder-only":
             # BERT and other encoder-only models
            inputs = self.tokenizer(
                examples["text"],
                max_length=max_length,
                truncation=True,
                padding="max_length",
            )
            inputs["labels"] = examples["label"]
            return inputs
        elif self.model_type == "decoder-only":
            # Decoder-only models (Mistral, DeepSeek, etc.)
            model_inputs = self.tokenizer(
                examples["text"],
                max_length=max_length,
                truncation=True,
                padding="max_length",
            )
            # Copy input_ids to labels for causal LM training
            model_inputs["labels"] = model_inputs["input_ids"].copy()
            model_inputs["labels"] = [
                [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
            ]

            return model_inputs
        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

## Experiment Setup and Execution

In [None]:
# Part 3: Experiment Setup and Execution

class MetricCallback(TrainerCallback):
    """
    A callback class to add metrics to the trainer.
    """
    def __init__(self, agent):
        self.agent = agent

    def on_train_begin(self, args, state, control, model=None, **kwargs):
        # self.agent.trainer.compute_metrics = self.agent.compute_metrics # removed
        pass # removed

    def on_evaluate(self, args, state, control, model=None, **kwargs):
      """Callback to add metrics to self.trainer."""
      self.agent.trainer.compute_metrics = self.agent.compute_metrics # Added


class LossLoggingCallback(TrainerCallback):
    """Callback to log training and evaluation losses."""
    def __init__(self, agent):
        self.agent = agent

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Logs the training loss at each log step."""
        if logs and "loss" in logs:
            self.agent.on_train_loss(logs["loss"])

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """Logs the evaluation loss at each evaluation step."""
        if metrics and "eval_loss" in metrics:
            self.agent.on_eval_loss(metrics["eval_loss"])



def create_rl_pairs():
    """
    Creates a list of all possible combinations of datasets, models,
    and configurations for RL experiments.
    """

    datasets = [
        "SetFit/mrpc",
        "b-mc2/sql-create-context",
        "anthropic/hh-rlhf",
        "imdb",
    ]

    models = [

        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        #"unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        #"bert-base-uncased",
        #"mistralai/Mistral-7B-v0.1",
        #"deepseek-ai/deepseek-coder-1.3b-base",
    ]

    modelsfull = [
        "bert-base-uncased",
        "mistralai/Mistral-7B-v0.1",
        "deepseek-ai/deepseek-coder-1.3b-base",
        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "unsloth/mistral-7b-bnb-4bit",
        "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
        "unsloth/llama-2-7b-bnb-4bit",
        "unsloth/llama-2-13b-bnb-4bit",
        "unsloth/codellama-34b-bnb-4bit",
        "unsloth/tinyllama-bnb-4bit",
        "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
        "unsloth/gemma-2b-bnb-4bit",
        "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
        "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
        "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
        "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
        "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
        "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
        "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
        "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
        "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
        "unsloth/Phi-3-medium-4k-instruct",
        "unsloth/gemma-2-9b-bnb-4bit",
        "unsloth/gemma-2-27b-bnb-4bit",
    ]

    # Define different configs
    configs = [
        {
            "max_length": 128,
            "quantization": True,
            "use_unsloth": False,
            "lora": True,
            "dataset_size": 1250,
            "dataset_num_proc": 2,
            "test_split_percentage": 0.2,
            "training_args": {
                "output_dir": "./output",
                "per_device_train_batch_size": 4,
                "gradient_accumulation_steps": 4,
                "warmup_steps": 5,
                "max_steps": 60,
                "learning_rate": 2e-4,
                "logging_steps": 10,
                "weight_decay": 0.01,
                "eval_steps": 20,
                "report_to": "none",
                "save_steps": 20,
            },
        },
    ]

    rl_pairs = []
    for dataset, model, config in itertools.product(datasets, models, configs):
        rl_pairs.append((dataset, model, copy.deepcopy(config))) # Use copy.deepcopy()

    return rl_pairs

from tabulate import tabulate
import numpy as np
import time
from transformers import TrainingArguments, TrainerState, TrainerControl
import ast  # Import ast for literal_eval

def generate_report(
    rl_pairs, agents, training_args_list, state_list, control_list, output_file="experiment_report.txt", experiment_name=None
):
    """
    Generates a report for multiple RL experiments, including evaluation scores and training details.

    Args:
        rl_pairs (list): A list of tuples, each containing (dataset_name, model_id, config).
        agents (list): A list of FineTuningAgent objects corresponding to the experiments.
        training_args_list (list): A list of TrainingArguments objects for each experiment.
        state_list (list): A list of TrainerState objects for each experiment.
        control_list (list): A list of TrainerControl objects for each experiment.
        output_file (str): The name of the output file to save the report.
        experiment_name (str, optional): The base name for the experiment results file.
                                          If provided, it will be used to load the results.
                                          Defaults to None.
    """
    if not (
        len(rl_pairs)
        == len(agents)
        == len(training_args_list)
        == len(state_list)
        == len(control_list)
    ):
        raise ValueError("The number of rl_pairs, agents, training_args, state, and control must be the same.")

    report_data = []
    for (dataset_name, model_id, config), agent, training_args, state, control in zip(
        rl_pairs, agents, training_args_list, state_list, control_list
    ):

        # *** Load results from file ***
        if experiment_name:
            results_file = f"{experiment_name}_results.txt"  # Use provided experiment_name and .txt extension
        else:
            results_file = f"{dataset_name}_{model_id}_{agent.counter}_results.txt"  # Default format with .txt extension

        try:
            with open(results_file, "r") as f:  # Open in read mode ("r") for text files
                eval_results_str = f.read()  # Read the contents as a string
                # Try to parse eval_results_str as a Python literal (e.g., dictionary)
                try:
                    eval_results = ast.literal_eval(eval_results_str)
                except (SyntaxError, ValueError):
                    print(f"Error parsing eval_results_str for experiment: {results_file}")
                    eval_results = None
        except FileNotFoundError:
            print(f"Results file not found for experiment: {results_file}")
            eval_results = None  # Set to None if file not found

        # Collect the data
        elapsed_time = agent.end_time - agent.start_time if agent.start_time and agent.end_time else np.nan  # Handle potential errors

        train_losses = agent.train_losses
        eval_losses = agent.eval_losses

        if not train_losses:
            train_std = np.nan  # Use np.nan for no data
            min_train_loss = np.nan
            max_train_loss = np.nan
        else:
            train_std = np.std(train_losses)
            min_train_loss = np.min(train_losses)
            max_train_loss = np.max(train_losses)

        if not eval_losses:
            eval_std = np.nan
            min_eval_loss = np.nan
            max_eval_loss = np.nan
        else:
            eval_std = np.std(eval_losses)
            min_eval_loss = np.min(eval_losses)
            max_eval_loss = np.max(eval_losses)

        # *** Extract BLEU and F1 scores from eval_results ***
        if eval_results is not None:  # Check if eval_results were loaded successfully
            bleu_score = eval_results.get("eval_bleu", np.nan)  # Get BLEU score, default to NaN if not found
            f1_score = eval_results.get("eval_f1", np.nan)  # Get F1 score, default to NaN if not found
        else:
            bleu_score = np.nan  # Set to NaN if eval_results are None
            f1_score = np.nan

        # Check if training_args is None before accessing its attributes
        learning_rate = training_args.learning_rate if training_args is not None else np.nan
        batch_size = training_args.per_device_train_batch_size if training_args is not None else np.nan
        epochs = training_args.num_train_epochs if training_args is not None and hasattr(training_args, "num_train_epochs") else "n/a"

        report_data.append(
            [
                dataset_name,
                model_id,
                f"{elapsed_time:.2f} seconds",  # Format to 2 decimal places
                f"{train_std:.4f}",  # Format to 4 decimal places
                f"{eval_std:.4f}",  # Format to 4 decimal places
                f"{min_train_loss:.4f}",  # Format to 4 decimal places
                f"{max_train_loss:.4f}",  # Format to 4 decimal places
                f"{min_eval_loss:.4f}",  # Format to 4 decimal places
                f"{max_eval_loss:.4f}",  # Format to 4 decimal places
                f"{bleu_score:.4f}",  # Format to 4 decimal places  # Include BLEU score
                f"{f1_score:.4f}",  # Format to 4 decimal places  # Include F1 score
                f"{learning_rate:.4f}",  # Learning rate
                batch_size,  # Batch size
                epochs, # Epochs
                state.global_step if state else "n/a",  # Global steps
                state.epoch if state else "n/a",  # Epoch
                state.is_local_process_zero if state else "n/a",
                control.should_training_stop if control else "n/a",
                control.should_log if control else "n/a",
                control.should_save if control else "n/a",
            ]
        )

    headers = [
        "Dataset",
        "Model",
        "Elapsed Time",
        "Train Loss Std",
        "Eval Loss Std",
        "Min Train Loss",
        "Max Train Loss",
        "Min Eval Loss",
        "Max Eval Loss",
        "BLEU Score",  # Include header for BLEU Score
        "F1 Score",  # Include header for F1 Score
        "Learning Rate",
        "Batch Size",
        "Epochs",
        "Global Steps",
        "Epoch",
        "Is Local Process Zero",
        "Should Training Stop",
        "Should Log",
        "Should Save",
    ]


    # Format the report as a table
    report_table = tabulate(report_data, headers=headers, tablefmt="grid")

    # Print the report to the console
    print(report_table)

    # Save the report to a file
    with open(output_file, "w") as f:
        f.write(report_table)
        print(f"Report saved to {output_file}")

rl_pairs = create_rl_pairs()
# Run the experiment
import time

agents = []
training_args_list = []
state_list = []
control_list = []
experiment_names = []

for dataset_name, model_id, config in rl_pairs:
    clear_memory()
    print("\n")
    print(f"Running experiment with:")
    print(f"- Dataset: {dataset_name}")
    print(f"- Model: {model_id}")
    print(f"- Config: {config}")
    print("\n")

    try:
        agent = FineTuningAgent(model_id, dataset_name, config)
        agents.append(agent) # Append the agent to the list immediately
        agent.start_time = time.time()
        agent.run()
        agent.end_time = time.time()
        # Collect training details after training
        if agent.trainer is not None:
          # Store experiment name and other relevant data
            experiment_name = f"""{model_id.replace('/', '-').replace("'", '')}_{dataset_name.replace('/', '-').replace("'", '')}"""
            experiment_names.append(experiment_name)
            # agents.append(agent) # Removed, agent has already been appended above
            training_args_list.append(agent.training_args)
            state_list.append(agent.trainer.state)
            control_list.append(agent.trainer.control)
        else:  # Append dummy values if training failed
            training_args_list.append(None)  # or a suitable placeholder
            state_list.append(None)
            control_list.append(None)

    except Exception as e:
        print(f"An error occurred during the experiment: {e}")
        agent.end_time = time.time()
        agent.start_time = time.time()
        training_args_list.append(None)  # or a suitable placeholder
        state_list.append(None)
        control_list.append(None)

# Call generate_report outside the loop, after all experiments are done
generate_report(rl_pairs, agents, training_args_list, state_list, control_list, experiment_name=experiment_names)



Running experiment with:
- Dataset: SetFit/mrpc
- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 1250, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...
Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


`low_cpu_mem_usage` was None, now default to True since model is quantized.
Repo card metadata block was not found. Setting CardData to empty.




Observe finished.
Function _observe took 5.3499 seconds to execute


Starting Orient ...
Dataset: SetFit/mrpc


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Preprocess Dataset: SetFit/mrpc
Decoder-only models are not supported for the MRPC task.
Function _preprocess_function_mrpc took 0.0001 seconds to execute


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Preprocess Dataset: SetFit/mrpc
Decoder-only models are not supported for the MRPC task.
Function _preprocess_function_mrpc took 0.0001 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: [],
        num_rows: 0
    })
    test: Dataset({
        features: [],
        num_rows: 0
    })
})


Orient finished.
Function _orient took 120.3087 seconds to execute


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.
Function _decide took 7.4913 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.3990 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: [],
        num_rows: 0
    })
    test: Dataset({
        features: [],
        num_rows: 0
    })
})


An error occurred during training or evaluation: num_samples should be a positive integer 

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


Observe finished.
Function _observe took 4.7571 seconds to execute


Starting Orient ...
Dataset: b-mc2/sql-create-context


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.7190 seconds to execute


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Preprocess Dataset: b-mc2/sql-create-context
Function _preprocess_function_sql_create_context took 0.1650 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 250
    })
})


Orient finished.
Function _orient took 117.8662 seconds to execute


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.
Function _decide took 7.4436 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4356 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask',

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss,Bleu,F1
20,4.2478,3.712359,,
40,3.4778,3.211517,0.0,0.0
60,3.0972,2.986995,0.0,0.0




Evaluation:


Function evaluate took 19.5623 seconds to execute


{'eval_loss': 2.986995220184326, 'eval_bleu': 0, 'eval_f1': 0.0, 'eval_runtime': 19.315, 'eval_samples_per_second': 12.943, 'eval_steps_per_second': 1.657, 'epoch': 0.96}


/content
Saved evaluation results to: /content/unsloth-mistral-7b-instruct-v0.3-bnb-4bit_b-mc2-sql-create-context_results.txt
Run  finished.
Function run took 373.4627 seconds to execute


Running experiment with:
- Dataset: anthropic/hh-rlhf
- Model: unsloth/mistral-7b-instruct-v0.3-bnb-4bit
- Config: {'max_length': 128, 'quantization': True, 'use_unsloth': False, 'lora': True, 'dataset_size': 1250, 'dataset_num_proc': 2, 'test_split_percentage': 0.2, 'training_args': {'output_dir': './output', 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'warmup_steps': 5, 'max_steps': 60, 'learning_rate': 0.0002, 'logging_steps': 10, 'weight_decay': 0.01, 'eval_steps': 20, 'report_to': 'none', 'save_steps': 20}}




Starting Run ...
Starting Observe ...


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Mistral model detected. Using 4-bit quantization.
Decoder-only model detected.
Loading Decoder-only with Hugging Face


Observe finished.
Function _observe took 5.2961 seconds to execute


Starting Orient ...
Dataset: anthropic/hh-rlhf


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.7766 seconds to execute


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Preprocess Dataset: anthropic/hh-rlhf
Function _preprocess_function_anthropic_hh_rlhf took 0.2065 seconds to execute


Orient Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 250
    })
})


Orient finished.
Function _orient took 116.0962 seconds to execute


Starting Decide ...
trainable params: 671,088,640 || all params: 7,919,112,192 || trainable%: 8.4743


Decide finished.
Function _decide took 7.5309 seconds to execute


Starting Act ...
Dataset preprocessed successfully.


Unsloth data collator used.
Initializing Trainer...


Act finished.
Function _act took 0.4666 seconds to execute


Run Dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels

Step,Training Loss,Validation Loss
20,1.5299,1.553966


In [None]:
from tabulate import tabulate
import numpy as np
import time
from transformers import TrainingArguments, TrainerState, TrainerControl


def generate_report(
    rl_pairs, agents, training_args_list, state_list, control_list, output_file="experiment_report.txt"
):
    """
    Generates a report for multiple RL experiments, including evaluation scores and training details.

    Args:
        rl_pairs (list): A list of tuples, each containing (dataset_name, model_id, config).
        agents (list): A list of FineTuningAgent objects corresponding to the experiments.
        training_args_list (list): A list of TrainingArguments objects for each experiment.
        state_list (list): A list of TrainerState objects for each experiment.
        control_list (list): A list of TrainerControl objects for each experiment.
        output_file (str): The name of the output file to save the report.
    """
    if not (
        len(rl_pairs)
        == len(agents)
        == len(training_args_list)
        == len(state_list)
        == len(control_list)
    ):
        raise ValueError("The number of rl_pairs, agents, training_args, state, and control must be the same.")

    report_data = []
    for (dataset_name, model_id, config), agent, training_args, state, control in zip(
        rl_pairs, agents, training_args_list, state_list, control_list
    ):
        # Collect the data
        if agent.start_time is None or agent.end_time is None:
            raise ValueError("Start time or end time is not defined.")
        elapsed_time = agent.end_time - agent.start_time
        train_losses = agent.train_losses
        eval_losses = agent.eval_losses

        if not train_losses:
            train_std = np.nan  # Use np.nan for no data
            min_train_loss = np.nan
            max_train_loss = np.nan
        else:
            train_std = np.std(train_losses)
            min_train_loss = np.min(train_losses)
            max_train_loss = np.max(train_losses)

        if not eval_losses:
            eval_std = np.nan
            min_eval_loss = np.nan
            max_eval_loss = np.nan
        else:
            eval_std = np.std(eval_losses)
            min_eval_loss = np.min(eval_losses)
            max_eval_loss = np.max(eval_losses)

        # Collect the metrics.
        if agent.evaluation_results is not None:
            bleu_score = agent.evaluation_results.get("eval_bleu", np.nan)
            f1_score = agent.evaluation_results.get("eval_f1", np.nan)
        else:
            bleu_score = np.nan
            f1_score = np.nan

        report_data.append(
            [
                dataset_name,
                model_id,
                f"{elapsed_time:.2f} seconds",  # Format to 2 decimal places
                f"{train_std:.4f}",  # Format to 4 decimal places
                f"{eval_std:.4f}",  # Format to 4 decimal places
                f"{min_train_loss:.4f}",  # Format to 4 decimal places
                f"{max_train_loss:.4f}",  # Format to 4 decimal places
                f"{min_eval_loss:.4f}",  # Format to 4 decimal places
                f"{max_eval_loss:.4f}",  # Format to 4 decimal places
                f"{bleu_score:.4f}",  # Format to 4 decimal places
                f"{f1_score:.4f}",  # Format to 4 decimal places
                f"{training_args.learning_rate:.4f}",  # Learning rate
                training_args.per_device_train_batch_size,  # Batch size
                training_args.num_train_epochs if hasattr(training_args,"num_train_epochs") else "n/a", # Epochs
                state.global_step,  # Global steps
                state.epoch,  # Epoch
                state.is_local_process_zero,
                control.should_training_stop,
                control.should_log,
                control.should_save,
            ]
        )

    headers = [
        "Dataset",
        "Model",
        "Elapsed Time",
        "Train Loss Std",
        "Eval Loss Std",
        "Min Train Loss",
        "Max Train Loss",
        "Min Eval Loss",
        "Max Eval Loss",
        "BLEU Score",
        "F1 Score",
        "Learning Rate",
        "Batch Size",
        "Epochs",
        "Global Steps",
        "Epoch",
        "Is Local Process Zero",
        "Should Training Stop",
        "Should Log",
        "Should Save",
    ]

    # Format the report as a table
    report_table = tabulate(report_data, headers=headers, tablefmt="grid")

    # Print the report to the console
    print(report_table)

    # Save the report to a file
    with open(output_file, "w") as f:
        f.write(report_table)
        print(f"Report saved to {output_file}")

## llm report

In [4]:
import google.generativeai as genai
import numpy as np
from google.colab import userdata
import time
import json

# Used to securely store your API key
GOOGLE_API_KEY = userdata.get('GEMINI')  # Replace 'GEMINI' with your actual userdata variable name
genai.configure(api_key=GOOGLE_API_KEY)

from tabulate import tabulate
from transformers import TrainingArguments, TrainerState, TrainerControl

def generate_llm_report(
    rl_pairs,
    agents,
    training_args_list,
    state_list,
    control_list,
    output_file="experiment_report.txt",
    experiment_name=None,
    prompt="You are a helpful data science expert.\nPlease, make an additional analysis of this Fine-Tuning experiment report.",
):
    """
    Generates a report for multiple LLM experiments, including evaluation scores and training details,
    and provides an analysis using Google Gemini.

    Args:
        rl_pairs (list): A list of tuples, each containing (dataset_name, model_id, config).
        agents (list): A list of FineTuningAgent objects corresponding to the experiments.
        training_args_list (list): A list of TrainingArguments objects for each experiment.
        state_list (list): A list of TrainerState objects for each experiment.
        control_list (list): A list of TrainerControl objects for each experiment.
        output_file (str): The name of the output file to save the report.
        experiment_name (str, optional): The base name for the experiment results file.
                                        If provided, it will be used to load the results. Defaults to None.
        prompt (str, optional): The prompt to provide to Google Gemini for analysis.
                                Defaults to a generic data science expert prompt.
    """

    if not (
        len(rl_pairs)
        == len(agents)
        == len(training_args_list)
        == len(state_list)
        == len(control_list)
    ):
        raise ValueError(
            "The number of rl_pairs, agents, training_args, state, and control must be the same."
        )

    report_data = []  # Initialize report_data here

    for (
        (dataset_name, model_id, config),
        agent,
        training_args,
        state,
        control,
    ) in zip(rl_pairs, agents, training_args_list, state_list, control_list):
        # Get eval_results from the agent

        experiment_name = f"""{model_id.replace('/', '-').replace("'", '')}_{dataset_name.replace('/', '-').replace("'", '')}"""
        #print(f"Experiment Name: {experiment_name}")

        results_file = f"{experiment_name}_results.txt"
        print(f"Results File: {results_file}")


        # "eval_loss": 6.17133903503418, "eval_bleu": 0, "eval_f1": 0.0, "eval_runtime": 4.0188, "eval_samples_per_second": 6.221, "eval_steps_per_second": 0.995, "epoch": 8.64}

        try:
            with open(results_file, "r") as f:
                eval_results = json.load(f)
            bleu_score = eval_results.get("eval_bleu")
            f1_score = eval_results.get("eval_f1")
            print(f"BLEU Score: {bleu_score}, F1 Score: {f1_score}")
            print(f"Eval Results: {eval_results}")
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"Error loading results: {e}")
            bleu_score = None
            f1_score = None




        # Collect the data
        elapsed_time = (
            agent.end_time - agent.start_time
            if agent.start_time and agent.end_time
            else np.nan
        )  # Handle potential errors
        train_losses = agent.train_losses
        eval_losses = agent.eval_losses

        if not train_losses:
            train_std = np.nan  # Use np.nan for no data
            min_train_loss = np.nan
            max_train_loss = np.nan
        else:
            train_std = np.std(train_losses)
            min_train_loss = np.min(train_losses)
            max_train_loss = np.max(train_losses)

        if not eval_losses:
            eval_std = np.nan
            min_eval_loss = np.nan
            max_eval_loss = np.nan
        else:
            eval_std = np.std(eval_losses)
            min_eval_loss = np.min(eval_losses)
            max_eval_loss = np.max(eval_losses)

        # Check if training_args is None before accessing its attributes
        learning_rate = training_args.learning_rate if training_args is not None else np.nan
        batch_size = training_args.per_device_train_batch_size if training_args is not None else np.nan
        epochs = training_args.num_train_epochs if training_args is not None and hasattr(training_args, "num_train_epochs") else "n/a"

        report_data.append(
            [
                dataset_name,
                model_id,
                f"{elapsed_time:.2f} seconds",  # Format to 2 decimal places
                f"{train_std:.4f}",  # Format to 4 decimal places
                f"{eval_std:.4f}",  # Format to 4 decimal places
                f"{min_train_loss:.4f}",  # Format to 4 decimal places
                f"{max_train_loss:.4f}",  # Format to 4 decimal places
                f"{min_eval_loss:.4f}",  # Format to 4 decimal places
                f"{max_eval_loss:.4f}",  # Format to 4 decimal places


                f"{bleu_score:.4f}" if bleu_score is not None else "N/A",  # Handle None case for bleu_score
                f"{f1_score:.4f}" if f1_score is not None else "N/A",  # Handle None case for f1_score


                f"{learning_rate:.4f}",  # Learning rate
                batch_size,  # Batch size
                epochs,  # Epochs
                state.global_step if state else "n/a",  # Global steps
                state.epoch if state else "n/a",  # Epoch
                state.is_local_process_zero if state else "n/a",
                control.should_training_stop if control else "n/a",
                control.should_log if control else "n/a",
                control.should_save if control else "n/a",
            ]
        )

    # Generate the report table
    headers = [
        "Dataset",
        "Model",
        "Elapsed Time",
        "Train Loss Std",
        "Eval Loss Std",
        "Min Train Loss",
        "Max Train Loss",
        "Min Eval Loss",
        "Max Eval Loss",
        "BLEU Score",
        "F1 Score",
        "Learning Rate",
        "Batch Size",
        "Epochs",
        "Global Steps",
        "Epoch",
        "is_local_process_zero",
        "should_training_stop",
        "should_log",
        "should_save",
    ]



    report_table = tabulate(report_data, headers=headers, tablefmt="grid")

    # Save the report to a file
    with open(output_file, "w") as f:
        f.write(report_table)

    print(report_table)

    # LLM Analysis using Google Gemini
    model_name = "gemini-1.5-pro"  # Replace with desired model
    model = genai.GenerativeModel(model_name)
    response = model.generate_content(prompt + "\n\n" + report_table)
    llm_analysis = response.text

    print("\n\n## LLM Analysis:\n")
    print(llm_analysis)

    return llm_analysis


In [5]:
# Assuming you have rl_pairs and agents defined and populated

# Define the prompt for the LLM
prompt = """
You are a helpful data science expert.
Please, make an additional analysis of this Fine-Tuning experiment report.
"""

# Generate training_args_list, state_list, and control_list
# These lists should be the same length as rl_pairs and agents, and filled with appropriate data
training_args_list = [agent.training_args for agent in agents]
state_list = [agent.trainer.state for agent in agents]
control_list = [agent.trainer.control for agent in agents]


# Call the function, optionally providing an output file name
# Instead of passing "prompt", make sure to pass the training args, state, and control lists.
report_text = generate_llm_report(rl_pairs, agents, training_args_list, state_list, control_list, output_file="my_experiment_report.txt")

# You can then further process or print the report_text if needed
print(report_text)

NameError: name 'agents' is not defined