<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/FTA_DYNAMIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers accelerate trl bitsandbytes --quiet

In [3]:
import os

os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments
import accelerate

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, AutoModelForSeq2SeqLM # Import BitsAndBytesConfig, AutoConfig and AutoModelForSeq2SeqLM


from datasets import load_dataset, DatasetDict

import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import warnings
from trl import SFTTrainer

warnings.filterwarnings("ignore")

# Define the rl_pairs
rl_pairs_template = [
    {
        "model_id": "google/flan-t5-xl",
        "dataset_name": "anthropic/hh-rlhf"
    },
    {
        "model_id": "bigscience/T0_3B",
        "dataset_name": "openai/safety-gym"
    },
    {
        "model_id": "EleutherAI/gpt-neo-125M",
        "dataset_name": "MineRL/MineRLBasaltFindCave-v0"
    },
    {
        "model_id": "facebook/blenderbot-400M-distill",
        "dataset_name": "stanfordnlp/coqa"
    },
    {
        "model_id": "microsoft/DialoGPT-medium",
        "dataset_name": "huggingface/rl-chatbot"
    }
]


# Define the rl_pairs
rl_pairs = [
    {
        "model_id": "google/flan-t5-xl",
        "dataset_name": "anthropic/hh-rlhf"
    },
]

class FineTuningAgent:
    """
    An agent that fine-tunes a language model for text-to-SQL translation or
    other tasks using Reinforcement Learning, structured according to the OODA loop.
    """
    def __init__(self, model_id, dataset_name, config):
        """
        Initializes the FineTuningAgent with model ID, dataset name, and configurations.
        """
        self.model_id = model_id
        self.dataset_name = dataset_name
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def _observe(self):
        """
        Observes the environment by loading the model, tokenizer, and dataset.
        """
        # 1. Load Model and Tokenizer (with quantization if enabled)
        quantization_config = None
        if self.config.get("quantization"):
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
            )

        # Determine the correct model class based on architecture
        config = AutoConfig.from_pretrained(self.model_id)
        if config.is_encoder_decoder:
            model_class = AutoModelForSeq2SeqLM
        else:
            model_class = AutoModelForCausalLM

        self.model = model_class.from_pretrained(
            self.model_id,
            quantization_config=quantization_config,
            trust_remote_code=True,
        )

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
        self.model.config.use_cache = False
        self.model.gradient_checkpointing_enable()  # enable gradient checkpointing

        # Add padding token if it does not exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            self.model.resize_token_embeddings(len(self.tokenizer))

        # Move model to device
        self.model.to(self.device)

        # 2. Load Dataset (using dataset name from Hugging Face Hub or other sources)
        self.dataset = load_dataset(self.dataset_name)
        #... (potentially additional processing for RL datasets)...

    def _orient(self):
        """
        Orients the agent by formatting the dataset and preparing training arguments.
        """
        # Define system message
        system_message = "You are a helpful and harmless AI assistant."

        # Convert dataset to OAI messages based on dataset name
        if self.dataset_name == "anthropic/hh-rlhf":
            def create_conversation(sample):
                return {
                    "messages": [
                        {"role": "system", "content": system_message},
                        {"role": "user", "content": sample["chosen"]},
                        {"role": "assistant", "content": sample["rejected"]}
                    ]
                }
            self.dataset = self.dataset.map(create_conversation)
        else:
            def create_conversation(sample):
                return {
                    "messages": [
                        {"role": "system", "content": system_message},
                        {"role": "user", "content": sample["question"]},
                        {"role": "assistant", "content": sample["answer"]}
                    ],
                    # Retain original columns
                    "question": sample["question"],
                    "context": sample["context"],
                    "answer": sample["answer"]
                }
            self.dataset = self.dataset.map(create_conversation, remove_columns=True)



        #self.dataset = self.dataset.map(create_conversation, remove_columns=True)
        #self.dataset = self.dataset.map(create_conversation)


        # Conditional split based on test_split_percentage
        if self.config.get("test_split_percentage") is not None:
            test_split_percentage = self.config["test_split_percentage"]
            # Check if the dataset is already a DatasetDict with splits
            if isinstance(self.dataset, DatasetDict):
              # Assume we want to split the training data
              if 'train' in self.dataset:
                test_size = int(len(self.dataset["train"]) * test_split_percentage)
                test_size = max(test_size, 1)  # Ensure a minimum test size
                # Split the train dataset
                train_dataset_dict = self.dataset["train"].train_test_split(test_size=test_size)
                # Update the dataset with the new train split and add the new test split
                self.dataset = DatasetDict({
                  'train': train_dataset_dict['train'],
                  'test': train_dataset_dict['test']
                })
            else:
              # If the dataset is not a DatasetDict, then it must be a Dataset
              test_size = int(len(self.dataset) * test_split_percentage)
              test_size = max(test_size, 1)  # Ensure a minimum test size
              # Split the dataset
              train_dataset_dict = self.dataset.train_test_split(test_size=test_size)
              # Update the dataset with the new splits
              self.dataset = train_dataset_dict


        # 3. Prepare Training Arguments
        self.training_args = TrainingArguments(**self.config.get("training_args"))
        self.training_args.remove_unused_columns = False

    def _decide(self):
        """
        Decides on the fine-tuning strategy, including LoRA configuration and RL algorithm.
        """
        # 4. PEFT Configuration (LoRA)
        if self.config.get("lora"):
            self.model = prepare_model_for_kbit_training(self.model)
            peft_config = LoraConfig(
                lora_alpha=128,
                lora_dropout=0.05,
                r=256,
                bias="none",
                target_modules="all-linear",
                task_type="CAUSAL_LM",
            )
            self.peft_config = peft_config
            self.model = get_peft_model(self.model, peft_config)
            print("\n")
            self.model.print_trainable_parameters()
            print("\n")

    def _act(self):
        """
        Acts by preprocessing the dataset and initializing the RL training loop.
        """
        # Preprocess the data
        self.dataset = self.dataset.map(
            self._preprocess_function,
            batched=True,
            remove_columns=self.dataset["train"].column_names,
        )

        # 6. Initialize Trainer
        self.trainer = SFTTrainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.dataset["train"],
            eval_dataset=self.dataset["test"],
            preprocess_logits_for_metrics=False,
            peft_config=self.peft_config,
        )

    def _preprocess_function(self, examples):
        """
        Preprocesses the data by combining context and question, tokenizing inputs and labels.
        """
        # Iterate through the messages
        inputs = []
        labels = []
        for message in examples["messages"]:
            # Extract the information from the messages
            message_info = message

            # Process the message to get the right information
            if isinstance(message_info, str):
                inputs.append(f"### Prompt: {message_info}")
                labels.append(message_info)
            elif isinstance(message_info, list):
                for sub_message in message_info:
                    if sub_message["role"] == "user":
                        inputs.append(f"### Prompt: {sub_message['content']}")
                    elif sub_message["role"] == "assistant":
                        labels.append(sub_message['content'])

        # Tokenize the inputs and labels
        model_inputs = self.tokenizer(inputs, max_length=1824, truncation=True, padding=True)
        with self.tokenizer.as_target_tokenizer():
          label_ids = self.tokenizer(labels, max_length=1024, truncation=True, padding=True)
        model_inputs["labels"] = label_ids["input_ids"]
        return model_inputs

    def run(self):
        """
        Executes the OODA loop and fine-tunes the language model using RL.
        """
        for pair in rl_pairs:
            self.model_id = pair["model_id"]
            self.dataset_name = pair["dataset_name"]

            print('\n')
            print(f"Fine-tuning for Model: {self.model_id}, Dataset: {self.dataset_name}")
            print('\n')

            self._observe()
            self._orient()
            self._decide()
            self._act()

            # Start the RL training loop
            #... (using a suitable RL library or a custom implementation)

# Example Usage with rl_pairs
config = {
    "training_args": {
        "output_dir": "./results",
        "num_train_epochs": 1,
        "per_device_train_batch_size": 3,
        "gradient_accumulation_steps": 2,
        "report_to": None,
        "gradient_checkpointing": True,
        "optim": "adamw_torch_fused",
        "learning_rate": 2e-4,
        "bf16": True,
        "tf32": True, # for L4 AND A100
        #"tf32": False, # Set tf32 to false since your hardware may not support it

        "max_grad_norm": 0.3,
        "warmup_ratio": 0.03,
        "lr_scheduler_type": "constant",
    },
    "quantization": True,
    "lora": True,  # Enable LORA
    "test_split_percentage": 0.25  # Use 25% of the data for evaluation
}

agent = FineTuningAgent(
    model_id=None,  # We'll set this in the loop
    dataset_name=None,  # We'll set this in the loop
    config=config,
)

# Iterate through the rl_pairs and run the fine-tuning for each pair
agent.run()  # The run() method now handles the iteration



Fine-tuning for Model: google/flan-t5-xl, Dataset: anthropic/hh-rlhf




`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).




trainable params: 566,231,040 || all params: 3,415,988,224 || trainable%: 16.5759




Map:   0%|          | 0/120600 [00:00<?, ? examples/s]

Map:   0%|          | 0/40200 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/120600 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/120600 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/120600 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/40200 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/40200 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/40200 [00:00<?, ? examples/s]

In [1]:
rl_pairs = [
    {
        "model_id": "google/flan-t5-xl",  # A versatile text-to-text model
        "dataset_name": "anthropic/hh-rlhf"  # Anthropic's helpful and harmless dataset
    },
    {
        "model_id": "bigscience/T0_3B",  # A general-purpose text-to-code model
        "dataset_name": "openai/safety-gym"  # OpenAI's Safety Gym for safe RL
    },
    {
        "model_id": "EleutherAI/gpt-neo-125M",  # A smaller model for faster experimentation
        "dataset_name": "MineRL/MineRLBasaltFindCave-v0"  # Minecraft environment for goal-oriented RL
    },
    {
        "model_id": "facebook/blenderbot-400M-distill",  # A dialogue-focused model
        "dataset_name": "stanfordnlp/coqa"  # Conversational Question Answering dataset
    },
    {
        "model_id": "microsoft/DialoGPT-medium",  # Another conversational model
        "dataset_name": "huggingface/rl-chatbot"  # A dataset for training RL chatbots
    }
]

In [6]:
model=rl_pairs[0]['model_id']
dataset=rl_pairs[0]['dataset_name']
print(model)
print(dataset)

google/flan-t5-xl
anthropic/hh-rlhf
