<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/FTA_DYNAMIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers accelerate trl bitsandbytes --quiet

## Univeral FineTuningAgent

SOL1

In [2]:
!pip install transformers accelerate trl bitsandbytes --quiet
import os
import re
import copy

# Set environment variables for debugging
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_DISABLED"] = "true"
!export CUDA_LAUNCH_BLOCKING=1  # Enable synchronous CUDA error reporting

# Import necessary modules
from transformers import TrainingArguments
import accelerate

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    Trainer,
    DataCollatorForLanguageModeling,
)

from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import warnings
from trl import SFTTrainer

warnings.filterwarnings("ignore")

# Define the rl_pairs
RL_PAIRS = [
    {
        "model_id": "mistralai/Mistral-7B-Instruct-v0.1",  # Mistral model
        "dataset_name": "b-mc2/sql-create-context",  # SQL dataset
    },
        {
        "model_id": "mistralai/Mistral-7B-Instruct-v0.1",  # Mistral model
        "dataset_name": "b-mc2/sql-create-context",  # SQL dataset
    }
]

class FineTuningAgent:
    def __init__(self, model_id, dataset_name, config):
        """
        Initializes the FineTuningAgent with model ID, dataset name, and configuration.
        """
        self.model_id = model_id
        self.dataset_name = dataset_name
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.dataset = None # Initialize the dataset as None
        self.tokenizer = None # Initialize the tokenizer as None
        self.model = None # Initialize the model as None
        self.trainer = None # Initialize the trainer as None
        self.training_args = None # Initialize training args as None
        self.peft_config = None # Initialize peft_config as None
        self.counter = 0

    def _reset(self):
        """Resets the state of the agent between runs."""
        self.dataset = None
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.training_args = None
        self.peft_config = None

    def _observe(self):
        """
        Observes the environment by loading the model, tokenizer, and dataset.
        """
        self.counter += 1
        print(f"Starting Observe {self.counter}...")
        # 1. Load Model and Tokenizer (with quantization if enabled)
        quantization_config = None  # Initialize as None to allow for disabling quantization
        if self.config.get("quantization"):
            if "mistral" in self.model_id.lower():
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.bfloat16,
                )
            else:
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_use_double_quant=False,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.float32,
                )

        # Determine the correct model class based on architecture
        config = AutoConfig.from_pretrained(self.model_id)
        if config.is_encoder_decoder:
            #model_class = AutoModelForSeq2SeqLM

            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                self.model_id,
                quantization_config=quantization_config,
                trust_remote_code=True,
            )
        else:
            #model_class = AutoModelForCausalLM

            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_id,
                quantization_config=quantization_config,
                trust_remote_code=True,
            )


        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_id, trust_remote_code=True
        )
        self.model.config.use_cache = False

        # Add padding token if it does not exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
            #self.model.resize_token_embeddings(len(self.tokenizer))

        # Move model to device
        self.model.to(self.device)

        print("\n")
        print(f"Model: {self.model}")
        print("\n")

        # 2. Load Dataset (using dataset name from Hugging Face Hub or other sources)
        self.dataset = load_dataset(self.dataset_name)
        print(f"Observe {self.counter} finished.")

    def _orient(self):
        """
        Orients the agent by formatting the dataset and preparing training arguments.
        """
        self.counter += 1
        print(f"Starting Orient {self.counter}...")
        # Define system message
        system_message = "You are a helpful and harmless AI assistant."

        # Convert dataset to OAI messages based on dataset name
        def create_conversation(sample):
            """
            Converts a data sample into a structured conversation format.
            """
            if self.dataset_name == "anthropic/hh-rlhf":
                return {
                    "messages": [
                        {"role": "system", "content": system_message},
                        {"role": "user", "content": sample["chosen"]},
                        {"role": "assistant", "content": sample["rejected"]},
                    ]
                }
            elif self.dataset_name == "b-mc2/sql-create-context":
                return {
                    "messages": [{"role": "user", "content": sample}]
                }
            else:
                return {
                    "messages": [
                        {"role": "system", "content": system_message},
                        {"role": "user", "content": sample["question"]},
                        {"role": "assistant", "content": sample["answer"]},
                    ],
                    # Retain original columns
                    "question": sample["question"],
                    "context": sample["context"],
                    "answer": sample["answer"],
                }

        # Batch processing
        batch_size = 1000  # Adjust the batch size as needed
        if isinstance(self.dataset, DatasetDict):
            for split in self.dataset:
                # Ensure that the split is not empty
                if len(self.dataset[split]) == 0:
                    print(f"Warning: Split '{split}' is empty. Skipping.")
                    continue
                for i in range(0, len(self.dataset[split]), batch_size):
                    batch = self.dataset[split][i: i + batch_size]
                    # Convert list of dicts to a dict of lists for Dataset creation
                    if isinstance(batch[0], dict):
                        batch_dict = {k: [d[k] for d in batch] for k in batch[0].keys()}
                    else:
                        # Handle cases where batch is a list of strings
                        batch_dict = {"messages": batch}
                    batch_dataset = Dataset.from_dict(batch_dict)
                    #Clear the dictionary after every loop
                    batch_dict.clear()
                    if self.dataset_name == "anthropic/hh-rlhf":
                        # Convert the batch before mapping
                        transformed_dataset = batch_dataset.map(create_conversation)
                        self.dataset[split] = concatenate_datasets([self.dataset[split][:i], transformed_dataset, self.dataset[split][i + batch_size:]])
                    else:
                        # If not anthropic/hh-rlhf, we need to transform the messages before running map
                        # Check if the dataset has messages, and if so, transform the dataset
                        if "messages" in batch_dataset.column_names and self.dataset_name != "b-mc2/sql-create-context":
                            # Extract the list of messages from each row
                            list_of_messages = [item["messages"] for item in batch_dataset]

                            # Create a new list of dicts to hold the transformed dataset
                            list_of_dicts = []
                            for item in list_of_messages:
                                # Validation for item
                                if not isinstance(item, list) or len(item) == 0:
                                    print(f"Warning: item is not a list or is empty: {item}")
                                    continue  # Skip to the next item

                                # Validation for item[0]
                                if not isinstance(item[0], dict) or "content" not in item[0]:
                                    print(f"Warning: item[0] is not a dict or does not have a 'content' key: {item[0]}")
                                    continue  # Skip to the next item

                                user = item[0]["content"]

                                # Validation for item length
                                if len(item) < 2:
                                    print(f"Warning: item does not have at least two elements: {item}")
                                    continue  # Skip to the next item

                                # Validation for item[1]
                                if not isinstance(item[1], dict) or "content" not in item[1]:
                                    print(f"Warning: item[1] is not a dict or does not have a 'content' key: {item[1]}")
                                    continue  # Skip to the next item

                                assistant = item[1]["content"]
                                list_of_dicts.append({"question": user, "answer": assistant})

                            # Check if list_of_dicts is empty after processing
                            if not list_of_dicts:
                                print("Warning: list_of_dicts is empty after processing. Skipping transformation.")
                                continue  # Skip the rest of the current batch

                            #convert to dataset
                            transformed_dataset = Dataset.from_list(list_of_dicts)
                            # Convert the batch before mapping
                            transformed_dataset = transformed_dataset.map(create_conversation, remove_columns=True)
                            self.dataset[split] = concatenate_datasets([self.dataset[split][:i], transformed_dataset, self.dataset[split][i + batch_size:]])
                        else:
                             # Convert the batch before mapping
                            transformed_dataset = batch_dataset.map(create_conversation, remove_columns=True)
                            self.dataset[split] = concatenate_datasets([self.dataset[split][:i], transformed_dataset, self.dataset[split][i + batch_size:]])
        else:
            # Handle non-DatasetDict case
            # Ensure that the dataset is not empty
            if len(self.dataset) == 0:
                print("Warning: Dataset is empty. Skipping.")
                return
            for i in range(0, len(self.dataset), batch_size):
                batch = self.dataset[i: i + batch_size]
                # Convert list of dicts to a dict of lists for Dataset creation
                if isinstance(batch[0], dict):
                    batch_dict = {k: [d[k] for d in batch] for k in batch[0].keys()}
                else:
                    # Handle cases where batch is a list of strings
                    batch_dict = {"messages": batch}
                batch_dataset = Dataset.from_dict(batch_dict)
                #Clear the dictionary after every loop
                batch_dict.clear()
                if self.dataset_name == "anthropic/hh-rlhf":
                    # Convert the batch before mapping
                    transformed_dataset = batch_dataset.map(create_conversation)
                    self.dataset = concatenate_datasets([self.dataset[:i], transformed_dataset, self.dataset[i + batch_size:]])
                else:
                    # If not anthropic/hh-rlhf, we need to transform the messages before running map
                    # Check if the dataset has messages, and if so, transform the dataset
                    if "messages" in batch_dataset.column_names and self.dataset_name != "b-mc2/sql-create-context":
                        # Extract the list of messages from each row
                        list_of_messages = [item["messages"] for item in batch_dataset]

                        # Create a new list of dicts to hold the transformed dataset
                        list_of_dicts = []
                        for item in list_of_messages:
                            # Validation for item
                            if not isinstance(item, list) or len(item) == 0:
                                print(f"Warning: item is not a list or is empty: {item}")
                                continue  # Skip to the next item

                            # Validation for item[0]
                            if not isinstance(item[0], dict) or "content" not in item[0]:
                                print(f"Warning: item[0] is not a dict or does not have a 'content' key: {item[0]}")
                                continue  # Skip to the next item

                            user = item[0]["content"]

                            # Validation for item length
                            if len(item) < 2:
                                print(f"Warning: item does not have at least two elements: {item}")
                                continue  # Skip to the next item

                            # Validation for item[1]
                            if not isinstance(item[1], dict) or "content" not in item[1]:
                                print(f"Warning: item[1] is not a dict or does not have a 'content' key: {item[1]}")
                                continue  # Skip to the next item

                            assistant = item[1]["content"]
                            list_of_dicts.append({"question": user, "answer": assistant})

                        # Check if list_of_dicts is empty after processing
                        if not list_of_dicts:
                            print("Warning: list_of_dicts is empty after processing. Skipping transformation.")
                            continue  # Skip the rest of the current batch

                        #convert to dataset
                        transformed_dataset = Dataset.from_list(list_of_dicts)
                        # Convert the batch before mapping
                        transformed_dataset = transformed_dataset.map(create_conversation, remove_columns=True)
                        self.dataset = concatenate_datasets([self.dataset[:i], transformed_dataset, self.dataset[i + batch_size:]])
                    else:
                        # Convert the batch before mapping
                        transformed_dataset = batch_dataset.map(create_conversation, remove_columns=True)
                        self.dataset = concatenate_datasets([self.dataset[:i], transformed_dataset, self.dataset[i + batch_size:]])

        # Conditional split based on test_split_percentage
        if self.config.get("test_split_percentage") is not None:
            test_split_percentage = self.config["test_split_percentage"]
            # Check if the dataset is already a DatasetDict with splits
            if isinstance(self.dataset, DatasetDict):
                # Assume we want to split the training data
                if "train" in self.dataset:
                    test_size = int(
                        len(self.dataset["train"]) * test_split_percentage
                    )
                    test_size = max(
                        test_size, 1
                    )  # Ensure a minimum test size
                    # Split the train dataset
                    train_dataset_dict = self.dataset["train"].train_test_split(
                        test_size=test_size
                    )
                    # Update the dataset with the new train split and add the test split
                    self.dataset = DatasetDict(
                        {
                            "train": train_dataset_dict["train"],
                            "test": train_dataset_dict["test"],
                        }
                    )
            else:
                # If the dataset is not a DatasetDict, then it must be a list
                test_size = int(len(self.dataset) * test_split_percentage)
                test_size = max(
                    test_size, 1
                )  # Ensure a minimum test size
                # Split the dataset
                train_dataset_dict = self.dataset.train_test_split(
                    test_size=test_size
                )
                # Update the dataset with the new splits
                self.dataset = train_dataset_dict

        # 3. Prepare Training Arguments
        self.training_args = TrainingArguments(
            **self.config.get("training_args", {})
        )
        self.training_args.remove_unused_columns = False

        print("\n")
        print(f"Orient - Training Arguments: {self.training_args}")
        print("\n")

        print("\n")
        print(f"Orient - Dataset: {self.dataset}")
        print("\n")
        print(f"Orient {self.counter} finished.")

    def _decide(self):
        """
        Decides on the fine-tuning strategy, including LoRA configuration.
        """
        self.counter += 1
        print(f"Starting Decide {self.counter}...")
        # 4. PEFT Configuration (LoRA)
        if self.config.get("lora"):
            self.model = prepare_model_for_kbit_training(self.model)

            # Determine the correct target modules based on model architecture
            if self.model.config.is_encoder_decoder:
                target_modules = [
                    "q", "k", "v", "o", "wi_0", "wi_1", "wo"
                ]  # For encoder-decoder models
                task_type = "SEQ_2_SEQ_LM"
            elif "mistral" in self.model_id.lower():
                target_modules = [
                    "q_proj", "k_proj", "v_proj", "o_proj"
                ]  # For Mistral models
                task_type = "CAUSAL_LM"
            else:
                target_modules = [
                    "c_attn", "c_proj", "w1", "w2"
                ]  # For other causal models
                task_type = "CAUSAL_LM"

            print("\n")
            print(f"LORA Target Modules: {target_modules}")
            print("\n")

            # Updated LoRA config
            peft_config = LoraConfig(
                lora_alpha=128,
                lora_dropout=0.05,
                r=256,
                bias="none",
                target_modules="all-linear",
                task_type="CAUSAL_LM",
            )

            self.peft_config = peft_config
            self.model = get_peft_model(self.model, peft_config)

            print("\n")
            self.model.print_trainable_parameters()
            print("\n")
        print(f"Decide {self.counter} finished.")

    def _act(self):
        """
        Acts by preprocessing the dataset and initializing the training loop.
        """
        self.counter += 1
        print(f"Starting Act {self.counter}...")
        try:
            print("Preprocessing dataset...")
            print("Dataset before preprocessing:", self.dataset)
            # Check if the dataset has a train split
            if "train" not in self.dataset:
                if isinstance(self.dataset, Dataset):
                    # If the dataset is a Dataset, convert it to a DatasetDict with a train split.
                    self.dataset = DatasetDict({"train": self.dataset})
                    print("The dataset did not have a 'train' split. It has been converted to a DatasetDict with a 'train' split.")
                else:
                    print("The dataset does not have a 'train' split.")
                    return  # Exit the method early if there's no train split

            # Check if the train split is empty
            if len(self.dataset["train"]) == 0:
                print("The 'train' split of the dataset is empty.")
                return  # Exit the method early if the train split is empty

            # Batch preprocessing
            batch_size = 1000  # Adjust the batch size as needed
            transformed_datasets = []
            for i in range(0, len(self.dataset["train"]), batch_size):
                batch = self.dataset["train"][i: i + batch_size]
                batch_dataset = Dataset.from_dict(batch)
                #Ensure the preprocessing function return something
                transformed_batch_dataset = batch_dataset.map(
                    self._preprocess_function,
                    batched=True,
                    remove_columns=batch_dataset.column_names, # Ensure that we remove the columns from the batch_dataset and not the entire dataset
                )
                transformed_datasets.append(transformed_batch_dataset)

            # Check if the list of dataset is empty.
            if not transformed_datasets:
                print("No data was transformed during preprocessing. transformed_datasets is empty.")
                return

            # Now concatenate all the preprocessed datasets using concatenate_datasets
            if all(isinstance(ds, Dataset) for ds in transformed_datasets):
                self.dataset["train"] = concatenate_datasets(transformed_datasets)
            else:
                print("Not all elements in transformed_datasets are of type Dataset. Cannot concatenate.")
                return

            print("Dataset preprocessed successfully.")
            print("Dataset after preprocessing:", self.dataset)

            print("Creating data collator...")
            data_collator = DataCollatorForLanguageModeling(
                tokenizer=self.tokenizer, mlm=False
            )
            print("Data collator created successfully.")

            print("Initializing trainer...")
            if self.dataset_name == "anthropic/hh-rlhf":
                # Use a regular Trainer for RLHF (you'll need to adapt this for your RL library)
                print("Using Trainer for RLHF.")
                self.trainer = Trainer(
                    model=self.model,
                    args=self.training_args,
                    train_dataset=self.dataset["train"],
                    eval_dataset=self.dataset.get(
                        "test"
                    ),  # Use get() to handle missing "test"
                    preprocess_logits_for_metrics=False,
                    data_collator=data_collator,  # pass the data_collator
                )
                #... (configure the Trainer for RL - this will depend on your RL library)...
            else:
                # Use SFTTrainer for supervised fine-tuning
                print("Using SFTTrainer for supervised fine-tuning.")
                self.trainer = SFTTrainer(
                    model=self.model,
                    args=self.training_args,
                    train_dataset=self.dataset["train"],
                    eval_dataset=self.dataset.get("test"),
                    preprocess_logits_for_metrics=False,
                    peft_config=self.peft_config,
                    data_collator=data_collator,  # pass the data_collator
                )
            print("Trainer initialized successfully.")
            print("Training arguments:", self.training_args)

        except Exception as e:
            print(f"An error occurred in _act(): {e}")
            raise  # Re-raise the exception to preserve the stack trace
        print(f"Act {self.counter} finished.")

    def _preprocess_function(self, example):
        """
        Preprocesses the data by combining context and question, tokenizing, and formatting.
        """
        if self.dataset_name == "b-mc2/sql-create-context":
            # Concatenate all messages into a single string for input.
            input_text = example['text'] # the key is now text in b-mc2/sql-create-context
            label_text = input_text
        else:
            input_text = "".join([msg['content'] for msg in example['messages']])
            label_text = example["answer"]

        # Count the number of times the word "weel" appears in the input_text
        # Convert to lowercase for case-insensitive counting
        word_to_count = "weel"
        lower_input_text = input_text.lower()
        count = len(re.findall(r'\b' + re.escape(word_to_count.lower()) + r'\b', lower_input_text))

        # Tokenize the inputs and labels
        model_inputs = self.tokenizer(
            input_text, max_length=1024, truncation=True, padding="max_length"
        )

        with self.tokenizer.as_target_tokenizer():
            label_ids = self.tokenizer(
                label_text, max_length=512, truncation=True, padding="max_length"
            )

        model_inputs["labels"] = label_ids["input_ids"]
        model_inputs["weel_count"] = count #add weel count to the model inputs

        return model_inputs

    def run(self):
        """
        Executes the OODA loop and fine-tunes the language model using RL or SFT.
        """
        for pair in RL_PAIRS:
            self._reset()
            self.model_id = pair["model_id"]
            self.dataset_name = pair["dataset_name"]

            print("Observe: Start")
            print("\n")  # Separate Observe stage
            self._observe()
            print("Observe: End")
            print("\n")  # Separates the observe stage from the Orient stage

            print("Orient: Start")
            print("\n")  # Separate Orient stage
            self._orient()
            print("Orient: End")
            print("\n")  # Separates the Orient stage from the Decide stage

            print("Decide: Start")
            print("\n")  # Separate Decide stage
            self._decide()
            print("Decide: End")
            print("\n")  # Separates the Decide stage from the Act stage

            print("Act: Start")
            print("\n")  # Separate Act stage
            self._act()
            print("Act: End")
            print("\n")  # Separates the Act stage from the training info

            print(
                f"Start: Fine-tuning for Model: {self.model_id}, "
                f"Dataset: {self.dataset_name}"
            )
            print("\n")  # Separate training information

            # Start the training
            print("Starting training...")
            self.trainer.train()
            print("Training completed.")

            print(
                f"End: Fine-tuning for Model: {self.model_id}, "
                f"Dataset: {self.dataset_name}"
            )
            print("\n")  # Separate training information

            # Explore synergies between Newton, Galileo, Einstein, and Hinton
            self._explore_synergies()

            # After training, you can access the 'weel_count' information like this:
            # Assuming you have a way to get a subset of the training dataset after training
            # or if you have a separate evaluation dataset:
            if "train" in self.dataset:
                sample_data = self.dataset["train"][:10] #this will get the first 10 rows of data
                for row in sample_data:
                    print(f"Input text: {row}")
                    print(f"'weel' count: {row['weel_count']}")
                    print()

    def _explore_synergies(self):
        """
        Explores the potential synergies between Newton, Galileo, Einstein, and Hinton in the context of AI.
        """
        print("\nExploring synergies between Newton, Galileo, Einstein, and Hinton in the context of AI:")
        # Add your code here to explore the synergies.
        # This could involve generating text, analyzing data, or conducting experiments.
        # For example, you could use the fine-tuned model to generate text about the contributions of each figure to AI.
        # You could also analyze the model's performance on different tasks to see how it reflects the principles of these figures.
        # Be creative and explore the connections between these influential figures and the field of AI.
        print("Synergies exploration completed.\n")

# Example Usage with rl_pairs
config = {
    "training_args": {
        "output_dir": "./results",
        "num_train_epochs": 1,
        "per_device_train_batch_size": 1,
        "gradient_accumulation_steps": 1,
        "optim": "adamw_torch_fused",
        "learning_rate": 2e-4,
        "bf16": False,
        "max_grad_norm": 0.3,
        "warmup_ratio": 0.03,
        "lr_scheduler_type": "constant",
        "logging_steps": 1,
        "evaluation_strategy": "steps",
        "eval_steps": 1,
        "tf32": True,  # Enable TF32 for A100
    },
    "quantization": True,  # Add quantization
    "lora": True,
    "test_split_percentage": 0.25,
}

agent = FineTuningAgent(model_id=None, dataset_name=None, config=config)

# Iterate through the rl_pairs and run the fine-tuning for each pair
agent.run()  # The run() method now handles the iteration

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Observe: Start


Starting Observe 1...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



Model: MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

README.md:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

sql_create_context_v4.json:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]

Observe 1 finished.
Observe: End


Orient: Start


Starting Orient 2...


KeyError: 0