<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Training_Smart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U datasets transformers accelerate peft trl bitsandbytes sentencepiece interpret
!pip install colab-env --quiet

!pip install -U bitsandbytes -q

In [None]:
!pip install flash-attn --no-build-isolation -q # Install the flash-attn package

In [None]:
!pip install peft --upgrade  -q # Upgrade peft to the latest version

In [None]:
!nvidia-smi

Wed Nov 20 05:02:49 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0              47W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from huggingface_hub import login
import torch
from datasets import load_dataset
import colab_env

# 0. Login to Hugging Face Hub
import os
access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")
login(token=access_token_write, add_to_git_credential=True)

In [None]:
!pip install numba -q
!pip install trl transformers datasets accelerate bitsandbytes -q

from numba import cuda

# Attempt to reset the device before selecting it.
cuda.close()

# Selects the desired device and creates a context.
cuda.select_device(0) # or any valid GPU index

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
import torch
from trl import PPOConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from tqdm import tqdm

import copy
import sqlite3

# 1. Load the base Mistral model and tokenizer
model_id = "mistralai/Mistral-7B-Instruct-v0.1"  # Use the base model

# BitsAndBytesConfig int-4 config (for reduced memory usage)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Set the desired GPU device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# or device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # device_map="auto", # Remove or comment out device_map="auto"
    trust_remote_code=True,
    quantization_config=bnb_config,
).to(device) # Explicitly move the model to the device


tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# Revert padding strategy to default
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token  # Ensure default padding token is used
tokenizer.pad_token_id = tokenizer.eos_token_id # Ensure default padding token ID is used


# Revert padding strategy
# Assuming default padding token was used previously
#tokenizer.padding_side = "right"
#tokenizer.pad_token = tokenizer.unk_token
#tokenizer.pad_token_id = tokenizer.unk_token_id


# 2. Load a small dataset (for POC)
dataset = load_dataset("b-mc2/sql-create-context", split="train")


# Preprocess the dataset (this is the crucial missing step)
def preprocess_function(examples):
    inputs = [f"### Instruction: Translate the following natural language query into SQL. The database schema is given in the context.\n\n### Context: {c}\n\n### Query: {q}" for q, c in zip(examples["question"], examples["context"])]
    targets = examples["answer"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names,
)

train_dataset = dataset.select(range(10))  # Very small dataset for POC


# 3. Define a simple Reward Function (for POC purposes)
class SQLRewardFunction:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, samples):
        rewards = []
        for sample in samples:
            query = self.tokenizer.decode(sample, skip_special_tokens=True)
            # Simple reward: 1 for any generated query
            reward = 1.0
            rewards.append(reward)
        return torch.tensor(rewards)

reward_fn = SQLRewardFunction(tokenizer)

# 4. Define PPO configuration (reduced episodes for POC)
ppo_config = PPOConfig(
    output_dir="./ppo_results",
    total_episodes=2  # Very few episodes for POC
)

# ... (previous code) ...

# 5. Create the value model head manually

class ValueHead(torch.nn.Module):
    """
    Value head for the model.

    Args:
        hidden_size (int): The hidden size of the model.
        vocab_size (int): The vocabulary size of the model.
    """

    def __init__(self, hidden_size, vocab_size):
        super().__init__()
        self.v_head = torch.nn.Linear(hidden_size, 1)
        self.lm_head = torch.nn.Linear(hidden_size, vocab_size)

    def forward(self, hidden_states):
        """
        Forward pass through the value head.

        Args:
            hidden_states (torch.Tensor): The hidden states of the model.

        Returns:
            tuple: A tuple containing the logits and the value.
        """

        # Pass hidden states to lm_head first to get logits
        logits = self.lm_head(hidden_states.type(torch.float32))

        # Calculate the value from the hidden state directly
        value = self.v_head(hidden_states.type(torch.float32))
        return logits, value



# Get the hidden size from the model
hidden_size = model.config.hidden_size

# Create the value head
value_head = ValueHead(hidden_size, len(tokenizer))

# Move the value head to the same device as the model
value_head.to(model.device) # This line moves the value head to the GPU

# Attach the value head to the model
model.v_head = value_head

# ... (rest of the code) ...

# 6. Create the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=ppo_config.learning_rate)

# 7. Define helper functions for PPO (simplified for POC)
def calculate_advantages(rewards, values, gamma=0.99, gae_lambda=0.95):
    """Calculates advantages using Generalized Advantage Estimation (GAE)."""
    last_advantage = 0
    advantages = []
    for t in reversed(range(len(rewards) - 1)):
        delta = rewards[t] + gamma * values[t + 1] - values[t]
        last_advantage = delta + gamma * gae_lambda * last_advantage
        advantages.insert(0, last_advantage)
    return torch.tensor(advantages)

def calculate_policy_loss(logits, actions, advantages):
    """Calculates the policy loss using the PPO objective."""
    # Move actions and advantages to the same device as logits
    actions = actions.to(logits.device)
    advantages = advantages.to(logits.device)

    # Assuming you're using categorical actions (e.g., token ids)
    cross_entropy = torch.nn.functional.cross_entropy(
        logits.view(-1, logits.size(-1)), actions.view(-1)
    )
    return (cross_entropy * advantages).mean()

# 8. Training loop with progress bar
for episode in tqdm(range(ppo_config.total_episodes), desc="Episodes"):
    for batch in tqdm(train_dataset, desc="Batches", leave=False):
        # a. Generate samples from the policy
        input_ids = torch.tensor(batch["input_ids"]).to(device)  # Access input_ids correctly
        attention_mask = torch.tensor(batch["attention_mask"]).to(device)

        # Add an extra dimension for batch size
        input_ids = input_ids.unsqueeze(0)
        attention_mask = attention_mask.unsqueeze(0)

        #print('\n\n')


        # Check the shapes of input_ids and attention_mask before calling model.generate
        #print("Input IDs shape:", input_ids.shape)
        #print("Attention Mask shape:", attention_mask.shape)

        #print('\n\n')

        #print("Unique values in input_ids:", torch.unique(input_ids))
       #print("Unique values in attention_mask:", torch.unique(attention_mask))



        samples = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=10,  # Reduced max_new_tokens for POC
            #do_sample=True,
            #top_k=50,
            #top_p=0.95,
            #temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

        # b. Calculate rewards
        rewards = reward_fn(samples)


        # ... inside the training loop ...
        # c. Compute the policy loss and value loss
        outputs = model(
            input_ids=samples,
            attention_mask=torch.ones_like(samples),
            labels=samples.type(torch.long),  # Convert samples to torch.long
            output_hidden_states=True  # Add this line to get hidden states
        )

        # Get the hidden states for the value head
        hidden_states = outputs.hidden_states[-1]  # Get the last hidden state

        # Get logits and values from the custom value head using the hidden states
        logits, values = model.v_head(hidden_states)


        # Find the loss tensor in the outputs tuple (this might need adjustment)
        value_loss = outputs.loss

        # Calculate advantages
        advantages = calculate_advantages(rewards, values)

        # Calculate policy loss
        policy_loss = calculate_policy_loss(logits, samples, advantages)

        # d. Update the model parameters
        optimizer.zero_grad()

        # Combine the losses
        total_loss = policy_loss + value_loss

        total_loss.backward()
        optimizer.step()


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Episodes:   0%|          | 0/2 [00:00<?, ?it/s]
Batches:   0%|          | 0/10 [00:00<?, ?it/s][A
Batches:  10%|█         | 1/10 [00:01<00:10,  1.12s/it][A
Batches:  20%|██        | 2/10 [00:02<00:08,  1.11s/it][A
Batches:  30%|███       | 3/10 [00:03<00:07,  1.12s/it][A
Batches:  40%|████      | 4/10 [00:04<00:06,  1.12s/it][A
Batches:  50%|█████     | 5/10 [00:05<00:05,  1.12s/it][A
Batches:  60%|██████    | 6/10 [00:06<00:04,  1.19s/it][A
Batches:  70%|███████   | 7/10 [00:08<00:03,  1.19s/it][A
Batches:  80%|████████  | 8/10 [00:09<00:02,  1.16s/it][A
Batches:  90%|█████████ | 9/10 [00:10<00:01,  1.15s/it][A
Batches: 100%|██████████| 10/10 [00:11<00:00,  1.14s/it][A
Episodes:  50%|█████     | 1/2 [00:11<00:11, 11.45s/it]
Batches:   0%|          | 0/10 [00:00<?, ?it/s][A
Batches:  10%|█         | 1/10 [00:01<00:09,  1.09s/it][A
Batches:  20%|██        | 2/10 [00:02<00:08,  1.11s/it][A
Batches:  30%|███       | 3/10 [00:03<00:07,  1.11s/it][A
Batches:  40%|████      | 

In [None]:
# 9. Save the model (optional for POC)
model.push_to_hub(
    "frankmorales2020/mistral-7b-ppo-poc-t2sql",  # Provide repo_id as positional argument
    commit_message="Upload PPO POC model"
)

model.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/frankmorales2020/mistral-7b-ppo-poc-t2sql/commit/b9fc1f4e9470e875bcc56e8668042f35d1ecff7e', commit_message='Upload PPO POC model', commit_description='', oid='b9fc1f4e9470e875bcc56e8668042f35d1ecff7e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/frankmorales2020/mistral-7b-ppo-poc-t2sql', endpoint='https://huggingface.co', repo_type='model', repo_id='frankmorales2020/mistral-7b-ppo-poc-t2sql'), pr_revision=None, pr_num=None)

## IMPROVEMENT

In [None]:
import torch
from trl import PPOConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from tqdm import tqdm
import sqlite3

# 1. Load the base Mistral model and tokenizer
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Set the desired GPU device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# Revert padding strategy to default
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# 2. Load and preprocess the dataset with modified prompts
dataset = load_dataset("b-mc2/sql-create-context", split="train")

def preprocess_function(examples):
    # Modified prompts to discourage "#" and add a separator
    inputs = [f"### Instruction: Translate the following natural language query into SQL. The database schema is given in the context. Do not generate any '#' characters.\n\n### Context: {c}\n\n### Query: {q}\n\n```sql\n" for q, c in zip(examples["question"], examples["context"])]
    targets = examples["answer"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names,
)

train_dataset = dataset.select(range(100))  # Increased dataset size

# 3. Define the Reward Function with dynamic schema handling and SQL extraction
def execute_query(query, db_path="database.db"):
    """Executes a SQL query against a SQLite database."""
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute(query)
        conn.close()
        return True  # Successful execution
    except Exception as e:
        print(f"Error executing query: {query}\nError: {e}")
        return False  # Failed execution

class SQLRewardFunction:
    def __init__(self, tokenizer, db_path="database.db"):
        self.tokenizer = tokenizer
        self.db_path = db_path

    def __call__(self, samples, **kwargs):
        rewards = []
        for sample in samples:
            # Extract the token IDs from the tensor
            token_ids = sample.squeeze().tolist()

            # Decode the token IDs, skipping special tokens
            query = tokenizer.decode(token_ids, skip_special_tokens=True)

            # --- Extract schema and create table (if not exists) ---
            try:
                # More robust schema extraction
                schema = query.split("### Context:")[-1].split("### Query:")[0].strip()
                schema = schema.split("### Solution:")[0].strip() # Remove any trailing solution

                # Split multiple CREATE TABLE statements
                schemas = schema.split(";")
                conn = sqlite3.connect(self.db_path)
                cursor = conn.cursor()
                for schema in schemas:
                    if schema.strip():  # Execute only if the schema is not empty
                        # Modify the schema to include "IF NOT EXISTS"
                        schema = schema.replace("CREATE TABLE", "CREATE TABLE IF NOT EXISTS")
                        cursor.execute(schema)  # Create the table if it doesn't exist
                conn.commit()
                conn.close()

            except Exception as e:
                print(f"Error creating table with schema: {schema}\nError: {e}")
                return torch.tensor([-1.0])  # Return a negative reward

            # --- End of schema extraction and table creation ---

            # --- Extract SQL query ---
            try:
                # Attempt to split based on "### Solution:"
                query = query.split("### Solution:")[-1].strip()
            except:
                print("Unexpected output format. Using default query.")
                query = "SELECT 1" # Fallback query

            # --- End of SQL extraction ---

            # Reward based on execution success
            if execute_query(query, self.db_path):
                reward = 1.0  # Successful execution
            else:
                reward = -1.0  # Failed execution
            rewards.append(reward)
        return torch.tensor(rewards)

reward_fn = SQLRewardFunction(tokenizer)  # Initialize reward function

# 4. Define PPO configuration
ppo_config = PPOConfig(
    output_dir="./ppo_results",
    total_episodes=10  # Increased number of episodes
)

# 5. Create and attach the simplified value model head
class ValueHead(torch.nn.Module):
    def __init__(self, hidden_size, vocab_size):
        super().__init__()
        self.v_head = torch.nn.Linear(hidden_size, 1)  # Only keep the value head

    def forward(self, hidden_states):
        # --- Examine hidden states ---
        print("Hidden states mean:", hidden_states.mean())
        print("Hidden states std:", hidden_states.std())
        print("Hidden states min:", hidden_states.min())
        print("Hidden states max:", hidden_states.max())
        # --- End of hidden state examination ---

        value = self.v_head(hidden_states.type(torch.float32))
        return value  # Only return the value

hidden_size = model.config.hidden_size
value_head = ValueHead(hidden_size, len(tokenizer))
value_head.to(model.device)
model.v_head = value_head

# 6. Create the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=ppo_config.learning_rate)

# 7. Define helper functions for PPO
def calculate_advantages(rewards, values, gamma=0.99, gae_lambda=0.95):
    last_advantage = 0
    advantages = []
    for t in reversed(range(len(rewards) - 1)):
        delta = rewards[t] + gamma * values[t + 1] - values[t]
        last_advantage = delta + gamma * gae_lambda * last_advantage
        advantages.insert(0, last_advantage)
    return torch.tensor(advantages)

def calculate_policy_loss(logits, actions, advantages):
    actions = actions.to(logits.device)
    advantages = advantages.to(logits.device)
    cross_entropy = torch.nn.functional.cross_entropy(
        logits.view(-1, logits.size(-1)), actions.view(-1)
    )
    return (cross_entropy * advantages).mean()

# 8. Training loop with gradient clipping
for episode in tqdm(range(ppo_config.total_episodes), desc="Episodes"):
    for batch in tqdm(train_dataset, desc="Batches", leave=False):
        input_ids = torch.tensor(batch["input_ids"]).to(device)
        attention_mask = torch.tensor(batch["attention_mask"]).to(device)

        # Add an extra dimension for batch size
        input_ids = input_ids.unsqueeze(0)
        attention_mask = attention_mask.unsqueeze(0)

        samples = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,  # Increased max_new_tokens
            pad_token_id=tokenizer.eos_token_id
        )

        # Remove the first token from each sample
        samples = [sample[1:] for sample in samples]

        # --- DEBUGGING (Optional) ---
        # Print the first token ID of the first sample
        print("First token ID:", samples[0][0].item())

        for sample in samples:
            # Extract the token IDs from the tensor
            token_ids = sample.squeeze().tolist()

            query = tokenizer.decode(token_ids, skip_special_tokens=True)  # Use tokenizer directly
            print("Generated output:", query)  # Print the generated output
        # --- END DEBUGGING ---

        rewards = reward_fn(samples)

        # --- Pad the samples before concatenating ---
        max_length = max([len(sample) for sample in samples])  # Get the maximum length


        # Pad the samples, ensuring the padding tensor is on the same device as the sample
        padded_samples = [
            torch.cat([sample, torch.full((max_length - len(sample),), tokenizer.pad_token_id, dtype=torch.long, device=sample.device)]) # Change here
            for sample in samples
        ]


        # Concatenate the padded samples
        samples_tensor = torch.cat(padded_samples, dim=0).to(device)
        # --- End of padding ---

        # Create attention mask for the stacked samples
        attention_mask = torch.ones_like(samples_tensor, dtype=torch.long).to(device)

        # Reshape attention mask to (batch_size, 1, 1, sequence_length)
        attention_mask = attention_mask.unsqueeze(0).unsqueeze(1)  # Add two dimensions for batch_size and num_heads


        outputs = model(
            input_ids=samples_tensor,  # Pass the stacked tensor
            attention_mask=attention_mask,  # Pass the correct attention mask
            labels=samples_tensor,  # Pass the stacked tensor
            output_hidden_states=True
        )

        hidden_states = outputs.hidden_states[-1]

        # Get the value from the simplified value head
        values = model.v_head(hidden_states)

        value_loss = outputs.loss
        advantages = calculate_advantages(rewards, values)

        # Get logits for the policy loss
        logits = outputs.logits
        policy_loss = calculate_policy_loss(logits, samples_tensor, advantages)  # Use the concatenated tensor

        optimizer.zero_grad()
        total_loss = policy_loss + value_loss
        total_loss.backward()

        # --- Gradient clipping ---
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

# 9. Save the model
model.push_to_hub(
    "your-username/mistral-7b-ppo-sql",  # Replace with your Hugging Face username
    commit_message="Upload PPO SQL model"
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Episodes:   0%|          | 0/10 [00:00<?, ?it/s]
Batches:   0%|          | 0/100 [00:00<?, ?it/s][A
Episodes:   0%|          | 0/10 [00:02<?, ?it/s]

First token ID: 774
Generated output: ### Instruction: Translate the following natural language query into SQL. The database schema is given in the context. Do not generate any '#' characters.

### Context: CREATE TABLE head (age INTEGER)

### Query: How many heads of the departments are older than 56 ?

```sql
  SELECT COUNT(*)
  FROM head
  WHERE age > 56
```
Error executing query: ### Instruction: Translate the following natural language query into SQL. The database schema is given in the context. Do not generate any '#' characters.

### Context: CREATE TABLE head (age INTEGER)

### Query: How many heads of the departments are older than 56 ?

```sql
  SELECT COUNT(*)
  FROM head
  WHERE age > 56
```
Error: unrecognized token: "#"





ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
# 9. Save the model
model.push_to_hub(
    "your-username/mistral-7b-ppo-sql",  # Replace with your Hugging Face username
    commit_message="Upload PPO SQL model"
)