<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/grpo_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install condacolab
!pip install -q condacolab
import condacolab
condacolab.install()
# Restart runtime here

In [1]:
!conda --version

conda 24.11.2


In [2]:
!conda env list


# conda environments:
#
base                   /usr/local



In [3]:
!nvidia-smi

Mon Jan 27 16:51:25 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0              47W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Update Conda (optional but recommended)
!conda update -n base -c defaults conda

# Create and activate conda environment
!conda create -n openr1 python=3.11 -y
!conda activate openr1

# Clone the Open-R1 repository:
!git clone https://github.com/huggingface/open-r1.git

# Change to project directory
%cd /content/open-r1

# Install necessary packages
!pip install -e ".[dev]"
!pip install vllm==0.6.6.post1 -q
!pip install vllm==0.6.6.post1 --extra-index-url https://download.pytorch.org/whl/cu121 -q


# Unset WANDB_DISABLED if it exists
import os
if 'WANDB_DISABLED' in os.environ:
    del os.environ['WANDB_DISABLED']


Channels:
 - defaults
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Solving environment: / - done


    current version: 24.11.2
    latest version: 25.1.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - conda


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.12.31 |       h06a4308_0         128 KB
    certifi-2024.12.14         |  py311h06a4308_0         161 KB
    conda-24.11.3              |  py311h06a4308_0         1.2 MB
    ------------------------------------------------------------
                                           Total:         1.5 MB

The following packages will be UP

In [None]:
!accelerate launch --config_file /content/open-r1/configs/zero3.yaml /content/open-r1/src/open_r1/grpo.py \
    --output_dir DeepSeek-R1-Distill-Qwen-7B-GRPO \
    --model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
    --dataset_name AI-MO/NuminaMath-TIR \
    --max_prompt_length 256 \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --logging_steps 10 \
    --bf16

In [None]:
!pip install bitsandbytes -q
!pip install datasets -q
!pip install transformers -q
!pip install torch -q
!pip install accelerate -q
!pip install tqdm -q

In [2]:
# Unset WANDB_DISABLED if it exists
import os
if 'WANDB_DISABLED' in os.environ:
    del os.environ['WANDB_DISABLED']

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes as bnb
import os
from accelerate import Accelerator  # Import Accelerator
from tqdm import tqdm  # Import tqdm

# Parameters
output_dir = "DeepSeek-R1-Distill-Qwen-7B-GRPO"
max_prompt_length = 256
per_device_train_batch_size = 1  # Not explicitly used in this simplified example
gradient_accumulation_steps = 16  # Not explicitly used in this simplified example
logging_steps = 10

# Function to run training within a subprocess (or directly)
def train_func(args):
    # Initialize Accelerator
    accelerator = Accelerator(mixed_precision="bf16", device_placement=False, split_batches=False)

    # 1. Load the Dataset
    dataset = load_dataset("AI-MO/NuminaMath-TIR")

    # 2. Load the DeepSeek-R1-Distill-Qwen-7B model with 4-bit quantization
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")

    quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading
    )

    # Load the model without specifying a device_map
    model = AutoModelForCausalLM.from_pretrained(
        "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
        quantization_config=quantization_config,
    )

    # Explicitly move the entire model to GPU 0
    model.to(accelerator.device)



    # Main Execution
    env = SimpleEnvironment(dataset, tokenizer, model, accelerator)  # Pass accelerator here
    input_dim = model.config.hidden_size
    output_dim = 10  # Assume 10 possible actions (adapt to your task)


    # Create models, optimizer
    policy_network = PolicyNetwork(input_dim, output_dim, accelerator.device)  # Pass device
    value_network = ValueNetwork(input_dim, accelerator.device)  # Pass device
    optimizer = optim.Adam(policy_network.parameters(), lr=0.01)


    # Prepare with accelerate (moves models and optimizer to device)
    policy_network, value_network, optimizer, env = accelerator.prepare(
       policy_network, value_network, optimizer, env
   )



    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Pass the model to GRPOTrainer
    trainer = GRPOTrainer(env, policy_network, value_network, optimizer, learning_rate=0.01, accelerator=accelerator, tokenizer=tokenizer, model=model)

    trainer.train(num_epochs=1, num_trajectories_per_epoch=10)

    # Save the model using the standard PyTorch save method
    torch.save(accelerator.unwrap_model(policy_network).state_dict(), os.path.join(output_dir, 'policy_network.pth'))

# 3. Define Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, device):  # Add device argument
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
        self.device = device  # Store device as attribute

    def forward(self, x):
        # Change to cast the input to float32 BEFORE it's passed to the linear layer
        x = x.to(self.device).type(torch.float32)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return torch.softmax(x, dim=-1)


# 4. Define Value Network
class ValueNetwork(nn.Module):
    def __init__(self, input_dim, device):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 1)
        self.device = device  # Store the device


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# 5. Environment (simplified example - assumes 'text' and 'label' fields in dataset)
class SimpleEnvironment:
    def __init__(self, dataset, tokenizer, model, accelerator):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.model = model
        self.accelerator = accelerator  # Store accelerator
        self.current_index = 0
        self.episode_length = 20  # Set the desired episode length

    def reset(self):
        self.current_index = 0
        return self.get_state()

    def get_state(self):
        text_description = self.dataset["train"][self.current_index].get("text", "")
        text_description = text_description[:max_prompt_length]

        # Move tokenizer outputs AND position_ids to the correct device
        inputs = self.tokenizer(text_description, return_tensors="pt")
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)

        # Move state_embedding to the correct device
        state_embedding = outputs.hidden_states[-1][:, 0, :].to(self.accelerator.device)

        return state_embedding.to(torch.float32)

    def step(self, action):
        #print("Entering env.step()")  # Print at the beginning
        target = self.dataset["train"][self.current_index].get("label", 0)  # Assume 0 if 'label' missing
        #print("Target obtained:", target)  # Print target value
        reward = 1.0 if action == target else -1.0
        #print("Reward calculated:", reward)  # Print reward value

        self.current_index += 1
        done = self.current_index >= self.episode_length  # Check if episode length is reached
        next_state = self.get_state() if not done else None

        #print("Exiting env.step()")  # Print before returning

        return next_state, reward, done

# 6. Trajectory Collection
def collect_trajectories(env, policy_network, num_trajectories, accelerator):
    trajectories = []
    for _ in range(num_trajectories):
        state = env.reset()
        trajectory = []
        while True:
            policy_network = policy_network.to(accelerator.device)
            state = state.to(accelerator.device)

            action_probs = policy_network(state)

            # Check for invalid probabilities (keep this part)
            if not torch.isfinite(action_probs).all() or not torch.allclose(action_probs.sum(), torch.tensor(1.0, device=action_probs.device)):
                action_probs = torch.ones_like(action_probs) / action_probs.shape[-1]

            action = torch.multinomial(action_probs, 1).item()
            next_state, reward, done = env.step(action)

            if next_state is not None:
                next_state = next_state.to(accelerator.device)

            trajectory.append((state, action, reward))

            if done:
                break
            state = next_state
        trajectories.append(trajectory)
    return trajectories




# 7. GRPO Update (simplified - needs customization)
def update_policy(policy_network, value_network, trajectories, optimizer, accelerator):  # Add accelerator
    # Accumulate policy loss across trajectories for a single backward pass
    policy_loss_total = 0

    # Debugging Prints: Input Information
    #print(f"Policy network device: {policy_network.device}")
    #print(f"Value network device: {value_network.device}")
    #print(f"Trajectories length: {len(trajectories)}")
    #print(f"Optimizer: {optimizer}")
    #print(f"Accelerator: {accelerator}")

    # Move both networks to the accelerator device before processing trajectories
    policy_network = policy_network.to(accelerator.device)
    value_network = value_network.to(accelerator.device)

    for trajectory in trajectories:
        for i in range(len(trajectory)):  # Iterate using index
            state, action, reward = trajectory[i]

            # 1. Estimate advantage (using a simple Monte Carlo estimate)
            # In a real GRPO implementation, you would likely use a more advanced advantage estimation method
            # like Generalized Advantage Estimation (GAE).
            returns = sum(r for _, _, r in trajectory[i:])  # Calculate returns from current index

            # Ensure state is on the correct device
            state = state.to(accelerator.device)

            # Calculate advantage (value_network is already on the correct device)
            advantage = returns - value_network(state).item()

            # Debugging Prints: Intermediate Values
            #print(f"State shape: {state.shape}, device: {state.device}")
            #print(f"Action: {action}")
            #print(f"Reward: {reward}")
            #print(f"Returns: {returns}")
            #print(f"Advantage: {advantage}")

            # 2. Calculate policy gradient (policy_network is already on the correct device)
            action_probs = policy_network(state)
            log_prob = torch.log(action_probs[0][action])  # Assuming action_probs is a batch of size 1
            policy_loss = -advantage * log_prob

            # Debugging Prints: Policy Loss
            #print(f"Action probabilities: {action_probs}")
            #print(f"Log probability: {log_prob}")
            #print(f"Policy loss: {policy_loss}")

            # Accumulate policy loss
            policy_loss_total += policy_loss

    # Debugging Prints: Total Policy Loss
    print(f"Total policy loss: {policy_loss_total}")

    # Return the total policy loss for the backward pass
    return policy_loss_total


# 8. GRPO Trainer
class GRPOTrainer:
    def __init__(self, env, policy_network, value_network, optimizer, learning_rate, accelerator, tokenizer, model):
        self.env = env
        self.policy_network = policy_network
        self.value_network = value_network
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.global_step = 0
        self.policy_loss = 0
        self.accelerator = accelerator
        self.tokenizer = tokenizer
        self.model = model

        # Prepare with accelerate
        self.policy_network, self.value_network, self.optimizer, self.env = accelerator.prepare(
            policy_network, value_network, optimizer, env
        )

        # Create evaluation dataset and dataloader
        eval_dataset = load_dataset("AI-MO/NuminaMath-TIR", split="test")
        self.eval_dataloader = torch.utils.data.DataLoader(
            eval_dataset, batch_size=1,  # Reduced batch size
            collate_fn=lambda examples: self.process_batch(examples)
        )

    def process_batch(self, examples):
        # Extract the relevant fields for tokenization, joining list elements if necessary
        texts = [
            example['problem'] +
            (example['solution'] if isinstance(example['solution'], str) else ' '.join(example['solution'])) +
            (example['messages'] if isinstance(example['messages'], str) else ' '.join(str(item) for item in example['messages']))
            for example in examples
        ]

        # Tokenize the extracted texts
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=self.model.config.max_position_embeddings)

        # Other necessary batching processes
        labels = [example.get('label', 0) for example in examples]

        # Correct the input labels to match input_ids shape
        inputs['labels'] = torch.tensor(labels).unsqueeze(1).repeat(1, inputs['input_ids'].shape[1]) # shape (batch_size, seq_len)
        inputs['labels'] = torch.where(inputs['attention_mask'] == 1, inputs['labels'], -100) # shape (batch_size, seq_len)

        return inputs

    def calculate_accuracy(self, predictions, ground_truth):
        # Calculate accuracy
        accuracy = np.mean(predictions == ground_truth)  # Removed argmax since predictions are already single values
        return accuracy

    def collect_trajectories(self, num_trajectories):
        return collect_trajectories(self.env, self.policy_network, num_trajectories, self.accelerator)

    def update_policy(self, trajectories):
        self.policy_loss = update_policy(self.policy_network, self.value_network, trajectories, self.optimizer, self.accelerator)

    def train(self, num_epochs, num_trajectories_per_epoch):
        total_trajectories = num_epochs * num_trajectories_per_epoch
        for epoch in range(num_epochs):
            for trajectory_index in tqdm(range(num_trajectories_per_epoch), desc=f"Epoch {epoch + 1}/{num_epochs}", total=total_trajectories, position=0, leave=True):
                # Collect trajectories and update policy within the trajectory loop
                trajectories = self.collect_trajectories(1)

                with self.accelerator.accumulate(self.policy_network):
                    self.policy_loss = update_policy(self.policy_network, self.value_network, trajectories, self.optimizer, self.accelerator)
                    self.accelerator.backward(self.policy_loss)
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    print(f"Policy Loss: {self.policy_loss.item()}")

            # Accuracy calculation and logging (POC) - performed after each epoch
            all_predictions = []
            all_ground_truth_labels = []

            for batch in self.eval_dataloader:
                with torch.no_grad():
                    inputs = {k: v.to(self.accelerator.device) for k, v in batch.items()}

                    # Call the model to get the hidden states (using the 4-bit quantized model)
                    # Remove output_hidden_states=True to reduce memory usage
                    outputs = self.model(**inputs, output_hidden_states=True)
                    states = outputs.hidden_states[-1][:, 0, :].to(torch.float32).to(self.accelerator.device)

                ground_truth_labels = batch['labels'].to(self.accelerator.device)  # Move labels to device

                # Ensure policy_network is on the correct device
                self.policy_network = self.policy_network.to(self.accelerator.device)

                # *** Iterate over each item in the batch ***
                for state, label_sequence in zip(states, ground_truth_labels):  # label_sequence now represents the sequence of labels
                    with torch.no_grad():
                        prediction = self.policy_network(state.unsqueeze(0))  # Add batch dimension

                    # Get prediction for the current item
                    batch_prediction = prediction.argmax(dim=-1).item()  # Get the predicted class as a single value

                    # Get the ground truth label for the current item (assuming the first label in the sequence is the target)
                    ground_truth_label = label_sequence[0].item()  # Get the first label in the sequence as a scalar

                    # Append prediction and label for the current item
                    all_predictions.append(batch_prediction)
                    all_ground_truth_labels.append(ground_truth_label)

            # Calculate accuracy outside the loop using accumulated predictions and labels
            accuracy = self.calculate_accuracy(np.array(all_predictions), np.array(all_ground_truth_labels))
            print(f"Epoch {epoch + 1}/{num_epochs}, Accuracy: {accuracy}")

# Entry point for your script
if __name__ == "__main__":
    # Create a namespace with the desired values
    import argparse
    args = argparse.Namespace(mixed_precision="bf16")  # Add other arguments as needed

    train_func(args)  # Call the training function

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1/1:   0%|          | 0/10 [00:00<?, ?it/s]

Starting trajectory collection...
Environment reset, state obtained.
Action probabilities calculated.
Action selected: 8
Entering env.step()
Target obtained: 0
Reward calculated: -1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 8
Entering env.step()
Target obtained: 0
Reward calculated: -1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 1
Entering env.step()
Target obtained: 0
Reward calculated: -1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 7
Entering env.step()
Target obtained: 0
Reward calculated: -1.0
Exiting env.step()
Environme

Epoch 1/1:  10%|█         | 1/10 [00:01<00:11,  1.28s/it]

Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 1
Entering env.step()
Target obtained: 0
Reward calculated: -1.0
Exiting env.step()
Environment step taken.
Policy network device: cuda
Value network device: cuda
Trajectories length: 1
Optimizer: AcceleratedOptimizer (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)
Accelerator: <accelerate.accelerator.Accelerator object at 0x7e4134398b90>
State shape: torch.Size([1, 3584]), device: cuda:0
Action: 8
Reward: -1.0
Returns: -14.0
Advantage: -14.244140625
Action probabilities: tensor([[0.1835, 0.1105, 0.1635, 0.0709, 0.0549, 0.0363, 0.0734, 0.0457, 0.1616,
         0.0998]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Log probability: -1.8227126598358154
Policy loss: -25.962974548339844
State shape: torch.Size([1, 3584]), device: cuda

Epoch 1/1:  20%|██        | 2/10 [00:02<00:10,  1.27s/it]

Policy Loss: -0.5608682632446289
Starting trajectory collection...
Environment reset, state obtained.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0

Epoch 1/1:  30%|███       | 3/10 [00:03<00:08,  1.26s/it]

Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Policy network device: cuda
Value network device: cuda
Trajectories length: 1
Optimizer: AcceleratedOptimizer (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)
Accelerator: <accelerate.accelerator.Accelerator object at 0x7e4134398b90>
State shape: torch.Size([1, 3584]), device: cuda:0
Action: 0
Reward: 1.0
Returns: 20.0
Advantage: 19.755859375
Action probabilities: tensor([[1.0000e+00, 2.8824e-13, 4.4202e-14, 1.0262e-10, 2.2897e-11, 1.2317e-42,
         1.2015e-13, 0.0000e+00, 0.0000e+00, 1.6587e-12]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
Log probability: 0.0
Policy loss: -0.0
State shape: torch.Size([1, 3584]), 

Epoch 1/1:  40%|████      | 4/10 [00:05<00:07,  1.26s/it]

Policy Loss: 0.0
Starting trajectory collection...
Environment reset, state obtained.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.ste

Epoch 1/1:  50%|█████     | 5/10 [00:06<00:06,  1.28s/it]

Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Policy network device: cuda
Value network device: cuda
Trajectories length: 1
Optimizer: AcceleratedOptimizer (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)
Accelerator: <accelerate.accelerator.Accelerator object at 0x7e4134398b90>
State shape: torch.Size([1, 3584]), device: cuda:0
Action: 0
Reward: 1.0
Returns: 20.0
Advantage: 19.755859375
Action probabilities: tensor([[1.0000e+00, 3.3517e-23, 1.6675e-43, 5.7495e-19, 6.3871e-21, 0.0000e+00,
         3.5326e-24, 0.0000e+00, 0.0000e+00, 2.3497e-21]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
Log probability: 0.0
Policy loss: -0.0
State shape: torch.Size([1, 3584]), 

Epoch 1/1:  60%|██████    | 6/10 [00:07<00:05,  1.28s/it]

Policy Loss: 0.0
Starting trajectory collection...
Environment reset, state obtained.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.ste

Epoch 1/1:  70%|███████   | 7/10 [00:08<00:03,  1.27s/it]

Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Policy network device: cuda
Value network device: cuda
Trajectories length: 1
Optimizer: AcceleratedOptimizer (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)
Accelerator: <accelerate.accelerator.Accelerator object at 0x7e4134398b90>
State shape: torch.Size([1, 3584]), device: cuda:0
Action: 0
Reward: 1.0
Returns: 20.0
Advantage: 19.755859375
Action probabilities: tensor([[1.0000e+00, 1.7817e-30, 0.0000e+00, 4.7809e-25, 7.1878e-28, 0.0000e+00,
         5.3802e-32, 0.0000e+00, 0.0000e+00, 4.3596e-28]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
Log probability: 0.0
Policy loss: -0.0
State shape: torch.Size([1, 3584]), 

Epoch 1/1:  80%|████████  | 8/10 [00:10<00:02,  1.27s/it]

Policy Loss: 0.0
Starting trajectory collection...
Environment reset, state obtained.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.ste

Epoch 1/1:  90%|█████████ | 9/10 [00:11<00:01,  1.26s/it]

Exiting env.step()
Environment step taken.
Action probabilities calculated.
Action selected: 0
Entering env.step()
Target obtained: 0
Reward calculated: 1.0
Exiting env.step()
Environment step taken.
Policy network device: cuda
Value network device: cuda
Trajectories length: 1
Optimizer: AcceleratedOptimizer (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)
Accelerator: <accelerate.accelerator.Accelerator object at 0x7e4134398b90>
State shape: torch.Size([1, 3584]), device: cuda:0
Action: 0
Reward: 1.0
Returns: 20.0
Advantage: 19.755859375
Action probabilities: tensor([[1.0000e+00, 2.4426e-36, 0.0000e+00, 2.9375e-30, 9.8542e-34, 0.0000e+00,
         4.4738e-38, 0.0000e+00, 0.0000e+00, 9.8542e-34]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
Log probability: 0.0
Policy loss: -0.0
State shape: torch.Size([1, 3584]), 

Epoch 1/1: 100%|██████████| 10/10 [00:12<00:00,  1.27s/it]

State shape: torch.Size([1, 3584]), device: cuda:0
Action: 0
Reward: 1.0
Returns: 5.0
Advantage: 4.755859375
Action probabilities: tensor([[1.0000e+00, 6.0546e-39, 0.0000e+00, 1.9793e-32, 2.4426e-36, 0.0000e+00,
         1.1089e-40, 0.0000e+00, 0.0000e+00, 6.6397e-36]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
Log probability: 0.0
Policy loss: -0.0
State shape: torch.Size([1, 3584]), device: cuda:0
Action: 0
Reward: 1.0
Returns: 4.0
Advantage: 3.755859375
Action probabilities: tensor([[1.0000e+00, 6.0546e-39, 0.0000e+00, 1.9793e-32, 2.4426e-36, 0.0000e+00,
         1.1089e-40, 0.0000e+00, 0.0000e+00, 6.6397e-36]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
Log probability: 0.0
Policy loss: -0.0
State shape: torch.Size([1, 3584]), device: cuda:0
Action: 0
Reward: 1.0
Returns: 3.0
Advantage: 2.755859375
Action probabilities: tensor([[1.0000e+00, 6.0546e-39, 0.0000e+00, 1.9793e-32, 2.4426e-36, 0.0000e+00,
         1.1089e-40, 0.0000e+00, 0.0000e+00, 6.6397e-36]], devic




Epoch 1/1, Accuracy: 1.0


EVALUATION

In [None]:
#!export MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
#!export MODEL_ARGS="pretrained=$MODEL,dtype=float16,max_model_length=32768,gpu_memory_utilisation=0.8"
#!export TASK=aime24
#!export OUTPUT_DIR=data/evals/$MODEL

#!lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
#    --custom-tasks src/open_r1/evaluate.py \
#    --use-chat-template \
#    --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
#    --output-dir $OUTPUT_DIR

EVALUATION - python

In [19]:
accelerator = Accelerator(mixed_precision="bf16", device_placement=False, split_batches=False)
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")

# Load the model with quantization config (as in training)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_enable_fp32_cpu_offload=True
)
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    quantization_config=quantization_config
)
model.to(accelerator.device)

# Load the dataset
dataset = load_dataset("AI-MO/NuminaMath-TIR")

# Create the environment
env = SimpleEnvironment(dataset, tokenizer, model, accelerator)  # Pass 'model' here

# Define input and output dimensions for networks
input_dim = model.config.hidden_size  # Use 'model' here
output_dim = 10  # Or the appropriate number of actions for your environment

# Create policy and value networks (as before)
policy_network = PolicyNetwork(input_dim, output_dim, accelerator.device)
value_network = ValueNetwork(input_dim, accelerator.device)

# *** Load the fine-tuned policy network ***
policy_network.load_state_dict(torch.load("/content/DeepSeek-R1-Distill-Qwen-7B-GRPO/policy_network.pth"))

# Create optimizer (as before)
learning_rate = 0.01
optimizer = optim.Adam(policy_network.parameters(), lr=learning_rate)

# Create the GRPOTrainer instance
trainer = GRPOTrainer(env, policy_network, value_network, optimizer, learning_rate, accelerator, tokenizer, model)  # Pass 'model' here

# Perform evaluation
trainer.evaluate()

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  policy_network.load_state_dict(torch.load("/content/DeepSeek-R1-Distill-Qwen-7B-GRPO/policy_network.pth"))


Evaluation Accuracy: 1.0
