In [None]:
import os
import threading
import time
import psutil
import GPUtil
import numpy as np
import json
import gymnasium as gym
import torch
import torch.nn.functional as F
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback, CallbackList

# ------------------------------
# Global Variables for Monitoring
# ------------------------------
cpu_usages = []
gpu_usages = []
monitoring = True

def monitor_usage(interval=1):
    """Monitor CPU and GPU usage every 'interval' seconds."""
    global monitoring
    while monitoring:
        cpu_percent = psutil.cpu_percent(interval=interval)
        cpu_usages.append(cpu_percent)
        try:
            gpus = GPUtil.getGPUs()
            gpu_percent = gpus[0].load * 100 if gpus else 0
        except Exception:
            gpu_percent = 0
        gpu_usages.append(gpu_percent)

# ------------------------------
# Domain Randomization Wrapper
# ------------------------------
class DomainRandomizationWrapper(gym.Wrapper):
    """
    Wraps an environment to randomize certain domain parameters at every reset.
    For Walker2D (a MuJoCo environment), this example randomizes:
      - Gravity (along the z-axis)
      - Friction for the first geometry (as an example)
      - A scaling factor for all body masses
    """
    def __init__(self, env,
                 gravity_range=(-10, -9),
                 friction_range=(0.5, 1.5),
                 mass_range=(0.8, 1.2)):
        super(DomainRandomizationWrapper, self).__init__(env)
        self.gravity_range = gravity_range
        self.friction_range = friction_range
        self.mass_range = mass_range

    def reset(self, **kwargs):
        # Check if the environment provides access to the simulation (MuJoCo)
        if hasattr(self.env, "sim"):
            # Randomize gravity (usually gravity is along the z axis)
            new_gravity = np.random.uniform(*self.gravity_range)
            self.env.sim.model.opt.gravity[2] = new_gravity

            # Randomize friction on the first geom as an example
            new_friction = np.random.uniform(*self.friction_range)
            # Typically, friction parameters for a geom are an array; here we update the first entry.
            self.env.sim.model.geom_friction[0] = np.array([new_friction, 0.005, 0.0001])

            # Randomize the mass scaling for all bodies
            mass_scale = np.random.uniform(*self.mass_range)
            for i in range(self.env.sim.model.nbody):
                # Multiply the original mass by a scaling factor; for reproducibility you could also store the original values.
                self.env.sim.model.body_mass[i] *= mass_scale

        # Reset the underlying environment
        obs, info = self.env.reset(**kwargs)
        return obs, info

# ------------------------------
# Evaluation Metrics Callback for PPO Training
# ------------------------------
class EvaluationMetricsCallback(BaseCallback):
    def __init__(self, eval_env, eval_freq: int, n_eval_episodes: int = 5, verbose: int = 1):
        super(EvaluationMetricsCallback, self).__init__(verbose)
        self.eval_env = eval_env
        self.eval_freq = eval_freq
        self.n_eval_episodes = n_eval_episodes
        self.eval_rewards = []   # List to store average reward at evaluation points
        self.eval_timesteps = [] # Timesteps corresponding to each evaluation
        self.eval_wall_times = []# Elapsed wall-clock time at evaluation
        self.start_time = None

    def _on_training_start(self):
        self.start_time = time.time()

    def _on_step(self) -> bool:
        if self.num_timesteps % self.eval_freq == 0:
            rewards = []
            for _ in range(self.n_eval_episodes):
                obs, info = self.eval_env.reset()
                ep_reward = 0.0
                done = False
                while not done:
                    action, _ = self.model.predict(obs, deterministic=True)
                    obs, reward, done, truncated, info = self.eval_env.step(action)
                    done = done or truncated
                    ep_reward += reward
                rewards.append(ep_reward)
            avg_reward = np.mean(rewards)
            self.eval_rewards.append(avg_reward)
            self.eval_timesteps.append(self.num_timesteps)
            elapsed = time.time() - self.start_time
            self.eval_wall_times.append(elapsed)
            print(f"Timesteps: {self.num_timesteps}, Avg Reward: {avg_reward:.2f}, Elapsed: {elapsed:.2f} sec")
        return True

# ------------------------------
# Create and Wrap Walker2D Environment with Domain Randomization
# ------------------------------
# Note: You may need to install and configure Mujoco and gymnasium's mujoco environments.
env_id = "Walker2d-v5"  # or the appropriate id for your Walker2D environment
base_env = gym.make(env_id)
# Wrap with domain randomization to improve generalization
env = DomainRandomizationWrapper(base_env,
                                 gravity_range=(-10, -9),      # randomized gravity
                                 friction_range=(0.5, 1.5),       # randomized friction coefficient
                                 mass_range=(0.8, 1.2))           # randomized mass scaling

# ------------------------------
# Initialize PPO Model for Walker2D with Domain Randomization
# ------------------------------
model = PPO("MlpPolicy", env, verbose=1)

# ------------------------------
# Set Up Callbacks for Training
# ------------------------------
# Here, as an example, we only use the evaluation callback.
eval_callback = EvaluationMetricsCallback(env, eval_freq=5000, n_eval_episodes=5, verbose=1)
callback = CallbackList([eval_callback])

# ------------------------------
# Start Monitoring CPU/GPU Usage and Train the Model
# ------------------------------
monitor_thread = threading.Thread(target=monitor_usage, args=(1,), daemon=True)
monitor_thread.start()

total_timesteps = 3e6  # Total training timesteps (adjust based on your available resources)
start_time = time.time()
model.learn(total_timesteps=int(total_timesteps), callback=callback)
model.save("walker2d_domain_randomized_v3.zip")
end_time = time.time()
training_time_sec = end_time - start_time

monitoring = False
monitor_thread.join()
env.close()

# ------------------------------
# Compute Energy Metrics and Print Summary
# ------------------------------
avg_cpu = np.mean(cpu_usages) if cpu_usages else 50.0
avg_gpu = np.mean(gpu_usages) if gpu_usages else 50.0

# Hardware assumptions: i7-10700K ~125W, RTX 2080 Super ~250W.
power_cpu_max = 125  # Watts
power_gpu_max = 250  # Watts
cpu_power = (avg_cpu / 100) * power_cpu_max
gpu_power = (avg_gpu / 100) * power_gpu_max
total_power = cpu_power + gpu_power  # in Watts
training_hours = training_time_sec / 3600
energy_kwh = (total_power * training_hours) / 1000  # in kWh

final_reward = eval_callback.eval_rewards[-1] if eval_callback.eval_rewards else None

metrics = {
    "final_reward": final_reward,
    "training_time_sec": training_time_sec,
    "avg_cpu": avg_cpu,
    "avg_gpu": avg_gpu,
    "energy_kwh": energy_kwh,
    "eval_timesteps": eval_callback.eval_timesteps,
    "eval_rewards": eval_callback.eval_rewards,
    "eval_wall_times": eval_callback.eval_wall_times
}

print("Training complete, and model saved as 'walker2d_domain_randomized.zip'.")
print("=== Metrics Summary ===")
print(f"Final Reward: {metrics['final_reward']}")
print(f"Training Time: {training_time_sec:.2f} sec ({training_hours:.2f} hours)")
print(f"Average CPU Usage: {avg_cpu:.2f}% -> CPU Power: {cpu_power:.2f} W")
print(f"Average GPU Usage: {avg_gpu:.2f}% -> GPU Power: {gpu_power:.2f} W")
print(f"Estimated Energy Consumption: {energy_kwh:.3f} kWh")

# ------------------------------
# Save Metrics to a File
# ------------------------------
metrics_file = "training_metrics_walker2d_domain_randomized_v3.json"
with open(metrics_file, "w") as f:
    json.dump(metrics, f, indent=4)
print(f"Metrics saved to {metrics_file}")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 18.9     |
|    ep_rew_mean     | -0.965   |
| time/              |          |
|    fps             | 1216     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.5        |
|    ep_rew_mean          | 3.38        |
| time/                   |             |
|    fps                  | 822         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.020834424 |
|    clip_fraction        | 0.264       |
|    clip_range           | 0.2         |
|    entropy_loss   