In [1]:
import os
import threading
import time
import psutil
import GPUtil
import numpy as np
import json
import gymnasium as gym
import torch
import torch.nn.functional as F
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback, CallbackList


In [2]:

# ------------------------------
# Global Variables for Monitoring
# ------------------------------
cpu_usages = []
gpu_usages = []
monitoring = True

def monitor_usage(interval=1):
    """Monitor CPU and GPU usage every 'interval' seconds."""
    global monitoring
    while monitoring:
        cpu_percent = psutil.cpu_percent(interval=interval)
        cpu_usages.append(cpu_percent)
        try:
            gpus = GPUtil.getGPUs()
            gpu_percent = gpus[0].load * 100 if gpus else 0
        except Exception:
            gpu_percent = 0
        gpu_usages.append(gpu_percent)

# ------------------------------
# Evaluation Metrics Callback for PPO Training
# ------------------------------
class EvaluationMetricsCallback(BaseCallback):
    def __init__(self, eval_env, eval_freq: int, n_eval_episodes: int = 5, verbose: int = 1):
        super(EvaluationMetricsCallback, self).__init__(verbose)
        self.eval_env = eval_env
        self.eval_freq = eval_freq
        self.n_eval_episodes = n_eval_episodes
        self.eval_rewards = []   # Store average reward at evaluation points
        self.eval_timesteps = [] # Timesteps corresponding to each evaluation
        self.eval_wall_times = []# Elapsed wall-clock time at evaluation
        self.start_time = None

    def _on_training_start(self):
        self.start_time = time.time()

    def _on_step(self) -> bool:
        if self.num_timesteps % self.eval_freq == 0:
            rewards = []
            for _ in range(self.n_eval_episodes):
                obs, info = self.eval_env.reset()
                ep_reward = 0.0
                done = False
                while not done:
                    action, _ = self.model.predict(obs, deterministic=True)
                    obs, reward, done, truncated, info = self.eval_env.step(action)
                    done = done or truncated
                    ep_reward += reward
                rewards.append(ep_reward)
            avg_reward = np.mean(rewards)
            self.eval_rewards.append(avg_reward)
            self.eval_timesteps.append(self.num_timesteps)
            elapsed = time.time() - self.start_time
            self.eval_wall_times.append(elapsed)
            print(f"Timesteps: {self.num_timesteps}, Avg Reward: {avg_reward:.2f}, Elapsed: {elapsed:.2f} sec")
        return True

# ------------------------------
# Step 1: Create Humanoid-v5 Environment and Initialize Model (Train from Scratch)
# ------------------------------
env = gym.make("HumanoidStandup-v5")
model = PPO("MlpPolicy", env, verbose=1)

# ------------------------------
# Step 2: Set Up Callback for Training
# ------------------------------
# For a baseline training from scratch, we only use the evaluation callback.
eval_callback = EvaluationMetricsCallback(env, eval_freq=5000, n_eval_episodes=5, verbose=1)
callback = CallbackList([eval_callback])

# ------------------------------
# Step 3: Start Monitoring CPU/GPU Usage and Train the Model
# ------------------------------
monitor_thread = threading.Thread(target=monitor_usage, args=(1,), daemon=True)
monitor_thread.start()

total_timesteps = 10e6  # Total training timesteps
start_time = time.time()
model.learn(total_timesteps=int(total_timesteps), callback=callback)
model.save("standup_from_scratch10M.zip")
end_time = time.time()
training_time_sec = end_time - start_time

monitoring = False
monitor_thread.join()
env.close()

# ------------------------------
# Step 4: Compute Energy Metrics and Print Summary
# ------------------------------
avg_cpu = np.mean(cpu_usages) if cpu_usages else 50.0  # default if none collected
avg_gpu = np.mean(gpu_usages) if gpu_usages else 50.0

# Hardware assumptions: i7-10700K max ~125W, RTX 2080 Super max ~250W.
power_cpu_max = 125  # Watts
power_gpu_max = 250  # Watts
cpu_power = (avg_cpu / 100) * power_cpu_max
gpu_power = (avg_gpu / 100) * power_gpu_max
total_power = cpu_power + gpu_power  # in Watts
training_hours = training_time_sec / 3600
energy_kwh = (total_power * training_hours) / 1000  # in kWh

# Use the last evaluation reward as the final reward.
final_reward = eval_callback.eval_rewards[-1] if eval_callback.eval_rewards else None

metrics = {
    "final_reward": final_reward,
    "training_time_sec": training_time_sec,
    "avg_cpu": avg_cpu,
    "avg_gpu": avg_gpu,
    "energy_kwh": energy_kwh,
    "eval_timesteps": eval_callback.eval_timesteps,
    "eval_rewards": eval_callback.eval_rewards,
    "eval_wall_times": eval_callback.eval_wall_times
}

print("Training complete, and model saved as 'humanoid_from_scratch3M.zip'.")
print("=== Metrics Summary ===")
print(f"Final Reward: {metrics['final_reward']}")
print(f"Training Time: {training_time_sec:.2f} sec ({training_hours:.2f} hours)")
print(f"Average CPU Usage: {avg_cpu:.2f}% -> CPU Power: {cpu_power:.2f} W")
print(f"Average GPU Usage: {avg_gpu:.2f}% -> GPU Power: {gpu_power:.2f} W")
print(f"Estimated Energy Consumption: {energy_kwh:.3f} kWh")

# ------------------------------
# Step 5: Save Metrics to a File
# ------------------------------
metrics_file = "standup_from_scratch10M.json"
with open(metrics_file, "w") as f:
    json.dump(metrics, f, indent=4)
print(f"Metrics saved to {metrics_file}")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 3.83e+04 |
| time/              |          |
|    fps             | 826      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 3.78e+04    |
| time/                   |             |
|    fps                  | 624         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011790035 |
|    clip_fraction        | 0.108       |
|    clip_range           | 0.2         |
|    entropy_loss   

In [6]:
# ------------------------------
# Step 6: Visual Evaluation
# ------------------------------
def evaluate_and_render(model_path, n_episodes=5):
    """Load model and visually evaluate performance."""
    env = gym.make("HumanoidStandup-v5", render_mode="human")
    model = PPO.load(model_path)

    for episode in range(n_episodes):
        obs, info = env.reset()
        done = False
        total_reward = 0.0
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, info = env.step(action)
            done = done or truncated
            total_reward += reward
            env.render()
            time.sleep(1/60)  # slow down to ~60 FPS for better viewing

        print(f"Episode {episode+1}: Total Reward = {total_reward:.2f}")

    env.close()

# Run the visual evaluation
evaluate_and_render(model_path="standup_finetuned10M", n_episodes=5)


Episode 1: Total Reward = 112117.72
Episode 2: Total Reward = 84524.53
Episode 3: Total Reward = 99774.42


C:\Users\Rocas\AppData\Local\anaconda3\envs\tfg\Lib\site-packages\glfw\__init__.py:917: GLFWError: (65537) b'The GLFW library is not initialized'


Episode 4: Total Reward = 97274.40
Episode 5: Total Reward = 91151.37
