In [2]:
import os
import threading
import time
import psutil
import GPUtil
import numpy as np
import json
import gymnasium as gym
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback, CallbackList
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.utils import get_device

# ------------------------------
# Global Monitoring Variables
# ------------------------------
cpu_usages = []
gpu_usages = []
monitoring = True

def monitor_usage(interval=1):
    """Monitor CPU and GPU usage every 'interval' seconds."""
    global monitoring
    while monitoring:
        cpu_percent = psutil.cpu_percent(interval=interval)
        cpu_usages.append(cpu_percent)
        try:
            gpus = GPUtil.getGPUs()
            gpu_percent = gpus[0].load * 100 if gpus else 0
        except Exception:
            gpu_percent = 0
        gpu_usages.append(gpu_percent)

# ------------------------------
# Callback: Evaluation Metrics
# ------------------------------
class EvaluationMetricsCallback(BaseCallback):
    def __init__(self, eval_env, eval_freq: int, n_eval_episodes: int = 5, verbose: int = 1):
        super(EvaluationMetricsCallback, self).__init__(verbose)
        self.eval_env = eval_env
        self.eval_freq = eval_freq
        self.n_eval_episodes = n_eval_episodes
        self.eval_rewards = []
        self.eval_timesteps = []
        self.eval_wall_times = []
        self.start_time = None

    def _on_training_start(self):
        self.start_time = time.time()

    def _on_step(self) -> bool:
        if self.num_timesteps % self.eval_freq == 0:
            rewards = []
            for _ in range(self.n_eval_episodes):
                obs, _ = self.eval_env.reset()
                done = False
                total_r = 0.0
                while not done:
                    action, _ = self.model.predict(obs, deterministic=True)
                    obs, r, terminated, truncated, _ = self.eval_env.step(action)
                    done = terminated or truncated
                    total_r += r
                rewards.append(total_r)
            avg_reward = np.mean(rewards)
            self.eval_timesteps.append(self.num_timesteps)
            self.eval_rewards.append(avg_reward)
            elapsed = time.time() - self.start_time
            self.eval_wall_times.append(elapsed)
            print(f"Timestep {self.num_timesteps}: Avg Reward = {avg_reward:.2f}, Elapsed = {elapsed:.1f}s")
        return True

# ------------------------------
# Callback: Progressive Unfreeze
# ------------------------------
class ProgressiveUnfreezeCallback(BaseCallback):
    def __init__(self, total_timesteps, freeze_threshold=0.2, verbose=1):
        super(ProgressiveUnfreezeCallback, self).__init__(verbose)
        self.total_timesteps = total_timesteps
        self.freeze_threshold = freeze_threshold
        self.unfroze = False

    def _on_step(self) -> bool:
        if (not self.unfroze) and (self.num_timesteps >= self.freeze_threshold * self.total_timesteps):
            for param in self.model.policy.parameters():
                param.requires_grad = True
            self.unfroze = True
            if self.verbose:
                print(f"Unfroze all policy layers at timestep {self.num_timesteps}")
        return True

# ------------------------------
# Training & Evaluation Utilities
# ------------------------------
def transfer_weights(student, teacher_path):
    teacher = PPO.load(teacher_path, device=get_device("auto"))
    teacher_state = teacher.policy.state_dict()
    student_state = student.policy.state_dict()
    matched = {k: v for k, v in teacher_state.items() if k in student_state and v.shape == student_state[k].shape}
    student_state.update(matched)
    student.policy.load_state_dict(student_state)
    print(f"Transferred {len(matched)} layers from teacher.")

# ------------------------------
# Main Execution
# ------------------------------
def main():
    env_id = "Humanoid-v5"
    total_timesteps = int(2e6)
    eval_freq = 50000
    n_eval_episodes = 5
    teacher_path = "walker2d_domain_randomized_v10000.zip"

    # Verify teacher exists
    if not os.path.exists(teacher_path):
        raise FileNotFoundError(f"Teacher model '{teacher_path}' not found.")

    # Create environment and student model
    env = gym.make(env_id)
    student = PPO(
        ActorCriticPolicy,
        env,
        learning_rate=3e-4,
        n_steps=2048,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=0.0,
        verbose=1,
        policy_kwargs={"net_arch": dict(pi=[64, 64, 64], vf=[64, 64, 64]), "activation_fn": torch.nn.Tanh},
        device=get_device("auto")
    )

    # Transfer and freeze extractor
    transfer_weights(student, teacher_path)
    for param in student.policy.mlp_extractor.parameters():
        param.requires_grad = False
    print("Frozen feature extractor layers.")

    # Setup monitoring thread
    monitor_thread = threading.Thread(target=monitor_usage, args=(1,), daemon=True)
    monitor_thread.start()

    # Callbacks
    progressive_cb = ProgressiveUnfreezeCallback(total_timesteps=total_timesteps, freeze_threshold=0.2)
    eval_cb = EvaluationMetricsCallback(env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes)
    callbacks = CallbackList([progressive_cb, eval_cb])

    # Train
    start = time.time()
    student.learn(total_timesteps=total_timesteps, callback=callbacks)
    student.save("humanoid_transfer2M.zip")
    end = time.time()

    # Stop monitoring
    global monitoring
    monitoring = False
    monitor_thread.join()

    # Compute metrics
    duration = end - start
    avg_cpu = np.mean(cpu_usages) if cpu_usages else 0
    avg_gpu = np.mean(gpu_usages) if gpu_usages else 0
    cpu_max, gpu_max = 125, 250
    energy_kwh = ((avg_cpu / 100 * cpu_max + avg_gpu / 100 * gpu_max) * (duration / 3600)) / 1000

    final_reward = eval_cb.eval_rewards[-1] if eval_cb.eval_rewards else None
    metrics = {
        "final_reward": final_reward,
        "training_time_sec": duration,
        "avg_cpu_percent": avg_cpu,
        "avg_gpu_percent": avg_gpu,
        "energy_kwh": energy_kwh,
        "eval_timesteps": eval_cb.eval_timesteps,
        "eval_rewards": eval_cb.eval_rewards,
        "eval_wall_times": eval_cb.eval_wall_times
    }

    # Summary
    print("\n=== Training Summary ===")
    print(f"Final eval reward: {final_reward}")
    print(f"Duration: {duration / 3600:.2f}h, CPU: {avg_cpu:.1f}%, GPU: {avg_gpu:.1f}%")
    print(f"Estimated energy: {energy_kwh:.3f} kWh")

    # Save metrics
    with open("humanoid_metrics2M.json", "w") as f:
        json.dump(metrics, f, indent=2)
    print("Metrics saved to 'humanoid_metrics.json'.")

if __name__ == "__main__":
    main()


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Transferred 8 layers from teacher.
Frozen feature extractor layers.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22       |
|    ep_rew_mean     | 98.5     |
| time/              |          |
|    fps             | 820      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.2        |
|    ep_rew_mean          | 99.9        |
| time/                   |             |
|    fps                  | 668         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.019429704 |
|    clip_fraction        | 0.21    