# BipedalWalker RL Training

Training SAC, TD3, and PPO agents on BipedalWalker-v3 environment.

**Setup: Runtime → Change runtime type → T4 GPU**

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install dependencies
!apt-get update -qq > /dev/null 2>&1
!apt-get install -y -qq swig build-essential python3-dev > /dev/null 2>&1
!pip install -q box2d-py stable-baselines3 gymnasium pygame tensorboard imageio imageio-ffmpeg

In [None]:
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3 import SAC, TD3, PPO
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from stable_baselines3.common.monitor import Monitor
import matplotlib.pyplot as plt
import imageio
import os
from datetime import datetime
from IPython.display import Video, display
import warnings
warnings.filterwarnings('ignore')

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")

!mkdir -p models logs tensorboard_logs results videos

In [None]:
# Training configuration
TIMESTEPS = 500000
SAVE_FREQ = 50000

def make_env():
    return Monitor(gym.make('BipedalWalker-v3'))

def train(algorithm, timesteps=TIMESTEPS):
    print(f"\n{'='*60}\nTraining {algorithm} - {timesteps:,} steps\n{'='*60}\n")
    
    env = make_env()
    eval_env = make_env()
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_dir = f"models/{algorithm.lower()}_{timestamp}"
    log_dir = f"tensorboard_logs/{algorithm.lower()}_{timestamp}"
    os.makedirs(model_dir, exist_ok=True)
    
    eval_cb = EvalCallback(eval_env, best_model_save_path=f"{model_dir}/best",
                           log_path=f"logs/{algorithm.lower()}", eval_freq=10000,
                           deterministic=True, render=False, n_eval_episodes=5)
    
    checkpoint_cb = CheckpointCallback(save_freq=SAVE_FREQ, save_path=f"{model_dir}/checkpoints",
                                       name_prefix=f"{algorithm.lower()}_model")
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}\n")
    
    if algorithm == 'SAC':
        model = SAC('MlpPolicy', env, verbose=1, learning_rate=3e-4, buffer_size=300000,
                   batch_size=256, gamma=0.99, tau=0.02, tensorboard_log=log_dir, device=device)
    elif algorithm == 'TD3':
        model = TD3('MlpPolicy', env, verbose=1, learning_rate=1e-3, buffer_size=200000,
                   batch_size=100, gamma=0.99, tau=0.005, tensorboard_log=log_dir, device=device)
    elif algorithm == 'PPO':
        model = PPO('MlpPolicy', env, verbose=1, learning_rate=3e-4, n_steps=2048,
                   batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95,
                   clip_range=0.2, tensorboard_log=log_dir, device=device)
    
    model.learn(total_timesteps=timesteps, callback=[eval_cb, checkpoint_cb], progress_bar=True)
    
    model.save(f"{model_dir}/final_model")
    print(f"\nModel saved: {model_dir}/final_model\n")
    
    env.close()
    eval_env.close()
    
    return model, model_dir

print(f"Configuration: {TIMESTEPS:,} timesteps per algorithm")

In [None]:
# Train SAC
model_sac, dir_sac = train('SAC')

In [None]:
# Train TD3
model_td3, dir_td3 = train('TD3')

In [None]:
# Train PPO
model_ppo, dir_ppo = train('PPO')

In [None]:
# Evaluation function
def evaluate(model, name, n_episodes=10):
    print(f"\nEvaluating {name}...")
    env = gym.make('BipedalWalker-v3', render_mode='rgb_array')
    
    rewards = []
    frames = []
    
    for ep in range(n_episodes):
        obs, _ = env.reset()
        done = truncated = False
        ep_reward = 0
        
        while not (done or truncated):
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, _ = env.step(action)
            ep_reward += reward
            
            if ep == 0:
                frames.append(env.render())
        
        rewards.append(ep_reward)
        print(f"  Episode {ep + 1}: {ep_reward:.2f}")
    
    env.close()
    
    video_path = f'videos/{name.lower()}.mp4'
    imageio.mimsave(video_path, frames, fps=30)
    
    mean = np.mean(rewards)
    std = np.std(rewards)
    print(f"  Mean reward: {mean:.2f} ± {std:.2f}\n")
    
    return rewards, video_path

In [None]:
# Evaluate all models
print("="*60)
print("EVALUATION")
print("="*60)

rewards_sac, video_sac = evaluate(model_sac, 'SAC')
rewards_td3, video_td3 = evaluate(model_td3, 'TD3')
rewards_ppo, video_ppo = evaluate(model_ppo, 'PPO')

In [None]:
# Show videos
print("SAC:")
display(Video(video_sac, embed=True, width=600))

print("\nTD3:")
display(Video(video_td3, embed=True, width=600))

print("\nPPO:")
display(Video(video_ppo, embed=True, width=600))

In [None]:
# Comparison plot
algorithms = ['SAC', 'TD3', 'PPO']
means = [np.mean(rewards_sac), np.mean(rewards_td3), np.mean(rewards_ppo)]
stds = [np.std(rewards_sac), np.std(rewards_td3), np.std(rewards_ppo)]

plt.figure(figsize=(10, 6))
x = np.arange(len(algorithms))
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

bars = plt.bar(x, means, yerr=stds, align='center', alpha=0.8,
               ecolor='black', capsize=10, color=colors)

plt.xlabel('Algorithm', fontsize=12, fontweight='bold')
plt.ylabel('Mean Reward', fontsize=12, fontweight='bold')
plt.title('Algorithm Performance Comparison\nBipedalWalker-v3', fontsize=14, fontweight='bold')
plt.xticks(x, algorithms, fontsize=11)
plt.grid(True, alpha=0.3, axis='y')

for bar, mean, std in zip(bars, means, stds):
    plt.text(bar.get_x() + bar.get_width()/2., bar.get_height(),
            f'{mean:.1f}±{std:.1f}', ha='center', va='bottom',
            fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('results/comparison.png', dpi=300)
plt.show()

print("\nResults:")
for alg, mean, std in zip(algorithms, means, stds):
    print(f"  {alg}: {mean:.2f} ± {std:.2f}")

In [None]:
# TensorBoard
%load_ext tensorboard
%tensorboard --logdir tensorboard_logs

In [None]:
# Download results
!zip -r -q results.zip models/ videos/ results/ logs/

from google.colab import files
files.download('results.zip')

print("\nDone! Results saved in results.zip")