# PPO Training on Google Colab

This notebook contains a complete, self-contained PPO (Proximal Policy Optimization) training pipeline for the Daladala environment.

**What's Included:**
- Full environment definition (5 actions, 14 observations)
- 12 PPO hyperparameter configurations for systematic tuning
- Training loop with 300,000 timesteps per configuration
- Automatic best model tracking and evaluation
- Results saved directly to Google Drive

**Expected Runtime:** ~3-4 hours on Colab CPU for all 12 configurations
**Output:** Best model + detailed results JSON saved to Google Drive

## Section 1: Install and Import Dependencies

In [None]:
!pip install stable-baselines3 gymnasium torch pandas numpy opencv-python --quiet

In [None]:
import os
import json
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import StopTrainingOnNoModelImprovement
import warnings
warnings.filterwarnings('ignore')

## Section 2: Mount Google Drive (for saving models and results)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create output directories
os.makedirs('/content/drive/MyDrive/daladala_results/models/ppo', exist_ok=True)
os.makedirs('/content/drive/MyDrive/daladala_results/results', exist_ok=True)
print("✓ Google Drive mounted successfully")

## Section 3: Define the DaladalaEnv Environment

In [None]:
class DaladalaEnv(gym.Env):
    """Daladala (mini-bus) optimization environment with 5 actions and 14 observations."""
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 12}

    def __init__(self, render_mode=None):
        super().__init__()
        self.observation_space = spaces.Box(-1, 1, shape=(14,), dtype=np.float32)
        self.action_space = spaces.Discrete(5)  # Move, Stop, Pickup, Dropoff, SpeedUp
        self.render_mode = render_mode

        # Fixed route Ubungo → Posta (right then up)
        self.route = [(x, 14) for x in range(15)] + [(14, y) for y in range(13, -1, -1)]
        self.high_demand_stops = [(4,14), (8,14), (14,8), (14,3)]
        self.police_checkpoints = [(6,14), (11,14), (14,10)]
        self.traffic_lights = [(3,14), (10,14), (14,12), (14,5)]

        self.max_steps = 350
        self.physical_max = 50

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.step_count = 0
        self.passengers = 0
        self.money = 0.0
        self.pos_idx = 0
        self.speed = 0
        self.fined = False
        return self._get_obs(), {}

    def _get_obs(self):
        if self.pos_idx >= len(self.route):
            x, y = 14, 0
        else:
            x, y = self.route[self.pos_idx]

        # Get next cell info
        next_idx = min(self.pos_idx + 1, len(self.route)-1)
        next_x, next_y = self.route[next_idx]

        # Traffic light logic
        light_red = 0
        if (x,y) in self.traffic_lights:
            light_red = 1 if (self.step_count // 40) % 2 == 0 else 0

        # Police & must_stop detection
        police_checkpoint_ahead = 1 if (next_x, next_y) in self.police_checkpoints else 0
        must_stop_next = 1 if police_checkpoint_ahead or (light_red and (x,y) in self.traffic_lights) else 0

        # Calculate distances
        dist_to_next_light = float('inf')
        for i in range(self.pos_idx + 1, min(self.pos_idx + 6, len(self.route))):
            if self.route[i] in self.traffic_lights:
                dist_to_next_light = i - self.pos_idx
                break
        dist_to_next_light = min(dist_to_next_light / 5.0, 1.0)

        dist_to_next_police = float('inf')
        for i in range(self.pos_idx + 1, min(self.pos_idx + 6, len(self.route))):
            if self.route[i] in self.police_checkpoints:
                dist_to_next_police = i - self.pos_idx
                break
        dist_to_next_police = min(dist_to_next_police / 5.0, 1.0)

        # Passengers waiting at high-demand stops
        if not hasattr(self, 'passengers_waiting_state'):
            self.passengers_waiting_state = {}
        
        if (x, y) in self.high_demand_stops:
            if (x, y) not in self.passengers_waiting_state:
                self.passengers_waiting_state[(x, y)] = np.random.randint(0, 11)
        else:
            self.passengers_waiting_state = {}
        
        passengers_waiting = self.passengers_waiting_state.get((x, y), 0) / 10.0

        # Normalize all observations to [-1, 1]
        obs = np.array([
            x / 14.0 * 2 - 1,
            y / 14.0 * 2 - 1,
            self.passengers / 50.0 * 2 - 1,
            self.money / 150000.0 * 2 - 1,
            self.speed / 3.0,
            dist_to_next_light * 2 - 1,
            dist_to_next_police * 2 - 1,
            light_red * 2 - 1,
            must_stop_next * 2 - 1,
            1 if (x,y) in self.high_demand_stops else -1,
            passengers_waiting * 2 - 1,
            1 if self.passengers > 40 else -1,
            1 if self.fined else -1,
            self.step_count / self.max_steps * 2 - 1
        ], dtype=np.float32)
        return obs

    def step(self, action):
        self.step_count += 1
        terminated = truncated = False

        x, y = self.route[self.pos_idx]
        must_stop_now = self._must_stop_here()

        # Capture pre-action state
        prev_passengers = self.passengers
        prev_money = self.money
        prev_pos_idx = self.pos_idx

        # Execute action (pure state transitions, no conditions)
        if action == 0:  # Move Forward
            if self.pos_idx < len(self.route) - 1:
                self.pos_idx += 1
                self.speed = min(self.speed + 1, 3)
            else:
                terminated = True

        elif action == 1:  # Stop
            self.speed = 0

        elif action == 2:  # Pick up passengers
            if (x, y) in self.high_demand_stops and self.passengers < self.physical_max:
                base_add = np.random.randint(4, 9)
                waiting_count = int(self.passengers_waiting_state.get((x, y), 0))
                add = min(base_add + waiting_count // 2, self.physical_max - self.passengers)
                self.passengers = min(self.passengers + add, self.physical_max)
                if (x, y) in self.passengers_waiting_state:
                    self.passengers_waiting_state[(x, y)] = max(0, waiting_count - add)

        elif action == 3:  # Drop off passengers
            if (x, y) in self.high_demand_stops and self.passengers > 0:
                drop = min(self.passengers, np.random.randint(6, 16))
                self.passengers -= drop
                self.money += drop * 1000

        elif action == 4:  # Speed Up
            if self.passengers <= 40:
                self.speed = min(self.speed + 1, 3)

        # Measure outcomes (pure state deltas)
        pos_progress = self.pos_idx - prev_pos_idx
        passengers_added = self.passengers - prev_passengers
        passengers_dropped = prev_passengers - self.passengers
        money_earned = self.money - prev_money

        # Calculate reward from outcomes only
        reward = 0.0

        if pos_progress > 0:
            reward += 5
        
        if passengers_added > 0:
            reward += passengers_added * 1.0
        
        if passengers_dropped > 0:
            reward += passengers_dropped * 1.2
        
        if money_earned > 0:
            reward += money_earned / 20000.0

        if terminated:
            reward += 100
            if self.passengers <= 33:
                reward += 200

        # Safety consequences
        if action == 0 and must_stop_now:
            reward -= 45
        
        if action == 1 and not must_stop_now:
            reward -= 2
        
        if action == 1 and must_stop_now:
            reward += 6
        
        if action == 4 and self.passengers > 40:
            reward -= 400
            terminated = True
        
        if (x, y) in self.police_checkpoints:
            if self.passengers > 40:
                reward -= 200
                terminated = True
                self.fined = True
            elif self.passengers > 33:
                reward -= 40
                self.fined = True

        truncated = self.step_count >= self.max_steps

        return self._get_obs(), reward, terminated, truncated, {}

    def _must_stop_here(self):
        if self.pos_idx >= len(self.route)-1:
            return False
        nx, ny = self.route[self.pos_idx + 1]
        light_red = 0
        cx, cy = self.route[self.pos_idx]
        if (cx,cy) in self.traffic_lights:
            light_red = 1 if (self.step_count // 40) % 2 == 0 else 0
        return (nx,ny) in self.police_checkpoints or light_red == 1

    def render(self):
        pass  # Rendering disabled for Colab

print("✓ DaladalaEnv class defined successfully")

## Section 4: Define PPO Hyperparameter Configurations

In [None]:
# 12 PPO hyperparameter configurations for systematic tuning
ppo_configs = [
    {"name": "LR_1e4_n_steps_512", "learning_rate": 1e-4, "n_steps": 512, "batch_size": 64, "gamma": 0.99, "clip_range": 0.2, "ent_coef": 0.0},
    {"name": "LR_1e4_n_steps_1024", "learning_rate": 1e-4, "n_steps": 1024, "batch_size": 64, "gamma": 0.99, "clip_range": 0.2, "ent_coef": 0.0},
    {"name": "LR_3e4_n_steps_512", "learning_rate": 3e-4, "n_steps": 512, "batch_size": 128, "gamma": 0.995, "clip_range": 0.2, "ent_coef": 0.005},
    {"name": "LR_3e4_n_steps_1024", "learning_rate": 3e-4, "n_steps": 1024, "batch_size": 128, "gamma": 0.995, "clip_range": 0.2, "ent_coef": 0.005},
    {"name": "LR_5e4_n_steps_512", "learning_rate": 5e-4, "n_steps": 512, "batch_size": 128, "gamma": 0.99, "clip_range": 0.2, "ent_coef": 0.01},
    {"name": "LR_5e4_n_steps_1024", "learning_rate": 5e-4, "n_steps": 1024, "batch_size": 64, "gamma": 0.995, "clip_range": 0.2, "ent_coef": 0.01},
    {"name": "LR_7e4_n_steps_512", "learning_rate": 7e-4, "n_steps": 512, "batch_size": 64, "gamma": 0.99, "clip_range": 0.25, "ent_coef": 0.0},
    {"name": "LR_7e4_n_steps_1024", "learning_rate": 7e-4, "n_steps": 1024, "batch_size": 128, "gamma": 0.995, "clip_range": 0.25, "ent_coef": 0.005},
    {"name": "LR_1e3_n_steps_512", "learning_rate": 1e-3, "n_steps": 512, "batch_size": 64, "gamma": 0.99, "clip_range": 0.2, "ent_coef": 0.01},
    {"name": "LR_1e3_n_steps_1024", "learning_rate": 1e-3, "n_steps": 1024, "batch_size": 128, "gamma": 0.995, "clip_range": 0.2, "ent_coef": 0.01},
    {"name": "LR_1e3_n_steps_2048", "learning_rate": 1e-3, "n_steps": 2048, "batch_size": 128, "gamma": 0.99, "clip_range": 0.15, "ent_coef": 0.0},
    {"name": "LR_5e4_n_steps_2048", "learning_rate": 5e-4, "n_steps": 2048, "batch_size": 64, "gamma": 0.995, "clip_range": 0.25, "ent_coef": 0.005},
]

print(f"✓ {len(ppo_configs)} PPO configurations defined")

## Section 5: Train PPO with All Configurations

In [None]:
results = {}
best_reward = -float('inf')
best_config = None
best_model = None

total_configs = len(ppo_configs)

for idx, config in enumerate(ppo_configs, 1):
    print(f"\n{'='*60}")
    print(f"Training Configuration {idx}/{total_configs}: {config['name']}")
    print(f"{'='*60}")
    
    # Create environment
    env = DaladalaEnv()
    
    # Initialize PPO model with configuration
    model = PPO(
        'MlpPolicy',
        env,
        learning_rate=config['learning_rate'],
        n_steps=config['n_steps'],
        batch_size=config['batch_size'],
        gamma=config['gamma'],
        clip_range=config['clip_range'],
        ent_coef=config['ent_coef'],
        verbose=0,
        device='cpu'
    )
    
    # Train for 300,000 timesteps
    print(f"Training for 300,000 timesteps...")
    model.learn(total_timesteps=300000)
    
    # Evaluate model on 50 episodes
    print(f"Evaluating on 50 episodes...")
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=50, deterministic=True)
    
    results[config['name']] = {
        'config': config,
        'mean_reward': mean_reward,
        'std_reward': std_reward
    }
    
    print(f"✓ Mean Reward: {mean_reward:.2f} (±{std_reward:.2f})")
    
    # Track best model
    if mean_reward > best_reward:
        best_reward = mean_reward
        best_config = config['name']
        best_model = model
        print(f"★ New best model!")
    
    env.close()

print(f"\n{'='*60}")
print(f"TRAINING COMPLETE")
print(f"{'='*60}")
print(f"Best Configuration: {best_config}")
print(f"Best Mean Reward: {best_reward:.2f}")

## Section 6: Save Best Model and Results

In [None]:
# Save best model
best_model_path = '/content/drive/MyDrive/daladala_results/models/ppo/best_ppo'
best_model.save(best_model_path)
print(f"✓ Best model saved to: {best_model_path}")

# Save results as JSON
results_json_path = '/content/drive/MyDrive/daladala_results/results/ppo_results.json'
results_summary = {}
for config_name, config_results in results.items():
    results_summary[config_name] = {
        'mean_reward': float(config_results['mean_reward']),
        'std_reward': float(config_results['std_reward']),
        'hyperparameters': {
            'learning_rate': config_results['config']['learning_rate'],
            'n_steps': config_results['config']['n_steps'],
            'batch_size': config_results['config']['batch_size'],
            'gamma': config_results['config']['gamma'],
            'clip_range': config_results['config']['clip_range'],
            'ent_coef': config_results['config']['ent_coef']
        }
    }

with open(results_json_path, 'w') as f:
    json.dump(results_summary, f, indent=2)
print(f"✓ Results saved to: {results_json_path}")

# Display results table
print("\n" + "="*80)
print("RESULTS SUMMARY - All 12 PPO Configurations")
print("="*80)
results_df = pd.DataFrame([
    {
        'Config': name,
        'Mean Reward': f"{results[name]['mean_reward']:.2f}",
        'Std Reward': f"{results[name]['std_reward']:.2f}",
        'LR': results[name]['config']['learning_rate'],
        'n_steps': results[name]['config']['n_steps'],
        'gamma': results[name]['config']['gamma']
    }
    for name in results.keys()
])
print(results_df.to_string(index=False))
print("="*80)

## Section 7: Test Best Model on Sample Episodes

In [None]:
# Test the best model on 5 sample episodes
print(f"\nTesting best model ({best_config}) on 5 sample episodes:")
print("="*60)

env = DaladalaEnv()
episode_rewards = []

for ep in range(5):
    obs, _ = env.reset()
    total_reward = 0
    done = False
    steps = 0
    
    while not done:
        action, _ = best_model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        done = terminated or truncated
        steps += 1
    
    episode_rewards.append(total_reward)
    print(f"Episode {ep+1}: Reward = {total_reward:.2f} (Steps: {steps})")

env.close()

print("="*60)
print(f"Sample Episodes Mean Reward: {np.mean(episode_rewards):.2f}")
print(f"Sample Episodes Std Reward:  {np.std(episode_rewards):.2f}")
print("\n✓ PPO training and evaluation complete!")
print(f"✓ Models and results saved to Google Drive")