# REINFORCE Training on Google Colab

This notebook contains a complete, self-contained REINFORCE (Policy Gradient) training pipeline for the Daladala environment.

**What's Included:**
- Full environment definition (5 actions, 14 observations)
- 12 REINFORCE hyperparameter configurations for systematic tuning
- Training loop with 300,000 timesteps per configuration
- Automatic best model tracking and evaluation
- Results saved directly to Google Drive

**Expected Runtime:** ~3-4 hours on Colab CPU for all 12 configurations
**Output:** Best model + detailed results JSON saved to Google Drive

## Section 1: Install and Import Dependencies

In [None]:
!pip install gymnasium torch pandas numpy opencv-python --quiet

In [None]:
import os
import json
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import warnings
warnings.filterwarnings('ignore')

## Section 2: Mount Google Drive (for saving models and results)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create output directories
os.makedirs('/content/drive/MyDrive/daladala_results/models/reinforce', exist_ok=True)
os.makedirs('/content/drive/MyDrive/daladala_results/results', exist_ok=True)
print("✓ Google Drive mounted successfully")

## Section 3: Define the DaladalaEnv Environment

In [None]:
class DaladalaEnv(gym.Env):
    """Daladala (mini-bus) optimization environment with 5 actions and 14 observations."""
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 12}

    def __init__(self, render_mode=None):
        super().__init__()
        self.observation_space = spaces.Box(-1, 1, shape=(14,), dtype=np.float32)
        self.action_space = spaces.Discrete(5)  # Move, Stop, Pickup, Dropoff, SpeedUp
        self.render_mode = render_mode

        # Fixed route Ubungo → Posta (right then up)
        self.route = [(x, 14) for x in range(15)] + [(14, y) for y in range(13, -1, -1)]
        self.high_demand_stops = [(4,14), (8,14), (14,8), (14,3)]
        self.police_checkpoints = [(6,14), (11,14), (14,10)]
        self.traffic_lights = [(3,14), (10,14), (14,12), (14,5)]

        self.max_steps = 350
        self.physical_max = 50

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.step_count = 0
        self.passengers = 0
        self.money = 0.0
        self.pos_idx = 0
        self.speed = 0
        self.fined = False
        return self._get_obs(), {}

    def _get_obs(self):
        if self.pos_idx >= len(self.route):
            x, y = 14, 0
        else:
            x, y = self.route[self.pos_idx]

        # Get next cell info
        next_idx = min(self.pos_idx + 1, len(self.route)-1)
        next_x, next_y = self.route[next_idx]

        # Traffic light logic
        light_red = 0
        if (x,y) in self.traffic_lights:
            light_red = 1 if (self.step_count // 40) % 2 == 0 else 0

        # Police & must_stop detection
        police_checkpoint_ahead = 1 if (next_x, next_y) in self.police_checkpoints else 0
        must_stop_next = 1 if police_checkpoint_ahead or (light_red and (x,y) in self.traffic_lights) else 0

        # Calculate distances
        dist_to_next_light = float('inf')
        for i in range(self.pos_idx + 1, min(self.pos_idx + 6, len(self.route))):
            if self.route[i] in self.traffic_lights:
                dist_to_next_light = i - self.pos_idx
                break
        dist_to_next_light = min(dist_to_next_light / 5.0, 1.0)

        dist_to_next_police = float('inf')
        for i in range(self.pos_idx + 1, min(self.pos_idx + 6, len(self.route))):
            if self.route[i] in self.police_checkpoints:
                dist_to_next_police = i - self.pos_idx
                break
        dist_to_next_police = min(dist_to_next_police / 5.0, 1.0)

        # Passengers waiting at high-demand stops
        if not hasattr(self, 'passengers_waiting_state'):
            self.passengers_waiting_state = {}
        
        if (x, y) in self.high_demand_stops:
            if (x, y) not in self.passengers_waiting_state:
                self.passengers_waiting_state[(x, y)] = np.random.randint(0, 11)
        else:
            self.passengers_waiting_state = {}
        
        passengers_waiting = self.passengers_waiting_state.get((x, y), 0) / 10.0

        # Normalize all observations to [-1, 1]
        obs = np.array([
            x / 14.0 * 2 - 1,
            y / 14.0 * 2 - 1,
            self.passengers / 50.0 * 2 - 1,
            self.money / 150000.0 * 2 - 1,
            self.speed / 3.0,
            dist_to_next_light * 2 - 1,
            dist_to_next_police * 2 - 1,
            light_red * 2 - 1,
            must_stop_next * 2 - 1,
            1 if (x,y) in self.high_demand_stops else -1,
            passengers_waiting * 2 - 1,
            1 if self.passengers > 40 else -1,
            1 if self.fined else -1,
            self.step_count / self.max_steps * 2 - 1
        ], dtype=np.float32)
        return obs

    def step(self, action):
        self.step_count += 1
        terminated = truncated = False

        x, y = self.route[self.pos_idx]
        must_stop_now = self._must_stop_here()

        # Capture pre-action state
        prev_passengers = self.passengers
        prev_money = self.money
        prev_pos_idx = self.pos_idx

        # Execute action (pure state transitions, no conditions)
        if action == 0:  # Move Forward
            if self.pos_idx < len(self.route) - 1:
                self.pos_idx += 1
                self.speed = min(self.speed + 1, 3)
            else:
                terminated = True

        elif action == 1:  # Stop
            self.speed = 0

        elif action == 2:  # Pick up passengers
            if (x, y) in self.high_demand_stops and self.passengers < self.physical_max:
                base_add = np.random.randint(4, 9)
                waiting_count = int(self.passengers_waiting_state.get((x, y), 0))
                add = min(base_add + waiting_count // 2, self.physical_max - self.passengers)
                self.passengers = min(self.passengers + add, self.physical_max)
                if (x, y) in self.passengers_waiting_state:
                    self.passengers_waiting_state[(x, y)] = max(0, waiting_count - add)

        elif action == 3:  # Drop off passengers
            if (x, y) in self.high_demand_stops and self.passengers > 0:
                drop = min(self.passengers, np.random.randint(6, 16))
                self.passengers -= drop
                self.money += drop * 1000

        elif action == 4:  # Speed Up
            if self.passengers <= 40:
                self.speed = min(self.speed + 1, 3)

        # Measure outcomes (pure state deltas)
        pos_progress = self.pos_idx - prev_pos_idx
        passengers_added = self.passengers - prev_passengers
        passengers_dropped = prev_passengers - self.passengers
        money_earned = self.money - prev_money

        # Calculate reward from outcomes only
        reward = 0.0

        if pos_progress > 0:
            reward += 5
        
        if passengers_added > 0:
            reward += passengers_added * 1.0
        
        if passengers_dropped > 0:
            reward += passengers_dropped * 1.2
        
        if money_earned > 0:
            reward += money_earned / 20000.0

        if terminated:
            reward += 100
            if self.passengers <= 33:
                reward += 200

        # Safety consequences
        if action == 0 and must_stop_now:
            reward -= 45
        
        if action == 1 and not must_stop_now:
            reward -= 2
        
        if action == 1 and must_stop_now:
            reward += 6
        
        if action == 4 and self.passengers > 40:
            reward -= 400
            terminated = True
        
        if (x, y) in self.police_checkpoints:
            if self.passengers > 40:
                reward -= 200
                terminated = True
                self.fined = True
            elif self.passengers > 33:
                reward -= 40
                self.fined = True

        truncated = self.step_count >= self.max_steps

        return self._get_obs(), reward, terminated, truncated, {}

    def _must_stop_here(self):
        if self.pos_idx >= len(self.route)-1:
            return False
        nx, ny = self.route[self.pos_idx + 1]
        light_red = 0
        cx, cy = self.route[self.pos_idx]
        if (cx,cy) in self.traffic_lights:
            light_red = 1 if (self.step_count // 40) % 2 == 0 else 0
        return (nx,ny) in self.police_checkpoints or light_red == 1

    def render(self):
        pass  # Rendering disabled for Colab

print("✓ DaladalaEnv class defined successfully")

## Section 4: Define REINFORCE Policy Network and Agent

In [None]:
class PolicyNetwork(nn.Module):
    """Neural network policy for REINFORCE algorithm."""
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(PolicyNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )
    
    def forward(self, state):
        return torch.softmax(self.net(state), dim=-1)
    
    def get_action_and_log_prob(self, state):
        """Get action and log probability from policy."""
        probs = self.forward(state)
        dist = Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob

class REINFORCEAgent:
    """REINFORCE (Policy Gradient) agent."""
    def __init__(self, state_dim, action_dim, hidden_dim, learning_rate):
        self.policy = PolicyNetwork(state_dim, action_dim, hidden_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
        self.device = torch.device('cpu')
        self.policy.to(self.device)
    
    def train_episode(self, env):
        """Train for one complete episode."""
        obs, _ = env.reset()
        log_probs = []
        rewards = []
        done = False
        
        while not done:
            obs_tensor = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action, log_prob = self.policy.get_action_and_log_prob(obs_tensor)
            obs, reward, terminated, truncated, _ = env.step(action)
            
            log_probs.append(log_prob)
            rewards.append(reward)
            done = terminated or truncated
        
        # Calculate returns (discounted cumulative rewards)
        returns = []
        cumulative_return = 0
        for reward in reversed(rewards):
            cumulative_return = reward + 0.99 * cumulative_return
            returns.insert(0, cumulative_return)
        
        # Normalize returns
        returns = torch.tensor(returns, dtype=torch.float32).to(self.device)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        
        # Calculate policy loss
        policy_loss = 0
        for log_prob, return_val in zip(log_probs, returns):
            policy_loss += -log_prob * return_val
        
        # Update policy
        self.optimizer.zero_grad()
        policy_loss.backward()
        self.optimizer.step()
        
        return sum(rewards)
    
    def evaluate(self, env, n_episodes=50):
        """Evaluate agent performance."""
        rewards = []
        for _ in range(n_episodes):
            obs, _ = env.reset()
            total_reward = 0
            done = False
            
            while not done:
                obs_tensor = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
                with torch.no_grad():
                    probs = self.policy(obs_tensor)
                    action = probs.argmax(dim=-1).item()
                obs, reward, terminated, truncated, _ = env.step(action)
                total_reward += reward
                done = terminated or truncated
            
            rewards.append(total_reward)
        
        return np.mean(rewards), np.std(rewards)
    
    def save(self, path):
        """Save model to disk."""
        torch.save(self.policy.state_dict(), path + '_policy.pth')
    
    def load(self, path):
        """Load model from disk."""
        self.policy.load_state_dict(torch.load(path + '_policy.pth'))

print("✓ REINFORCE Policy Network and Agent defined successfully")

## Section 5: Define REINFORCE Hyperparameter Configurations

In [None]:
# 12 REINFORCE hyperparameter configurations for systematic tuning
reinforce_configs = [
    {"name": "LR_1e3_hid_64", "learning_rate": 1e-3, "hidden_dim": 64},
    {"name": "LR_1e3_hid_128", "learning_rate": 1e-3, "hidden_dim": 128},
    {"name": "LR_3e3_hid_64", "learning_rate": 3e-3, "hidden_dim": 64},
    {"name": "LR_3e3_hid_128", "learning_rate": 3e-3, "hidden_dim": 128},
    {"name": "LR_5e3_hid_64", "learning_rate": 5e-3, "hidden_dim": 64},
    {"name": "LR_5e3_hid_128", "learning_rate": 5e-3, "hidden_dim": 128},
    {"name": "LR_1e2_hid_64", "learning_rate": 1e-2, "hidden_dim": 64},
    {"name": "LR_1e2_hid_128", "learning_rate": 1e-2, "hidden_dim": 128},
    {"name": "LR_1e2_hid_256", "learning_rate": 1e-2, "hidden_dim": 256},
    {"name": "LR_5e3_hid_256", "learning_rate": 5e-3, "hidden_dim": 256},
    {"name": "LR_3e3_hid_256", "learning_rate": 3e-3, "hidden_dim": 256},
    {"name": "LR_1e3_hid_256", "learning_rate": 1e-3, "hidden_dim": 256},
]

print(f"✓ {len(reinforce_configs)} REINFORCE configurations defined")

## Section 6: Train REINFORCE with All Configurations

In [None]:
results = {}
best_reward = -float('inf')
best_config = None
best_agent = None

total_configs = len(reinforce_configs)
state_dim = 14
action_dim = 5

for idx, config in enumerate(reinforce_configs, 1):
    print(f"\n{'='*60}")
    print(f"Training Configuration {idx}/{total_configs}: {config['name']}")
    print(f"{'='*60}")
    
    # Create environment
    env = DaladalaEnv()
    
    # Initialize REINFORCE agent
    agent = REINFORCEAgent(
        state_dim=state_dim,
        action_dim=action_dim,
        hidden_dim=config['hidden_dim'],
        learning_rate=config['learning_rate']
    )
    
    # Train agent - convert 300k timesteps to approximate episodes
    # Typical episode length ~350 steps, so ~857 episodes for 300k timesteps
    print(f"Training for ~857 episodes (~300k timesteps)...")
    total_steps = 0
    target_steps = 300000
    
    while total_steps < target_steps:
        ep_reward = agent.train_episode(env)
        total_steps += 350  # Approximate steps per episode
    
    # Evaluate agent on 50 episodes
    print(f"Evaluating on 50 episodes...")
    mean_reward, std_reward = agent.evaluate(env, n_episodes=50)
    
    results[config['name']] = {
        'config': config,
        'mean_reward': mean_reward,
        'std_reward': std_reward
    }
    
    print(f"✓ Mean Reward: {mean_reward:.2f} (±{std_reward:.2f})")
    
    # Track best model
    if mean_reward > best_reward:
        best_reward = mean_reward
        best_config = config['name']
        best_agent = agent
        print(f"★ New best model!")
    
    env.close()

print(f"\n{'='*60}")
print(f"TRAINING COMPLETE")
print(f"{'='*60}")
print(f"Best Configuration: {best_config}")
print(f"Best Mean Reward: {best_reward:.2f}")

## Section 7: Save Best Model and Results

In [None]:
# Save best model
best_model_path = '/content/drive/MyDrive/daladala_results/models/reinforce/best_reinforce'
best_agent.save(best_model_path)
print(f"✓ Best model saved to: {best_model_path}")

# Save results as JSON
results_json_path = '/content/drive/MyDrive/daladala_results/results/reinforce_results.json'
results_summary = {}
for config_name, config_results in results.items():
    results_summary[config_name] = {
        'mean_reward': float(config_results['mean_reward']),
        'std_reward': float(config_results['std_reward']),
        'hyperparameters': {
            'learning_rate': config_results['config']['learning_rate'],
            'hidden_dim': config_results['config']['hidden_dim']
        }
    }

with open(results_json_path, 'w') as f:
    json.dump(results_summary, f, indent=2)
print(f"✓ Results saved to: {results_json_path}")

# Display results table
print("\n" + "="*80)
print("RESULTS SUMMARY - All 12 REINFORCE Configurations")
print("="*80)
results_df = pd.DataFrame([
    {
        'Config': name,
        'Mean Reward': f"{results[name]['mean_reward']:.2f}",
        'Std Reward': f"{results[name]['std_reward']:.2f}",
        'LR': results[name]['config']['learning_rate'],
        'Hidden': results[name]['config']['hidden_dim']
    }
    for name in results.keys()
])
print(results_df.to_string(index=False))
print("="*80)

## Section 8: Test Best Model on Sample Episodes

In [None]:
# Test the best model on 5 sample episodes
print(f"\nTesting best model ({best_config}) on 5 sample episodes:")
print("="*60)

env = DaladalaEnv()
episode_rewards = []

for ep in range(5):
    obs, _ = env.reset()
    total_reward = 0
    done = False
    steps = 0
    
    while not done:
        obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
        with torch.no_grad():
            probs = best_agent.policy(obs_tensor)
            action = probs.argmax(dim=-1).item()
        obs, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        done = terminated or truncated
        steps += 1
    
    episode_rewards.append(total_reward)
    print(f"Episode {ep+1}: Reward = {total_reward:.2f} (Steps: {steps})")

env.close()

print("="*60)
print(f"Sample Episodes Mean Reward: {np.mean(episode_rewards):.2f}")
print(f"Sample Episodes Std Reward:  {np.std(episode_rewards):.2f}")
print("\n✓ REINFORCE training and evaluation complete!")
print(f"✓ Models and results saved to Google Drive")