# DQN Training on Google Colab

This notebook contains a complete, self-contained DQN (Deep Q-Network) training pipeline for the Daladala environment.

**What's Included:**
- Full environment definition (5 actions, 14 observations)
- 12 DQN hyperparameter configurations for systematic tuning
- Training loop with 300,000 timesteps per configuration
- Automatic best model tracking and evaluation
- Results saved directly to Google Drive

**Expected Runtime:** ~3-4 hours on Colab CPU for all 12 configurations
**Output:** Best model + detailed results JSON saved to Google Drive

## Section 1: Install and Import Dependencies

In [None]:
!pip install stable-baselines3 gymnasium torch pandas numpy opencv-python --quiet

In [None]:
import os
import json
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import StopTrainingOnNoModelImprovement
import warnings
warnings.filterwarnings('ignore')

## Section 2: Mount Google Drive (for saving models and results)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create output directories
os.makedirs('/content/drive/MyDrive/daladala_results/models/dqn', exist_ok=True)
os.makedirs('/content/drive/MyDrive/daladala_results/results', exist_ok=True)
print("✓ Google Drive mounted successfully")

## Section 3: Define the DaladalaEnv Environment

In [None]:
class DaladalaEnv(gym.Env):
    """Daladala (mini-bus) optimization environment with automatic movement and randomized hazards."""
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 12}

    def __init__(self, render_mode=None):
        super().__init__()
        self.observation_space = spaces.Box(-1, 1, shape=(14,), dtype=np.float32)
        self.action_space = spaces.Discrete(5)  # 0:Move, 1:Pickup, 2:Dropoff, 3:Stop, 4:SpeedUp
        self.render_mode = render_mode

        # Fixed route Ubungo → Posta (right then up)
        self.route = [(x, 14) for x in range(15)] + [(14, y) for y in range(13, -1, -1)]
        self.high_demand_stops = [(4,14), (8,14), (14,8), (14,3)]
        
        # These will be randomized each reset
        self.police_checkpoints = []
        self.traffic_lights = []

        self.max_steps = 350
        self.physical_max = 50
        self.light_cycle = 0  # Track light cycle deterministically

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.step_count = 0
        self.passengers = 0
        self.money = 0.0
        self.pos_idx = 0
        self.speed = 0
        self.fined = False
        self.light_cycle = 0
        
        # Randomize hazard positions each episode
        available = [pos for pos in self.route if pos not in self.high_demand_stops]
        if len(available) >= 7:  # 3 police + 4 traffic lights
            sampled = np.random.choice(len(available), 7, replace=False)
            self.police_checkpoints = [available[i] for i in sampled[:3]]
            self.traffic_lights = [available[i] for i in sampled[3:7]]
        
        # Initialize deterministic passenger counts per stop (seeded by position)
        self.passengers_at_stop = {}
        for stop in self.high_demand_stops:
            # Deterministic: same stop always has same initial count
            seed_val = hash(stop) % 11  # 0-10 passengers
            self.passengers_at_stop[stop] = seed_val
        
        return self._get_obs(), {}

    def _get_obs(self):
        """
        Generate observation based on current location and environment state.
        Observations are DETERMINISTIC per location to match training/visualization.
        """
        if self.pos_idx >= len(self.route):
            x, y = 14, 0
        else:
            x, y = self.route[self.pos_idx]

        # === CURRENT LOCATION HAZARDS ===
        # Traffic light: RED on even cycles, GREEN on odd cycles
        light_is_red = 1 if (x, y) in self.traffic_lights and (self.light_cycle % 2 == 0) else 0
        
        # Police checkpoint detection
        police_here = 1 if (x, y) in self.police_checkpoints else 0
        
        # Must stop at THIS location?
        must_stop_now = 1 if (light_is_red or police_here) else 0
        
        # === NEXT LOCATION HAZARDS ===
        next_idx = min(self.pos_idx + 1, len(self.route) - 1)
        next_x, next_y = self.route[next_idx]
        
        # Check what's ahead
        next_light_is_red = 1 if (next_x, next_y) in self.traffic_lights and (self.light_cycle % 2 == 0) else 0
        police_ahead = 1 if (next_x, next_y) in self.police_checkpoints else 0
        must_stop_next = 1 if (next_light_is_red or police_ahead) else 0
        
        # === PASSENGER STATE ===
        # At high-demand stop: passengers waiting (deterministic)
        at_stop = 1 if (x, y) in self.high_demand_stops else 0
        passengers_waiting = self.passengers_at_stop.get((x, y), 0) if at_stop else 0
        
        # === DISTANCE AHEAD (for lookahead) ===
        # Distance to next traffic light (in next 5 cells)
        dist_to_light = 5
        for i in range(self.pos_idx + 1, min(self.pos_idx + 6, len(self.route))):
            if self.route[i] in self.traffic_lights:
                dist_to_light = i - self.pos_idx
                break
        
        # Distance to next police (in next 5 cells)
        dist_to_police = 5
        for i in range(self.pos_idx + 1, min(self.pos_idx + 6, len(self.route))):
            if self.route[i] in self.police_checkpoints:
                dist_to_police = i - self.pos_idx
                break
        
        # === BUILD OBSERVATION VECTOR (all normalized to [-1, 1]) ===
        obs = np.array([
            x / 14.0 * 2 - 1,                      # [0] position_x
            y / 14.0 * 2 - 1,                      # [1] position_y
            self.passengers / self.physical_max * 2 - 1,  # [2] current_passengers
            self.money / 150000.0 * 2 - 1,        # [3] money_earned
            self.speed / 3.0 * 2 - 1,              # [4] current_speed
            light_is_red * 2 - 1,                  # [5] light_is_red_HERE (critical)
            police_here * 2 - 1,                   # [6] police_checkpoint_HERE
            must_stop_now * 2 - 1,                 # [7] must_stop_now_HERE (critical)
            at_stop * 2 - 1,                       # [8] at_high_demand_stop
            passengers_waiting / 10.0 * 2 - 1,    # [9] passengers_waiting_at_stop
            must_stop_next * 2 - 1,                # [10] must_stop_next_location
            dist_to_light / 5.0 * 2 - 1,          # [11] distance_to_traffic_light
            dist_to_police / 5.0 * 2 - 1,         # [12] distance_to_police
            self.step_count / self.max_steps * 2 - 1,  # [13] episode_progress
        ], dtype=np.float32)
        
        return obs

    def step(self, action):
        """
        Action: 0=Move, 1=Pickup, 2=Dropoff, 3=Stop, 4=SpeedUp
        Movement is ALWAYS automatic. Actions are overlaid on movement.
        Rewards guide agent toward optimal actions based on current state.
        """
        self.step_count += 1
        self.light_cycle += 1  # Update traffic light cycle
        
        terminated = truncated = False
        x, y = self.route[self.pos_idx]
        
        # === PHASE 1: AUTOMATIC MOVEMENT (always happens) ===
        if self.pos_idx < len(self.route) - 1:
            self.pos_idx += 1
        else:
            terminated = True
        
        # === PHASE 2: EXECUTE ACTION ===
        reward = 0.0
        
        # Observe the CURRENT location (before action)
        light_is_red = 1 if (x, y) in self.traffic_lights and (self.light_cycle % 2 == 0) else 0
        police_here = 1 if (x, y) in self.police_checkpoints else 0
        must_stop_here = 1 if (light_is_red or police_here) else 0
        at_stop = 1 if (x, y) in self.high_demand_stops else 0
        
        # === INTELLIGENT REWARD SYSTEM ===
        # We know the "right" action for each state, so rewards guide strongly
        
        if action == 0:  # MOVE action (advance to next cell)
            # Movement already happened automatically
            # This action is mostly for consistency; reward small progress bonus
            reward += 2
            
            # PENALTY: Moved through hazard without stopping
            if must_stop_here:
                reward -= 40  # Heavy penalty: ran through red light or police checkpoint
        
        elif action == 1:  # PICKUP action
            if at_stop and self.passengers < self.physical_max:
                # GOOD: Picking up at a stop
                base_add = max(3, self.passengers_at_stop.get((x, y), 0))
                add = min(base_add, self.physical_max - self.passengers)
                self.passengers += add
                reward += 15  # High reward for correct action
                
                # Deduct waiting passengers
                if (x, y) in self.passengers_at_stop:
                    self.passengers_at_stop[(x, y)] = max(0, self.passengers_at_stop[(x, y)] - add)
            else:
                # BAD: Picked up when not at stop
                reward -= 5
            
            # PENALTY: Picking up at hazard zone
            if must_stop_here:
                reward -= 10
        
        elif action == 2:  # DROPOFF action
            if at_stop and self.passengers > 0:
                # GOOD: Dropping off at a stop
                drop = min(self.passengers, max(3, self.passengers // 2 + 1))
                self.passengers -= drop
                self.money += drop * 1000
                reward += 12  # Good reward for revenue
            else:
                # BAD: Dropped off when not at stop
                reward -= 8
            
            # PENALTY: Dropping off at hazard zone
            if must_stop_here:
                reward -= 10
        
        elif action == 3:  # STOP action (slows down / waits)
            self.speed = max(0, self.speed - 1)
            
            # GOOD: Stopped at hazard location
            if must_stop_here:
                reward += 25  # Strong reward: correct safety action
            else:
                # BAD: Unnecessary stop
                reward -= 3
        
        elif action == 4:  # SPEEDUP action
            # GOOD: Speeding up in safe zones
            if not must_stop_here and self.passengers <= 40:
                self.speed = min(self.speed + 1, 3)
                reward += 3
            else:
                # BAD: Speeding in danger or when overloaded
                if must_stop_here:
                    reward -= 15  # Dangerous
                if self.passengers > 40:
                    reward -= 30  # Could crash
                    terminated = True  # Crash!
        
        # === PHASE 3: SAFETY VIOLATIONS ===
        # Check destination after automatic movement
        new_x, new_y = self.route[self.pos_idx] if self.pos_idx < len(self.route) else (14, 0)
        
        # Police checkpoint consequences
        if (new_x, new_y) in self.police_checkpoints:
            if self.passengers > 40:
                reward -= 50  # Severe: overloaded at police
                self.fined = True
                terminated = True
            elif self.passengers > 33:
                reward -= 20  # Violation: illegal capacity
                self.fined = True
        
        # === PHASE 4: PROGRESS & COMPLETION ===
        # Base movement reward (small, to encourage progress)
        reward += 1
        
        # Destination completion bonus
        if terminated:
            reward += 100  # Large bonus for reaching destination
            if self.passengers <= 33 and not self.fined:
                reward += 50  # Bonus for legal completion
        
        # === PHASE 5: STATE UPDATES ===
        truncated = self.step_count >= self.max_steps
        
        return self._get_obs(), reward, terminated, truncated, {}

    def render(self):
        pass  # Rendering disabled for Colab

print("✓ DaladalaEnv class defined with new intelligent design")

## Section 4: Define DQN Hyperparameter Configurations

In [None]:
# 12 DQN hyperparameter configurations for systematic tuning
dqn_configs = [
    {"name": "LR_1e4_buf_10k_eps_025", "learning_rate": 1e-4, "buffer_size": 10000, "exploration_fraction": 0.25, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.05, "gamma": 0.99},
    {"name": "LR_1e4_buf_50k_eps_05", "learning_rate": 1e-4, "buffer_size": 50000, "exploration_fraction": 0.5, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.05, "gamma": 0.99},
    {"name": "LR_3e4_buf_10k_eps_025", "learning_rate": 3e-4, "buffer_size": 10000, "exploration_fraction": 0.25, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.05, "gamma": 0.995},
    {"name": "LR_3e4_buf_50k_eps_05", "learning_rate": 3e-4, "buffer_size": 50000, "exploration_fraction": 0.5, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.05, "gamma": 0.995},
    {"name": "LR_5e4_buf_10k_eps_025", "learning_rate": 5e-4, "buffer_size": 10000, "exploration_fraction": 0.25, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.1, "gamma": 0.99},
    {"name": "LR_5e4_buf_50k_eps_05", "learning_rate": 5e-4, "buffer_size": 50000, "exploration_fraction": 0.5, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.1, "gamma": 0.995},
    {"name": "LR_7e4_buf_10k_eps_025", "learning_rate": 7e-4, "buffer_size": 10000, "exploration_fraction": 0.25, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.02, "gamma": 0.99},
    {"name": "LR_7e4_buf_50k_eps_05", "learning_rate": 7e-4, "buffer_size": 50000, "exploration_fraction": 0.5, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.02, "gamma": 0.995},
    {"name": "LR_1e3_buf_10k_eps_025", "learning_rate": 1e-3, "buffer_size": 10000, "exploration_fraction": 0.25, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.05, "gamma": 0.99},
    {"name": "LR_1e3_buf_50k_eps_05", "learning_rate": 1e-3, "buffer_size": 50000, "exploration_fraction": 0.5, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.05, "gamma": 0.995},
    {"name": "LR_1e3_buf_100k_eps_1", "learning_rate": 1e-3, "buffer_size": 100000, "exploration_fraction": 1.0, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.05, "gamma": 0.99},
    {"name": "LR_5e4_buf_100k_eps_1", "learning_rate": 5e-4, "buffer_size": 100000, "exploration_fraction": 1.0, "exploration_initial_eps": 1.0, "exploration_final_eps": 0.1, "gamma": 0.995},
]

print(f"✓ {len(dqn_configs)} DQN configurations defined")

## Section 5: Train DQN with All Configurations

In [None]:
results = {}
best_reward = -float('inf')
best_config = None
best_model = None

total_configs = len(dqn_configs)

for idx, config in enumerate(dqn_configs, 1):
    print(f"\n{'='*60}")
    print(f"Training Configuration {idx}/{total_configs}: {config['name']}")
    print(f"{'='*60}")
    
    # Create environment
    env = DaladalaEnv()
    
    # Initialize DQN model with configuration
    model = DQN(
        'MlpPolicy',
        env,
        learning_rate=config['learning_rate'],
        buffer_size=config['buffer_size'],
        exploration_fraction=config['exploration_fraction'],
        exploration_initial_eps=config['exploration_initial_eps'],
        exploration_final_eps=config['exploration_final_eps'],
        gamma=config['gamma'],
        verbose=0,
        device='cpu'
    )
    
    # Train for 300,000 timesteps
    print(f"Training for 300,000 timesteps...")
    model.learn(total_timesteps=300000)
    
    # Evaluate model on 50 episodes
    print(f"Evaluating on 50 episodes...")
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=50, deterministic=True)
    
    results[config['name']] = {
        'config': config,
        'mean_reward': mean_reward,
        'std_reward': std_reward
    }
    
    print(f"✓ Mean Reward: {mean_reward:.2f} (±{std_reward:.2f})")
    
    # Track best model
    if mean_reward > best_reward:
        best_reward = mean_reward
        best_config = config['name']
        best_model = model
        print(f"★ New best model!")
    
    env.close()

print(f"\n{'='*60}")
print(f"TRAINING COMPLETE")
print(f"{'='*60}")
print(f"Best Configuration: {best_config}")
print(f"Best Mean Reward: {best_reward:.2f}")

## Section 6: Save Best Model and Results

In [None]:
# Save best model
best_model_path = '/content/drive/MyDrive/daladala_results/models/dqn/best_dqn'
best_model.save(best_model_path)
print(f"✓ Best model saved to: {best_model_path}")

# Save results as JSON
results_json_path = '/content/drive/MyDrive/daladala_results/results/dqn_results.json'
results_summary = {}
for config_name, config_results in results.items():
    results_summary[config_name] = {
        'mean_reward': float(config_results['mean_reward']),
        'std_reward': float(config_results['std_reward']),
        'hyperparameters': {
            'learning_rate': config_results['config']['learning_rate'],
            'buffer_size': config_results['config']['buffer_size'],
            'exploration_fraction': config_results['config']['exploration_fraction'],
            'exploration_initial_eps': config_results['config']['exploration_initial_eps'],
            'exploration_final_eps': config_results['config']['exploration_final_eps'],
            'gamma': config_results['config']['gamma']
        }
    }

with open(results_json_path, 'w') as f:
    json.dump(results_summary, f, indent=2)
print(f"✓ Results saved to: {results_json_path}")

# Display results table
print("\n" + "="*80)
print("RESULTS SUMMARY - All 12 DQN Configurations")
print("="*80)
results_df = pd.DataFrame([
    {
        'Config': name,
        'Mean Reward': f"{results[name]['mean_reward']:.2f}",
        'Std Reward': f"{results[name]['std_reward']:.2f}",
        'LR': results[name]['config']['learning_rate'],
        'Buffer': results[name]['config']['buffer_size'],
        'Gamma': results[name]['config']['gamma']
    }
    for name in results.keys()
])
print(results_df.to_string(index=False))
print("="*80)

## Section 7: Test Best Model on Sample Episodes

In [None]:
# Test the best model on 5 sample episodes
print(f"\nTesting best model ({best_config}) on 5 sample episodes:")
print("="*60)

env = DaladalaEnv()
episode_rewards = []

for ep in range(5):
    obs, _ = env.reset()
    total_reward = 0
    done = False
    steps = 0
    
    while not done:
        action, _ = best_model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        done = terminated or truncated
        steps += 1
    
    episode_rewards.append(total_reward)
    print(f"Episode {ep+1}: Reward = {total_reward:.2f} (Steps: {steps})")

env.close()

print("="*60)
print(f"Sample Episodes Mean Reward: {np.mean(episode_rewards):.2f}")
print(f"Sample Episodes Std Reward:  {np.std(episode_rewards):.2f}")
print("\n✓ DQN training and evaluation complete!")
print(f"✓ Models and results saved to Google Drive")