In [1]:
# Cell 1: Clone repo and install dependencies
import os

REPO_URL = "https://github.com/elonmj/Code-traffic-flow.git"
REPO_DIR = "/content/Code-traffic-flow"

if os.path.exists(REPO_DIR):
    print(f"Repository already exists at {REPO_DIR}")
    %cd {REPO_DIR}
    !git pull
else:
    !git clone {REPO_URL} {REPO_DIR}
    %cd {REPO_DIR}

!pip install stable-baselines3 gymnasium numba --quiet
print(f"‚úÖ Setup complete | Working dir: {os.getcwd()}")

Cloning into '/content/Code-traffic-flow'...
remote: Enumerating objects: 9056, done.[K
remote: Counting objects: 100% (284/284), done.[K
remote: Enumerating objects: 9056, done.[K[K
remote: Counting objects: 100% (284/284), done.[K
remote: Compressing objects: 100% (199/199), done.[K
remote: Compressing objects: 100% (199/199), done.[K
remote: Total 9056 (delta 150), reused 204 (delta 79), pack-reused 8772 (from 2)[K
Receiving objects: 100% (9056/9056), 317.59 MiB | 50.58 MiB/s, done.
remote: Total 9056 (delta 150), reused 204 (delta 79), pack-reused 8772 (from 2)[K
Receiving objects: 100% (9056/9056), 317.59 MiB | 50.58 MiB/s, done.
Resolving deltas: 100% (4646/4646), done.
Resolving deltas: 100% (4646/4646), done.
/content/Code-traffic-flow
/content/Code-traffic-flow
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m187.2/187.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90

In [2]:
# Cell 2: Imports & Setup - STAGE 4: VARIABLE DEMAND
import sys
sys.path.insert(0, REPO_DIR)

import numpy as np
import torch
import time
import os
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.callbacks import BaseCallback

from arz_model.config import create_victoria_island_config
from Code_RL.src.env.traffic_signal_env_direct_v3 import TrafficSignalEnvDirectV3
from Code_RL.src.env.variable_demand_wrapper import VariableDemandEnv  # NEW: Variable Demand

# Mount Google Drive for persistence
try:
    from google.colab import drive
    drive.mount('/content/drive')
    SAVE_DIR = "/content/drive/MyDrive/thesis_runs_stage4"  # Stage 4 directory
    os.makedirs(SAVE_DIR, exist_ok=True)
    print(f"‚úÖ Google Drive mounted. Saving results to: {SAVE_DIR}")
except:
    SAVE_DIR = "/content"
    print(f"‚ö†Ô∏è Google Drive not available. Saving to: {SAVE_DIR}")

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


‚ö†Ô∏è Google Drive not available. Saving to: /content
PyTorch: 2.9.0+cu126
CUDA: True
GPU: Tesla T4


In [3]:
# Cell 3: Configuration - STAGE 4: DOMAIN RANDOMIZATION
# ============================================================================
# THEORETICAL JUSTIFICATION (Webster's Formula):
# For stationary demand q, the delay d = f(Œª) is convex where Œª = g/C
# Therefore, Fixed-Time tuned to (Œª*, C*) is GLOBALLY OPTIMAL for constant q
# RL CANNOT beat this theoretical optimum; it can only match it
# 
# SOLUTION: Break stationarity with Domain Randomization
# œÅ_in ~ U(80, 280) veh/km creates scenarios where RL > Fixed-Time
# ============================================================================

DEFAULT_DENSITY = 120.0  # Initial network density

# Domain Randomization bounds (as per thesis section 8)
DENSITY_RANGE = (80.0, 280.0)   # veh/km: LOS A-B to LOS E-F
VELOCITY_RANGE = (30.0, 50.0)   # km/h: entry velocity variation

# Reward weights (no switch penalty for exploration)
REWARD_WEIGHTS = {'alpha': 5.0, 'kappa': 0.0, 'mu': 0.1}

def create_fixed_demand_env(inflow_density=180.0, quiet=True):
    """Create environment with FIXED demand (for baseline comparison)."""
    config = create_victoria_island_config(
        t_final=450.0, output_dt=15.0, cells_per_100m=4,
        default_density=DEFAULT_DENSITY, inflow_density=inflow_density, use_cache=False
    )
    config.rl_metadata = {'observation_segment_ids': [s.id for s in config.segments], 'decision_interval': 15.0}
    return TrafficSignalEnvDirectV3(
        simulation_config=config,
        decision_interval=15.0, reward_weights=REWARD_WEIGHTS, quiet=quiet
    )

def create_variable_demand_env(quiet=True, seed=None):
    """Create environment with VARIABLE demand (Domain Randomization)."""
    return VariableDemandEnv(
        density_range=DENSITY_RANGE,
        velocity_range=VELOCITY_RANGE,
        default_density=DEFAULT_DENSITY,
        t_final=450.0,
        decision_interval=15.0,
        reward_weights=REWARD_WEIGHTS,
        seed=seed,
        quiet=quiet
    )

print(f"‚úÖ STAGE 4: Variable Demand Environment Ready")
print(f"   Domain Randomization: œÅ_in ~ U{DENSITY_RANGE} veh/km")
print(f"   Velocity Range: v_in ~ U{VELOCITY_RANGE} km/h")
print(f"   Reward Weights: Œ±={REWARD_WEIGHTS['alpha']}, Œ∫={REWARD_WEIGHTS['kappa']}, Œº={REWARD_WEIGHTS['mu']}")

‚úÖ STAGE 4: Variable Demand Environment Ready
   Domain Randomization: œÅ_in ~ U(80.0, 280.0) veh/km
   Velocity Range: v_in ~ U(30.0, 50.0) km/h
   Reward Weights: Œ±=5.0, Œ∫=0.0, Œº=0.1


In [4]:
# Cell 4: Evaluate Baselines on FIXED Demand (Reference Point)
# This establishes the baseline performance on medium demand (œÅ=180)
env = create_fixed_demand_env(inflow_density=180.0)
baseline_results = {}

def eval_fixed_time(env, interval, n_ep=3):
    """Evaluate Fixed-Time controller with given cycle interval."""
    rewards = []
    for _ in range(n_ep):
        obs, _ = env.reset()
        done, ep_r, t = False, 0.0, 0.0
        while not done:
            t += env.decision_interval
            action = 1 if t >= interval else 0
            if action == 1: t = 0.0
            obs, r, done, _, _ = env.step(action)
            ep_r += r
        rewards.append(ep_r)
    return {'mean_reward': np.mean(rewards), 'std_reward': np.std(rewards)}

# Random baseline
print("üé≤ Random (Reference)...")
rnd = [sum([env.step(env.action_space.sample())[1] for _ in range(30)]) for _ in [env.reset() for _ in range(3)]]
baseline_results['Random'] = {'mean_reward': np.mean(rnd), 'std_reward': np.std(rnd)}

# Fixed-time baselines on FIXED demand
for name, interval in [('FT-30s', 30), ('FT-60s', 60), ('FT-90s', 90)]:
    print(f"‚è±Ô∏è {name} (œÅ=180, fixed)...")
    baseline_results[name] = eval_fixed_time(env, interval)

print("\nüìä BASELINES (Fixed Demand œÅ=180):")
for n, d in sorted(baseline_results.items(), key=lambda x: x[1]['mean_reward'], reverse=True):
    print(f"  {n:10s}: {d['mean_reward']:>8.1f} ¬± {d['std_reward']:.1f}")

FT90_REF = baseline_results['FT-90s']['mean_reward']
print(f"\nüéØ Reference for comparison: FT-90s = {FT90_REF:.1f}")


üè≠ VICTORIA ISLAND CONFIG FACTORY - GLOBAL CONFIGURATION GENERATION
   üìä Loading topology from: /content/Code-traffic-flow/arz_model/data/fichier_de_travail_corridor_utf8.csv
   ‚úÖ Loaded 70 edges from topology
   üîó Building directed graph...
   ‚úÖ Graph built: 60 nodes, 70 edges
   üß† Analyzing network structure (global reflection)...
   ‚úÖ Network analysis complete:
      - Entry points: 4
      - Exit points: 4
      - Junctions: 15
      - Simple pass-through nodes: 37
üè≠ VICTORIA ISLAND CONFIG FACTORY - GLOBAL CONFIGURATION GENERATION
   üìä Loading topology from: /content/Code-traffic-flow/arz_model/data/fichier_de_travail_corridor_utf8.csv
   ‚úÖ Loaded 70 edges from topology
   üîó Building directed graph...
   ‚úÖ Graph built: 60 nodes, 70 edges
   üß† Analyzing network structure (global reflection)...
   ‚úÖ Network analysis complete:
      - Entry points: 4
      - Exit points: 4
      - Junctions: 15
      - Simple pass-through nodes: 37


  return datetime.utcnow().replace(tzinfo=utc)


   üö¶ Detected 8 signalized nodes from OSM data

   üîß Generating segment configurations...
   ‚úÖ Created 70 segment configurations

   üîß Generating node configurations...
   ‚úÖ Created 60 node configurations

   ‚öôÔ∏è  Setting up time and physics parameters...

   üî® Assembling complete network configuration...

‚úÖ CONFIGURATION GENERATION COMPLETE
   Total Segments: 70

   üîß Generating segment configurations...
   ‚úÖ Created 70 segment configurations

   üîß Generating node configurations...
   ‚úÖ Created 60 node configurations

   ‚öôÔ∏è  Setting up time and physics parameters...

   üî® Assembling complete network configuration...

‚úÖ CONFIGURATION GENERATION COMPLETE
   Total Segments: 70
   Total Nodes: 60
   Entry Points: 4
   Exit Points: 4
   Junctions: 15
   Signalized Nodes: 8
   Simulation Duration: 450.0s (7.5 min)
   Grid Resolution: 4 cells/100m

   Total Nodes: 60
   Entry Points: 4
   Exit Points: 4
   Junctions: 15
   Signalized Nodes: 8
   Simula

  return datetime.utcnow().replace(tzinfo=utc)


   [10/70] segments created
   [20/70] segments created
   [30/70] segments created
   [40/70] segments created
   [50/70] segments created
   [60/70] segments created
   [70/70] segments created
[NETWORK BUILD] ‚úÖ Network construction complete!
   Total segments: 70
   Total nodes: 60
Finalizing network structure and validating topology...
‚úÖ Network topology is valid.


  return datetime.utcnow().replace(tzinfo=utc)


‚úÖ GPUMemoryPool initialized:
   - Segments: 70
   - Total cells: 795
   - Ghost cells: 3
   - Compute Capability: (6, 0)
   - CUDA streams: Enabled
   - GPU memory allocated: 12.00 MB


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


  - Preparing GPU topology for network coupling...
    - GPU topology prepared and transferred.
üé≤ Random (Reference)...


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


‚è±Ô∏è FT-30s (œÅ=180, fixed)...




‚è±Ô∏è FT-60s (œÅ=180, fixed)...
‚è±Ô∏è FT-90s (œÅ=180, fixed)...
‚è±Ô∏è FT-90s (œÅ=180, fixed)...

üìä BASELINES (Fixed Demand œÅ=180):
  Random    :    806.6 ¬± 18.2
  FT-60s    :    799.7 ¬± 0.0
  FT-90s    :    799.6 ¬± 0.0
  FT-30s    :    792.1 ¬± 10.6

üéØ Reference for comparison: FT-90s = 799.6

üìä BASELINES (Fixed Demand œÅ=180):
  Random    :    806.6 ¬± 18.2
  FT-60s    :    799.7 ¬± 0.0
  FT-90s    :    799.6 ¬± 0.0
  FT-30s    :    792.1 ¬± 10.6

üéØ Reference for comparison: FT-90s = 799.6


### üìê Justification Th√©orique : Pourquoi la Demande Variable ?

**Probl√®me avec la demande constante** (Stage 3) :
- Pour une demande stationnaire $q_{in}(t) = \bar{q}$, le d√©lai moyen $d = f(\lambda)$ est une **fonction convexe**
- Il existe donc un unique optimum $(\lambda^*, C^*)$ ‚Üí Fixed-Time configur√© sur cet optimum est **globalement optimal**
- L'agent RL ne peut math√©matiquement pas faire mieux ; il converge vers FT-90s

**Solution : Domain Randomization** (Stage 4) :
$$\rho_{in} \sim \mathcal{U}(80, 280) \text{ veh/km}$$

Cette variabilit√© cr√©e des conditions o√π :
1. **R√©gime fluide** (œÅ < 100) : FT-90s gaspille du temps de vert inutile
2. **R√©gime satur√©** (œÅ > 200) : FT-90s cr√©e des files d'attente r√©siduelles
3. **L'agent RL** apprend √† **diagnostiquer** l'√©tat et **adapter** dynamiquement ses d√©cisions

**Hypoth√®se H5** : L'agent RL surpassera FT-90s sur les sc√©narios de demande variable.

In [None]:
# Cell 4b: Test TRUE Domain Randomization
# This verifies that inflow_density ACTUALLY changes the simulation behavior
print("üî¨ Testing TRUE Domain Randomization...")
print("="*60)

test_env = create_variable_demand_env(quiet=True, seed=42)

# Run 5 episodes with Domain Randomization
print("\nüìä Domain Randomization Test (5 episodes):")
print("-"*60)
episode_densities = []
episode_rewards = []

for ep in range(5):
    obs, info = test_env.reset()
    density = info.get('inflow_density', 'N/A')
    velocity = info.get('inflow_velocity', 'N/A')
    episode_densities.append(density)
    
    ep_reward = 0.0
    steps = 0
    while True:
        action = test_env.action_space.sample()  # Random actions for test
        obs, reward, done, truncated, _ = test_env.step(action)
        ep_reward += reward
        steps += 1
        if done or truncated:
            break
    
    episode_rewards.append(ep_reward)
    print(f"  Episode {ep+1}: œÅ_in={density:.0f} veh/km, v_in={velocity:.0f} km/h ‚Üí Reward={ep_reward:.1f}")

# Verify variation
density_std = np.std(episode_densities)
print("-"*60)
print(f"\n‚úÖ Density variation: œÉ = {density_std:.1f} veh/km")
if density_std > 10:
    print("   ‚Üí TRUE Domain Randomization is working!")
    print("   ‚Üí Each episode has DIFFERENT traffic conditions")
else:
    print("   ‚ö†Ô∏è WARNING: Low variation - check implementation")

print(f"\nüìà Reward variation: min={min(episode_rewards):.1f}, max={max(episode_rewards):.1f}")
print("="*60)

In [6]:
# Cell 5: Progressive Training Callback with BEST MODEL CHECKPOINTING
class ProgressCallback(BaseCallback):
    """
    Callback to track training progress with variable demand.
    
    Key feature: Saves BEST model based on evaluation reward, not just periodic saves.
    This ensures we keep the best-performing model even if performance degrades later.
    """
    def __init__(self, eval_env, ref_reward, save_path, target_pct=10.0, eval_freq=5000, n_eval=5):
        super().__init__()
        self.eval_env = eval_env
        self.ref = ref_reward
        self.target = ref_reward * (1 + target_pct/100)
        self.eval_freq = eval_freq
        self.n_eval = n_eval
        self.save_path = save_path  # Path to save best model
        self.history = []
        self.best = -np.inf
        self.best_step = 0
        self.reached = False
        
    def _on_step(self):
        if self.n_calls % self.eval_freq == 0:
            rewards = []
            densities = []
            all_actions = []
            
            for _ in range(self.n_eval):
                obs, info = self.eval_env.reset()
                densities.append(info.get('inflow_density', 180))
                done = False
                ep_reward = 0
                ep_actions = []
                
                while not done:
                    action, _ = self.model.predict(obs, deterministic=True)
                    ep_actions.append(int(action))
                    obs, reward, done, truncated, _ = self.eval_env.step(action)
                    ep_reward += reward
                    if truncated:
                        break
                        
                rewards.append(ep_reward)
                all_actions.append(ep_actions)
            
            mean_r = np.mean(rewards)
            std_r = np.std(rewards)
            mean_density = np.mean(densities)
            imp = ((mean_r - self.ref) / abs(self.ref)) * 100
            
            self.history.append({
                'step': self.num_timesteps, 
                'reward': mean_r, 
                'std': std_r,
                'improvement': imp,
                'mean_density': mean_density
            })
            
            # BEST MODEL CHECKPOINTING: Save only when we improve
            new_best = False
            if mean_r > self.best:
                self.best = mean_r
                self.best_step = self.num_timesteps
                self.model.save(f"{self.save_path}/model_BEST")
                new_best = True
            
            # Action analysis
            n_switches = sum(all_actions[0]) if all_actions else 0
            n_steps = len(all_actions[0]) if all_actions else 1
            switch_rate = n_switches / n_steps * 100
            
            best_marker = "üíæ BEST" if new_best else ""
            status = "üéØ" if imp >= 10 else ""
            print(f"  [{self.num_timesteps:>6}] R={mean_r:>7.1f}¬±{std_r:.1f} | œÅ={mean_density:.0f} | Œî={imp:>+5.1f}% | Sw={switch_rate:.0f}% {status} {best_marker}")
            
            if imp >= 10 and not self.reached:
                self.reached = True
                print(f"\nüèÜ TARGET +10% REACHED at step {self.num_timesteps}!")
                
        return True

print(f"‚úÖ Callback ready with BEST MODEL CHECKPOINTING")
print(f"   Reference: FT-90s (fixed œÅ=180) = {FT90_REF:.1f}")
print(f"   Target: +10% improvement = {FT90_REF*1.1:.1f}")
print(f"   Best model saved automatically when performance improves")

‚úÖ Callback ready with BEST MODEL CHECKPOINTING
   Reference: FT-90s (fixed œÅ=180) = 799.6
   Target: +10% improvement = 879.5
   Best model saved automatically when performance improves


In [7]:
# Cell 6: Initialize DQN Model - STAGE 4 (200k Steps, 10 blocks of 20k)
# Create environments with Domain Randomization
train_env = create_variable_demand_env(quiet=True, seed=42)
eval_env = create_variable_demand_env(quiet=True, seed=123)  # Different seed for eval

TOTAL_STEPS = 200_000
BLOCK_SIZE = 20_000
N_BLOCKS = 10  # 10 blocks √ó 20k = 200k total

# DQN with 30% exploration (60k steps out of 200k)
model = DQN(
    "MlpPolicy", 
    train_env, 
    learning_rate=1e-4, 
    buffer_size=50000,    # Smaller buffer for 200k training
    learning_starts=1000,
    batch_size=64, 
    tau=0.005, 
    gamma=0.99,
    exploration_fraction=0.3,  # 30% exploration (60k steps)
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    verbose=0, 
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# Callback with best model saving
callback = ProgressCallback(
    eval_env, FT90_REF, 
    save_path=SAVE_DIR,  # Path for best model
    target_pct=10.0, 
    eval_freq=5000,  # Evaluate every 5k steps
    n_eval=5
)
STATE = {'model': model, 'callback': callback, 'steps': 0, 'block': 0, 'done': False}

print(f"üöÄ STAGE 4: DQN ready on {model.device}")
print(f"   Training Environment: Variable Demand œÅ ~ U(80, 280)")
print(f"   Total Steps: {TOTAL_STEPS:,} ({N_BLOCKS} blocks √ó {BLOCK_SIZE//1000}k)")
print(f"   Exploration: 30% ({int(TOTAL_STEPS*0.3):,} steps)")
print(f"   Best Model: Auto-saved when performance improves")


üè≠ VICTORIA ISLAND CONFIG FACTORY - GLOBAL CONFIGURATION GENERATION
   üìä Loading topology from: /content/Code-traffic-flow/arz_model/data/fichier_de_travail_corridor_utf8.csv
   ‚úÖ Loaded 70 edges from topology
   üîó Building directed graph...
   ‚úÖ Graph built: 60 nodes, 70 edges
   üß† Analyzing network structure (global reflection)...
   ‚úÖ Network analysis complete:
üè≠ VICTORIA ISLAND CONFIG FACTORY - GLOBAL CONFIGURATION GENERATION
   üìä Loading topology from: /content/Code-traffic-flow/arz_model/data/fichier_de_travail_corridor_utf8.csv
   ‚úÖ Loaded 70 edges from topology
   üîó Building directed graph...
   ‚úÖ Graph built: 60 nodes, 70 edges
   üß† Analyzing network structure (global reflection)...
   ‚úÖ Network analysis complete:
      - Entry points: 4
      - Exit points: 4
      - Junctions: 15
      - Simple pass-through nodes: 37
   üö¶ Detected 8 signalized nodes from OSM data

   üîß Generating segment configurations...
   ‚úÖ Created 70 segment co

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)



üè≠ VICTORIA ISLAND CONFIG FACTORY - GLOBAL CONFIGURATION GENERATION
   üìä Loading topology from: /content/Code-traffic-flow/arz_model/data/fichier_de_travail_corridor_utf8.csv
   ‚úÖ Loaded 70 edges from topology
   üîó Building directed graph...
üè≠ VICTORIA ISLAND CONFIG FACTORY - GLOBAL CONFIGURATION GENERATION
   üìä Loading topology from: /content/Code-traffic-flow/arz_model/data/fichier_de_travail_corridor_utf8.csv
   ‚úÖ Loaded 70 edges from topology
   üîó Building directed graph...
   ‚úÖ Graph built: 60 nodes, 70 edges
   üß† Analyzing network structure (global reflection)...
   ‚úÖ Network analysis complete:
      - Entry points: 4
      - Exit points: 4
      - Junctions: 15
      - Simple pass-through nodes: 37
   ‚úÖ Graph built: 60 nodes, 70 edges
   üß† Analyzing network structure (global reflection)...
   ‚úÖ Network analysis complete:
      - Entry points: 4
      - Exit points: 4
      - Junctions: 15
      - Simple pass-through nodes: 37
   üö¶ Detected 

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


üöÄ STAGE 4: DQN ready on cuda
   Training Environment: Variable Demand œÅ ~ U(80, 280)
   Total Steps: 200,000 (10 blocks √ó 20k)
   Exploration: 30% (60,000 steps)
   Best Model: Auto-saved when performance improves


  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
# Block 1: Steps 0 ‚Üí 20k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 1: 0 ‚Üí 20k")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 1, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    print(f"‚úÖ Block 1 done | Steps: {STATE['steps']:,} | Best: {STATE['callback'].best:.1f} (at {STATE['callback'].best_step:,})")

üìä BLOCK 1: 0 ‚Üí 20k

üè≠ VICTORIA ISLAND CONFIG FACTORY - GLOBAL CONFIGURATION GENERATION
   üìä Loading topology from: /content/Code-traffic-flow/arz_model/data/fichier_de_travail_corridor_utf8.csv
   ‚úÖ Loaded 70 edges from topology
   üîó Building directed graph...

üè≠ VICTORIA ISLAND CONFIG FACTORY - GLOBAL CONFIGURATION GENERATION
   üìä Loading topology from: /content/Code-traffic-flow/arz_model/data/fichier_de_travail_corridor_utf8.csv
   ‚úÖ Loaded 70 edges from topology
   üîó Building directed graph...
   ‚úÖ Graph built: 60 nodes, 70 edges
   üß† Analyzing network structure (global reflection)...
   ‚úÖ Network analysis complete:
      - Entry points: 4
      - Exit points: 4
      - Junctions: 15
      - Simple pass-through nodes: 37
   ‚úÖ Graph built: 60 nodes, 70 edges
   üß† Analyzing network structure (global reflection)...
   ‚úÖ Network analysis complete:
      - Entry points: 4
      - Exit points: 4
      - Junctions: 15
      - Simple pass-through no

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


Output()

In [None]:
# Block 2: Steps 20k ‚Üí 40k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 2: 20k ‚Üí 40k")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 2, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    print(f"‚úÖ Block 2 done | Steps: {STATE['steps']:,} | Best: {STATE['callback'].best:.1f} (at {STATE['callback'].best_step:,})")

In [None]:
# Block 3: Steps 40k ‚Üí 60k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 3: 40k ‚Üí 60k")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 3, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    print(f"‚úÖ Block 3 done | Steps: {STATE['steps']:,} | Best: {STATE['callback'].best:.1f} (at {STATE['callback'].best_step:,})")

In [None]:
# Block 4: Steps 60k ‚Üí 80k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 4: 60k ‚Üí 80k")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 4, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    print(f"‚úÖ Block 4 done | Steps: {STATE['steps']:,} | Best: {STATE['callback'].best:.1f} (at {STATE['callback'].best_step:,})")

In [None]:
# Block 5: Steps 80k ‚Üí 100k (HALFWAY)
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 5: 80k ‚Üí 100k (HALFWAY)")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 5, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    imp = ((STATE['callback'].best - FT90_REF) / abs(FT90_REF)) * 100
    print(f"üìà HALFWAY: {STATE['steps']:,} steps | Best: {STATE['callback'].best:.1f} | Œî={imp:+.1f}%")

In [None]:
# Block 6: Steps 100k ‚Üí 120k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 6: 100k ‚Üí 120k")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 6, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    print(f"‚úÖ Block 6 done | Steps: {STATE['steps']:,} | Best: {STATE['callback'].best:.1f} (at {STATE['callback'].best_step:,})")

In [None]:
# Block 7: Steps 120k ‚Üí 140k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 7: 120k ‚Üí 140k")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 7, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    print(f"‚úÖ Block 7 done | Steps: {STATE['steps']:,} | Best: {STATE['callback'].best:.1f} (at {STATE['callback'].best_step:,})")

In [None]:
# Block 8: Steps 140k ‚Üí 160k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 8: 140k ‚Üí 160k")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 8, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    print(f"‚úÖ Block 8 done | Steps: {STATE['steps']:,} | Best: {STATE['callback'].best:.1f} (at {STATE['callback'].best_step:,})")

In [None]:
# Block 9: Steps 160k ‚Üí 180k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 9: 160k ‚Üí 180k")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 9, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    print(f"‚úÖ Block 9 done | Steps: {STATE['steps']:,} | Best: {STATE['callback'].best:.1f} (at {STATE['callback'].best_step:,})")

In [None]:
# Block 10 (FINAL): Steps 180k ‚Üí 200k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 10 (FINAL): 180k ‚Üí 200k")
    STATE['model'].learn(BLOCK_SIZE, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 10, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached

# Final summary
imp = ((STATE['callback'].best - FT90_REF) / abs(FT90_REF)) * 100
print(f"\n{'='*60}")
print(f"üèÅ TRAINING COMPLETE - STAGE 4")
print(f"{'='*60}")
print(f"   Total steps: {STATE['steps']:,}")
print(f"   Best reward: {STATE['callback'].best:.1f} (at step {STATE['callback'].best_step:,})")
print(f"   Improvement vs FT-90s: {imp:+.1f}%")
print(f"   Target (+10%): {'‚úÖ ACHIEVED' if imp >= 10 else '‚ùå NOT YET'}")
print(f"{'='*60}")
print(f"\nüíæ Best model saved at: {SAVE_DIR}/model_BEST.zip")

In [None]:
# Final: Plot Learning Curve & Save Results
import matplotlib.pyplot as plt
import json

h = STATE['callback'].history
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Plot 1: Learning Curve
ax1 = axes[0]
ax1.plot([x['step'] for x in h], [x['reward'] for x in h], 'b-o', markersize=2, label='RL Agent')
ax1.fill_between([x['step'] for x in h], 
                 [x['reward']-x.get('std',0) for x in h], 
                 [x['reward']+x.get('std',0) for x in h], alpha=0.2)
ax1.axhline(y=FT90_REF, color='r', linestyle='--', label='FT-90s (œÅ=180)')
ax1.set_xlabel('Training Steps')
ax1.set_ylabel('Reward')
ax1.legend()
ax1.set_title('Stage 4: Learning Curve (Variable Demand)')

# Plot 2: Improvement vs FT-90s
ax2 = axes[1]
ax2.plot([x['step'] for x in h], [x['improvement'] for x in h], 'b-o', markersize=2)
ax2.axhline(y=10, color='g', linestyle='--', label='Target +10%')
ax2.axhline(y=0, color='r', linestyle='-', alpha=0.5)
ax2.set_xlabel('Training Steps')
ax2.set_ylabel('Improvement vs FT-90s (%)')
ax2.legend()
ax2.set_title('Improvement Over Fixed-Time Baseline')

# Plot 3: Mean Density Distribution during eval
ax3 = axes[2]
ax3.plot([x['step'] for x in h], [x.get('mean_density', 180) for x in h], 'g-o', markersize=2)
ax3.axhline(y=180, color='orange', linestyle='--', label='Fixed œÅ=180')
ax3.set_xlabel('Training Steps')
ax3.set_ylabel('Mean Eval Density (veh/km)')
ax3.legend()
ax3.set_title('Variable Demand Distribution')

plt.tight_layout()
plt.savefig(f'{SAVE_DIR}/stage4_learning_curve.png', dpi=150)
plt.show()

# Save JSON results
results = {
    'stage': 'Stage 4 - Variable Demand',
    'baselines': baseline_results, 
    'history': h, 
    'best': STATE['callback'].best,
    'improvement': ((STATE['callback'].best - FT90_REF) / abs(FT90_REF)) * 100,
    'density_range': list(DENSITY_RANGE),
    'total_steps': STATE['steps']
}
with open(f'{SAVE_DIR}/stage4_results.json', 'w') as f: 
    json.dump(results, f, indent=2)
    
print(f"üìÅ Saved: {SAVE_DIR}/stage4_learning_curve.png")
print(f"üìÅ Saved: {SAVE_DIR}/stage4_results.json")

In [None]:
# CRITICAL: Comparative Evaluation on Multiple Demand Scenarios
# Load BEST model for evaluation (not the final one which may be worse)
print("\n" + "="*70)
print("üìä STAGE 4: COMPARATIVE EVALUATION - RL vs FT-90s")
print("="*70)

# Load the best model (saved during training when performance improved)
print(f"üíæ Loading BEST model from: {SAVE_DIR}/model_BEST.zip")
best_model = DQN.load(f"{SAVE_DIR}/model_BEST", device='cuda' if torch.cuda.is_available() else 'cpu')
print(f"   Best reward during training: {STATE['callback'].best:.1f}")
print(f"   Best model saved at step: {STATE['callback'].best_step:,}")

# Define test scenarios covering the demand spectrum
TEST_SCENARIOS = {
    'Light (œÅ=100)': 100.0,    # LOS A-B: Free flow
    'Medium (œÅ=180)': 180.0,   # LOS C-D: Reference scenario
    'Heavy (œÅ=250)': 250.0,    # LOS E: Near saturation
    'Peak (œÅ=300)': 300.0      # LOS F: Oversaturation
}

def evaluate_on_scenario(model, inflow_density, n_episodes=5):
    """Evaluate RL agent on a specific demand scenario."""
    env = create_fixed_demand_env(inflow_density=inflow_density, quiet=True)
    rewards = []
    for _ in range(n_episodes):
        obs, _ = env.reset()
        done, ep_r = False, 0.0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, r, done, _, _ = env.step(action)
            ep_r += r
        rewards.append(ep_r)
    return {'mean': np.mean(rewards), 'std': np.std(rewards)}

def evaluate_ft90_on_scenario(inflow_density, n_episodes=5):
    """Evaluate FT-90s baseline on a specific demand scenario."""
    env = create_fixed_demand_env(inflow_density=inflow_density, quiet=True)
    rewards = []
    for _ in range(n_episodes):
        obs, _ = env.reset()
        done, ep_r, t = False, 0.0, 0.0
        while not done:
            t += env.decision_interval
            action = 1 if t >= 90 else 0
            if action == 1: t = 0.0
            obs, r, done, _, _ = env.step(action)
            ep_r += r
        rewards.append(ep_r)
    return {'mean': np.mean(rewards), 'std': np.std(rewards)}

# Evaluate BEST model (not STATE['model']) on all scenarios
rl_results = {}
ft90_results = {}

print("\nüìà Evaluating BEST model on each demand scenario...")
for name, density in TEST_SCENARIOS.items():
    print(f"  Testing {name}...")
    rl_results[name] = evaluate_on_scenario(best_model, density)  # Use best_model
    ft90_results[name] = evaluate_ft90_on_scenario(density)

# Print comparison table
print("\n" + "="*70)
print(f"{'Scenario':<20} | {'RL (BEST)':>12} | {'FT-90s':>12} | {'Œî (%)':>10}")
print("-"*70)
total_rl, total_ft = 0, 0
for scenario in TEST_SCENARIOS:
    rl_mean = rl_results[scenario]['mean']
    ft_mean = ft90_results[scenario]['mean']
    improvement = ((rl_mean - ft_mean) / abs(ft_mean)) * 100
    total_rl += rl_mean
    total_ft += ft_mean
    marker = "üéØ" if improvement > 5 else ("‚úì" if improvement > 0 else "")
    print(f"{scenario:<20} | {rl_mean:>12.1f} | {ft_mean:>12.1f} | {improvement:>+9.1f}% {marker}")

print("-"*70)
avg_improvement = ((total_rl - total_ft) / abs(total_ft)) * 100
print(f"{'AVERAGE':<20} | {total_rl/4:>12.1f} | {total_ft/4:>12.1f} | {avg_improvement:>+9.1f}%")
print("="*70)

# Store scenario results
scenario_results = {'rl': rl_results, 'ft90': ft90_results, 'scenarios': TEST_SCENARIOS}

In [None]:
# Final Visualization: Bar Chart Comparison
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Bar chart comparison
scenarios = list(TEST_SCENARIOS.keys())
rl_means = [rl_results[s]['mean'] for s in scenarios]
ft_means = [ft90_results[s]['mean'] for s in scenarios]
x = np.arange(len(scenarios))
width = 0.35

ax1 = axes[0]
bars1 = ax1.bar(x - width/2, rl_means, width, label='RL BEST Model', color='#2ecc71')
bars2 = ax1.bar(x + width/2, ft_means, width, label='Fixed-Time (90s)', color='#e74c3c')
ax1.set_xlabel('Traffic Demand Scenario')
ax1.set_ylabel('Mean Reward')
ax1.set_title('Stage 4: RL (BEST) vs Fixed-Time Across Demand Levels')
ax1.set_xticks(x)
ax1.set_xticklabels(scenarios, rotation=15)
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Plot 2: Improvement percentage
improvements = [((rl_results[s]['mean'] - ft90_results[s]['mean']) / abs(ft90_results[s]['mean'])) * 100 
                for s in scenarios]
colors = ['#27ae60' if imp > 0 else '#c0392b' for imp in improvements]
ax2 = axes[1]
bars = ax2.bar(scenarios, improvements, color=colors, edgecolor='black')
ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax2.axhline(y=10, color='green', linestyle='--', label='Target +10%', alpha=0.7)
ax2.set_xlabel('Traffic Demand Scenario')
ax2.set_ylabel('Improvement over FT-90s (%)')
ax2.set_title('RL Advantage by Demand Level')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, imp in zip(bars, improvements):
    height = bar.get_height()
    ax2.annotate(f'{imp:+.1f}%',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points",
                ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig(f'{SAVE_DIR}/stage4_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

# Final Summary
print("\n" + "="*70)
print("üèÅ STAGE 4 COMPLETE: VARIABLE DEMAND VALIDATION")
print("="*70)
print(f"   Training: {STATE['steps']:,} steps ({N_BLOCKS} blocks √ó {BLOCK_SIZE//1000}k)")
print(f"   Best Reward: {STATE['callback'].best:.1f} (at step {STATE['callback'].best_step:,})")
print(f"   Overall Improvement: {avg_improvement:+.1f}%")
print(f"   Target (+10%): {'‚úÖ ACHIEVED' if avg_improvement >= 10 else '‚ùå NOT YET'}")
print("="*70)

# Save all results
final_results = {
    'stage': 'Stage 4 - Variable Demand Training (200k)',
    'training': {
        'total_steps': STATE['steps'],
        'block_size': BLOCK_SIZE,
        'n_blocks': N_BLOCKS,
        'best_reward': STATE['callback'].best,
        'best_step': STATE['callback'].best_step,
        'history': STATE['callback'].history
    },
    'evaluation': {
        'scenarios': scenario_results,
        'average_improvement': avg_improvement
    },
    'config': {
        'density_range': list(DENSITY_RANGE),
        'velocity_range': list(VELOCITY_RANGE),
        'reward_weights': REWARD_WEIGHTS
    }
}
with open(f'{SAVE_DIR}/stage4_final_results.json', 'w') as f:
    json.dump(final_results, f, indent=2)
print(f"\nüìÅ All results saved to: {SAVE_DIR}/")