In [None]:
# Cell 1: Clone repo and install dependencies
import os

REPO_URL = "https://github.com/elonmj/Code-traffic-flow.git"
REPO_DIR = "/content/Code-traffic-flow"

if os.path.exists(REPO_DIR):
    print(f"Repository already exists at {REPO_DIR}")
    %cd {REPO_DIR}
    !git pull
else:
    !git clone {REPO_URL} {REPO_DIR}
    %cd {REPO_DIR}

!pip install stable-baselines3 gymnasium numba --quiet
print(f"‚úÖ Setup complete | Working dir: {os.getcwd()}")

In [None]:
# Cell 2: Imports & Setup
import sys
sys.path.insert(0, REPO_DIR)

import numpy as np
import torch
import time
import os
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.callbacks import BaseCallback

from arz_model.config import create_victoria_island_config
from Code_RL.src.env.traffic_signal_env_direct_v3 import TrafficSignalEnvDirectV3

# Mount Google Drive for persistence
try:
    from google.colab import drive
    drive.mount('/content/drive')
    SAVE_DIR = "/content/drive/MyDrive/thesis_runs_stage3"
    os.makedirs(SAVE_DIR, exist_ok=True)
    print(f"‚úÖ Google Drive mounted. Saving results to: {SAVE_DIR}")
except:
    SAVE_DIR = "/content"
    print(f"‚ö†Ô∏è Google Drive not available. Saving to: {SAVE_DIR}")

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Cell 3: Configuration
# FIX: Reduce phase change penalty to encourage exploration
DEFAULT_DENSITY = 120.0  # Congested scenario
INFLOW_DENSITY = 180.0

# IMPORTANT: Set kappa=0.0 to remove phase change penalty initially
# This allows the agent to explore switching without penalty
REWARD_WEIGHTS = {'alpha': 5.0, 'kappa': 0.0, 'mu': 0.1}  # kappa=0.0 removes switch penalty

def create_env(quiet=True):
    """Create traffic environment"""
    config = create_victoria_island_config(
        t_final=450.0, output_dt=15.0, cells_per_100m=4,
        default_density=DEFAULT_DENSITY, inflow_density=INFLOW_DENSITY, use_cache=False
    )
    config.rl_metadata = {'observation_segment_ids': [s.id for s in config.segments], 'decision_interval': 15.0}
    
    class SimpleConfig:
        def __init__(self, c):
            self.arz_simulation_config = c
            self.rl_env_params = {'dt_decision': 15.0, 'observation_segment_ids': None, 'reward_weights': REWARD_WEIGHTS}
    
    return TrafficSignalEnvDirectV3(
        simulation_config=SimpleConfig(config).arz_simulation_config,
        decision_interval=15.0, reward_weights=REWARD_WEIGHTS, quiet=quiet
    )

print(f"‚úÖ Environment ready | kappa={REWARD_WEIGHTS['kappa']} (no switch penalty)")
print(f"   Congestion: alpha={REWARD_WEIGHTS['alpha']}, Throughput: mu={REWARD_WEIGHTS['mu']}")

In [None]:
# Cell 4: Evaluate Baselines (Random, FT-30s, FT-60s, FT-90s)
env = create_env()
baseline_results = {}

def eval_fixed_time(env, interval, n_ep=3):
    rewards = []
    for _ in range(n_ep):
        obs, _ = env.reset()
        done, ep_r, t = False, 0.0, 0.0
        while not done:
            t += env.decision_interval
            action = 1 if t >= interval else 0
            if action == 1: t = 0.0
            obs, r, done, _, _ = env.step(action)
            ep_r += r
        rewards.append(ep_r)
    return {'mean_reward': np.mean(rewards), 'std_reward': np.std(rewards)}

# Random
print("üé≤ Random...")
rnd = [sum([env.step(env.action_space.sample())[1] for _ in range(30)]) for _ in [env.reset() for _ in range(3)]]
baseline_results['Random'] = {'mean_reward': np.mean(rnd), 'std_reward': np.std(rnd)}

# Fixed-time baselines
for name, interval in [('FT-30s', 30), ('FT-60s', 60), ('FT-90s', 90)]:
    print(f"‚è±Ô∏è {name}...")
    baseline_results[name] = eval_fixed_time(env, interval)

print("\nüìä BASELINES:")
for n, d in sorted(baseline_results.items(), key=lambda x: x[1]['mean_reward'], reverse=True):
    print(f"  {n:10s}: {d['mean_reward']:>8.1f} ¬± {d['std_reward']:.1f}")

### üß† Analysis: Is the scenario "too easy"?

You asked if the current scenario is "too easy". Here is the scientific perspective:

1.  **Convexity**: In a uniform demand scenario with a macroscopic model, the optimization surface is likely "convex" or "unimodal". This means there is a single clear optimal solution (balanced green time, i.e., FT-90s).
2.  **"Easy" vs "Fundamental"**: It is not that the problem is "easy" in a trivial sense, but rather that the **optimal policy is simple**. Finding that simple optimal policy is a valid test of the RL agent.
3.  **Complexity comes from Variance**: Real-world difficulty comes from **varying demand** (morning rush vs. night). Currently, we have constant demand.
    *   *If we added variable demand, FT-90s would fail, and RL would likely shine.*
    *   *But for this thesis stage, proving RL can match the theoretical optimum (FT-90s) in the base case is the necessary first step.*

**Conclusion:** It is "easy" for a human to guess the solution (balance the lights), but it is a **perfect validation test** for the RL agent. If it couldn't solve this "easy" case, we couldn't trust it on hard ones.

In [None]:
# Cell 4b: Quick PPO Comparison (Experimental)
from stable_baselines3 import PPO

print("\nüß™ EXPERIMENTAL: Quick PPO Test (50k steps)...")
print("Checking if PPO finds a better policy faster than DQN...")

ppo_env = create_env()
# PPO often works better with default hyperparameters than DQN
ppo_model = PPO("MlpPolicy", ppo_env, verbose=0, learning_rate=3e-4)

start_time = time.time()
ppo_model.learn(total_timesteps=5000, progress_bar=True)
train_time = time.time() - start_time

# Evaluate PPO
rewards = []
for _ in range(5):
    obs, _ = ppo_env.reset()
    done, ep_r = False, 0.0
    while not done:
        action, _ = ppo_model.predict(obs, deterministic=True)
        obs, r, done, _, _ = ppo_env.step(action)
        ep_r += r
    rewards.append(ep_r)

ppo_mean = np.mean(rewards)
imp = ((ppo_mean - baseline_results['FT-90s']['mean_reward']) / abs(baseline_results['FT-90s']['mean_reward'])) * 100

print(f"‚è±Ô∏è PPO Training Time: {train_time:.1f}s")
print(f"üìä PPO Result (50k steps): {ppo_mean:.1f} | vs FT-90s: {imp:+.1f}%")

if ppo_mean > baseline_results['FT-90s']['mean_reward'] * 1.05:
    print("üí° INSIGHT: PPO seems promising! Consider switching if DQN struggles.")
else:
    print("üí° INSIGHT: PPO performs similarly or worse. Sticking with DQN is fine.")

In [None]:
# Cell 5: Progressive Training Callback
class ProgressCallback(BaseCallback):
    def __init__(self, eval_env, ref_reward, target_pct=10.0, eval_freq=1000, n_eval=3):
        super().__init__()
        self.eval_env, self.ref = eval_env, ref_reward
        self.target = ref_reward * (1 + target_pct/100)
        self.eval_freq, self.n_eval = eval_freq, n_eval
        self.history, self.best, self.reached = [], -np.inf, False
        
    def _on_step(self):
        if self.n_calls % self.eval_freq == 0:
            rewards = []
            all_actions = []  # Track actions across episodes
            for _ in range(self.n_eval):
                obs, done, r = self.eval_env.reset()[0], False, 0
                ep_actions = []
                while not done:
                    a, _ = self.model.predict(obs, deterministic=True)
                    ep_actions.append(int(a))
                    obs, rew, done, _, _ = self.eval_env.step(a)
                    r += rew
                rewards.append(r)
                all_actions.append(ep_actions)
            
            mean_r = np.mean(rewards)
            imp = ((mean_r - self.ref) / abs(self.ref)) * 100
            self.history.append({'step': self.num_timesteps, 'reward': mean_r, 'improvement': imp})
            if mean_r > self.best: self.best = mean_r
            
            # Analyze actions
            avg_actions = all_actions[0]  # Use first episode as representative
            n_switches = sum(avg_actions)
            n_steps = len(avg_actions)
            switch_rate = n_switches / n_steps * 100
            
            status = "üéØ" if imp >= 10 else ""
            print(f"  [{self.num_timesteps:>6}] R={mean_r:>7.1f} | vs FT-90s: {imp:>+5.1f}% | Switches: {n_switches}/{n_steps} ({switch_rate:.0f}%) {status}")
            
            if imp >= 10 and not self.reached:
                self.reached = True
                print(f"\nüèÜ TARGET REACHED!")
        return True

FT90_REF = baseline_results['FT-90s']['mean_reward']
print(f"‚úÖ Callback ready | Reference: FT-90s = {FT90_REF:.1f} | Target: {FT90_REF*1.1:.1f}")

In [None]:
# Cell 6: Initialize DQN Model - LONG RUN CONFIGURATION (1M Steps)
train_env = create_env()
eval_env = create_env()

# SCALING FOR 1 MILLION STEPS
# We want to explore for a significant portion, but 500k steps might be excessive.
# Let's explore for 20% (200k steps) which is plenty.
TOTAL_STEPS = 1_000_000

model = DQN("MlpPolicy", train_env, learning_rate=1e-4, buffer_size=100000, # Increased buffer
            learning_starts=1000,
            batch_size=64, tau=0.005, gamma=0.99,
            exploration_fraction=0.2,  # Explore for 20% (200k steps)
            exploration_initial_eps=1.0,
            exploration_final_eps=0.05, # Lower final epsilon for fine-tuning
            verbose=0, device='cuda' if torch.cuda.is_available() else 'cpu')

# Eval freq: 1M steps / 200 points = 5000 steps
callback = ProgressCallback(eval_env, FT90_REF, target_pct=10.0, eval_freq=5000)
STATE = {'model': model, 'callback': callback, 'steps': 0, 'block': 0, 'done': False}

print(f"üöÄ DQN ready on {model.device}")
print(f"   Target Steps: {TOTAL_STEPS} (1 Million)")
print(f"   Exploration: 20% (200k steps), final eps=0.05")
print(f"   Target: Beat FT-90s by +10%")

In [None]:
# Block 1: Steps 0 ‚Üí 100,000
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 1: 0 ‚Üí 100k")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 1, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_100k")
    print(f"‚úÖ Block 1 done | Steps: {STATE['steps']} | Best: {STATE['callback'].best:.1f}")

In [None]:
# Block 2: Steps 100k ‚Üí 200k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 2: 100k ‚Üí 200k")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 2, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_200k")
    print(f"‚úÖ Block 2 done | Steps: {STATE['steps']} | Best: {STATE['callback'].best:.1f}")

In [None]:
# Block 3: Steps 200k ‚Üí 300k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 3: 200k ‚Üí 300k")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 3, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_300k")
    print(f"‚úÖ Block 3 done | Steps: {STATE['steps']} | Best: {STATE['callback'].best:.1f}")

In [None]:
# Block 4: Steps 300k ‚Üí 400k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 4: 300k ‚Üí 400k")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 4, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_400k")
    print(f"‚úÖ Block 4 done | Steps: {STATE['steps']} | Best: {STATE['callback'].best:.1f}")

In [None]:
# Block 5: Steps 400k ‚Üí 500k (HALFWAY)
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 5: 400k ‚Üí 500k")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 5, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_500k")
    imp = ((STATE['callback'].best - FT90_REF) / abs(FT90_REF)) * 100
    print(f"üìà HALFWAY: {STATE['steps']} steps | Best improvement: {imp:+.1f}%")

In [None]:
# Block 6: Steps 500k ‚Üí 600k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 6: 500k ‚Üí 600k")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 6, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_600k")
    print(f"‚úÖ Block 6 done | Steps: {STATE['steps']} | Best: {STATE['callback'].best:.1f}")

In [None]:
# Block 7: Steps 600k ‚Üí 700k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 7: 600k ‚Üí 700k")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 7, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_700k")
    print(f"‚úÖ Block 7 done | Steps: {STATE['steps']} | Best: {STATE['callback'].best:.1f}")

In [None]:
# Block 8: Steps 700k ‚Üí 800k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 8: 700k ‚Üí 800k")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 8, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_800k")
    print(f"‚úÖ Block 8 done | Steps: {STATE['steps']} | Best: {STATE['callback'].best:.1f}")

In [None]:
# Block 9: Steps 800k ‚Üí 900k
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 9: 800k ‚Üí 900k")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 9, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_900k")
    print(f"‚úÖ Block 9 done | Steps: {STATE['steps']} | Best: {STATE['callback'].best:.1f}")

In [None]:
# Block 10 (FINAL): 900k ‚Üí 1M
if STATE['done']: print("‚úÖ Target reached, skipping")
else:
    print("üìä BLOCK 10 (FINAL): 900k ‚Üí 1M")
    STATE['model'].learn(100000, callback=STATE['callback'], reset_num_timesteps=False, progress_bar=True)
    STATE['block'], STATE['steps'] = 10, STATE['callback'].num_timesteps
    STATE['done'] = STATE['callback'].reached
    STATE['model'].save(f"{SAVE_DIR}/model_FINAL")

imp = ((STATE['callback'].best - FT90_REF) / abs(FT90_REF)) * 100
print(f"\n{'='*60}")
print(f"üèÅ TRAINING COMPLETE")
print(f"   Total steps: {STATE['steps']}")
print(f"   Best reward: {STATE['callback'].best:.1f}")
print(f"   Improvement vs FT-90s: {imp:+.1f}%")
print(f"   Target (+10%): {'‚úÖ ACHIEVED' if STATE['done'] else '‚ùå NOT ACHIEVED'}")
print(f"{'='*60}")

In [None]:
# Final: Plot & Save Results
import matplotlib.pyplot as plt
import json

h = STATE['callback'].history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.plot([x['step'] for x in h], [x['reward'] for x in h], 'b-o', markersize=3)
ax1.axhline(y=FT90_REF, color='r', linestyle='--', label='FT-90s')
ax1.axhline(y=baseline_results['FT-30s']['mean_reward'], color='g', linestyle=':', label='FT-30s')
ax1.set_xlabel('Steps'); ax1.set_ylabel('Reward'); ax1.legend(); ax1.set_title('Learning Curve')

ax2.plot([x['step'] for x in h], [x['improvement'] for x in h], 'b-o', markersize=3)
ax2.axhline(y=10, color='g', linestyle='--', label='Target +10%')
ax2.axhline(y=0, color='r', linestyle='-', alpha=0.5)
ax2.set_xlabel('Steps'); ax2.set_ylabel('Improvement (%)'); ax2.legend(); ax2.set_title('Improvement vs FT-90s')

plt.tight_layout()
plt.savefig(f'{SAVE_DIR}/stage3_results.png', dpi=150)
plt.show()

# Save JSON
results = {'baselines': baseline_results, 'history': h, 'best': STATE['callback'].best, 
           'improvement': ((STATE['callback'].best - FT90_REF) / abs(FT90_REF)) * 100}
with open(f'{SAVE_DIR}/stage3_results.json', 'w') as f: json.dump(results, f, indent=2)
print(f"üìÅ Saved: {SAVE_DIR}/stage3_results.png, {SAVE_DIR}/stage3_results.json")