In [1]:
# Import the required packages and libs.
from single_intersection import TrafficEnv
import numpy as np
import os 
import matplotlib.pyplot as plt
import time
from sumo_rl import SumoEnvironment
import torch
# Print the sumo environment path for further verification 
print("SUMO HOME:", os.environ.get("SUMO_HOME"))
# SUMO HOME: /opt/miniconda3/lib/python3.13/site-packages/sumo

SUMO HOME: /opt/miniconda3/lib/python3.13/site-packages/sumo


In [24]:
import csv
import numpy as np
import time

def to_evaluate_agent(
        env=None,
        agent="heuristic",
        steps=1000,
        phase_duration=10,
        render=False,
        seed=42,
        to_save = None):
    """ Evaluate the performance of a method in given SUMO env."""

    # Initialization  ->  file saving 
    sim_records = []
    header = [
        "step", 
        "sim_time",
        "avg_wait_time", 
        # "total_wait_time",
        "queue_length",
        "pressure",
        "throughput",
        "avg_speed",
        "action",
        "reward"]
    
    # Initialization -> simulation loops
    obs, info = env.reset(seed=seed)
    cur_phase, phase_timer, last_arrived = 0,0,0
    done = False 

    for step in range(steps):
        if render: env.render()

        # debug 
        # print("Phase Time" , phase_timer, "Current Phase: ", cur_phase)

        #################################
        ###  Action Selection ###
        #################################
        if agent == "heuristic":
            # Determine the action
            if phase_timer >= phase_duration: # Pre-defined heuristic method
                cur_phase = (cur_phase + 1) % env.action_space.n
                phase_timer = 0
            action = cur_phase
            phase_timer += 1
        elif agent == "random": # Random Method 
            action = env.action_space.sample()
        # else: # Trained RL Agent 
        #     # action, _ 

        #################################
        ###  Feed action and observe ###
        #################################
        obs, reward, done, _, info = env.step(action)
        sim_time = env.sumo.simulation.getTime()
        # veh_ids = env.sumo.simulation.getIDList()
        # avg_wait_time = info["avg_wait_time"] # Not sure if it is available


        #######################################
        ###  Collect and store the metrics ###
        #######################################
        # wait_time_lst = []
        # queue_length = 0
        # lane_veh_counts = {}

        # for veh in veh_ids:
        #     cur_lane = env.sumo.vehicles.getLaneID(veh)
        #     cur_speed = env.sumo.vehicles.getSpeed(veh)
        #     cur_wait_time = env.sumo.vehicles.getWaitingTime(veh)

        #     if cur_speed < 0.1:
        #         queue_length += 1
        #     lane_veh_counts[cur_lane] = lane_veh_counts.get(cur_lane, 0) + 1

        #     # ====== Wait-Time ======
        #     wait_time_lst.append(cur_wait_time)
        #     # ========================

        # # Pressure
        # pressure = 0
        # for lane, count in lane_veh_counts.items():
        #     if 1:
        #         # num_incoming += count
        #         pressure += count
        #     else:
        #         num_outgoing += count
        #         pressure -= count

        # Avg wait and total wait
        # avg_wait = np.mean(wait_time_lst) if wait_time_lst else 0
        # total_wait = np.sum(wait_time_lst) if wait_time_lst else 0

        # Avg speed and total speed
        # avg_speed = np.mean(env.sumo.vehicles.getSpeed(vid) for vid in veh_ids)
        # total_speed = np.sum(env.sumo.vehicles.getSpeed(vid) for vid in veh_ids)

        # Throughput
        # total_arrived = env.sumo.simulation.getArrivedNumber()
        # throughput = total_arrived - last_arrived
        # last_arrived = total_arrived

        # Load the records into log
        sim_records.append([
            step,
            sim_time,
            info["waiting_time"],
            info["queue_length"],
            info["pressure"],
            info["throughput"],
            info["avg_speed"],
            action,
            reward])

        if done:
            obs, info = env.reset(seed=seed)
            last_arrived = 0
        
    env.close()

    if to_save:
        # supposed: results/***_evaluation_records.csv 
        folder = "results"
        os.makedirs(folder, exist_ok=True)

        save_path = os.path.join(folder, f"{to_save}_evaluation_records.csv")

        with open(save_path, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(header)
            writer.writerows(sim_records)

        print(f"Evaluation records saved to: {save_path}")

        
        return sim_records



## Heuristic baseline

In [26]:
sumo_cmd = [
    # "--start", # Uncomment this line while using the GUI for visualization 
    "--no-warnings", "true", # Uncomment this line to 
    "-n", "single-intersection.net.xml",
    "-r", "single-intersection-vertical.rou.xml",
    "--step-length", "1.0"
]

TLS_ID = "t"    

# import traci
# traci.close(False)

# Initialize SUMO environment
env = TrafficEnv(
    sumo_cmd=sumo_cmd,
    tls_id=TLS_ID,
    gui=False   # show SUMO GUI
)

logs = to_evaluate_agent(
    env=env,
    agent="heuristic",
    steps=100,
    to_save="test"
)

 Retrying in 1 seconds


  num_phases = len(self.sumo.trafficlight.getCompleteRedYellowGreenDefinition(self.ts_id)[0].phases)


 Retrying in 1 seconds
Step #0.00 (0ms ?*RT. ?UPS, TraCI: 3ms, vehicles TOT 0 ACT 0 BUF 0)                      
Step #100.00 (0ms ?*RT. ?UPS, TraCI: 12ms, vehicles TOT 76 ACT 52 BUF 0)                   
Evaluation records saved to: results/test_evaluation_records.csv


## Define the training function

In [5]:
import torch.optim as optim
from ppo import ActorCritic, compute_gae, collect_rollout
from torch import nn
from helper_func import plot_traff_metrics

# from torch.nn import function as F

def train_ppo(
    model,
    env=None,
    GAMMA = 0.99,
    GAE_LAMBDA = 0.95,
    CLIP_EPS = 0.2,
    LR = 3e-3,
    ENT_COEF = 0.01,
    VF_COEF = 0.5,
    MAX_GRAD_NORM = 0.5,
    N_STEPS = 256, 
    N_EPOCHS = 10,        
    MINI_BATCH_SIZE = 32, 
    TOTAL_TIMESTEPS = 4096,
    noise = False
):
    

    if env is None:
        raise ValueError("Please provide a valid environment instance.")
    elif model is None:
        raise ValueError("Please provide a valid model instance.")
    
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    optimizer = optim.Adam(model.parameters(), lr=LR)

    print("Observation dim:", obs_dim)
    print("Action dim:", act_dim)

    global_step = 0
    # episode_returns = []
    # current_ep_return = 0.0

    obs, info = env.reset()

    appended_rewards=[]
    appended_avg_speeds=[]
    appended_throughputs=[]
    appended_waiting_times=[]

    while global_step < TOTAL_TIMESTEPS:
        
        # 1) Global Step: Roll-out sampling
        batch = collect_rollout(env, model, N_STEPS)
        global_step += N_STEPS

        obs_arr = batch["obs"] 
        actions_arr = batch["actions"]
        old_logprobs_arr = batch["logprobs"]
        rewards_arr = batch["rewards"]
        dones_arr = batch["dones"]
        values_arr = batch["values"]
        next_value = batch["next_value"]

        # 2) Global Step:  Extract the main metric result from the batch
        avg_speeds=batch["avg_speeds"]
        throughputs=batch["throughputs"]
        waiting_times=batch["waiting_times"]

        

        # 3)  Global Step: Calcuate the advantages and the returns
        advantages, returns = compute_gae(
            rewards_arr, values_arr, 
            dones_arr, next_value,
            gamma=GAMMA, lam=GAE_LAMBDA
        )

        #  Global Step: Standarize the advantage for better and faster convergence
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        #  Global Step: Tensor type conversion 
        obs_t = torch.tensor(obs_arr, dtype=torch.float32)
        actions_t = torch.tensor(actions_arr, dtype=torch.int64)
        old_logprobs_t = torch.tensor(old_logprobs_arr, dtype=torch.float32)
        advantages_t = torch.tensor(advantages, dtype=torch.float32)
        returns_t = torch.tensor(returns, dtype=torch.float32)

        # Multiple epochs of updates
        dataset_size = N_STEPS
        indices = np.arange(dataset_size)

        for _ in range(N_EPOCHS):
            # Print debug info 

            np.random.shuffle(indices)
            for start in range(0, dataset_size, MINI_BATCH_SIZE):
                end = start + MINI_BATCH_SIZE
                mb_idx = indices[start:end]

                mb_obs = obs_t[mb_idx]
                mb_actions = actions_t[mb_idx]
                mb_old_logprobs = old_logprobs_t[mb_idx]
                mb_advantages = advantages_t[mb_idx]
                mb_returns = returns_t[mb_idx]

                # Compute log_prob, entropy, value
                new_logprobs, entropy, values_pred = model.evaluate_actions(mb_obs, mb_actions)

                # ratio = π_θ(a|s) / π_θ_old(a|s)
                ratio = torch.exp(new_logprobs - mb_old_logprobs)

                # PPO clipped surrogate
                surr1 = ratio * mb_advantages
                surr2 = torch.clamp(ratio, 1.0 - CLIP_EPS, 1.0 + CLIP_EPS) * mb_advantages
                actor_loss = -torch.min(surr1, surr2).mean()

                # value function loss
                critic_loss = nn.MSELoss()(values_pred, mb_returns)

                # entropy for exploring 
                entropy_loss = -entropy.mean()

                loss = actor_loss + VF_COEF * critic_loss + ENT_COEF * entropy_loss

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
                optimizer.step()



        # 4) Compute the result for the current batch for metric measurement 
        batch_rewards_return = rewards_arr.mean()
        batch_speeds_return = avg_speeds.mean()
        batch_throughputs_return = throughputs.mean()
        batch_waiting_times_return = waiting_times.mean()

        appended_rewards.append(batch_rewards_return)
        appended_avg_speeds.append(batch_speeds_return)
        appended_throughputs.append(batch_throughputs_return)
        appended_waiting_times.append(batch_waiting_times_return)

    env.close()
    torch.save(model.state_dict(), "ppo_traffic_signal.pth")
    print("Training finished, model saved to ppo_traffic_signal.pth")

    # Plotting the eval. graphs
    plot_traff_metrics(
        appended_rewards,
        appended_avg_speeds,
        appended_throughputs,
        appended_waiting_times
    )

    # return appended_rewards, appended_avg_speeds, appended_throughputs, appended_waiting_times



## Training Section

In [6]:
from single_intersection import TrafficEnv

# SUMO command (headless for speed)
sumo_cmd = [
    # "--start", # Uncomment this line while using the GUI for visualization 
    "--no-warnings", "true", # Uncomment this line to 
    "-n", "single-intersection.net.xml",
    "-r", "single-intersection-vertical.rou.xml",
    "--step-length", "1.0"
]

TLS_ID = "t"    

# Initialize SUMO environment
env = TrafficEnv(
    sumo_cmd=sumo_cmd,
    tls_id=TLS_ID,
    gui=False   # show SUMO GUI
)

# Check if the environment is working 
obs, info = env.reset()
print("Environment setup complete. Ready for Model Initilizaiton...")

 Retrying in 1 seconds


  num_phases = len(self.sumo.trafficlight.getCompleteRedYellowGreenDefinition(self.ts_id)[0].phases)


Step #0.00 (0ms ?*RT. ?UPS, TraCI: 3ms, vehicles TOT 0 ACT 0 BUF 0)                      
 Retrying in 1 seconds
Environment setup complete. Ready for Model Initilizaiton...


In [7]:
model_params  = {
    "obs_dim": env.observation_space.shape[0],
    "act_dim": env.action_space.n
}

ppo_model = ActorCritic(**model_params)
print("Model initialized. Starting training...")

train_params = {
    "model": ppo_model,
    "env": env

}

model_hist = train_ppo(**train_params)


Model initialized. Starting training...


## RL agent and validation

## Performance Evaluation