# Red Light, Green Light: Dynamic Traffic Lights with RL
## By Erin Gregoire & Daniel Viola
### Spring 2025

In [1]:
import torch
import random
import copy
import pickle
import torch.optim as optim
from torch import nn
import numpy as np
class DQNAgent6(nn.Module ):
    def __init__(self, env):
        self.replay = ReplayBuffer(max_size=10000)
        super().__init__()
        self.env = env
        self.observation_space = env.observation_space
        self.action_space = env.action_space

        self.layers = nn.Sequential(
            nn.Linear(in_features=11, out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=self.action_space.n),
        )
        self.target_nn = copy.deepcopy(self.layers)
        self.buffer=[ ]
        self.max_size=5
        self.counting =0
        self.epsilon=0
        c_rewards=[]
        self.loss = nn.MSELoss()
        self.optimize = optim.Adam(self.layers.parameters(), lr=.00005)
        self.device = torch.device("cuda")
        self.to(self.device)

    
    def forward(self,x):
        return self.layers(x)
    
    def training_step(self,batch):
        gamma =.98
        obs,action, reward, next_obs, terminated = batch
        device=torch.device("cuda")

        obs =  torch.tensor(obs, dtype=torch.float32, device=device )
        action = torch.tensor(action,dtype=torch.long,device=device)
        reward= torch.tensor( reward, dtype=torch.float32,device=device )
        next_obs = torch.tensor(next_obs, dtype=torch.float32, device=device)
        terminated =torch.tensor(terminated, dtype=torch.float32, device=device)

        q_values =self.layers( obs)  
        q_predictions = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        with torch.no_grad():
            next_actions= self.layers(next_obs).argmax(1)  
            next_q_values =self.target_nn(next_obs).gather(1, next_actions.unsqueeze(1)).squeeze(1)
            q_target = reward+ (1-terminated)*gamma *next_q_values

        loss = self.loss(q_predictions, q_target)
        self.optimize.zero_grad()
        loss.backward()
        self.optimize.step()

    def update_target_network(self):
        self.target_nn.load_state_dict(self.layers.state_dict())

    def step(self, obs):
  

        if random.random() <self.epsilon:

            return np.random.choice(self.action_space.n)
        else:
        
            obs_tensor = torch.tensor(obs, dtype=torch.float32, device ='cuda').unsqueeze(0)  
            q_values = self.layers(obs_tensor)  
            action = torch.argmax(q_values, dim=1).item() 
            return action
            
    def winner_winner_chicken_dinner(self, obs):
        phase = int(obs[0])
        queues = obs[1:]

        binned= []
        for q in queues:
            if q < 0.33:

                binned.append(0)
            elif q< 0.66:
                binned.append(1)
            else:
                binned.append(2)

        return (phase, *binned)
        
            
    def save_q_table(self, filename):
        with open(filename, "wb") as f:
            pickle.dump(self.state_dict(), f)

    def load_q_table(self, filename):
        with open(filename, "rb") as f:
            state_dict = pickle.load(f)
            self.load_state_dict(state_dict)
            self.eval()

    def update_q_time(self, obs, action, reward, next_obs, terminated):
        self.replay.update_buffer(obs, action, reward, next_obs, terminated)
        if len(self.replay.buffer) >= 32:
            batch = self.replay.sample_batch(32)
            self.training_step(batch)


class ReplayBuffer():
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = []
        self.position = 0

    def update_buffer(self, obs,action,reward, next_obs, terminated):

        transition = (obs,action,reward, next_obs, terminated)

        if len(self.buffer) <self.max_size:
            self.buffer.append(transition)
        elif len(self.buffer) >=self.max_size:
            self.buffer[self.position] =  transition
        self.position = (self.position + 1) % self.max_size


    def sample_batch(self, batch_size):
        
        x =random.sample(self.buffer, batch_size)
        obs_list =[]
        action_list=[]
        reward_list =[]
        next_obs_list=[]
        terminated_list =[]

        for transition in x:
            obs, action, reward, next_obs, terminated = transition
            obs_list.append(obs)
            action_list.append(action)
            reward_list.append(reward)
            next_obs_list.append(next_obs)
            terminated_list.append(terminated)


        return obs_list, action_list, reward_list, next_obs_list, terminated_list

In [None]:


from sumo_rl import SumoEnvironment
import os
import matplotlib.pyplot as plt
import pickle


def looping_time():
    #you will get an import error with sumo rl if you dont change this to your workspace where you have sumo to whomever may see this
    os.environ["SUMO_HOME"] = "C:/Users/alien/AppData/Local/Programs/Python/Python39/Lib/site-packages/sumo"

    env = SumoEnvironment(
        net_file="sumo_rl/nets/single-intersection/single-intersection.net.xml",
        route_file="sumo_rl/nets/single-intersection/single-intersection.rou.xml",
        use_gui=True,
        num_seconds=4000,
        single_agent=True,
        delta_time=4
    )

    evaluation=True
    agent=DQNAgent6(env)
    agent.load_q_table("ergregoi_dviola_single_intersection_ddqn_weights.pkl")
    episodes=5
    training_timesteps_per_episode=40000
    evaluation_per_episode=40000
    average_wait_per_car=[]
    total_reward=[]
    total_reward_electric_boogaloo=[]
    epsilon_storage=.1
    decay_rate=.995
    for i in range(episodes):
        obs_array, info = env.reset()
        terminated, truncated=False,False
        info={}
        reward=0
        reward_storage_per_episode_training=0
        reward_storage_per_episode_evluation=0
        agent.epsilon=epsilon_storage
        j=0
        previous_wait_time=0
        current_wait=0
        if evaluation == False:
            while not (terminated or truncated):
                state=agent.winner_winner_chicken_dinner(obs_array)
                action = agent.step(state)
                next_obs_array,reward,terminated,truncated,info =env.step(action)
                next_state=agent.winner_winner_chicken_dinner(next_obs_array)

                reward_storage_per_episode_training+=reward
                agent.update_q_time(state, action, reward, next_state, terminated or truncated)

                obs_array=next_obs_array
                if j%2000==0:
                    epsilon_storage=epsilon_storage*decay_rate
                    agent.epsilon=max(epsilon_storage,.1)
                j+=1
           
            total_reward.append(reward_storage_per_episode_training)
            print('reward for training episode' + str(i) +'is' + str(reward_storage_per_episode_training))
            agent.save_q_table("dqn_smallenv.pkl")
            print('saved q table')

            reward_storage_per_episode_training=0

        obs_array, info = env.reset()
        terminated, truncated=False, False
        if evaluation ==True:
            while not (terminated or truncated):
                agent.epsilon=0
                state=agent.winner_winner_chicken_dinner(obs_array)
                obs_tensor = torch.tensor(state, dtype=torch.float32, device='cuda').unsqueeze(0)
                if random.random() < agent.epsilon:
                    action = random.choice(list(range(env.action_space.n)))
                else:
                    q_values = agent(obs_tensor)
                    action = torch.argmax(q_values, dim=1).item()
                next_obs_array,reward,terminated,truncated,info =env.step(action)
                    
                next_state=agent.winner_winner_chicken_dinner(next_obs_array)
                reward_storage_per_episode_evluation+=reward
                    
                obs_array=next_obs_array
            average_wait_per_car.append(info["system_total_waiting_time"]/ max( info["system_total_departed"], 1))
            total_reward_electric_boogaloo.append(reward_storage_per_episode_evluation)
            reward_storage_per_episode_evluation=0
        
    env.close()


    plt.show()

    plt.figure(figsize=(10, 5))
    plt.plot(total_reward_electric_boogaloo, label="evaluation Reward")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("evaluation Reward")
    plt.legend()
    plt.grid(True)
    plt.show()


    plt.figure(figsize=(10, 5))
    plt.plot(average_wait_per_car, label="average waiting")
    plt.xlabel("Episode")
    plt.ylabel("wait time")
    plt.title("average waiting")
    plt.legend()
    plt.grid(True)
    plt.show()
looping_time()
            

            

