In [1]:
import numpy as np
import gym_envs.envs.traffic_light_support_functions as tlsf
import traci

cycleTime = 60
minGreentime = 5
numberOfPhases = 6
intergreenMatrix = np.zeros((numberOfPhases, numberOfPhases))
intergreenMatrix = [
    [0, 0, 0, 7, 6, 6],
    [0, 0, 0, 5, 0, 0],
    [8, 0, 0, 0, 5, 0],
    [6, 6, 0, 0, 8, 0],
    [8, 0, 6, 5, 0, 0],
    [6, 0, 0, 0, 0, 0],
]

phasePlan = np.full((numberOfPhases,cycleTime), 0)

initialPhaseplan = np.full((numberOfPhases, cycleTime),'r')

starting_phases = [1, 1, 2, 2, 1, 2]
phase_lengths = [
    [19, 2, 14, 3, 22],
    [41, 2, 13, 4],
    [2, 34, 3, 21],
    [2, 13, 3, 42],
    [40, 2, 14, 3, 1],
    [2, 13, 3, 42]
]

initialPhaseplan = tlsf.generate_phase_plan(starting_phases, phase_lengths)
initialPhaseplan = tlsf.change_phase_plan((5,5),60)

In [2]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tensordict import TensorDict
import numpy as np
import gymnasium as gym
import random
from collections import deque, namedtuple
from pytorch_lightning.loggers import TensorBoardLogger

if torch.cuda.is_available() or torch.backends.mps.is_available():
    num_episodes = 600
else:
    num_episodes = 50

# Környezet beállítása
env = gym.make('TrafficEnv-V0', render_mode='console', starting_phases=starting_phases, 
               phase_lengths=phase_lengths, simulation_time=3600*5, phase_change_step=5)

# Konstansok
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000*10
TAU = 0.005
LR = 1e-4

Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DummyDataset(Dataset):
    def __init__(self, num_samples=1000):
        self.num_samples = num_samples

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return 0  # dummy data

class DQN(nn.Module):
    def __init__(self, observation_space, n_actions):
        super(DQN, self).__init__()
        self.fc_occupancy = nn.Linear(observation_space['occupancy'].shape[0], 64)
        self.fc_vehicle_count = nn.Linear(observation_space['vehicle_count'].shape[0], 64)
        self.fc1 = nn.Linear(64 + 64, 256)
        self.fc2 = nn.Linear(256, n_actions)

    def forward(self, x: TensorDict):
        occupancy = F.relu(self.fc_occupancy(x['occupancy']))
        vehicle_count = F.relu(self.fc_vehicle_count(x['vehicle_count']))
        combined = torch.cat((occupancy, vehicle_count), dim=1)
        x = F.relu(self.fc1(combined))
        return self.fc2(x)

class DQNLightning(pl.LightningModule):
    def __init__(self, env, memory_capacity=10000):
        super().__init__()
        self.env = env
        self.n_actions = env.action_space.n
        self.policy_net = DQN(env.observation_space, self.n_actions)
        self.target_net = DQN(env.observation_space, self.n_actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.memory = ReplayMemory(memory_capacity)
        self.steps_done = 0
        self.episode_rewards = []
        self.current_episode_reward = 0

    def forward(self, x):
        return self.policy_net(x)

    def train_dataloader(self):
        return DataLoader(DummyDataset(), batch_size=1)

    def select_action(self, state):
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            np.exp(-1. * self.steps_done / EPS_DECAY)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                return self.policy_net(state).max(1).indices.view(1, 1)
        else:
            return torch.tensor([[self.env.action_space.sample()]], device=self.device, dtype=torch.long)

    def training_step(self, batch, batch_idx):
        if len(self.memory) < BATCH_SIZE:
            return None

        transitions = self.memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), 
                                      device=self.device, dtype=torch.bool)
        non_final_next_states = TensorDict({
            'occupancy': torch.cat([s['occupancy'] for s in batch.next_state if s is not None]),
            'vehicle_count': torch.cat([s['vehicle_count'] for s in batch.next_state if s is not None]),
        })
        
        state_batch = TensorDict({
            'occupancy': torch.cat([s['occupancy'] for s in batch.state]),
            'vehicle_count': torch.cat([s['vehicle_count'] for s in batch.state]),
        })

        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(BATCH_SIZE, device=self.device)
        with torch.no_grad():
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1).values

        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        
        # Simulate an environment step and log the reward
        state = self.env.reset()
        action = self.select_action(state)
        next_state, reward, done, _ = self.env.step(action.item())
        self.current_episode_reward += reward

        if done:
            self.episode_rewards.append(self.current_episode_reward)
            self.log('episode_reward', self.current_episode_reward, on_step=False, on_epoch=True, prog_bar=True, logger=True)
            self.current_episode_reward = 0

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.policy_net.parameters(), lr=LR, amsgrad=True)
        return optimizer

    def on_train_epoch_end(self):
        # Update target network
        target_net_state_dict = self.target_net.state_dict()
        policy_net_state_dict = self.policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        self.target_net.load_state_dict(target_net_state_dict)

        # Log average episode reward
        if self.episode_rewards:
            avg_reward = sum(self.episode_rewards) / len(self.episode_rewards)
            self.log('avg_episode_reward', avg_reward, on_epoch=True, prog_bar=True, logger=True)
            self.episode_rewards = []

# Edzés
model = DQNLightning(env)
logger = TensorBoardLogger("tb_logs", name="dqn_experiment")
trainer = pl.Trainer(max_epochs=num_episodes, logger=logger)
trainer.fit(model)

# Epizódok futtatása
for i_episode in range(num_episodes):
    state, info = env.reset()
    state = TensorDict({
        'occupancy': torch.tensor(state['occupancy'], dtype=torch.float32, device=model.device).unsqueeze(0),
        'vehicle_count': torch.tensor(state['vehicle_count'], dtype=torch.float32, device=model.device).unsqueeze(0),
    })
    
    episode_reward = 0
    for t in range(1000):  # maximum 1000 lépés epizódonként
        action = model.select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([reward], device=model.device)
        episode_reward += reward.item()
        
        done = terminated or truncated

        if terminated:
            next_state = None
        else:
            next_state = TensorDict({
                'occupancy': torch.tensor(observation['occupancy'], dtype=torch.float32, device=model.device).unsqueeze(0),
                'vehicle_count': torch.tensor(observation['vehicle_count'], dtype=torch.float32, device=model.device).unsqueeze(0),
            })

        model.memory.push(state, action, next_state, reward)
        state = next_state

        if done:
            print(f"Episode {i_episode} finished with reward {episode_reward}")
            break

    # Edzés minden epizód után
    trainer.fit(model)

 Retrying in 1 seconds


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Missing logger folder: tb_logs/dqn_experiment

  | Name       | Type | Params | Mode 
--------------------------------------------
0 | policy_net | DQN  | 39.3 K | train
1 | target_net | DQN  | 39.3 K | train
--------------------------------------------
78.6 K    Trainable params
0         Non-trainable params
78.6 K    Total params
0.315     Total estimated model params size (MB)
/opt/anaconda3/envs/sumo_RL_pytorch/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter serve