In [1]:
import gymnasium as gym
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import pathlib
import sys
from time import sleep
import torch
from torch.utils.tensorboard import SummaryWriter
import tqdm
sys.path.insert(0, str(pathlib.Path("../..").absolute()))

from environment import TreeEnv
from computation_sim.system import SystemDrawer
from agents.metrics import MovingAverage, MovingTotal
from agents.q_agent import DQNActor

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#DEVICE = "cpu"
print(f"Using device: {DEVICE}")


Using device: cuda


In [2]:
dt = 10
params = {
    "environment" : {
        "num_sensors": 5,
        "dt": dt,
        "cost_input": 0.01,
        "cost_message_loss": 1.0,
        "cost_output_time": 0.1 / 100.0,
    },
    "actor" : {
        "learn_period": int(100 / dt), # Update model every 100ms
        "memory_size": 2 * 60 * int(1_000 / dt), # Memory spans a period of 2 minutes,
        "tau" : dt / 2000.0, # Full model update after 2sec
        "batch_size" : 1024, # Number of samples used in one optimization
        "gamma": 0.9, # Hyperparam; not really of used in continuous task
        "epsilon_start": 0.1,
        "epsilon_end": 0.0,
        "epsilon_decay" : 4 * 60 * int(1_000 / dt), # Half-life of epsilon decay: 4 minutes
        "lr" : 1e-3,
    },
    "num_sim_steps": int(3_600_000 / dt) # train for 1h

}

In [3]:
env = TreeEnv(**params["environment"])
drawer = SystemDrawer()
drawer.build(env.system.node_graph)
drawer.fw.update_layout(autosize=True, width=400, height=400)
clear_output(wait=True)
display(drawer.fw)

FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color': '#888', 'width': 1.0},
              'mode': 'lines',
              'showlegend': False,
              'type': 'scatter',
              'uid': '0a4358b5-25a6-465f-b4bf-bd5af1288ecb',
              'x': [0.4642857142857143, 0.9642857142857143, None,
                    0.4642857142857143, 0.9642857142857143, None,
                    -0.5357142857142857, -0.035714285714285726, None,
                    -0.035714285714285726, 0.4642857142857143, None,
                    -0.035714285714285726, 0.4642857142857143, None,
                    -0.5357142857142857, -0.035714285714285726, None,
                    -0.035714285714285726, 0.4642857142857143, None,
                    -0.035714285714285726, 0.4642857142857143, None,
                    -0.5357142857142857, -0.035714285714285726, None,
                    -0.035714285714285726, 0.4642857142857143, None,
                    -0.035714285714285726, 0.464

In [4]:
agent = DQNActor(num_states=env.observation_space.shape[0], num_actions=env.action_space.n, device=DEVICE, **params["actor"])

In [5]:
def train(
    actor: DQNActor,
    env: gym.Env,
    num_sim_steps: int,
    learn_period: int,
    num_samples_metrics_filter: int,
    num_skip_logging: int = 20,
    writer=None,
    device: str = "cpu",
):
    # Prepare filters for metrics
    avg_message_loss = MovingTotal(num_samples_metrics_filter)
    avg_min = MovingAverage(num_samples_metrics_filter)
    avg_avg = MovingAverage(num_samples_metrics_filter)
    avg_max = MovingAverage(num_samples_metrics_filter)
    avg_reward = MovingAverage(num_samples_metrics_filter)

    # Initialize the environment
    state, _ = env.reset()
    state = torch.tensor(state.data, dtype=torch.float32, device=device).unsqueeze(0)

    # Run training loop
    bar = tqdm.tqdm(range(num_sim_steps))
    for i_step in bar:
        #bar.set_description(f"Time: {env.time} ms")
        # Take action, observe transition
        action = actor.epsilon_greedy(state)
        next_state, reward, _, _, info = env.step(action)

        # Push logging
        avg_message_loss.push(info["total_message_losses"])
        avg_min.push(info["last_output_min_age"])
        avg_avg.push(info["last_output_avg_age"])
        avg_max.push(info["last_output_max_age"])
        avg_reward.push(reward)
        if i_step % num_skip_logging == 0:
            writer.add_scalar("AVG/reward", avg_reward.value, env.time)
            writer.add_scalar("AVG/outputMinTime", avg_min.value, env.time)
            writer.add_scalar("AVG/outputAvgTime", avg_avg.value, env.time)
            writer.add_scalar("AVG/outputMaxTime", avg_max.value, env.time)
            writer.add_scalar("AVG/messageLoss", avg_message_loss.value, env.time)

        # Update memory
        action = torch.tensor([[action]], device=device, dtype=torch.int64)
        next_state = torch.tensor(
            next_state.data, dtype=torch.float32, device=device
        ).unsqueeze(0)
        reward = torch.tensor([reward], device=device)
        actor.push_memory(state, action, next_state, reward)

        # Run learning algo
        if i_step % learn_period == 0:
            learning_info = actor.optimize_model()

        # Log learning stuff
        if i_step % num_skip_logging and "loss" in learning_info:
            writer.add_scalar("Learning/Loss", learning_info["loss"], env.time)
        if i_step % num_skip_logging and "epsilon" in learning_info:
            writer.add_scalar("Learning/Epsilon", learning_info["epsilon"], env.time)

        # Close loop
        state = next_state

In [6]:
writer = SummaryWriter(log_dir="logs/")
train(
    agent,
    env,
    params["num_sim_steps"],
    params["actor"]["learn_period"],
    int(5_000 / dt),
    int(200 / dt),
    writer=writer,
    device=DEVICE
)

 33%|███▎      | 119140/360000 [00:56<01:53, 2122.96it/s]


KeyboardInterrupt: 