In [1]:
from pyrl.agents.classic import DQNAgent
from pyrl.agents.survival import SurvivalDQNAgent
import numpy as np
# from tensorforce.environments import Environment
# from pyrl.environments import CustomEnvironment
from itertools import count
from pyrl import Sim
from pyrl.environments.grid import GridEnv, GridEnvRender
from tensorforce.environments import Environment


In [2]:
map_size = (50, 5)
num_rows = map_size[1]
num_cols = map_size[0]
minor_r = 5.0
major_r = 100.0
reward_targets = {major_r : [(num_cols - 2, num_rows // 2)],
                  minor_r : [(3*(num_cols - 1) // 5, num_rows // 2), ((num_cols - 1) // 3, num_rows // 2)]}
cell_size = 25
horizon = 5000
points = 6
repeat = 3
survival_threshold = 250
exploration_threshold = 500
gamma = 0.99 # discount factor
initial_budgets = np.linspace(100, horizon, points, dtype=int)
replay_capacity = 5000
batch_size = 256
learning_rate = 0.001

In [3]:
dqn_time_mean = np.full(initial_budgets.shape, -1)
dqn_exploration_rate = np.full(initial_budgets.shape, -1)
dqn_alive_rate = np.full(initial_budgets.shape, -1)
dqn_budget_evolutions_mean = np.full(initial_budgets.shape, None)
dqn_budget_evolutions_max = np.full(initial_budgets.shape, None)
dqn_budget_evolutions_min = np.full(initial_budgets.shape, None)

In [4]:
def simulation_started_callback(sim, env, agent):
    print("START SIM")
    sim.metrics = dict(
        time = 0,
        exploration = np.zeros(np.prod(env.observation_space.nvec) + env.action_space.n),
        budget = np.zeros((sim.episode_horizon,), dtype=int)
    )

def simulation_finished_callback(sim, env, agent):
    print("END SIM")

def episode_started_callback(sim, env, agent):
    print("START EPISODE")

def episode_finished_callback(sim, env, agent):
    print("END EPISODE")

def round_started_callback(sim, env, agent):
    pass

def round_finished_callback(sim, env, agent):
    print("END ROUND")
    sim.metrics["time"] = sim.metrics["time"] + 1
    state_action_index = tuple(np.concatenate( (agent.get_state(), agent.get_action()) ) )
    state_action_index = tuple(agent.get_state_action())
    v = sim.metrics["exploration"].item(state_action_index)
    sim.metrics["exploration"].itemset(state_action_index, v+1)
    sim.metrics["budget"][sim.t-1] = agent.b

In [5]:
dqn_time_mean = np.full(initial_budgets.shape, -1)
dqn_exploration_rate = np.full(initial_budgets.shape, -1)
dqn_alive_rate = np.full(initial_budgets.shape, -1)
dqn_budget_evolutions_mean = np.full(initial_budgets.shape, None)
dqn_budget_evolutions_max = np.full(initial_budgets.shape, None)
dqn_budget_evolutions_min = np.full(initial_budgets.shape, None)
dqn_exploration_map = np.full(initial_budgets.shape, None)

env = GridEnv(num_rows=num_rows, num_cols=num_cols, 
              reward_mode="s'", reward_targets=reward_targets, default_reward=-1.0,
              render_mode="external")

print("====> Classic DQN")

for i, b in enumerate(initial_budgets):
    nb_alive = 0
    # print(f"b={b}", end=" ")
    for j in range(repeat):
        print(f"====> Classic DQN {b} | Try {j + 1}")
        agent_DQN = SurvivalDQNAgent(observation_space=env.observation_space,
                        action_space=env.action_space,
                        batch_size=batch_size,
                        initial_budget=b,
                        gamma=gamma,
                        learning_rate=learning_rate
                        )
        window = GridEnvRender(env, agent_DQN, cell_size=cell_size)

        env._render_frame = window.refresh

        print("TEST CLASSIC DQN AGENT")

        sim = Sim(agent_DQN,
                env,
                episode_horizon=horizon,
                num_simulations=1,
                rl_config='classic',
                simulation_started_callback=simulation_started_callback,
                simulation_finished_callback=simulation_finished_callback,
                episode_started_callback=episode_started_callback,
                episode_finished_callback=episode_finished_callback,
                round_started_callback=round_started_callback,
                round_finished_callback=round_finished_callback
                )

        try:
            sim.run()
        except:
            window.close()
            raise

        if dqn_time_mean[i] == -1:
            dqn_time_mean[i] = sim.metrics["time"]
        else:
            dqn_time_mean[i] = dqn_time_mean[i] + (1/j) * (sim.metrics["time"] - dqn_time_mean[i])
        
        exploration_rate = (np.count_nonzero(sim.metrics["exploration"]) / (np.prod(env.observation_space.nvec) * env.action_space.n)) * 100

        if dqn_exploration_rate[i] == -1:
            dqn_exploration_rate[i] = exploration_rate
        else:
            dqn_exploration_rate[i] = dqn_exploration_rate[i] + (1 / j) * (exploration_rate - dqn_exploration_rate[i])

        if agent_DQN.b > 0:
            nb_alive = nb_alive + 1

        dqn_alive_rate[i] = nb_alive / (j+1) * 100

        # budget evolution mean
        if dqn_budget_evolutions_mean[i] is None:
            dqn_budget_evolutions_mean[i] = sim.metrics["budget"]
        else:
            dqn_budget_evolutions_mean[i] = dqn_budget_evolutions_mean[i] + (1 / j) * (sim.metrics["budget"] - dqn_budget_evolutions_mean[i])

        # budget evolution max
        if dqn_budget_evolutions_max[i] is None:
            dqn_budget_evolutions_max[i] = sim.metrics["budget"]
        else:
            dqn_budget_evolutions_max[i] = np.maximum(dqn_budget_evolutions_max[i], sim.metrics["budget"])

        # budget evolution min
        if dqn_budget_evolutions_min[i] is None:
            dqn_budget_evolutions_min[i] = sim.metrics["budget"]
        else:
            dqn_budget_evolutions_min[i] = np.minimum(dqn_budget_evolutions_min[i], sim.metrics["budget"])
        
        # exploration map
        if dqn_exploration_map[i] is None:
            dqn_exploration_map[i] = sim.metrics["exploration"]
        else:
            dqn_exploration_map[i] = dqn_exploration_map[i] + (1 / j) * (sim.metrics["exploration"] - dqn_exploration_map[i])

        print(f"Time mean : {dqn_time_mean[i]}")
        print(f"Alive rate : {dqn_alive_rate[i]}%")
        print(f"Exploration rate : {dqn_exploration_rate[i]}%")

window.close()

dqn_1_time_mean = dqn_time_mean
dqn_1_exploration_rate = dqn_exploration_rate
dqn_1_alive_rate = dqn_alive_rate
dqn_1_budget_evolutions_mean = dqn_budget_evolutions_mean
dqn_1_budget_evolutions_max = dqn_budget_evolutions_max
dqn_1_budget_evolutions_min = dqn_budget_evolutions_min
dqn_1_exploration_map = dqn_exploration_map

====> Classic DQN
====> Classic DQN 100 | Try 1
TEST CLASSIC DQN AGENT
START SIM
START EPISODE
episode_finished_callback() missing 1 required positional argument: 'agent'
END SIM
Time mean : 100
Alive rate : 0%
Exploration rate : 0%
====> Classic DQN 100 | Try 2
TEST CLASSIC DQN AGENT
START SIM
START EPISODE
episode_finished_callback() missing 1 required positional argument: 'agent'
END SIM
Time mean : 100
Alive rate : 0%
Exploration rate : 0%
====> Classic DQN 100 | Try 3
TEST CLASSIC DQN AGENT
START SIM
START EPISODE
episode_finished_callback() missing 1 required positional argument: 'agent'
END SIM
Time mean : 100
Alive rate : 0%
Exploration rate : 0%
====> Classic DQN 1080 | Try 1
TEST CLASSIC DQN AGENT
START SIM
START EPISODE


  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)


KeyboardInterrupt: 

In [None]:


env = GridEnv(num_rows=num_rows, num_cols=num_cols, 
              reward_mode="s'", reward_targets=reward_targets, default_reward=-1.0,
              render_mode="external")
Q = np.random.sample(env.observation_shape + env.action_shape)
print(Q.shape)
print(env.observation_shape)

print(env.observation_space.nvec)
for b in initial_budgets:
    agent_DQN = DQNAgent(observation_space=env.observation_space,
                        action_space=env.action_space,
                        batch_size=batch_size,
                        initial_budget=b,
                        gamma=gamma,
                        learning_rate=learning_rate
                        )

    window = GridEnvRender(env, agent_DQN, cell_size=cell_size)

    env._render_frame = window.refresh

    print("TEST CLASSIC DQN AGENT")

    sim = Sim(agent_DQN,
            env,
            episode_horizon=horizon,
            num_simulations=repeat,
            rl_config='classic',
            simulation_started_callback=simulation_started_callback,
            simulation_finished_callback=simulation_finished_callback,
            episode_started_callback=episode_started_callback,
            episode_finished_callback=episode_finished_callback,
            round_started_callback=round_started_callback,
            round_finished_callback=round_finished_callback
            )

    try:
        sim.run()
    except:
        window.close()
        raise

window.close()

(50, 5, 4)
(50, 5)
[ 5 50]
TEST CLASSIC DQN AGENT
250
[0 2]
START SIM
START EPISODE


KeyboardInterrupt: 