diff --git a/.github/workflows/testBomberEnv.yml b/.github/workflows/testBomberEnv.yml index 5d198fd..09bfe56 100644 --- a/.github/workflows/testBomberEnv.yml +++ b/.github/workflows/testBomberEnv.yml @@ -9,7 +9,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.9] + python-version: ['3.10'] steps: - uses: actions/checkout@v1 diff --git a/bomberworld.py b/bomberworld.py index 188d8a4..bb8014c 100644 --- a/bomberworld.py +++ b/bomberworld.py @@ -2,11 +2,12 @@ # Author: Adrian Schneider, armasuisse # Note: Initial copied from Giacomo Del Rio, IDSIA -from typing import Optional, Tuple +from typing import Optional, Tuple, List import gymnasium as gym import numpy as np import copy +from random import randrange # Best performance when size = 10 and no penalty on moving and allowed being close to bomb # 10 x 10 = 100 stones - 6 = 94 @@ -15,7 +16,7 @@ class BomberworldEnv(gym.Env): - def __init__(self, size: int, max_steps: int, indestructible_agent=True, dead_near_bomb=False, dead_when_colliding=False, reduced_obs=False, move_penalty=-0.2, collision_penalty=-1.0, + def __init__(self, size: int | List[int], max_steps: int, indestructible_agent=True, dead_near_bomb=False, dead_when_colliding=False, reduced_obs=False, move_penalty=-0.2, collision_penalty=-1.0, bomb_penalty=-1.0, close_bomb_penalty=-2.0, rock_reward=1.0, end_game_reward=10.0 ): """ Parameters @@ -45,6 +46,7 @@ def __init__(self, size: int, max_steps: int, indestructible_agent=True, dead_ne self.end_game_reward = end_game_reward self.size = size + self.board_size = None self.max_steps = max_steps self.indestructible_agent = indestructible_agent self.dead_near_bomb = dead_near_bomb @@ -52,42 +54,61 @@ def __init__(self, size: int, max_steps: int, indestructible_agent=True, dead_ne self.reduced_obs = reduced_obs self.current_step = 0 - self.agent_pos = (0, 0) - self.stones = np.full((self.size, self.size), True) - self.active_bombs = [] - if self.reduced_obs: self.observation_space = gym.spaces.Box(low=0, high=1, shape=(3 * 3,), dtype=np.float32) else: self.observation_space = gym.spaces.Box(low=0, high=1, shape=(size * size,), dtype=np.float32) self.action_space = gym.spaces.Discrete(5) - # print info - print("Simple Bomber World") if self.indestructible_agent else print("Complex Bomber World") - def reset(self, seed: Optional[int] = None, options: Optional[dict] = None) -> Tuple[np.ndarray, dict]: super().reset(seed=seed) self.current_step = 0 - self.set_initial_board(tuple(self.np_random.integers(low=0, high=self.size, size=2))) + + if type(self.size) is list: # randomly select a board size form the list + self.board_size = self.size[randrange(len(self.size))] + + # normalize penalties and rewards relative to first size in list + main_size = self.size[0] + + # reward = (total reward with main_size) / board_size + self.current_move_penalty = (self.move_penalty * (main_size ** 2)) / (self.board_size ** 2) + self.current_collision_penalty = (self.collision_penalty * (main_size ** 2)) / (self.board_size ** 2) + self.current_bomb_penalty = (self.bomb_penalty * (main_size ** 2)) / (self.board_size ** 2) + self.current_close_bomb_penalty = (self.close_bomb_penalty * (main_size ** 2)) / (self.board_size ** 2) + self.current_rock_reward = (self.rock_reward * (main_size ** 2)) / (self.board_size ** 2) + self.current_max_steps = (self.max_steps / (main_size ** 2)) * (self.board_size ** 2) # increase with board size + self.current_end_game_reward = self.end_game_reward # endgame reward independant of board size + else: + self.board_size = self.size + self.current_move_penalty = self.move_penalty + self.current_collision_penalty = self.collision_penalty + self.current_bomb_penalty = self.bomb_penalty + self.current_close_bomb_penalty = self.close_bomb_penalty + self.current_rock_reward = self.rock_reward + self.current_max_steps = self.max_steps + self.current_end_game_reward = self.end_game_reward # endgame reward independant of board size + + self.set_initial_board(self.board_size, tuple(self.np_random.integers(low=0, high=self.board_size, size=2))) return self.make_observation(), {} - def set_initial_board(self, agent_pos): - self.stones = np.full((self.size, self.size), True) + def set_initial_board(self, size, agent_pos): + self.stones = np.full((size, size), True) self.agent_pos = agent_pos + self.active_bombs = [] # initially remove all 8 stones around the agent self.bomb_3x3(agent_pos) def is_valid_pos(self, pos: Tuple[int, int]) -> bool: m, n = pos - return (-1 < m < self.size) and (-1 < n < self.size) + return (-1 < m < self.board_size) and (-1 < n < self.board_size) def can_move_to_pos(self, pos: Tuple[int, int]) -> bool: return self.is_valid_pos(pos) and (not self.stones[pos]) def make_current_board_2D(self) -> np.ndarray: - board = np.zeros((self.size, self.size), dtype=np.float32) + board = np.zeros((self.board_size, self.board_size), dtype=np.float32) # set rocks for m, n in np.ndindex(self.stones.shape): board[(m, n)] = self.rock_val if self.stones[(m, n)] else self.empty_val @@ -105,9 +126,9 @@ def make_observation_2D(self) -> np.ndarray: if self.reduced_obs: # cut 3x3 patch around agent m_ap, n_ap = self.agent_pos m_center = max(1, m_ap) - m_center = min(self.size-2, m_center) + m_center = min(self.board_size - 2, m_center) n_center = max(1, n_ap) - n_center = min(self.size - 2, n_center) + n_center = min(self.board_size - 2, n_center) return board[m_center-1:m_center+2, n_center-1:n_center+2] else: return board @@ -158,14 +179,14 @@ def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, dict]: if self.can_move_to_pos(next_pos): self.agent_pos = next_pos - reward += self.move_penalty # penalty for each move + reward += self.current_move_penalty # penalty for each move else: - reward += self.collision_penalty + reward += self.current_collision_penalty if self.dead_when_colliding: agent_killed = True elif action == 4: # drop bomb at agent location - reward += self.bomb_penalty # penalty for each dropped bomb + reward += self.current_bomb_penalty # penalty for each dropped bomb placed_bomb = self.agent_pos if self.indestructible_agent: self.active_bombs.append((self.agent_pos, 0)) # immediate detonation @@ -176,14 +197,14 @@ def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, dict]: still_active_bombs = [] for bomb_pos, step_timer in self.active_bombs: if step_timer <= 0: - reward += self.rock_reward * self.bomb_3x3(bomb_pos) # detonate bomb + reward += self.current_rock_reward * self.bomb_3x3(bomb_pos) # detonate bomb exploded_bomb = bomb_pos if not self.indestructible_agent: # check that agent is in safe distance squared_dist = (bomb_pos[0]-self.agent_pos[0])**2 + (bomb_pos[1]-self.agent_pos[1])**2 if squared_dist < 4.0: - reward += self.close_bomb_penalty + reward += self.current_close_bomb_penalty if self.dead_near_bomb: agent_killed = True else: @@ -193,12 +214,12 @@ def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, dict]: # mission completed when every rock was bombed if (self.stones == False).all(): - reward += self.end_game_reward + reward += self.current_end_game_reward terminated = True else: terminated = False - if self.current_step > self.max_steps or agent_killed: # end game when max step reached or agent killed + if self.current_step > self.current_max_steps or agent_killed: # end game when max step reached or agent killed truncate = True else: truncate = False diff --git a/bomberworld_plotter.py b/bomberworld_plotter.py index 0884af8..fdfaa79 100644 --- a/bomberworld_plotter.py +++ b/bomberworld_plotter.py @@ -29,6 +29,7 @@ def __init__(self, size: int, animated_gif_folder_path: Optional[Union[str, Path self.ordered_file_list = [] self.agent_traj: List[Tuple[int, int]] = [] self.bomb_traj: List[Tuple[int, int]] = [] + self.current_agent_pos: Tuple[int, int] = (0,0) self.stones: np.ndarray = np.zeros((self.size, self.size), dtype=np.float32) self.explosion: Tuple[int, int] = None self.agent_shape = [[.2, .6], [.2, .3], [.3, .1], [.7, .1], [.8, .3], [.8, .6]] @@ -36,6 +37,7 @@ def __init__(self, size: int, animated_gif_folder_path: Optional[Union[str, Path def add_frame(self, agent_position: Tuple[int, int], placed_bomb: Tuple[int, int], exploded_bomb: Tuple[int, int], stones: np.ndarray ) -> None: if placed_bomb is None: self.agent_traj.append(agent_position) + self.current_agent_pos = agent_position else: self.bomb_traj.append(placed_bomb) # bomb placed -> agent did not move @@ -48,8 +50,9 @@ def plot_episode(self, current_reward = None): self.draw_grid(ax) self.draw_stones(ax, self.stones) - self.draw_path(ax, self.agent_traj, color='red', line_width=1) - self.draw_bombs(ax, self.bomb_traj) + self.draw_current_agent_pos(ax, self.current_agent_pos) + #self.draw_path(ax, self.agent_traj, color='red', line_width=1) + #self.draw_bombs(ax, self.bomb_traj) self.draw_agent(ax, self.agent_traj[0][0], self.agent_traj[0][1]) self.draw_explosion(ax, self.explosion) @@ -84,7 +87,7 @@ def draw_bombs(ax: mpl.axes.Axes, bombs: List[Tuple[int, int]]): index = 0 for m, n in bombs: ax.add_patch(patches.Ellipse((n+0.5, m+0.5), width=0.8, height=0.8, ec="black", fill=False)) - ax.text(n+0.3, m+0.6, str(index)) + #ax.text(n+0.3, m+0.6, str(index)) index += 1 @staticmethod @@ -102,6 +105,11 @@ def draw_stones(ax: mpl.axes.Axes, stones: np.ndarray): if stones[(m,n)] < 0.1: ax.add_patch(patches.Rectangle((n+0.125, m+0.125), width=0.75, height=0.75, ec='black', fc='grey', fill=True)) + @staticmethod + def draw_current_agent_pos(ax: mpl.axes.Axes, pos: Tuple[int, int]): + m, n = pos + ax.add_patch(patches.Rectangle((n + 0.1, m + 0.1), width=0.8, height=0.8, ec='red', fc='red', fill=False, linewidth=6.0)) + def draw_grid(self, ax: mpl.axes.Axes): for i in range(self.size + 1): ax.axhline(y=i, c='k', lw=2) @@ -129,7 +137,7 @@ def create_animated_gif_from_episodes(self): frames[0].save(gif_out_path, format='GIF', append_images=frames[1:], save_all=True, - duration=300, loop=0) + duration=75, loop=0) print("Animated gif created, nbr imgs:", len(frames)) else: print("Error: animated_gif_folder_path needs to be set in ctor") diff --git a/do_bombing.py b/do_bombing.py index 0482859..a5c4709 100644 --- a/do_bombing.py +++ b/do_bombing.py @@ -15,10 +15,10 @@ def run_bombing(path_to_checkpoint: str, use_lstm: bool): cell_size = 256 lstm_states = [np.zeros(cell_size, np.float32), np.zeros(cell_size, np.float32)] - env = bomberworld.BomberworldEnv(6, 60, dead_when_colliding=True, reduced_obs=True, indestructible_agent=False, dead_near_bomb=True) + env = bomberworld.BomberworldEnv(20, 2000, dead_when_colliding=True, reduced_obs=True) o, info = env.reset() - plotter = BomberworldPlotter(size=env.size, animated_gif_folder_path="gifs") + plotter = BomberworldPlotter(size=env.board_size, animated_gif_folder_path="gifs") plotter.add_frame(env.agent_pos, None, None, env.make_current_board_2D()) reward_sum = 0 diff --git a/rsc/6,8x6,8-trained-10x10.gif b/rsc/6,8x6,8-trained-10x10.gif new file mode 100644 index 0000000..9d4e05a Binary files /dev/null and b/rsc/6,8x6,8-trained-10x10.gif differ diff --git a/rsc/6,8x6,8-trained-12x12.gif b/rsc/6,8x6,8-trained-12x12.gif new file mode 100644 index 0000000..3afea35 Binary files /dev/null and b/rsc/6,8x6,8-trained-12x12.gif differ diff --git a/rsc/6,8x6,8-trained-20x20.gif b/rsc/6,8x6,8-trained-20x20.gif new file mode 100644 index 0000000..150d923 Binary files /dev/null and b/rsc/6,8x6,8-trained-20x20.gif differ diff --git a/rsc/6,8x6,8-trained-6x6.gif b/rsc/6,8x6,8-trained-6x6.gif new file mode 100644 index 0000000..d792bbb Binary files /dev/null and b/rsc/6,8x6,8-trained-6x6.gif differ diff --git a/rsc/6,8x6,8-trained-7x7.gif b/rsc/6,8x6,8-trained-7x7.gif new file mode 100644 index 0000000..bedbd46 Binary files /dev/null and b/rsc/6,8x6,8-trained-7x7.gif differ diff --git a/rsc/6,8x6,8-trained-8x8.gif b/rsc/6,8x6,8-trained-8x8.gif new file mode 100644 index 0000000..699d976 Binary files /dev/null and b/rsc/6,8x6,8-trained-8x8.gif differ diff --git a/rsc/6,8x6,8-trained-9x9.gif b/rsc/6,8x6,8-trained-9x9.gif new file mode 100644 index 0000000..aa8dba4 Binary files /dev/null and b/rsc/6,8x6,8-trained-9x9.gif differ diff --git a/solver.py b/solver.py index 9ded60d..8a2418d 100644 --- a/solver.py +++ b/solver.py @@ -89,15 +89,16 @@ def resume_training(): if True: # train hw: - hw = {"gpu": 0, "cpu": 3} # imac - #hw = {"gpu": 1, "cpu": 11} # adris + #hw = {"gpu": 0, "cpu": 3} # imac + hw = {"gpu": 1, "cpu": 11} # adris - env_params = {"size": 6, "max_steps": 60, "reduced_obs": True, "dead_when_colliding": True, "indestructible_agent": False, "dead_near_bomb": True} + env_params = {"size": [6, 8], "max_steps": 40, "reduced_obs": True, "dead_when_colliding": True} + #env_params = {"size": 6, "max_steps": 60, "reduced_obs": True, "dead_when_colliding": True, "indestructible_agent": False, "dead_near_bomb": True} #env_params = {"size": 10, "max_steps": 100, "indestructible_agent": False, "dead_near_bomb": True} # env_params = {"size": 10, "max_steps": 200, "dead_when_colliding": True, "dead_near_bomb": True, "indestructible_agent": False, "close_bomb_penalty": -1.0} nn_model = [256, 128, 64] activation = "relu" - description = "ReducedSmartBomber-6x6-Gamma=0.75-LSTM" + description = "ReducedSmartBomber-6to8x6to8-Gamma=0.75-LSTM" - grid_search_hypers(env_params, nn_model, activation, description, hw) + grid_search_hypers(env_params, nn_model, activation, description, hw, use_lstm=True) diff --git a/test_bomberworld.py b/test_bomberworld.py index 2368d9e..6c4c42a 100644 --- a/test_bomberworld.py +++ b/test_bomberworld.py @@ -21,16 +21,17 @@ def test_ctor(self): env = bomberworld.BomberworldEnv(size, maxst, indestructible_agent=indestr, dead_when_colliding=dc, move_penalty=movep, collision_penalty=collip, bomb_penalty=bombp, close_bomb_penalty=closep, rock_reward=rockr, end_game_reward=endr, dead_near_bomb=dnb) + env.reset() - self.assertEqual(env.size, size) - self.assertEqual(env.max_steps, maxst) + self.assertEqual(env.board_size, size) + self.assertEqual(env.current_max_steps, maxst) self.assertEqual(env.indestructible_agent, indestr) - self.assertAlmostEqual(env.move_penalty, movep) - self.assertAlmostEqual(env.collision_penalty, collip) - self.assertAlmostEqual(env.bomb_penalty, bombp) - self.assertAlmostEqual(env.close_bomb_penalty, closep) - self.assertAlmostEqual(env.rock_reward, rockr) - self.assertAlmostEqual(env.end_game_reward, endr) + self.assertAlmostEqual(env.current_move_penalty, movep) + self.assertAlmostEqual(env.current_collision_penalty, collip) + self.assertAlmostEqual(env.current_bomb_penalty, bombp) + self.assertAlmostEqual(env.current_close_bomb_penalty, closep) + self.assertAlmostEqual(env.current_rock_reward, rockr) + self.assertAlmostEqual(env.current_end_game_reward, endr) self.assertAlmostEqual(env.dead_near_bomb, dnb) self.assertAlmostEqual(env.dead_when_colliding, dc) @@ -38,6 +39,7 @@ def test_valid_pos(self): # test function which checks if position is on the board size = 10 env = bomberworld.BomberworldEnv(size, 100) + env.reset() for m in range(0, size): for n in range(0, size): self.assertTrue(env.is_valid_pos((m, n))) @@ -52,6 +54,7 @@ def test_can_move_to_pos(self): # test function which checks if position is on the board size = 10 env = bomberworld.BomberworldEnv(size, 100) + env.reset() # can move nowhere env.stones = np.full((size, size), True) @@ -74,6 +77,7 @@ def test_can_move_to_pos(self): def test_bomb_3x3(self): size = 10 env = bomberworld.BomberworldEnv(size, 100) + env.reset() # bomb upper left corner env.stones = np.full((size, size), True) @@ -119,60 +123,62 @@ def test_reset(self): def test_move_actions(self): size = 10 env = bomberworld.BomberworldEnv(size, 100) - env.set_initial_board((0,0)) + env.reset() + env.set_initial_board(size, (0,0)) # agent at (0,0) -> can initially move only to 3 bording squares. Others are rocks or wall. obs, reward, _, stopped, dbg = env.step(0) # up not possible - self.assertAlmostEqual(reward, env.collision_penalty) + self.assertAlmostEqual(reward, env.current_collision_penalty) self.assertAlmostEqual(env.make_observation_2D()[(0,0)], env.agent_val) self.assertIsNone(dbg["placed_bomb"]) self.assertIsNone(dbg["exploded_bomb"]) self.assertFalse(stopped) obs, reward, _, stopped, _ = env.step(3) # left not possible - self.assertAlmostEqual(reward, env.collision_penalty) + self.assertAlmostEqual(reward, env.current_collision_penalty) self.assertFalse(stopped) obs, reward, _, stopped, _ = env.step(1) # right possible - self.assertAlmostEqual(reward, env.move_penalty) + self.assertAlmostEqual(reward, env.current_move_penalty) self.assertEqual(env.agent_pos, (0,1)) self.assertAlmostEqual(env.make_observation_2D()[(0, 0)], env.empty_val) # previous field empty self.assertAlmostEqual(env.make_observation_2D()[(0, 1)], env.agent_val) # current field agent self.assertFalse(stopped) obs, reward, _, _, _ = env.step(1) # right again not possible - self.assertAlmostEqual(reward, env.collision_penalty) + self.assertAlmostEqual(reward, env.current_collision_penalty) self.assertEqual(env.agent_pos, (0, 1)) obs, reward, _, _, _ = env.step(2) # down possible - self.assertAlmostEqual(reward, env.move_penalty) + self.assertAlmostEqual(reward, env.current_move_penalty) self.assertEqual(env.agent_pos, (1, 1)) self.assertAlmostEqual(env.make_observation_2D()[(0, 1)], env.empty_val) # previous field empty self.assertAlmostEqual(env.make_observation_2D()[(1, 1)], env.agent_val) # current field agent obs, reward, _, _, _ = env.step(2) # down again not possible - self.assertAlmostEqual(reward, env.collision_penalty) + self.assertAlmostEqual(reward, env.current_collision_penalty) self.assertEqual(env.agent_pos, (1, 1)) obs, reward, _, _, _ = env.step(3) # left possible - self.assertAlmostEqual(reward, env.move_penalty) + self.assertAlmostEqual(reward, env.current_move_penalty) self.assertEqual(env.agent_pos, (1, 0)) obs, reward, _, _, _ = env.step(3) # left again not possible - self.assertAlmostEqual(reward, env.collision_penalty) + self.assertAlmostEqual(reward, env.current_collision_penalty) self.assertEqual(env.agent_pos, (1, 0)) obs, reward, _, _, _ = env.step(0) # up possible - self.assertAlmostEqual(reward, env.move_penalty) + self.assertAlmostEqual(reward, env.current_move_penalty) self.assertEqual(env.agent_pos, (0, 0)) def test_bomb_actions(self): size = 10 env = bomberworld.BomberworldEnv(size, 100) - env.set_initial_board((0, 0)) + env.reset() + env.set_initial_board(size, (0, 0)) obs, reward, _, _, dbg = env.step(4) # no rock bombed - self.assertAlmostEqual(reward, env.bomb_penalty) + self.assertAlmostEqual(reward, env.current_bomb_penalty) self.assertAlmostEqual(env.make_observation_2D()[(0, 0)], env.agent_val) # check debug output self.assertIsNotNone(dbg["placed_bomb"]) @@ -182,27 +188,28 @@ def test_bomb_actions(self): obs, reward, _, _, _ = env.step(1) # move to (0,1) obs, reward, _, _, _ = env.step(4) # 2 rocks bombed - self.assertAlmostEqual(reward, env.bomb_penalty+2*env.rock_reward) + self.assertAlmostEqual(reward, env.current_bomb_penalty + 2 * env.current_rock_reward) obs, reward, _, _, _ = env.step(2) # move to (1,1) obs, reward, _, _, _ = env.step(4) # 3 rocks bombed - self.assertAlmostEqual(reward, env.bomb_penalty+3*env.rock_reward) + self.assertAlmostEqual(reward, env.current_bomb_penalty + 3 * env.current_rock_reward) def test_reach_target(self): size = 10 env = bomberworld.BomberworldEnv(size, 100) - env.set_initial_board((0, 0)) + env.reset() + env.set_initial_board(size, (0, 0)) # destroy all rocks except one env.stones.fill(False) env.stones[(0, 1)] = True obs, reward, terminated, _, _ = env.step(2) # down - self.assertAlmostEqual(reward, env.move_penalty) + self.assertAlmostEqual(reward, env.current_move_penalty) self.assertFalse(terminated) obs, reward, terminated, _, _ = env.step(4) # bomb and all is destroyed - self.assertAlmostEqual(reward, env.end_game_reward) + self.assertAlmostEqual(reward, env.current_end_game_reward) self.assertTrue(terminated) def test_reach_max(self): @@ -223,7 +230,8 @@ def test_good_run(self): reward = 0.0 size = 10 env = bomberworld.BomberworldEnv(size, 100) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) for i in range(0, 7): _, r, terminated, _, _ = env.step(2) # down @@ -275,10 +283,11 @@ def test_good_run(self): def test_destructable_agent(self): size = 10 env = bomberworld.BomberworldEnv(size, 100, indestructible_agent=False) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) _, r, _, tc, dbg = env.step(4) # bomb at (1,1) and stay there at detonation - self.assertAlmostEqual(r, env.bomb_penalty) + self.assertAlmostEqual(r, env.current_bomb_penalty) self.assertEqual(len(env.active_bombs), 1) self.assertFalse(tc) # check debug output @@ -287,7 +296,7 @@ def test_destructable_agent(self): self.assertEqual(dbg["placed_bomb"], (1, 1)) _, r, _, tc, dbg = env.step(0) # up - self.assertAlmostEqual(r, env.move_penalty) + self.assertAlmostEqual(r, env.current_move_penalty) self.assertEqual(len(env.active_bombs), 1) self.assertFalse(tc) # check debug output @@ -296,7 +305,7 @@ def test_destructable_agent(self): _, r, _, tc, dbg = env.step(2) # down and bomb detonates self.assertEqual(env.agent_pos, (1,1)) - self.assertAlmostEqual(r, env.move_penalty + env.close_bomb_penalty) + self.assertAlmostEqual(r, env.current_move_penalty + env.current_close_bomb_penalty) self.assertEqual(len(env.active_bombs), 0) self.assertFalse(tc) # check debug output @@ -305,46 +314,47 @@ def test_destructable_agent(self): self.assertEqual(dbg["exploded_bomb"], (1, 1)) _, r, _, _, _ = env.step(4) # bomb at (1,1) and stay (0,0) at detonation - self.assertAlmostEqual(r, env.bomb_penalty) + self.assertAlmostEqual(r, env.current_bomb_penalty) self.assertEqual(len(env.active_bombs), 1) _, r, _, _, _ = env.step(0) # up - self.assertAlmostEqual(r, env.move_penalty) + self.assertAlmostEqual(r, env.current_move_penalty) self.assertEqual(len(env.active_bombs), 1) _, r, _, _, _ = env.step(3) # left and bomb detonates self.assertEqual(env.agent_pos, (0, 0)) - self.assertAlmostEqual(r, env.move_penalty + env.close_bomb_penalty) + self.assertAlmostEqual(r, env.current_move_penalty + env.current_close_bomb_penalty) self.assertEqual(len(env.active_bombs), 0) env.step(2) env.step(1) _, r, _, _, _ = env.step(4) # bomb at (1,1) and stay (2,2) at detonation - self.assertAlmostEqual(r, env.bomb_penalty) + self.assertAlmostEqual(r, env.current_bomb_penalty) self.assertEqual(len(env.active_bombs), 1) _, r, _, _, _ = env.step(2) # down - self.assertAlmostEqual(r, env.move_penalty) + self.assertAlmostEqual(r, env.current_move_penalty) self.assertEqual(len(env.active_bombs), 1) _, r, _, _, _ = env.step(1) # right and bomb detonates self.assertEqual(env.agent_pos, (2, 2)) - self.assertAlmostEqual(r, env.move_penalty + env.close_bomb_penalty) + self.assertAlmostEqual(r, env.current_move_penalty + env.current_close_bomb_penalty) self.assertEqual(len(env.active_bombs), 0) # drop bomb at (2,2) and go to safe place (0,2) _, r, _, _, _ = env.step(4) # bomb at (1,1) and stay (2,2) at detonation - self.assertAlmostEqual(r, env.bomb_penalty) + self.assertAlmostEqual(r, env.current_bomb_penalty) self.assertEqual(len(env.active_bombs), 1) _, r, _, _, _ = env.step(0) # up - self.assertAlmostEqual(r, env.move_penalty) + self.assertAlmostEqual(r, env.current_move_penalty) self.assertEqual(len(env.active_bombs), 1) _, r, _, _, _ = env.step(0) # up self.assertEqual(env.agent_pos, (0, 2)) - self.assertAlmostEqual(r, env.move_penalty + 5 * env.rock_reward) + self.assertAlmostEqual(r, env.current_move_penalty + 5 * env.current_rock_reward) self.assertEqual(len(env.active_bombs), 0) def test_if_bomb_on_field(self): size = 10 env = bomberworld.BomberworldEnv(size, 100, indestructible_agent=False) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) env.step(4) self.assertTrue(env.is_active_bomb_on_field((1,1))) self.assertFalse(env.is_active_bomb_on_field((0, 1))) @@ -357,7 +367,8 @@ def test_if_bomb_on_field(self): def test_destructable_obs_bombs(self): size = 10 env = bomberworld.BomberworldEnv(size, 100, indestructible_agent=False) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) self.assertEqual(env.agent_pos, (1,1)) env.step(4) # drop bomb, agent and bomb in same spot @@ -377,7 +388,8 @@ def test_destructable_obs_bombs(self): def test_destructable_multiple_bombs(self): size = 10 env = bomberworld.BomberworldEnv(size, 150, indestructible_agent=False) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) self.assertEqual(env.agent_pos, (1,1)) env.step(4) # drop bomb, agent and bomb in same spot @@ -399,7 +411,8 @@ def test_destructable_good_run(self): reward = 0.0 size = 10 env = bomberworld.BomberworldEnv(size, 100, indestructible_agent=False) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) _, r, _, _, _ = env.step(2) # down reward += r @@ -410,7 +423,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(0) # up reward += r - self.assertAlmostEqual(3*env.move_penalty + 1 * env.bomb_penalty + 3.0*env.rock_reward, reward) + self.assertAlmostEqual(3 * env.current_move_penalty + 1 * env.current_bomb_penalty + 3.0 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(2) # down reward += r @@ -418,26 +431,26 @@ def test_destructable_good_run(self): reward += r _, r, _, _, _ = env.step(4) # bomb reward += r - self.assertAlmostEqual(5 * env.move_penalty + 2 * env.bomb_penalty + 3.0*env.rock_reward, reward) + self.assertAlmostEqual(5 * env.current_move_penalty + 2 * env.current_bomb_penalty + 3.0 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(2) # down reward += r _, r, _, _, _ = env.step(2) # down reward += r - self.assertAlmostEqual(7 * env.move_penalty + 2 * env.bomb_penalty + 6.0*env.rock_reward, reward) + self.assertAlmostEqual(7 * env.current_move_penalty + 2 * env.current_bomb_penalty + 6.0 * env.current_rock_reward, reward) self.assertEqual(len(env.active_bombs), 0) _, r, _, _, _ = env.step(4) # bomb + previous bomb explodes - self.assertAlmostEqual(env.bomb_penalty, r) + self.assertAlmostEqual(env.current_bomb_penalty, r) reward += r _, r, _, _, _ = env.step(3) # left - self.assertAlmostEqual(env.move_penalty, r) + self.assertAlmostEqual(env.current_move_penalty, r) reward += r _, r, _, _, _ = env.step(3) # left + bomb exploding - self.assertAlmostEqual(env.move_penalty+4*env.rock_reward, r) + self.assertAlmostEqual(env.current_move_penalty + 4 * env.current_rock_reward, r) reward += r - self.assertAlmostEqual(9 * env.move_penalty + 3 * env.bomb_penalty + 10.0*env.rock_reward, reward) + self.assertAlmostEqual(9 * env.current_move_penalty + 3 * env.current_bomb_penalty + 10.0 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(1) # right reward += r @@ -450,7 +463,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(1) # right reward += r - self.assertAlmostEqual(13 * env.move_penalty + 4 * env.bomb_penalty + 14.0 * env.rock_reward, reward) + self.assertAlmostEqual(13 * env.current_move_penalty + 4 * env.current_bomb_penalty + 14.0 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(4) # bomb reward += r @@ -461,7 +474,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(0) # up reward += r - self.assertAlmostEqual(16 * env.move_penalty + 5 * env.bomb_penalty + 18.0 * env.rock_reward, reward) + self.assertAlmostEqual(16 * env.current_move_penalty + 5 * env.current_bomb_penalty + 18.0 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(4) # bomb reward += r @@ -482,7 +495,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(0) # up reward += r - self.assertAlmostEqual(23 * env.move_penalty + 7 * env.bomb_penalty + 26.0 * env.rock_reward, reward) + self.assertAlmostEqual(23 * env.current_move_penalty + 7 * env.current_bomb_penalty + 26.0 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(2) # down reward += r @@ -499,7 +512,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(0) # up reward += r - self.assertAlmostEqual(29 * env.move_penalty + 8 * env.bomb_penalty + 31.0 * env.rock_reward, reward) + self.assertAlmostEqual(29 * env.current_move_penalty + 8 * env.current_bomb_penalty + 31.0 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(4) # bomb reward += r @@ -518,7 +531,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(0) # up reward += r - self.assertAlmostEqual(35 * env.move_penalty + 10 * env.bomb_penalty + 39.0 * env.rock_reward, reward) + self.assertAlmostEqual(35 * env.current_move_penalty + 10 * env.current_bomb_penalty + 39.0 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(0) # up reward += r @@ -542,7 +555,7 @@ def test_destructable_good_run(self): reward += r - self.assertAlmostEqual(43 * env.move_penalty + 12 * env.bomb_penalty + 48 * env.rock_reward, reward) + self.assertAlmostEqual(43 * env.current_move_penalty + 12 * env.current_bomb_penalty + 48 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(4) # bomb @@ -560,7 +573,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(1) # right reward += r - self.assertAlmostEqual(48 * env.move_penalty + 14 * env.bomb_penalty + 55 * env.rock_reward, reward) + self.assertAlmostEqual(48 * env.current_move_penalty + 14 * env.current_bomb_penalty + 55 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(4) # bomb reward += r @@ -577,7 +590,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(3) # left reward += r - self.assertAlmostEqual(53 * env.move_penalty + 16 * env.bomb_penalty + 62 * env.rock_reward, reward) + self.assertAlmostEqual(53 * env.current_move_penalty + 16 * env.current_bomb_penalty + 62 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(1) # right reward += r @@ -594,7 +607,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(3) # left reward += r - self.assertAlmostEqual(59 * env.move_penalty + 17 * env.bomb_penalty + 67 * env.rock_reward, reward) + self.assertAlmostEqual(59 * env.current_move_penalty + 17 * env.current_bomb_penalty + 67 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(1) # right reward += r @@ -607,7 +620,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(1) # right reward += r - self.assertAlmostEqual(63 * env.move_penalty + 18 * env.bomb_penalty + 71 * env.rock_reward, reward) + self.assertAlmostEqual(63 * env.current_move_penalty + 18 * env.current_bomb_penalty + 71 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(4) # bomb reward += r @@ -616,7 +629,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(2) # down reward += r - self.assertAlmostEqual(65 * env.move_penalty + 19 * env.bomb_penalty + 75 * env.rock_reward, reward) + self.assertAlmostEqual(65 * env.current_move_penalty + 19 * env.current_bomb_penalty + 75 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(4) # bomb reward += r @@ -643,7 +656,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(0) # up reward += r - self.assertAlmostEqual(75 * env.move_penalty + 21 * env.bomb_penalty + 79 * env.rock_reward, reward) + self.assertAlmostEqual(75 * env.current_move_penalty + 21 * env.current_bomb_penalty + 79 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(3) # left reward += r @@ -660,7 +673,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(0) # up reward += r - self.assertAlmostEqual(81 * env.move_penalty + 22 * env.bomb_penalty + 84 * env.rock_reward, reward) + self.assertAlmostEqual(81 * env.current_move_penalty + 22 * env.current_bomb_penalty + 84 * env.current_rock_reward, reward) _, r, _, _, _ = env.step(3) # left reward += r @@ -679,7 +692,7 @@ def test_destructable_good_run(self): _, r, _, _, _ = env.step(1) # right reward += r - self.assertAlmostEqual(87 * env.move_penalty + 24 * env.bomb_penalty + 90 * env.rock_reward, reward) + self.assertAlmostEqual(87 * env.current_move_penalty + 24 * env.current_bomb_penalty + 90 * env.current_rock_reward, reward) _, r, terminated, _, _ = env.step(4) # bomb self.assertFalse(terminated) @@ -691,7 +704,7 @@ def test_destructable_good_run(self): reward += r self.assertTrue(terminated) - self.assertAlmostEqual(89 * env.move_penalty + 25 * env.bomb_penalty + 91 * env.rock_reward + env.end_game_reward, reward) + self.assertAlmostEqual(89 * env.current_move_penalty + 25 * env.current_bomb_penalty + 91 * env.current_rock_reward + env.current_end_game_reward, reward) print(reward) @@ -699,32 +712,34 @@ def test_destructable_good_run(self): def test_destructable_dead_near_bomb_agent(self): size = 10 env = bomberworld.BomberworldEnv(size, 100, indestructible_agent=False, dead_near_bomb=True) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) _, r, _, tc, _ = env.step(4) # bomb at (1,1) and stay there at detonation - self.assertAlmostEqual(r, env.bomb_penalty) + self.assertAlmostEqual(r, env.current_bomb_penalty) self.assertEqual(len(env.active_bombs), 1) self.assertFalse(tc) _, r, _, tc, _ = env.step(0) # up - self.assertAlmostEqual(r, env.move_penalty) + self.assertAlmostEqual(r, env.current_move_penalty) self.assertEqual(len(env.active_bombs), 1) self.assertFalse(tc) _, r, _, tc, _ = env.step(2) # down and bomb detonates self.assertEqual(env.agent_pos, (1,1)) - self.assertAlmostEqual(r, env.move_penalty + env.close_bomb_penalty) + self.assertAlmostEqual(r, env.current_move_penalty + env.current_close_bomb_penalty) self.assertEqual(len(env.active_bombs), 0) self.assertTrue(tc) def test_dead_collision_agent(self): size = 10 env = bomberworld.BomberworldEnv(size, 100, dead_when_colliding=True) - env.set_initial_board((0, 0)) + env.reset() + env.set_initial_board(size, (0, 0)) # agent at (0,0) -> can initially move only to 3 bording squares. Others are rocks or wall. obs, reward, _, stopped, _ = env.step(0) # up not possible -> agent dead - self.assertAlmostEqual(reward, env.collision_penalty) + self.assertAlmostEqual(reward, env.current_collision_penalty) self.assertAlmostEqual(env.make_observation_2D()[(0, 0)], env.agent_val) self.assertTrue(stopped) @@ -732,20 +747,21 @@ def test_smaller_fields_4x4(self): reward = 0.0 size = 4 env = bomberworld.BomberworldEnv(size, 50) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) _, r, terminated, _, _ = env.step(2) # down reward += r _, r, terminated, _, _ = env.step(4) # bomb reward += r - self.assertAlmostEqual(1 * env.move_penalty + 1 * env.bomb_penalty + 3 * env.rock_reward, reward) + self.assertAlmostEqual(1 * env.current_move_penalty + 1 * env.current_bomb_penalty + 3 * env.current_rock_reward, reward) self.assertFalse(terminated) _, r, terminated, _, _ = env.step(1) # right reward += r _, r, terminated, _, _ = env.step(4) # bomb reward += r - self.assertAlmostEqual(2 * env.move_penalty + 2 * env.bomb_penalty + 6 * env.rock_reward, reward) + self.assertAlmostEqual(2 * env.current_move_penalty + 2 * env.current_bomb_penalty + 6 * env.current_rock_reward, reward) self.assertFalse(terminated) # try to leave area @@ -754,14 +770,14 @@ def test_smaller_fields_4x4(self): _, r, terminated, _, _ = env.step(1) # not possible right reward += r - self.assertAlmostEqual(3 * env.move_penalty + 1 * env.collision_penalty + 2 * env.bomb_penalty + 6 * env.rock_reward, reward) + self.assertAlmostEqual(3 * env.current_move_penalty + 1 * env.current_collision_penalty + 2 * env.current_bomb_penalty + 6 * env.current_rock_reward, reward) self.assertFalse(terminated) _, r, terminated, _, _ = env.step(0) # up reward += r _, r, terminated, _, _ = env.step(4) # bomb reward += r - self.assertAlmostEqual(4 * env.move_penalty + 1 * env.collision_penalty + 3 * env.bomb_penalty + 7 * env.rock_reward + env.end_game_reward, reward) + self.assertAlmostEqual(4 * env.current_move_penalty + 1 * env.current_collision_penalty + 3 * env.current_bomb_penalty + 7 * env.current_rock_reward + env.current_end_game_reward, reward) self.assertTrue(terminated) print(env.make_observation_2D()) @@ -771,7 +787,8 @@ def test_reduced_obs(self): reward = 0.0 size = 4 env = bomberworld.BomberworldEnv(size, 50, reduced_obs=True) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) b = env.make_observation_2D() @@ -813,7 +830,8 @@ def test_good_run_reduced(self): reward = 0.0 size = 10 env = bomberworld.BomberworldEnv(size, 100, reduced_obs=True) - env.set_initial_board((1, 1)) + env.reset() + env.set_initial_board(size, (1, 1)) for i in range(0, 7): _, r, terminated, _, _ = env.step(2) # down @@ -861,7 +879,68 @@ def test_good_run_reduced(self): self.assertTrue(terminated) print(reward) - + def test_mulitple_board_sizes(self): + # test function which checks if position is on the board + sizes = [5, 6, 7] + env = bomberworld.BomberworldEnv(sizes, 100, reduced_obs=True) + + fiveOccured = False + sixOccured = False + sevenOccured = False + + rock_rewards = [None, None, None] + move_penalties = [None, None, None] + bomb_penalties = [None, None, None] + end_game_rewards = [None, None, None] + max_steps = [None, None, None] + + for k in range(0, 100): # check that all 3 sizes are used at least once. + env.reset() + the_size = env.board_size + + self.assertTrue(the_size in sizes) + + if the_size == 5: + fiveOccured = True + rock_rewards[0] = env.current_rock_reward + move_penalties[0] = env.current_move_penalty + bomb_penalties[0] = env.current_bomb_penalty + end_game_rewards[0] = env.current_end_game_reward + max_steps[0] = env.current_max_steps + + if the_size == 6: + sixOccured = True + rock_rewards[1] = env.current_rock_reward + move_penalties[1] = env.current_move_penalty + bomb_penalties[1] = env.current_bomb_penalty + end_game_rewards[1] = env.current_end_game_reward + max_steps[1] = env.current_max_steps + + if the_size == 7: + sevenOccured = True + rock_rewards[2] = env.current_rock_reward + move_penalties[2] = env.current_move_penalty + bomb_penalties[2] = env.current_bomb_penalty + end_game_rewards[2] = env.current_end_game_reward + max_steps[2] = env.current_max_steps + + self.assertTrue(fiveOccured) + self.assertTrue(sixOccured) + self.assertTrue(sevenOccured) + + # smaller board size rewards must be bigger + for li in [rock_rewards, move_penalties, bomb_penalties]: + val_before = 10000000000 + for k in li: + self.assertGreater(abs(val_before), abs(k)) + val_before = k + + self.assertAlmostEqual(end_game_rewards[0], end_game_rewards[1]) + self.assertAlmostEqual(end_game_rewards[0], end_game_rewards[2]) + + # increasing max steps + self.assertLess(max_steps[0], max_steps[1]) + self.assertLess(max_steps[1], max_steps[2]) if __name__ == '__main__': unittest.main()