diambra · alexpalms · Aug 13, 2022 · Aug 13, 2022 · Aug 13, 2022
diff --git a/diambra/arena/arena_gym.py b/diambra/arena/arena_gym.py
@@ -105,28 +105,28 @@ def env_info_process(self, env_info):
         # Action dict
         move_dict = {}
         for idx in range(current_idx,
-                         current_idx + 2*self.n_actions_but_comb[0], 2):
-            move_dict[int(env_info[idx])] = env_info[idx+1]
+                         current_idx + 2 * self.n_actions_but_comb[0], 2):
+            move_dict[int(env_info[idx])] = env_info[idx + 1]
 
-        current_idx += 2*self.n_actions_but_comb[0]
+        current_idx += 2 * self.n_actions_but_comb[0]
 
         attack_dict = {}
         for idx in range(current_idx,
-                         current_idx + 2*self.n_actions_but_comb[1], 2):
-            attack_dict[int(env_info[idx])] = env_info[idx+1]
+                         current_idx + 2 * self.n_actions_but_comb[1], 2):
+            attack_dict[int(env_info[idx])] = env_info[idx + 1]
 
         self.print_actions_dict = [move_dict, attack_dict]
 
-        current_idx += 2*self.n_actions_but_comb[1]
+        current_idx += 2 * self.n_actions_but_comb[1]
 
         # Additional Obs map
         number_of_add_obs = int(env_info[current_idx])
         current_idx += 1
         self.add_obs = {}
         for idx in range(number_of_add_obs):
-            self.add_obs[env_info[current_idx]] = [int(env_info[current_idx+1]),
-                                                   int(env_info[current_idx+2]),
-                                                   int(env_info[current_idx+3])]
+            self.add_obs[env_info[current_idx]] = [int(env_info[current_idx + 1]),
+                                                   int(env_info[current_idx + 2]),
+                                                   int(env_info[current_idx + 3])]
             current_idx += 4
 
     # Return env action list
@@ -145,8 +145,8 @@ def print_actions(self):
 
     # Return min max rewards for the environment
     def get_min_max_reward(self):
-        return [self.minmax_reward[0]/(self.reward_normalization_value),
-                self.minmax_reward[1]/(self.reward_normalization_value)]
+        return [self.minmax_reward[0] / (self.reward_normalization_value),
+                self.minmax_reward[1] / (self.reward_normalization_value)]
 
     # Step method to be implemented in derived classes
     def step(self, action):
@@ -259,14 +259,14 @@ def __init__(self, env_settings):
         action_space_dict = {}
         for idx in range(2):
             if env_settings["action_space"][idx] == "multi_discrete":
-                action_space_dict["P{}".format(idx+1)] =\
+                action_space_dict["P{}".format(idx + 1)] =\
                     spaces.MultiDiscrete(self.n_actions[idx])
-                print("Using MultiDiscrete action space for P{}".format(idx+1))
+                print("Using MultiDiscrete action space for P{}".format(idx + 1))
             elif env_settings["action_space"][idx] == "discrete":
-                action_space_dict["P{}".format(idx+1)] =\
+                action_space_dict["P{}".format(idx + 1)] =\
                     spaces.Discrete(
                         self.n_actions[idx][0] + self.n_actions[idx][1] - 1)
-                print("Using Discrete action space for P{}".format(idx+1))
+                print("Using Discrete action space for P{}".format(idx + 1))
             else:
                 raise Exception(
                     "Not recognized action space: {}".format(env_settings["action_space"][idx]))
@@ -353,13 +353,13 @@ def __init__(self, env_settings):
                 continue
 
             if k[-2:] == "P1":
-                knew = "own"+k[:-2]
+                knew = "own" + k[:-2]
             else:
-                knew = "opp"+k[:-2]
+                knew = "opp" + k[:-2]
 
             # Discrete spaces (binary / categorical)
             if v[0] == 0 or v[0] == 2:
-                player_spec_dict[knew] = spaces.Discrete(v[2]+1)
+                player_spec_dict[knew] = spaces.Discrete(v[2] + 1)
             elif v[0] == 1:  # Box spaces
                 player_spec_dict[knew] = spaces.Box(low=v[1], high=v[2],
                                                     shape=(), dtype=np.int32)
@@ -396,9 +396,9 @@ def add_obs_integration(self, frame, data):
                 continue
 
             if k[-2:] == self.player_side:
-                knew = "own"+k[:-2]
+                knew = "own" + k[:-2]
             else:
-                knew = "opp"+k[:-2]
+                knew = "opp" + k[:-2]
 
             player_spec_dict[knew] = data[k]
 
@@ -451,12 +451,12 @@ def __init__(self, env_settings):
                 continue
 
             if k[-2:] == "P1":
-                knew = "own"+k[:-2]
+                knew = "own" + k[:-2]
             else:
-                knew = "opp"+k[:-2]
+                knew = "opp" + k[:-2]
 
             if v[0] == 0 or v[0] == 2:  # Discrete spaces
-                player_spec_dict[knew] = spaces.Discrete(v[2]+1)
+                player_spec_dict[knew] = spaces.Discrete(v[2] + 1)
             elif v[0] == 1:  # Box spaces
                 player_spec_dict[knew] = spaces.Box(low=v[1], high=v[2],
                                                     shape=(), dtype=np.int32)
@@ -500,15 +500,15 @@ def add_obs_integration(self, frame, data):
                     continue
 
                 if k[-2:] == elem:
-                    knew = "own"+k[:-2]
+                    knew = "own" + k[:-2]
                 else:
-                    knew = "opp"+k[:-2]
+                    knew = "opp" + k[:-2]
 
                 player_spec_dict[knew] = data[k]
 
             actions_dict = {
-                "move": data["moveActionP{}".format(ielem+1)],
-                "attack": data["attackActionP{}".format(ielem+1)],
+                "move": data["moveActionP{}".format(ielem + 1)],
+                "attack": data["attackActionP{}".format(ielem + 1)],
             }
 
             player_spec_dict["actions"] = actions_dict

diff --git a/diambra/arena/utils/gym_utils.py b/diambra/arena/utils/gym_utils.py
@@ -35,18 +35,18 @@ def nested_dict_obs_space(space, k_list=[], level=0):
         if isinstance(v, gym.spaces.dict.Dict):
             k_list = k_list[0:level]
             k_list.append(k)
-            nested_dict_obs_space(v, k_list, level=level+1)
+            nested_dict_obs_space(v, k_list, level=level + 1)
         else:
             k_list = k_list[0:level]
             out_string = "observation_space"
-            indentation = "    "*level
+            indentation = "    " * level
             for idk in k_list:
                 out_string += "[\"{}\"]".format(idk)
             out_string += "[\"{}\"]".format(k)
-            out_string = indentation+out_string+":"
+            out_string = indentation + out_string + ":"
             print(out_string, v)
             if isinstance(v, gym.spaces.MultiDiscrete):
-                print(indentation+"Space size:", v.nvec.shape)
+                print(indentation + "Space size:", v.nvec.shape)
             elif isinstance(v, gym.spaces.Discrete):
                 pass
             elif isinstance(v, gym.spaces.Box):
@@ -161,10 +161,10 @@ def show_gym_obs(observation, char_list, wait_key=1, viz=True):
                       observation["frame"].shape)
 
         if viz:
-            obs = np.array(observation["frame"]).astype(np.float32)/255
+            obs = np.array(observation["frame"]).astype(np.float32) / 255
     else:
         if viz:
-            obs = np.array(observation).astype(np.float32)/255
+            obs = np.array(observation).astype(np.float32) / 255
 
     if viz:
         cv2.imshow("Frame", obs[:, :, ::-1])  # rgb2bgr
@@ -205,7 +205,7 @@ def show_wrap_obs(observation, n_actions_stack, char_list, wait_key=1, viz=True)
 
     if viz:
         for idx in range(obs.shape[2]):
-            cv2.imshow("Frame-"+str(idx), obs[:, :, idx])
+            cv2.imshow("Frame-" + str(idx), obs[:, :, idx])
 
         cv2.waitKey(wait_key)
 

diff --git a/diambra/arena/wrappers/arena_wrappers.py b/diambra/arena/wrappers/arena_wrappers.py
@@ -92,22 +92,23 @@ def __init__(self, env, reward_normalization_factor):
         :param rewardNormalizationFactor: multiplication factor
         """
         gym.RewardWrapper.__init__(self, env)
-        self.env.reward_normalization_value = reward_normalization_factor*self.env.max_delta_health
+        self.env.reward_normalization_value = reward_normalization_factor * self.env.max_delta_health
 
     def reward(self, reward):
         """
         Nomralize reward dividing by reward normalization factor*max_delta_health
         :param reward: (float)
         """
-        return float(reward)/float(self.env.reward_normalization_value)
+        return float(reward) / float(self.env.reward_normalization_value)
 
 # Environment Wrapping (rewards normalization, resizing, grayscaling, etc)
 
 
 def env_wrapping(env, player, no_op_max=0, sticky_actions=1, clip_rewards=False,
                  reward_normalization=False, reward_normalization_factor=0.5,
                  frame_stack=1, actions_stack=1, scale=False, scale_mod=0,
-                 hwc_obs_resize=[84, 84, 0], dilation=1, hardcore=False):
+                 hwc_obs_resize=[84, 84, 0], dilation=1, flatten_dict=False,
+                 hardcore=False):
     """
     Typical standard environment wrappers
     :param env: (Gym Environment) the diambra environment
@@ -145,7 +146,7 @@ def env_wrapping(env, player, no_op_max=0, sticky_actions=1, clip_rewards=False,
     else:
         from diambra.arena.wrappers.obs_wrapper import WarpFrame, \
             WarpFrame3C, FrameStack, FrameStackDilated,\
-            ActionsStack, ScaledFloatObsNeg, ScaledFloatObs
+            ActionsStack, ScaledFloatObsNeg, ScaledFloatObs, FlattenDictObs
 
     if hwc_obs_resize[2] == 1:
         # Resizing observation from H x W x 3 to
@@ -192,4 +193,7 @@ def env_wrapping(env, player, no_op_max=0, sticky_actions=1, clip_rewards=False,
         else:
             raise ValueError("Scale mod musto be either 0 or -1")
 
+    if flatten_dict:
+        env = FlattenDictObs(env)
+
     return env
diff --git a/diambra/arena/wrappers/obs_wrapper.py b/diambra/arena/wrappers/obs_wrapper.py
@@ -3,6 +3,7 @@
 from copy import deepcopy
 import numpy as np
 from collections import deque
+from collections.abc import Mapping
 import cv2  # pytype:disable=import-error
 cv2.ocl.setUseOpenCL(False)
 
@@ -33,7 +34,7 @@ def scaled_float_obs_func(observation, observation_space):
                 buf_len = observation_space.spaces[k].nvec.shape[0]
                 actions_vector = np.zeros((buf_len * n_act), dtype=int)
                 for iact in range(buf_len):
-                    actions_vector[iact*n_act + observation[k][iact]] = 1
+                    actions_vector[iact * n_act + observation[k][iact]] = 1
                 observation[k] = actions_vector
             elif isinstance(v_space, spaces.Discrete) and (v_space.n > 2):
                 var_vector = np.zeros(
@@ -136,8 +137,7 @@ def step(self, action):
 
         # Add last obs n_frames - 1 times in case of
         # new round / stage / continueGame
-        if ((info["round_done"] or info["stage_done"] or info["game_done"])
-                and not done):
+        if ((info["round_done"] or info["stage_done"] or info["game_done"]) and not done):
             for _ in range(self.n_frames - 1):
                 self.frames.append(obs["frame"])
 
@@ -165,7 +165,7 @@ def __init__(self, env, n_frames, dilation):
         self.dilation = dilation
         # Keeping all n_frames*dilation in memory,
         # then extract the subset given by the dilation factor
-        self.frames = deque([], maxlen=n_frames*dilation)
+        self.frames = deque([], maxlen=n_frames * dilation)
         shp = self.observation_space["frame"].shape
         self.observation_space.spaces["frame"] = spaces.Box(low=0, high=255,
                                                             shape=(
@@ -174,7 +174,7 @@ def __init__(self, env, n_frames, dilation):
 
     def reset(self, **kwargs):
         obs = self.env.reset(**kwargs)
-        for _ in range(self.n_frames*self.dilation):
+        for _ in range(self.n_frames * self.dilation):
             self.frames.append(obs["frame"])
         obs["frame"] = self.get_ob()
         return obs
@@ -185,16 +185,15 @@ def step(self, action):
 
         # Add last obs n_frames - 1 times in case of
         # new round / stage / continueGame
-        if ((info["round_done"] or info["stage_done"] or info["game_done"])
-                and not done):
-            for _ in range(self.n_frames*self.dilation - 1):
+        if ((info["round_done"] or info["stage_done"] or info["game_done"]) and not done):
+            for _ in range(self.n_frames * self.dilation - 1):
                 self.frames.append(obs["frame"])
 
         obs["frame"] = self.get_ob()
         return obs, reward, done, info
 
     def get_ob(self):
-        frames_subset = list(self.frames)[self.dilation-1::self.dilation]
+        frames_subset = list(self.frames)[self.dilation - 1::self.dilation]
         assert len(frames_subset) == self.n_frames
         return LazyFrames(list(frames_subset))
 
@@ -215,10 +214,10 @@ def __init__(self, env, n_actions_stack, n_players=1):
                 deque([0 for i in range(n_actions_stack)], maxlen=n_actions_stack))
             self.attack_action_stack.append(
                 deque([0 for i in range(n_actions_stack)], maxlen=n_actions_stack))
-            self.observation_space.spaces["P{}".format(iplayer+1)].spaces["actions"].spaces["move"] =\
-                spaces.MultiDiscrete([self.n_actions[iplayer][0]]*n_actions_stack)
-            self.observation_space.spaces["P{}".format(iplayer+1)].spaces["actions"].spaces["attack"] =\
-                spaces.MultiDiscrete([self.n_actions[iplayer][1]]*n_actions_stack)
+            self.observation_space.spaces["P{}".format(iplayer + 1)].spaces["actions"].spaces["move"] =\
+                spaces.MultiDiscrete([self.n_actions[iplayer][0]] * n_actions_stack)
+            self.observation_space.spaces["P{}".format(iplayer + 1)].spaces["actions"].spaces["attack"] =\
+                spaces.MultiDiscrete([self.n_actions[iplayer][1]] * n_actions_stack)
 
     def fill_stack(self, value=0):
         # Fill the actions stack with no action after reset
@@ -233,30 +232,29 @@ def reset(self, **kwargs):
 
         for iplayer in range(self.n_players):
             obs["P{}".format(
-                iplayer+1)]["actions"]["move"] = self.move_action_stack[iplayer]
+                iplayer + 1)]["actions"]["move"] = self.move_action_stack[iplayer]
             obs["P{}".format(
-                iplayer+1)]["actions"]["attack"] = self.attack_action_stack[iplayer]
+                iplayer + 1)]["actions"]["attack"] = self.attack_action_stack[iplayer]
         return obs
 
     def step(self, action):
         obs, reward, done, info = self.env.step(action)
         for iplayer in range(self.n_players):
             self.move_action_stack[iplayer].append(
-                obs["P{}".format(iplayer+1)]["actions"]["move"])
+                obs["P{}".format(iplayer + 1)]["actions"]["move"])
             self.attack_action_stack[iplayer].append(
-                obs["P{}".format(iplayer+1)]["actions"]["attack"])
+                obs["P{}".format(iplayer + 1)]["actions"]["attack"])
 
         # Add noAction for n_actions_stack - 1 times
         # in case of new round / stage / continueGame
-        if ((info["round_done"] or info["stage_done"] or info["game_done"])
-                and not done):
+        if ((info["round_done"] or info["stage_done"] or info["game_done"]) and not done):
             self.fill_stack()
 
         for iplayer in range(self.n_players):
             obs["P{}".format(
-                iplayer+1)]["actions"]["move"] = self.move_action_stack[iplayer]
+                iplayer + 1)]["actions"]["move"] = self.move_action_stack[iplayer]
             obs["P{}".format(
-                iplayer+1)]["actions"]["attack"] = self.attack_action_stack[iplayer]
+                iplayer + 1)]["actions"]["attack"] = self.attack_action_stack[iplayer]
         return obs, reward, done, info
 
 
@@ -288,10 +286,10 @@ def scaled_float_obs_space_func(obs_dict):
                 # One hot encoding x nStack
                 n_val = v.nvec.shape[0]
                 max_val = v.nvec[0]
-                obs_dict.spaces[k] = spaces.MultiDiscrete([2]*(n_val*max_val))
+                obs_dict.spaces[k] = spaces.MultiBinary(n_val * max_val)
             elif isinstance(v, spaces.Discrete) and (v.n > 2):
                 # One hot encoding
-                obs_dict.spaces[k] = spaces.MultiDiscrete([2]*(v.n))
+                obs_dict.spaces[k] = spaces.MultiBinary(v.n)
             elif isinstance(v, spaces.Box):
                 obs_dict.spaces[k] = spaces.Box(
                     low=0, high=1.0, shape=v.shape, dtype=np.float32)
@@ -341,3 +339,32 @@ def __len__(self):
 
     def __getitem__(self, i):
         return self.force()[i]
+
+
+_FLAG_FIRST = object()
+
+def flatten_obs_func(input_dictionary):
+    flattened_dict = {}
+
+    def visit(subdict, results, partial_key):
+        for k, v in subdict.items():
+            newKey = k if partial_key == _FLAG_FIRST else partial_key + "_" + k
+            if isinstance(v, Mapping):
+                visit(v, flattened_dict, newKey)
+            else:
+                flattened_dict[newKey] = v
+
+    visit(input_dictionary, flattened_dict, _FLAG_FIRST)
+
+    return flattened_dict
+
+
+class FlattenDictObs(gym.ObservationWrapper):
+    def __init__(self, env):
+        gym.ObservationWrapper.__init__(self, env)
+
+        self.observation_space = spaces.Dict(flatten_obs_func(self.observation_space))
+
+    def observation(self, observation):
+
+        return flatten_obs_func(observation)