In [1]:
import os
import pickle
import random
import warnings
from abc import ABC

import gym
import numpy as np
from ai_economist import foundation
from gym import spaces
from gym.utils import seeding
from ray.rllib.env.multi_agent_env import MultiAgentEnv

_BIG_NUMBER = 1e20

Inside covid19_components.py: 0 GPUs are available.
No GPUs found! Running the simulation on a CPU.
Inside covid19_env.py: 0 GPUs are available.
No GPUs found! Running the simulation on a CPU.


In [2]:
class CustomEnvWrapper(gym.Env):
    def __init__(self,env_config_dict,verbose=False):
        self.env = BaseEnvironment(env_config_dict)
        self.observation_space = spaces.Box(low=0,high=255,shape=(3,3),dtype=np.unit8)
        self.action_space = spaces.Discrete(4)

        self.verbose =verbose

      
    def reset(self):
        obs = self.reset()
        obs_dict = {
             'flat': obs['flat'],
            'world-idx_map': obs['world-idx_map'],
            'world-map': obs['world-map'],
            'time': np.array([0], dtype=np.float32),
            'action_mask': np.ones(50, dtype=np.float32)

        }
        return obs_dict
    def step(self,action):
        obs, reward, done, info = self.env.step(action)
        obs_dict = {
            'flat': obs['flat'],
            'world-idx_map': obs['world-idx_map'],
            'world-map': obs['world-map'],
            'time': np.array([info['time_step']], dtype=np.float32),
            'action_mask': np.array(info['action_mask'], dtype=np.float32)
        }
        return obs_dict, reward, done, info


In [3]:
env_config_dict = {
    # ===== SCENARIO CLASS =====
    # Which Scenario class to use: the class's name in the Scenario Registry (foundation.scenarios).
    # The environment object will be an instance of the Scenario class.
    'scenario_name': 'layout_from_file/simple_wood_and_stone',

    # ===== COMPONENTS =====
    # Which components to use (specified as list of ("component_name", {component_kwargs}) tuples).
    #   "component_name" refers to the Component class's name in the Component Registry (foundation.components)
    #   {component_kwargs} is a dictionary of kwargs passed to the Component class
    # The order in which components reset, step, and generate obs follows their listed order below.
    'components': [
        # (1) Building houses
        ('Build', {
            'skill_dist': 'pareto',
            'payment_max_skill_multiplier': 3,
            'build_labor': 10,
            'payment': 10
        }),
        # (2) Trading collectible resources
        ('ContinuousDoubleAuction', {
            'max_bid_ask': 10,
            'order_labor': 0.25,
            'max_num_orders': 5,
            'order_duration': 50
        }),
        # (3) Movement and resource collection
        ('Gather', {
            'move_labor': 1,
            'collect_labor': 1,
            'skill_dist': 'pareto'
        }),
        # (4) Planner
        ('PeriodicBracketTax', {
            'period': 100,
            'bracket_spacing': 'us-federal',
            'usd_scaling': 1000,
            'disable_taxes': False
        })
    ],

    # ===== SCENARIO CLASS ARGUMENTS =====
    # (optional) kwargs that are added by the Scenario class (i.e. not defined in BaseEnvironment)
    'env_layout_file': 'quadrant_25x25_20each_30clump.txt',
    'starting_agent_coin': 10,
    'fixed_four_skill_and_loc': True,

    # ===== STANDARD ARGUMENTS ======
    # kwargs that are used by every Scenario class (i.e. defined in BaseEnvironment)
    'n_agents': 4,  # Number of non-planner agents (must be > 1)
    'world_size': [25, 25],  # [Height, Width] of the env world
    'episode_length': 1000,  # Number of timesteps per episode

    # In multi-action-mode, the policy selects an action for each action subspace (defined in component code).
    # Otherwise, the policy selects only 1 action.
    'multi_action_mode_agents': False,
    'multi_action_mode_planner': True,

    # When flattening observations, concatenate scalar & vector observations before output.
    # Otherwise, return observations with minimal processing.
    'flatten_observations': True,
    # When Flattening masks, concatenate each action subspace mask into a single array.
    # Note: flatten_masks = True is required for masking action logits in the code below.
    'flatten_masks': True,

    # How often to save the dense logs
    'dense_log_frequency': 1
}

In [4]:
env_obj = CustomEnvWrapper({"env_config_dict": env_config_dict}, verbose=True)
print(env_obj)

TypeError: __init__() got an unexpected keyword argument 'verbose'

In [None]:
policies = {
    "a": (
        None,  # uses default policy
        env_obj.observation_space,
        env_obj.action_space,
        {}  # define a custom agent policy configuration.
    ),
    "p": (
        None,  # uses default policy
        env_obj.observation_space_pl,
        env_obj.action_space_pl,
        {}  # define a custom planner policy configuration.
    )
}

# In foundation, all the agents have integer ids and the social planner has an id of "p"
policy_mapping_fun = lambda i: "a" if str(i).isdigit() else "p"

policies_to_train = ["a", "p"]


In [None]:
trainer_config = {
    "multiagent": {
        "policies": policies,
        "policies_to_train": policies_to_train,
        "policy_mapping_fn": policy_mapping_fun,
    }
}

In [None]:
trainer_config.update(
    {
        "num_workers": 2,
        "num_envs_per_worker": 2,
        # Other training parameters
        "train_batch_size":  4000,
        "sgd_minibatch_size": 4000,
        "num_sgd_iter": 1
    }
)


In [None]:
# We also add the "num_envs_per_worker" parameter for the env. wrapper to index the environments.
env_config = {
    "env_config_dict": env_config_dict,
    "num_envs_per_worker": trainer_config.get('num_envs_per_worker'),
}

trainer_config.update(
    {
        "env_config": env_config
    }
)

In [None]:
import ray

In [None]:
ray.init(webui_host="127.0.0.1")

In [None]:
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
trainer = PPOTrainer(
    env = RLlibEnvWrapper,
    config = trainer_config

)

In [None]:


NUM_ITERS = 5
for iteration in range(NUM_ITERS):
    print(f'********** Iter : {iteration} **********')
    result = trainer.train()
    print(f'''episode_reward_mean: {result.get('episode_reward_mean')}''')



In [None]:

dense_logs = {}
# Note: worker 0 is reserved for the trainer actor
for worker in range((trainer_config["num_workers"] > 0), trainer_config["num_workers"] + 1):
    for env_id in range(trainer_config["num_envs_per_worker"]):
        dense_logs["worker={};env_id={}".format(worker, env_id)] = \
        trainer.workers.foreach_worker(lambda w: w.async_env)[worker].envs[env_id].env.previous_episode_dense_log


In [None]:
print(dense_logs.keys())


In [None]:
def generate_rollout_from_current_trainer_policy(
    trainer, 
    env_obj,
    num_dense_logs=1
):
    dense_logs = {}
    for idx in range(num_dense_logs):
        # Set initial states
        agent_states = {}
        for agent_idx in range(env_obj.env.n_agents):
            agent_states[str(agent_idx)] = trainer.get_policy("a").get_initial_state()
        planner_states = trainer.get_policy("p").get_initial_state()   

        # Play out the episode
        obs = env_obj.reset(force_dense_logging=True)
        for t in range(env_obj.env.episode_length):
            actions = {}
            for agent_idx in range(env_obj.env.n_agents):
                # Use the trainer object directly to sample actions for each agent
                actions[str(agent_idx)] = trainer.compute_action(
                    obs[str(agent_idx)], 
                    agent_states[str(agent_idx)], 
                    policy_id="a",
                    full_fetch=False
                )

            # Action sampling for the planner
            actions["p"] = trainer.compute_action(
                obs['p'], 
                planner_states, 
                policy_id='p',
                full_fetch=False
            )

            obs, rew, done, info = env_obj.step(actions)        
            if done['__all__']:
                break
        dense_logs[idx] = env_obj.env.dense_log
    return dense_logs

In [None]:


dense_logs = generate_rollout_from_current_trainer_policy(
    trainer, 
    env_obj,
    num_dense_logs=2
)



In [None]:
import plotting  # plotting utilities for visualizing env. state

dense_log_idx = 0
plotting.breakdown(dense_logs[dense_log_idx]);

