# Pommerman V4 Training in smaller central region

This notebook demonstrates how to train Pommerman agents. Please let us know at support@pommerman.com if you run into any issues.

In [1]:
import os
import sys
import numpy as np
import time

import pommerman
from pommerman.agents import SimpleAgent, RandomAgent, PlayerAgent, BaseAgent
from pommerman.configs import ffa_v4_fast_env
from pommerman.envs.v4 import Pomme as Pomme_v4
from pommerman.characters import Bomber
from pommerman import utility
from pommerman import agents
from pommerman import envs
from pommerman import constants
from pommerman import characters

# print all env configs
print(pommerman.REGISTRY)

['PommeFFACompetition-v0', 'PommeFFACompetitionFast-v0', 'PommeFFAFast-v0', 'PommeFFA-v1', 'PommeFFAFast-v3', 'PommeFFAFast-v4', 'OneVsOne-v0', 'PommeRadioCompetition-v2', 'PommeRadio-v2', 'PommeTeamCompetition-v0', 'PommeTeamCompetitionFast-v0', 'PommeTeamCompetition-v1', 'PommeTeam-v0', 'PommeTeamFast-v0', 'PommeTeamSimple-v0']


# Train with stable baseline

In [2]:
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines import PPO2

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Inherit pommerman env and make it compatible with stable-baseline

In [3]:
class CustomPomme(Pomme_v4):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.obs_raw = None # store the raw version of observation
        self.training_idx = 1 # idx of the agent being trained
    
    # function to flatten pommerman observation
    def _transform_obs(self, obs_raw):
        obs_training = obs_raw[self.training_idx] # default the first agent to be trained

        # construct flattened observation
        obs = [
            *np.array(obs_training["board"]).reshape(-1),
            *np.array(obs_training["bomb_blast_strength"]).reshape(-1),
            *np.array(obs_training["bomb_life"]).reshape(-1),
            *np.array(obs_training["position"]).reshape(-1),
            obs_training["ammo"],
            obs_training["blast_strength"],
            obs_training["can_kick"],
            obs_training["teammate"].value,
            obs_training["enemies"][0].value,
            
            # uncommon if training 1 v 1
            obs_training["enemies"][0].value,
            obs_training["enemies"][0].value,
            
            # uncommon if training 2 v 2
#             obs_training["enemies"][1].value,
#             obs_training["enemies"][2].value,
        ]
        return obs
    
    def get_obs_raw(self):
        return self.obs_raw

    def step(self, action_training):
        action_nontraining = self.act(self.obs_raw)
        actions = [*action_nontraining, action_training]
        obs_raw, reward, done, info = super().step(actions)
        self.obs_raw = obs_raw
        return self._transform_obs(obs_raw), reward[self.training_idx], done, info
    
    def reset(self):
        obs_raw = super().reset()
        self.obs_raw = obs_raw
        return self._transform_obs(obs_raw)
    
    def render(self,
               mode=None,
               close=False,
               record_pngs_dir=None,
               record_json_dir=None,
               do_sleep=True):
        super().render(mode=mode,
                       close=close,
                       record_pngs_dir=record_pngs_dir,
                       record_json_dir=record_json_dir,
                       do_sleep=do_sleep)

In [4]:
# def team_v3_fast_env():
#     """Start up a FFA config with the default settings."""
#     env = CustomPomme
#     game_type = constants.GameType.Team
#     env_entry_point = 'CustomPomme'
#     env_id = 'PommeTeamFast-v3'
#     env_kwargs = {
#         'game_type': game_type,
#         'board_size': 8,
#         'num_rigid': 0,
#         'num_wood': 0,
#         'num_items': 0,
#         'max_steps': constants.MAX_STEPS,
#         'render_fps': 1000,
#         'env': env_entry_point,
#     }
#     agent = characters.Bomber
#     return locals()

# def one_vs_one_v3_env():
#     """Start up a FFA config with the default settings."""
#     env = CustomPomme
#     game_type = constants.GameType.OneVsOne
#     env_entry_point = 'CustomPomme'
#     env_id = 'PommeOneVsOneFast-v3'
#     env_kwargs = {
#         'game_type': game_type,
#         'board_size': 8,
#         'num_rigid': 0,
#         'num_wood': 0,
#         'num_items': 0,
#         'max_steps': constants.MAX_STEPS,
#         'render_fps': 1000,
#         'env': env_entry_point,
#     }
#     agent = characters.Bomber
#     return locals()

def one_vs_one_v4_env():
    """Start up a FFA config with the default settings."""
    env = CustomPomme
    game_type = constants.GameType.OneVsOne
    env_entry_point = 'CustomPomme'
    env_id = 'PommeOneVsOneFast-v4'
    env_kwargs = {
        'game_type': game_type,
        'board_size': 11,
        'free_board_size': 4,
        'num_rigid': 0,
        'num_wood': 0,
        'num_items': 0,
        'max_steps': constants.MAX_STEPS,
        'render_fps': 1000,
        'env': env_entry_point,
    }
    agent = characters.Bomber
    return locals()

In [5]:
# Instantiate the environment

config = one_vs_one_v4_env()
env_pom = CustomPomme(**config["env_kwargs"])

# config agents
agents = []

# Add simple agents
for agent_id in range(1):
    agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))
    
# add player agent(to train)
agents.append(PlayerAgent(config["agent"](1, config["game_type"])))

env_pom.set_agents(agents)
env_pom.set_training_agent(agents[1].agent_id)
env_pom.set_init_game_state(None)

# Seed and reset the environment
env_pom.seed(0)

[0]

In [6]:
# log function during training, implement if needed
def log(local_var, global_var):
    pass
#     display(local_var)
#     display(global_var)

In [14]:
import time
n_cpu = 2
env = DummyVecEnv([lambda: env_pom for i in range(n_cpu)])

model = PPO2(MlpPolicy, env, verbose=1, 
             n_steps = 3000, # batch_size = n_step * num_env
             ent_coef = 0.001, # entropy coefficient
             tensorboard_log="./ppo_pommerman_tensorboard/")
startTime = time.time()
model = model.learn(total_timesteps=4000000, # num_update = total_timesteps // batch_size
                    callback = log)
endTime = time.time()
elapsedTime = endTime - startTime
print(elapsedTime)
# model.save("ppo2_pommerman_20000_2")
# model = model.learn(total_timesteps=5000000, # num_update = total_timesteps // batch_size
#                     callback = log)
model.save("ppo2_pommerman_v4_4")

--------------------------------------
| approxkl           | 0.00021511447 |
| clipfrac           | 0.0           |
| explained_variance | -0.484        |
| fps                | 404           |
| n_updates          | 1             |
| policy_entropy     | 1.7915729     |
| policy_loss        | -0.0009018528 |
| serial_timesteps   | 3000          |
| time_elapsed       | 2.15e-06      |
| total_timesteps    | 6000          |
| value_loss         | 0.11618769    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0005002811  |
| clipfrac           | 0.0           |
| explained_variance | -0.227        |
| fps                | 406           |
| n_updates          | 2             |
| policy_entropy     | 1.7897758     |
| policy_loss        | -0.0012685414 |
| serial_timesteps   | 6000          |
| time_elapsed       | 14.9          |
| total_timesteps    | 12000         |
| value_loss         | 0.081075415   |
-------------------------

---------------------------------------
| approxkl           | 0.00077191653  |
| clipfrac           | 0.0            |
| explained_variance | 0.00917        |
| fps                | 434            |
| n_updates          | 18             |
| policy_entropy     | 1.4725542      |
| policy_loss        | -0.00049163174 |
| serial_timesteps   | 54000          |
| time_elapsed       | 244            |
| total_timesteps    | 108000         |
| value_loss         | 0.079081364    |
---------------------------------------
--------------------------------------
| approxkl           | 0.0028776783  |
| clipfrac           | 0.038458332   |
| explained_variance | -0.000838     |
| fps                | 435           |
| n_updates          | 19            |
| policy_entropy     | 1.4518452     |
| policy_loss        | -0.0023887653 |
| serial_timesteps   | 57000         |
| time_elapsed       | 258           |
| total_timesteps    | 114000        |
| value_loss         | 0.0720797     |
------------

--------------------------------------
| approxkl           | 0.00048358174 |
| clipfrac           | 0.0           |
| explained_variance | 0.0301        |
| fps                | 437           |
| n_updates          | 34            |
| policy_entropy     | 1.334669      |
| policy_loss        | -0.0005592419 |
| serial_timesteps   | 102000        |
| time_elapsed       | 466           |
| total_timesteps    | 204000        |
| value_loss         | 0.07963874    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0039963606  |
| clipfrac           | 0.05245833    |
| explained_variance | 0.0179        |
| fps                | 436           |
| n_updates          | 35            |
| policy_entropy     | 1.3178447     |
| policy_loss        | -0.0028403876 |
| serial_timesteps   | 105000        |
| time_elapsed       | 479           |
| total_timesteps    | 210000        |
| value_loss         | 0.074807614   |
-------------------------

--------------------------------------
| approxkl           | 0.0010948445  |
| clipfrac           | 0.00041666668 |
| explained_variance | 0.025         |
| fps                | 438           |
| n_updates          | 50            |
| policy_entropy     | 1.3423265     |
| policy_loss        | -0.0009754554 |
| serial_timesteps   | 150000        |
| time_elapsed       | 686           |
| total_timesteps    | 300000        |
| value_loss         | 0.07658155    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0015178178  |
| clipfrac           | 0.030083332   |
| explained_variance | 0.0514        |
| fps                | 437           |
| n_updates          | 51            |
| policy_entropy     | 1.3533132     |
| policy_loss        | -0.0012429515 |
| serial_timesteps   | 153000        |
| time_elapsed       | 699           |
| total_timesteps    | 306000        |
| value_loss         | 0.08401557    |
-------------------------

--------------------------------------
| approxkl           | 0.004042419   |
| clipfrac           | 0.032791667   |
| explained_variance | 0.0577        |
| fps                | 436           |
| n_updates          | 67            |
| policy_entropy     | 1.2495171     |
| policy_loss        | -0.0017528718 |
| serial_timesteps   | 201000        |
| time_elapsed       | 919           |
| total_timesteps    | 402000        |
| value_loss         | 0.079835884   |
--------------------------------------
-------------------------------------
| approxkl           | 0.0042106602 |
| clipfrac           | 0.029041668  |
| explained_variance | 0.0338       |
| fps                | 436          |
| n_updates          | 68           |
| policy_entropy     | 1.2715764    |
| policy_loss        | -0.002246154 |
| serial_timesteps   | 204000       |
| time_elapsed       | 933          |
| total_timesteps    | 408000       |
| value_loss         | 0.08106926   |
-------------------------------------

--------------------------------------
| approxkl           | 0.0009449359  |
| clipfrac           | 0.0018750001  |
| explained_variance | 0.0569        |
| fps                | 437           |
| n_updates          | 84            |
| policy_entropy     | 1.2205815     |
| policy_loss        | -0.0013241434 |
| serial_timesteps   | 252000        |
| time_elapsed       | 1.15e+03      |
| total_timesteps    | 504000        |
| value_loss         | 0.09736623    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0013790244  |
| clipfrac           | 0.001625      |
| explained_variance | 0.0738        |
| fps                | 437           |
| n_updates          | 85            |
| policy_entropy     | 1.2268263     |
| policy_loss        | -0.0008749147 |
| serial_timesteps   | 255000        |
| time_elapsed       | 1.17e+03      |
| total_timesteps    | 510000        |
| value_loss         | 0.087886356   |
-------------------------

---------------------------------------
| approxkl           | 0.0006184267   |
| clipfrac           | 4.1666666e-05  |
| explained_variance | 0.0756         |
| fps                | 440            |
| n_updates          | 101            |
| policy_entropy     | 1.07611        |
| policy_loss        | -0.00055688247 |
| serial_timesteps   | 303000         |
| time_elapsed       | 1.39e+03       |
| total_timesteps    | 606000         |
| value_loss         | 0.09984091     |
---------------------------------------
--------------------------------------
| approxkl           | 0.0008374504  |
| clipfrac           | 0.0016666667  |
| explained_variance | 0.0871        |
| fps                | 439           |
| n_updates          | 102           |
| policy_entropy     | 1.0708232     |
| policy_loss        | -0.0015378068 |
| serial_timesteps   | 306000        |
| time_elapsed       | 1.4e+03       |
| total_timesteps    | 612000        |
| value_loss         | 0.08740346    |
------------

---------------------------------------
| approxkl           | 0.0010540711   |
| clipfrac           | 0.001          |
| explained_variance | 0.0902         |
| fps                | 438            |
| n_updates          | 118            |
| policy_entropy     | 0.92464        |
| policy_loss        | -0.00042116316 |
| serial_timesteps   | 354000         |
| time_elapsed       | 1.62e+03       |
| total_timesteps    | 708000         |
| value_loss         | 0.084241316    |
---------------------------------------
--------------------------------------
| approxkl           | 0.0025597503  |
| clipfrac           | 0.023208335   |
| explained_variance | 0.11          |
| fps                | 438           |
| n_updates          | 119           |
| policy_entropy     | 0.89018345    |
| policy_loss        | -0.0017130965 |
| serial_timesteps   | 357000        |
| time_elapsed       | 1.63e+03      |
| total_timesteps    | 714000        |
| value_loss         | 0.08802465    |
------------

--------------------------------------
| approxkl           | 0.0013527492  |
| clipfrac           | 0.005791666   |
| explained_variance | 0.125         |
| fps                | 437           |
| n_updates          | 135           |
| policy_entropy     | 0.71803916    |
| policy_loss        | -0.0010188927 |
| serial_timesteps   | 405000        |
| time_elapsed       | 1.85e+03      |
| total_timesteps    | 810000        |
| value_loss         | 0.08979756    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0013432545  |
| clipfrac           | 0.0060416665  |
| explained_variance | 0.109         |
| fps                | 437           |
| n_updates          | 136           |
| policy_entropy     | 0.6932734     |
| policy_loss        | -0.0015315295 |
| serial_timesteps   | 408000        |
| time_elapsed       | 1.87e+03      |
| total_timesteps    | 816000        |
| value_loss         | 0.08992804    |
-------------------------

--------------------------------------
| approxkl           | 0.00047955083 |
| clipfrac           | 0.0031250003  |
| explained_variance | 0.123         |
| fps                | 402           |
| n_updates          | 152           |
| policy_entropy     | 0.572739      |
| policy_loss        | -0.0011398988 |
| serial_timesteps   | 456000        |
| time_elapsed       | 2.1e+03       |
| total_timesteps    | 912000        |
| value_loss         | 0.08740221    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0008109651  |
| clipfrac           | 0.006583333   |
| explained_variance | 0.155         |
| fps                | 401           |
| n_updates          | 153           |
| policy_entropy     | 0.5603754     |
| policy_loss        | -0.0012352165 |
| serial_timesteps   | 459000        |
| time_elapsed       | 2.11e+03      |
| total_timesteps    | 918000        |
| value_loss         | 0.090317525   |
-------------------------

--------------------------------------
| approxkl           | 0.0008300486  |
| clipfrac           | 0.0047916663  |
| explained_variance | 0.149         |
| fps                | 431           |
| n_updates          | 169           |
| policy_entropy     | 0.44024312    |
| policy_loss        | -0.0013246641 |
| serial_timesteps   | 507000        |
| time_elapsed       | 2.35e+03      |
| total_timesteps    | 1014000       |
| value_loss         | 0.09064502    |
--------------------------------------
---------------------------------------
| approxkl           | 0.0005316943   |
| clipfrac           | 0.003          |
| explained_variance | 0.188          |
| fps                | 424            |
| n_updates          | 170            |
| policy_entropy     | 0.45071593     |
| policy_loss        | -0.00077502057 |
| serial_timesteps   | 510000         |
| time_elapsed       | 2.36e+03       |
| total_timesteps    | 1020000        |
| value_loss         | 0.091152266    |
-------------

--------------------------------------
| approxkl           | 0.0008778226  |
| clipfrac           | 0.00875       |
| explained_variance | 0.205         |
| fps                | 411           |
| n_updates          | 186           |
| policy_entropy     | 0.35747808    |
| policy_loss        | -0.0012891327 |
| serial_timesteps   | 558000        |
| time_elapsed       | 2.59e+03      |
| total_timesteps    | 1116000       |
| value_loss         | 0.08637524    |
--------------------------------------
--------------------------------------
| approxkl           | 0.00029410058 |
| clipfrac           | 0.0019166665  |
| explained_variance | 0.191         |
| fps                | 403           |
| n_updates          | 187           |
| policy_entropy     | 0.3623139     |
| policy_loss        | -0.0011716222 |
| serial_timesteps   | 561000        |
| time_elapsed       | 2.61e+03      |
| total_timesteps    | 1122000       |
| value_loss         | 0.08193247    |
-------------------------

---------------------------------------
| approxkl           | 0.00084437703  |
| clipfrac           | 0.007791667    |
| explained_variance | 0.305          |
| fps                | 405            |
| n_updates          | 203            |
| policy_entropy     | 0.29113743     |
| policy_loss        | -0.00088686775 |
| serial_timesteps   | 609000         |
| time_elapsed       | 2.85e+03       |
| total_timesteps    | 1218000        |
| value_loss         | 0.08591429     |
---------------------------------------
--------------------------------------
| approxkl           | 0.00047519326 |
| clipfrac           | 0.0052083326  |
| explained_variance | 0.308         |
| fps                | 396           |
| n_updates          | 204           |
| policy_entropy     | 0.27882743    |
| policy_loss        | -0.0009718945 |
| serial_timesteps   | 612000        |
| time_elapsed       | 2.86e+03      |
| total_timesteps    | 1224000       |
| value_loss         | 0.08408789    |
------------

--------------------------------------
| approxkl           | 0.0005012946  |
| clipfrac           | 0.0054999995  |
| explained_variance | 0.349         |
| fps                | 422           |
| n_updates          | 220           |
| policy_entropy     | 0.19009554    |
| policy_loss        | -0.0010125774 |
| serial_timesteps   | 660000        |
| time_elapsed       | 3.1e+03       |
| total_timesteps    | 1320000       |
| value_loss         | 0.093307495   |
--------------------------------------
---------------------------------------
| approxkl           | 0.00059793115  |
| clipfrac           | 0.0060833334   |
| explained_variance | 0.353          |
| fps                | 414            |
| n_updates          | 221            |
| policy_entropy     | 0.1817143      |
| policy_loss        | -0.00085142173 |
| serial_timesteps   | 663000         |
| time_elapsed       | 3.11e+03       |
| total_timesteps    | 1326000        |
| value_loss         | 0.10472649     |
-------------

--------------------------------------
| approxkl           | 0.0010097203  |
| clipfrac           | 0.023625      |
| explained_variance | 0.384         |
| fps                | 393           |
| n_updates          | 236           |
| policy_entropy     | 0.1452262     |
| policy_loss        | -0.0006745024 |
| serial_timesteps   | 708000        |
| time_elapsed       | 3.33e+03      |
| total_timesteps    | 1416000       |
| value_loss         | 0.10323969    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0006918519  |
| clipfrac           | 0.0069166664  |
| explained_variance | 0.44          |
| fps                | 399           |
| n_updates          | 237           |
| policy_entropy     | 0.13813499    |
| policy_loss        | -0.0010019318 |
| serial_timesteps   | 711000        |
| time_elapsed       | 3.35e+03      |
| total_timesteps    | 1422000       |
| value_loss         | 0.10044796    |
-------------------------

---------------------------------------
| approxkl           | 0.00070581585  |
| clipfrac           | 0.008958333    |
| explained_variance | 0.41           |
| fps                | 398            |
| n_updates          | 252            |
| policy_entropy     | 0.10300365     |
| policy_loss        | -0.00019734415 |
| serial_timesteps   | 756000         |
| time_elapsed       | 3.57e+03       |
| total_timesteps    | 1512000        |
| value_loss         | 0.10060625     |
---------------------------------------
---------------------------------------
| approxkl           | 0.00027162838  |
| clipfrac           | 0.0033750003   |
| explained_variance | 0.401          |
| fps                | 409            |
| n_updates          | 253            |
| policy_entropy     | 0.093780175    |
| policy_loss        | -0.00047647126 |
| serial_timesteps   | 759000         |
| time_elapsed       | 3.58e+03       |
| total_timesteps    | 1518000        |
| value_loss         | 0.09985373     |


--------------------------------------
| approxkl           | 0.00020450834 |
| clipfrac           | 0.0025833335  |
| explained_variance | 0.427         |
| fps                | 419           |
| n_updates          | 268           |
| policy_entropy     | 0.07008855    |
| policy_loss        | -0.0003105668 |
| serial_timesteps   | 804000        |
| time_elapsed       | 3.8e+03       |
| total_timesteps    | 1608000       |
| value_loss         | 0.09292202    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0010768619  |
| clipfrac           | 0.010708333   |
| explained_variance | 0.395         |
| fps                | 428           |
| n_updates          | 269           |
| policy_entropy     | 0.06900939    |
| policy_loss        | -0.0006884488 |
| serial_timesteps   | 807000        |
| time_elapsed       | 3.81e+03      |
| total_timesteps    | 1614000       |
| value_loss         | 0.101866476   |
-------------------------

---------------------------------------
| approxkl           | 0.0006233106   |
| clipfrac           | 0.0069999998   |
| explained_variance | 0.384          |
| fps                | 416            |
| n_updates          | 284            |
| policy_entropy     | 0.06215287     |
| policy_loss        | -0.00010298301 |
| serial_timesteps   | 852000         |
| time_elapsed       | 4.03e+03       |
| total_timesteps    | 1704000        |
| value_loss         | 0.1044118      |
---------------------------------------
---------------------------------------
| approxkl           | 0.00052951934  |
| clipfrac           | 0.0037500001   |
| explained_variance | 0.432          |
| fps                | 431            |
| n_updates          | 285            |
| policy_entropy     | 0.05746124     |
| policy_loss        | -0.00028569807 |
| serial_timesteps   | 855000         |
| time_elapsed       | 4.04e+03       |
| total_timesteps    | 1710000        |
| value_loss         | 0.0977076      |


---------------------------------------
| approxkl           | 0.00021259088  |
| clipfrac           | 0.0027916667   |
| explained_variance | 0.408          |
| fps                | 399            |
| n_updates          | 300            |
| policy_entropy     | 0.05099418     |
| policy_loss        | -0.00014567212 |
| serial_timesteps   | 900000         |
| time_elapsed       | 4.27e+03       |
| total_timesteps    | 1800000        |
| value_loss         | 0.095502704    |
---------------------------------------
---------------------------------------
| approxkl           | 0.00043290245  |
| clipfrac           | 0.006833333    |
| explained_variance | 0.394          |
| fps                | 375            |
| n_updates          | 301            |
| policy_entropy     | 0.051697824    |
| policy_loss        | -3.0458872e-05 |
| serial_timesteps   | 903000         |
| time_elapsed       | 4.29e+03       |
| total_timesteps    | 1806000        |
| value_loss         | 0.10009271     |


---------------------------------------
| approxkl           | 0.00036469472  |
| clipfrac           | 0.0047916663   |
| explained_variance | 0.43           |
| fps                | 422            |
| n_updates          | 316            |
| policy_entropy     | 0.04766588     |
| policy_loss        | -0.00027249425 |
| serial_timesteps   | 948000         |
| time_elapsed       | 4.5e+03        |
| total_timesteps    | 1896000        |
| value_loss         | 0.094101384    |
---------------------------------------
---------------------------------------
| approxkl           | 0.00068648637  |
| clipfrac           | 0.007958332    |
| explained_variance | 0.431          |
| fps                | 415            |
| n_updates          | 317            |
| policy_entropy     | 0.05444123     |
| policy_loss        | -0.00019218253 |
| serial_timesteps   | 951000         |
| time_elapsed       | 4.52e+03       |
| total_timesteps    | 1902000        |
| value_loss         | 0.09780135     |


--------------------------------------
| approxkl           | 0.00019280854 |
| clipfrac           | 0.0035833335  |
| explained_variance | 0.406         |
| fps                | 416           |
| n_updates          | 332           |
| policy_entropy     | 0.03747384    |
| policy_loss        | -9.177994e-05 |
| serial_timesteps   | 996000        |
| time_elapsed       | 4.73e+03      |
| total_timesteps    | 1992000       |
| value_loss         | 0.09590063    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0007544513  |
| clipfrac           | 0.008333333   |
| explained_variance | 0.384         |
| fps                | 423           |
| n_updates          | 333           |
| policy_entropy     | 0.035388503   |
| policy_loss        | -0.0005356316 |
| serial_timesteps   | 999000        |
| time_elapsed       | 4.75e+03      |
| total_timesteps    | 1998000       |
| value_loss         | 0.094130784   |
-------------------------

--------------------------------------
| approxkl           | 0.00016103564 |
| clipfrac           | 0.0010833334  |
| explained_variance | 0.446         |
| fps                | 432           |
| n_updates          | 348           |
| policy_entropy     | 0.037013233   |
| policy_loss        | 1.520475e-06  |
| serial_timesteps   | 1044000       |
| time_elapsed       | 4.96e+03      |
| total_timesteps    | 2088000       |
| value_loss         | 0.094095774   |
--------------------------------------
---------------------------------------
| approxkl           | 0.00015870716  |
| clipfrac           | 0.0017916667   |
| explained_variance | 0.417          |
| fps                | 425            |
| n_updates          | 349            |
| policy_entropy     | 0.038558632    |
| policy_loss        | -0.00011587114 |
| serial_timesteps   | 1047000        |
| time_elapsed       | 4.97e+03       |
| total_timesteps    | 2094000        |
| value_loss         | 0.09919407     |
-------------

--------------------------------------
| approxkl           | 0.0016336731  |
| clipfrac           | 0.013750001   |
| explained_variance | 0.405         |
| fps                | 424           |
| n_updates          | 364           |
| policy_entropy     | 0.051347815   |
| policy_loss        | -0.0011413957 |
| serial_timesteps   | 1092000       |
| time_elapsed       | 5.18e+03      |
| total_timesteps    | 2184000       |
| value_loss         | 0.099925555   |
--------------------------------------
--------------------------------------
| approxkl           | 0.0010126663  |
| clipfrac           | 0.007083334   |
| explained_variance | 0.369         |
| fps                | 427           |
| n_updates          | 365           |
| policy_entropy     | 0.050642516   |
| policy_loss        | 0.00048246072 |
| serial_timesteps   | 1095000       |
| time_elapsed       | 5.2e+03       |
| total_timesteps    | 2190000       |
| value_loss         | 0.10198132    |
-------------------------

--------------------------------------
| approxkl           | 0.0005142897  |
| clipfrac           | 0.008708334   |
| explained_variance | 0.369         |
| fps                | 436           |
| n_updates          | 380           |
| policy_entropy     | 0.04511083    |
| policy_loss        | -0.0005507248 |
| serial_timesteps   | 1140000       |
| time_elapsed       | 5.41e+03      |
| total_timesteps    | 2280000       |
| value_loss         | 0.09474182    |
--------------------------------------
---------------------------------------
| approxkl           | 0.0030890808   |
| clipfrac           | 0.014041668    |
| explained_variance | 0.388          |
| fps                | 438            |
| n_updates          | 381            |
| policy_entropy     | 0.037810985    |
| policy_loss        | -0.00021493046 |
| serial_timesteps   | 1143000        |
| time_elapsed       | 5.42e+03       |
| total_timesteps    | 2286000        |
| value_loss         | 0.09357205     |
-------------

--------------------------------------
| approxkl           | 0.00054859533 |
| clipfrac           | 0.0040833335  |
| explained_variance | 0.469         |
| fps                | 437           |
| n_updates          | 396           |
| policy_entropy     | 0.03786654    |
| policy_loss        | -0.000165259  |
| serial_timesteps   | 1188000       |
| time_elapsed       | 5.62e+03      |
| total_timesteps    | 2376000       |
| value_loss         | 0.08750243    |
--------------------------------------
-------------------------------------
| approxkl           | 0.00075265   |
| clipfrac           | 0.005458333  |
| explained_variance | 0.406        |
| fps                | 434          |
| n_updates          | 397          |
| policy_entropy     | 0.03581118   |
| policy_loss        | 9.530795e-07 |
| serial_timesteps   | 1191000      |
| time_elapsed       | 5.64e+03     |
| total_timesteps    | 2382000      |
| value_loss         | 0.0992728    |
-------------------------------------

---------------------------------------
| approxkl           | 0.00035714483  |
| clipfrac           | 0.0044583334   |
| explained_variance | 0.382          |
| fps                | 433            |
| n_updates          | 413            |
| policy_entropy     | 0.03802504     |
| policy_loss        | -0.00019933372 |
| serial_timesteps   | 1239000        |
| time_elapsed       | 5.86e+03       |
| total_timesteps    | 2478000        |
| value_loss         | 0.09762553     |
---------------------------------------
--------------------------------------
| approxkl           | 0.00017319292 |
| clipfrac           | 0.0030416667  |
| explained_variance | 0.455         |
| fps                | 437           |
| n_updates          | 414           |
| policy_entropy     | 0.03979485    |
| policy_loss        | -6.030138e-05 |
| serial_timesteps   | 1242000       |
| time_elapsed       | 5.87e+03      |
| total_timesteps    | 2484000       |
| value_loss         | 0.08909476    |
------------

---------------------------------------
| approxkl           | 0.0007525922   |
| clipfrac           | 0.010625       |
| explained_variance | 0.378          |
| fps                | 438            |
| n_updates          | 429            |
| policy_entropy     | 0.056557827    |
| policy_loss        | -0.00018279142 |
| serial_timesteps   | 1287000        |
| time_elapsed       | 6.08e+03       |
| total_timesteps    | 2574000        |
| value_loss         | 0.09963719     |
---------------------------------------
---------------------------------------
| approxkl           | 0.00041132717  |
| clipfrac           | 0.0065         |
| explained_variance | 0.375          |
| fps                | 441            |
| n_updates          | 430            |
| policy_entropy     | 0.05734613     |
| policy_loss        | -9.5502866e-05 |
| serial_timesteps   | 1290000        |
| time_elapsed       | 6.09e+03       |
| total_timesteps    | 2580000        |
| value_loss         | 0.09149142     |


---------------------------------------
| approxkl           | 0.00095969846  |
| clipfrac           | 0.011624999    |
| explained_variance | 0.369          |
| fps                | 439            |
| n_updates          | 445            |
| policy_entropy     | 0.054087028    |
| policy_loss        | -0.00029014106 |
| serial_timesteps   | 1335000        |
| time_elapsed       | 6.3e+03        |
| total_timesteps    | 2670000        |
| value_loss         | 0.09440828     |
---------------------------------------
---------------------------------------
| approxkl           | 0.0016292275   |
| clipfrac           | 0.026333332    |
| explained_variance | 0.406          |
| fps                | 439            |
| n_updates          | 446            |
| policy_entropy     | 0.063614935    |
| policy_loss        | -0.00043090314 |
| serial_timesteps   | 1338000        |
| time_elapsed       | 6.31e+03       |
| total_timesteps    | 2676000        |
| value_loss         | 0.09884153     |


---------------------------------------
| approxkl           | 0.00013942087  |
| clipfrac           | 0.0024583335   |
| explained_variance | 0.396          |
| fps                | 439            |
| n_updates          | 461            |
| policy_entropy     | 0.046575647    |
| policy_loss        | -3.1944706e-05 |
| serial_timesteps   | 1383000        |
| time_elapsed       | 6.52e+03       |
| total_timesteps    | 2766000        |
| value_loss         | 0.09164877     |
---------------------------------------
--------------------------------------
| approxkl           | 0.0002564123  |
| clipfrac           | 0.0022500001  |
| explained_variance | 0.386         |
| fps                | 440           |
| n_updates          | 462           |
| policy_entropy     | 0.046457756   |
| policy_loss        | -9.769234e-05 |
| serial_timesteps   | 1386000       |
| time_elapsed       | 6.53e+03      |
| total_timesteps    | 2772000       |
| value_loss         | 0.09323688    |
------------

---------------------------------------
| approxkl           | 0.00029578936  |
| clipfrac           | 0.004666666    |
| explained_variance | 0.39           |
| fps                | 437            |
| n_updates          | 477            |
| policy_entropy     | 0.046869975    |
| policy_loss        | -0.00020304188 |
| serial_timesteps   | 1431000        |
| time_elapsed       | 6.74e+03       |
| total_timesteps    | 2862000        |
| value_loss         | 0.086273775    |
---------------------------------------
--------------------------------------
| approxkl           | 0.0004367309  |
| clipfrac           | 0.0087916665  |
| explained_variance | 0.397         |
| fps                | 435           |
| n_updates          | 478           |
| policy_entropy     | 0.044951852   |
| policy_loss        | -3.438751e-05 |
| serial_timesteps   | 1434000       |
| time_elapsed       | 6.75e+03      |
| total_timesteps    | 2868000       |
| value_loss         | 0.09158306    |
------------

---------------------------------------
| approxkl           | 0.00068889686  |
| clipfrac           | 0.006875       |
| explained_variance | 0.402          |
| fps                | 441            |
| n_updates          | 493            |
| policy_entropy     | 0.04228045     |
| policy_loss        | -0.00020674264 |
| serial_timesteps   | 1479000        |
| time_elapsed       | 6.96e+03       |
| total_timesteps    | 2958000        |
| value_loss         | 0.10021853     |
---------------------------------------
--------------------------------------
| approxkl           | 0.000397631   |
| clipfrac           | 0.006208333   |
| explained_variance | 0.399         |
| fps                | 435           |
| n_updates          | 494           |
| policy_entropy     | 0.04482522    |
| policy_loss        | -0.0004975854 |
| serial_timesteps   | 1482000       |
| time_elapsed       | 6.97e+03      |
| total_timesteps    | 2964000       |
| value_loss         | 0.09733568    |
------------

----------------------------------------
| approxkl           | 0.00019568737   |
| clipfrac           | 0.003375        |
| explained_variance | 0.34            |
| fps                | 439             |
| n_updates          | 509             |
| policy_entropy     | 0.031873614     |
| policy_loss        | -0.000101775804 |
| serial_timesteps   | 1527000         |
| time_elapsed       | 7.17e+03        |
| total_timesteps    | 3054000         |
| value_loss         | 0.10518768      |
----------------------------------------
---------------------------------------
| approxkl           | 0.0013119719   |
| clipfrac           | 0.008541666    |
| explained_variance | 0.342          |
| fps                | 439            |
| n_updates          | 510            |
| policy_entropy     | 0.03452123     |
| policy_loss        | -4.5542663e-05 |
| serial_timesteps   | 1530000        |
| time_elapsed       | 7.19e+03       |
| total_timesteps    | 3060000        |
| value_loss         | 0.09

---------------------------------------
| approxkl           | 0.0004649407   |
| clipfrac           | 0.0062916665   |
| explained_variance | 0.38           |
| fps                | 440            |
| n_updates          | 525            |
| policy_entropy     | 0.03807444     |
| policy_loss        | -2.0915175e-05 |
| serial_timesteps   | 1575000        |
| time_elapsed       | 7.39e+03       |
| total_timesteps    | 3150000        |
| value_loss         | 0.10303327     |
---------------------------------------
---------------------------------------
| approxkl           | 0.00050270796  |
| clipfrac           | 0.009249999    |
| explained_variance | 0.38           |
| fps                | 437            |
| n_updates          | 526            |
| policy_entropy     | 0.039054036    |
| policy_loss        | -0.00022503757 |
| serial_timesteps   | 1578000        |
| time_elapsed       | 7.41e+03       |
| total_timesteps    | 3156000        |
| value_loss         | 0.10015494     |


---------------------------------------
| approxkl           | 0.00012287065  |
| clipfrac           | 0.0015833334   |
| explained_variance | 0.356          |
| fps                | 438            |
| n_updates          | 541            |
| policy_entropy     | 0.03147889     |
| policy_loss        | -2.5802452e-05 |
| serial_timesteps   | 1623000        |
| time_elapsed       | 7.61e+03       |
| total_timesteps    | 3246000        |
| value_loss         | 0.10558093     |
---------------------------------------
--------------------------------------
| approxkl           | 0.001030255   |
| clipfrac           | 0.010458333   |
| explained_variance | 0.381         |
| fps                | 439           |
| n_updates          | 542           |
| policy_entropy     | 0.03372068    |
| policy_loss        | -9.383203e-06 |
| serial_timesteps   | 1626000       |
| time_elapsed       | 7.63e+03      |
| total_timesteps    | 3252000       |
| value_loss         | 0.1058448     |
------------

---------------------------------------
| approxkl           | 0.00030283837  |
| clipfrac           | 0.0040833335   |
| explained_variance | 0.337          |
| fps                | 438            |
| n_updates          | 557            |
| policy_entropy     | 0.037750855    |
| policy_loss        | -0.00045969326 |
| serial_timesteps   | 1671000        |
| time_elapsed       | 7.83e+03       |
| total_timesteps    | 3342000        |
| value_loss         | 0.09688019     |
---------------------------------------
----------------------------------------
| approxkl           | 0.00047735535   |
| clipfrac           | 0.004833333     |
| explained_variance | 0.335           |
| fps                | 439             |
| n_updates          | 558             |
| policy_entropy     | 0.040729254     |
| policy_loss        | -0.000104569786 |
| serial_timesteps   | 1674000         |
| time_elapsed       | 7.85e+03        |
| total_timesteps    | 3348000         |
| value_loss         | 0.0924

--------------------------------------
| approxkl           | 0.00028095697 |
| clipfrac           | 0.0037500004  |
| explained_variance | 0.37          |
| fps                | 437           |
| n_updates          | 573           |
| policy_entropy     | 0.04169402    |
| policy_loss        | 0.00014909475 |
| serial_timesteps   | 1719000       |
| time_elapsed       | 8.05e+03      |
| total_timesteps    | 3438000       |
| value_loss         | 0.083647124   |
--------------------------------------
---------------------------------------
| approxkl           | 0.0003197575   |
| clipfrac           | 0.0042916667   |
| explained_variance | 0.387          |
| fps                | 434            |
| n_updates          | 574            |
| policy_entropy     | 0.03793738     |
| policy_loss        | -0.00018467377 |
| serial_timesteps   | 1722000        |
| time_elapsed       | 8.06e+03       |
| total_timesteps    | 3444000        |
| value_loss         | 0.08193628     |
-------------

---------------------------------------
| approxkl           | 0.00086643157  |
| clipfrac           | 0.007          |
| explained_variance | 0.396          |
| fps                | 434            |
| n_updates          | 589            |
| policy_entropy     | 0.028241735    |
| policy_loss        | -0.00037153473 |
| serial_timesteps   | 1767000        |
| time_elapsed       | 8.27e+03       |
| total_timesteps    | 3534000        |
| value_loss         | 0.08806822     |
---------------------------------------
--------------------------------------
| approxkl           | 0.00050912343 |
| clipfrac           | 0.0056666667  |
| explained_variance | 0.384         |
| fps                | 437           |
| n_updates          | 590           |
| policy_entropy     | 0.031784058   |
| policy_loss        | -0.000206747  |
| serial_timesteps   | 1770000       |
| time_elapsed       | 8.29e+03      |
| total_timesteps    | 3540000       |
| value_loss         | 0.0882097     |
------------

---------------------------------------
| approxkl           | 0.00035585312  |
| clipfrac           | 0.005208333    |
| explained_variance | 0.397          |
| fps                | 437            |
| n_updates          | 606            |
| policy_entropy     | 0.035527684    |
| policy_loss        | -0.00013612914 |
| serial_timesteps   | 1818000        |
| time_elapsed       | 8.51e+03       |
| total_timesteps    | 3636000        |
| value_loss         | 0.088676944    |
---------------------------------------
---------------------------------------
| approxkl           | 0.00033711246  |
| clipfrac           | 0.004833333    |
| explained_variance | 0.407          |
| fps                | 434            |
| n_updates          | 607            |
| policy_entropy     | 0.035927445    |
| policy_loss        | -3.7906153e-05 |
| serial_timesteps   | 1821000        |
| time_elapsed       | 8.52e+03       |
| total_timesteps    | 3642000        |
| value_loss         | 0.08870351     |


---------------------------------------
| approxkl           | 0.001161335    |
| clipfrac           | 0.00825        |
| explained_variance | 0.363          |
| fps                | 433            |
| n_updates          | 622            |
| policy_entropy     | 0.02691583     |
| policy_loss        | -0.00042047168 |
| serial_timesteps   | 1866000        |
| time_elapsed       | 8.73e+03       |
| total_timesteps    | 3732000        |
| value_loss         | 0.08923784     |
---------------------------------------
--------------------------------------
| approxkl           | 0.00063529855 |
| clipfrac           | 0.0057499995  |
| explained_variance | 0.384         |
| fps                | 437           |
| n_updates          | 623           |
| policy_entropy     | 0.027515084   |
| policy_loss        | -0.0005176809 |
| serial_timesteps   | 1869000       |
| time_elapsed       | 8.74e+03      |
| total_timesteps    | 3738000       |
| value_loss         | 0.09033298    |
------------

---------------------------------------
| approxkl           | 0.00032326416  |
| clipfrac           | 0.0036250004   |
| explained_variance | 0.38           |
| fps                | 434            |
| n_updates          | 638            |
| policy_entropy     | 0.027280817    |
| policy_loss        | -0.00024420783 |
| serial_timesteps   | 1914000        |
| time_elapsed       | 8.95e+03       |
| total_timesteps    | 3828000        |
| value_loss         | 0.08519528     |
---------------------------------------
---------------------------------------
| approxkl           | 0.00038898407  |
| clipfrac           | 0.0037083337   |
| explained_variance | 0.384          |
| fps                | 437            |
| n_updates          | 639            |
| policy_entropy     | 0.02650403     |
| policy_loss        | -0.00010391166 |
| serial_timesteps   | 1917000        |
| time_elapsed       | 8.96e+03       |
| total_timesteps    | 3834000        |
| value_loss         | 0.08523044     |


--------------------------------------
| approxkl           | 0.00085632433 |
| clipfrac           | 0.009249999   |
| explained_variance | 0.399         |
| fps                | 406           |
| n_updates          | 654           |
| policy_entropy     | 0.03352984    |
| policy_loss        | 0.00032894363 |
| serial_timesteps   | 1962000       |
| time_elapsed       | 9.17e+03      |
| total_timesteps    | 3924000       |
| value_loss         | 0.0913123     |
--------------------------------------
--------------------------------------
| approxkl           | 0.00056624704 |
| clipfrac           | 0.0047083334  |
| explained_variance | 0.401         |
| fps                | 420           |
| n_updates          | 655           |
| policy_entropy     | 0.03332037    |
| policy_loss        | 2.5919213e-05 |
| serial_timesteps   | 1965000       |
| time_elapsed       | 9.19e+03      |
| total_timesteps    | 3930000       |
| value_loss         | 0.086989366   |
-------------------------

In [16]:
# del model # remove to demonstrate saving and loading
model = PPO2.load("ppo2_pommerman_v4_4")

n_cpu = 1
env = DummyVecEnv([lambda: env_pom for i in range(n_cpu)])
model.envs = env

# test the learned model
num_win = 0
num_tie = 0
num_lose = 0
total = 5 # number of playouts
for i_episode in range(total):
    obs = env.reset()
    done = False
    info = None
    while not done:
        env.render()
        action_training, _states = model.predict(obs)
#         print(action_training)
        obs, rewards, dones, infos = env.step(action_training)
#         print(infos)
        done = dones[0]
        info = infos[0]
        time.sleep(0.1)
    print('Episode {} finished'.format(i_episode))
    if(info["result"].value == 0):
        if(1 in info["winners"]):
            num_win+=1
        else:
            num_lose+=1
    elif(info["result"].value == 2):
        num_tie+=1
#     print(info)
env.close()
print("Win ", num_win, "/", total, " games")
print("Tie ", num_tie, "/", total, " games")
print("Lose ", num_lose, "/", total, " games")


Loading a model without an environment, this model cannot be trained until it has a valid environment.
Episode 0 finished
Episode 1 finished
Episode 2 finished
Episode 3 finished
Episode 4 finished
Win  0 / 5  games
Tie  1 / 5  games
Lose  4 / 5  games


# baseline example code

In [None]:
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import PPO2

# multiprocess environment
n_cpu = 1
env = DummyVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("ppo2_cartpole")

del model # remove to demonstrate saving and loading

model = PPO2.load("ppo2_cartpole")

# Enjoy trained agent
obs = env.reset()



In [None]:
print(obs)
print(env.buf_obs[None].shape)
print(env.observation_space)

action, _states = model.predict(obs)
obs, rewards, dones, info = env.step(action)

print(obs)
print(rewards)
print(dones)
print(info)

# while True:
#     action, _states = model.predict(obs)
#     obs, rewards, dones, info = env.step(action)
#     env.render()