# Pommerman V4 Training in smaller central region

This notebook demonstrates how to train Pommerman agents. Please let us know at support@pommerman.com if you run into any issues.

In [1]:
import os
import sys
import numpy as np
import time

import pommerman
from pommerman.agents import SimpleAgent, RandomAgent, PlayerAgent, BaseAgent
from pommerman.configs import ffa_v4_fast_env
from pommerman.envs.v4 import Pomme as Pomme_v4
from pommerman.characters import Bomber
from pommerman import utility
from pommerman import agents
from pommerman import envs
from pommerman import constants
from pommerman import characters

# print all env configs
print(pommerman.REGISTRY)

['PommeFFACompetition-v0', 'PommeFFACompetitionFast-v0', 'PommeFFAFast-v0', 'PommeFFA-v1', 'PommeFFAFast-v3', 'PommeFFAFast-v4', 'OneVsOne-v0', 'PommeRadioCompetition-v2', 'PommeRadio-v2', 'PommeTeamCompetition-v0', 'PommeTeamCompetitionFast-v0', 'PommeTeamCompetition-v1', 'PommeTeam-v0', 'PommeTeamFast-v0', 'PommeTeamSimple-v0']


# Train with stable baseline

In [2]:
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines import PPO2

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Inherit pommerman env and make it compatible with stable-baseline

In [3]:
class CustomPomme(Pomme_v4):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.obs_raw = None # store the raw version of observation
        self.training_idx = 1 # idx of the agent being trained
    
    # function to flatten pommerman observation
    def _transform_obs(self, obs_raw):
        obs_training = obs_raw[self.training_idx] # default the first agent to be trained

        # construct flattened observation
        obs = [
            *np.array(obs_training["board"]).reshape(-1),
            *np.array(obs_training["bomb_blast_strength"]).reshape(-1),
            *np.array(obs_training["bomb_life"]).reshape(-1),
            *np.array(obs_training["position"]).reshape(-1),
            obs_training["ammo"],
            obs_training["blast_strength"],
            obs_training["can_kick"],
            obs_training["teammate"].value,
            obs_training["enemies"][0].value,
            
            # uncommon if training 1 v 1
            obs_training["enemies"][0].value,
            obs_training["enemies"][0].value,
            
            # uncommon if training 2 v 2
#             obs_training["enemies"][1].value,
#             obs_training["enemies"][2].value,
        ]
        return obs
    
    def get_obs_raw(self):
        return self.obs_raw

    def step(self, action_training):
        action_nontraining = self.act(self.obs_raw)
        actions = [*action_nontraining, action_training]
        obs_raw, reward, done, info = super().step(actions)
        self.obs_raw = obs_raw
        return self._transform_obs(obs_raw), reward[self.training_idx], done, info
    
    def reset(self):
        obs_raw = super().reset()
        self.obs_raw = obs_raw
        return self._transform_obs(obs_raw)
    
    def render(self,
               mode=None,
               close=False,
               record_pngs_dir=None,
               record_json_dir=None,
               do_sleep=True):
        super().render(mode=mode,
                       close=close,
                       record_pngs_dir=record_pngs_dir,
                       record_json_dir=record_json_dir,
                       do_sleep=do_sleep)

In [4]:
# def team_v3_fast_env():
#     """Start up a FFA config with the default settings."""
#     env = CustomPomme
#     game_type = constants.GameType.Team
#     env_entry_point = 'CustomPomme'
#     env_id = 'PommeTeamFast-v3'
#     env_kwargs = {
#         'game_type': game_type,
#         'board_size': 8,
#         'num_rigid': 0,
#         'num_wood': 0,
#         'num_items': 0,
#         'max_steps': constants.MAX_STEPS,
#         'render_fps': 1000,
#         'env': env_entry_point,
#     }
#     agent = characters.Bomber
#     return locals()

# def one_vs_one_v3_env():
#     """Start up a FFA config with the default settings."""
#     env = CustomPomme
#     game_type = constants.GameType.OneVsOne
#     env_entry_point = 'CustomPomme'
#     env_id = 'PommeOneVsOneFast-v3'
#     env_kwargs = {
#         'game_type': game_type,
#         'board_size': 8,
#         'num_rigid': 0,
#         'num_wood': 0,
#         'num_items': 0,
#         'max_steps': constants.MAX_STEPS,
#         'render_fps': 1000,
#         'env': env_entry_point,
#     }
#     agent = characters.Bomber
#     return locals()

def one_vs_one_v4_env():
    """Start up a FFA config with the default settings."""
    env = CustomPomme
    game_type = constants.GameType.OneVsOne
    env_entry_point = 'CustomPomme'
    env_id = 'PommeOneVsOneFast-v4'
    env_kwargs = {
        'game_type': game_type,
        'board_size': 11,
        'free_board_size': 4,
        'num_rigid': 0,
        'num_wood': 0,
        'num_items': 0,
        'max_steps': constants.MAX_STEPS,
        'render_fps': 1000,
        'env': env_entry_point,
    }
    agent = characters.Bomber
    return locals()

In [5]:
# Instantiate the environment

config = one_vs_one_v4_env()
env_pom = CustomPomme(**config["env_kwargs"])

# config agents
agents = []

# Add simple agents
for agent_id in range(1):
    agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))
    
# add player agent(to train)
agents.append(PlayerAgent(config["agent"](1, config["game_type"])))

env_pom.set_agents(agents)
env_pom.set_training_agent(agents[1].agent_id)
env_pom.set_init_game_state(None)

# Seed and reset the environment
env_pom.seed(0)

[0]

In [6]:
# log function during training, implement if needed
def log(local_var, global_var):
    pass
#     display(local_var)
#     display(global_var)

In [None]:
import time
n_cpu = 2
env = DummyVecEnv([lambda: env_pom for i in range(n_cpu)])

model = PPO2(MlpPolicy, env, verbose=1, 
             n_steps = 3000, # batch_size = n_step * num_env
             ent_coef = 0.001, # entropy coefficient
             tensorboard_log="./ppo_pommerman_tensorboard/")
startTime = time.time()
model = model.learn(total_timesteps=4000000, # num_update = total_timesteps // batch_size
                    callback = log)
endTime = time.time()
elapsedTime = endTime - startTime
print(elapsedTime)
# model.save("ppo2_pommerman_20000_2")
# model = model.learn(total_timesteps=5000000, # num_update = total_timesteps // batch_size
#                     callback = log)
model.save("ppo2_pommerman_v4_4")

--------------------------------------
| approxkl           | 0.00021511447 |
| clipfrac           | 0.0           |
| explained_variance | -0.484        |
| fps                | 404           |
| n_updates          | 1             |
| policy_entropy     | 1.7915729     |
| policy_loss        | -0.0009018528 |
| serial_timesteps   | 3000          |
| time_elapsed       | 2.15e-06      |
| total_timesteps    | 6000          |
| value_loss         | 0.11618769    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0005002811  |
| clipfrac           | 0.0           |
| explained_variance | -0.227        |
| fps                | 406           |
| n_updates          | 2             |
| policy_entropy     | 1.7897758     |
| policy_loss        | -0.0012685414 |
| serial_timesteps   | 6000          |
| time_elapsed       | 14.9          |
| total_timesteps    | 12000         |
| value_loss         | 0.081075415   |
-------------------------

---------------------------------------
| approxkl           | 0.00077191653  |
| clipfrac           | 0.0            |
| explained_variance | 0.00917        |
| fps                | 434            |
| n_updates          | 18             |
| policy_entropy     | 1.4725542      |
| policy_loss        | -0.00049163174 |
| serial_timesteps   | 54000          |
| time_elapsed       | 244            |
| total_timesteps    | 108000         |
| value_loss         | 0.079081364    |
---------------------------------------
--------------------------------------
| approxkl           | 0.0028776783  |
| clipfrac           | 0.038458332   |
| explained_variance | -0.000838     |
| fps                | 435           |
| n_updates          | 19            |
| policy_entropy     | 1.4518452     |
| policy_loss        | -0.0023887653 |
| serial_timesteps   | 57000         |
| time_elapsed       | 258           |
| total_timesteps    | 114000        |
| value_loss         | 0.0720797     |
------------

--------------------------------------
| approxkl           | 0.00048358174 |
| clipfrac           | 0.0           |
| explained_variance | 0.0301        |
| fps                | 437           |
| n_updates          | 34            |
| policy_entropy     | 1.334669      |
| policy_loss        | -0.0005592419 |
| serial_timesteps   | 102000        |
| time_elapsed       | 466           |
| total_timesteps    | 204000        |
| value_loss         | 0.07963874    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0039963606  |
| clipfrac           | 0.05245833    |
| explained_variance | 0.0179        |
| fps                | 436           |
| n_updates          | 35            |
| policy_entropy     | 1.3178447     |
| policy_loss        | -0.0028403876 |
| serial_timesteps   | 105000        |
| time_elapsed       | 479           |
| total_timesteps    | 210000        |
| value_loss         | 0.074807614   |
-------------------------

--------------------------------------
| approxkl           | 0.0010948445  |
| clipfrac           | 0.00041666668 |
| explained_variance | 0.025         |
| fps                | 438           |
| n_updates          | 50            |
| policy_entropy     | 1.3423265     |
| policy_loss        | -0.0009754554 |
| serial_timesteps   | 150000        |
| time_elapsed       | 686           |
| total_timesteps    | 300000        |
| value_loss         | 0.07658155    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0015178178  |
| clipfrac           | 0.030083332   |
| explained_variance | 0.0514        |
| fps                | 437           |
| n_updates          | 51            |
| policy_entropy     | 1.3533132     |
| policy_loss        | -0.0012429515 |
| serial_timesteps   | 153000        |
| time_elapsed       | 699           |
| total_timesteps    | 306000        |
| value_loss         | 0.08401557    |
-------------------------

--------------------------------------
| approxkl           | 0.004042419   |
| clipfrac           | 0.032791667   |
| explained_variance | 0.0577        |
| fps                | 436           |
| n_updates          | 67            |
| policy_entropy     | 1.2495171     |
| policy_loss        | -0.0017528718 |
| serial_timesteps   | 201000        |
| time_elapsed       | 919           |
| total_timesteps    | 402000        |
| value_loss         | 0.079835884   |
--------------------------------------
-------------------------------------
| approxkl           | 0.0042106602 |
| clipfrac           | 0.029041668  |
| explained_variance | 0.0338       |
| fps                | 436          |
| n_updates          | 68           |
| policy_entropy     | 1.2715764    |
| policy_loss        | -0.002246154 |
| serial_timesteps   | 204000       |
| time_elapsed       | 933          |
| total_timesteps    | 408000       |
| value_loss         | 0.08106926   |
-------------------------------------

--------------------------------------
| approxkl           | 0.0009449359  |
| clipfrac           | 0.0018750001  |
| explained_variance | 0.0569        |
| fps                | 437           |
| n_updates          | 84            |
| policy_entropy     | 1.2205815     |
| policy_loss        | -0.0013241434 |
| serial_timesteps   | 252000        |
| time_elapsed       | 1.15e+03      |
| total_timesteps    | 504000        |
| value_loss         | 0.09736623    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0013790244  |
| clipfrac           | 0.001625      |
| explained_variance | 0.0738        |
| fps                | 437           |
| n_updates          | 85            |
| policy_entropy     | 1.2268263     |
| policy_loss        | -0.0008749147 |
| serial_timesteps   | 255000        |
| time_elapsed       | 1.17e+03      |
| total_timesteps    | 510000        |
| value_loss         | 0.087886356   |
-------------------------

---------------------------------------
| approxkl           | 0.0006184267   |
| clipfrac           | 4.1666666e-05  |
| explained_variance | 0.0756         |
| fps                | 440            |
| n_updates          | 101            |
| policy_entropy     | 1.07611        |
| policy_loss        | -0.00055688247 |
| serial_timesteps   | 303000         |
| time_elapsed       | 1.39e+03       |
| total_timesteps    | 606000         |
| value_loss         | 0.09984091     |
---------------------------------------
--------------------------------------
| approxkl           | 0.0008374504  |
| clipfrac           | 0.0016666667  |
| explained_variance | 0.0871        |
| fps                | 439           |
| n_updates          | 102           |
| policy_entropy     | 1.0708232     |
| policy_loss        | -0.0015378068 |
| serial_timesteps   | 306000        |
| time_elapsed       | 1.4e+03       |
| total_timesteps    | 612000        |
| value_loss         | 0.08740346    |
------------

---------------------------------------
| approxkl           | 0.0010540711   |
| clipfrac           | 0.001          |
| explained_variance | 0.0902         |
| fps                | 438            |
| n_updates          | 118            |
| policy_entropy     | 0.92464        |
| policy_loss        | -0.00042116316 |
| serial_timesteps   | 354000         |
| time_elapsed       | 1.62e+03       |
| total_timesteps    | 708000         |
| value_loss         | 0.084241316    |
---------------------------------------
--------------------------------------
| approxkl           | 0.0025597503  |
| clipfrac           | 0.023208335   |
| explained_variance | 0.11          |
| fps                | 438           |
| n_updates          | 119           |
| policy_entropy     | 0.89018345    |
| policy_loss        | -0.0017130965 |
| serial_timesteps   | 357000        |
| time_elapsed       | 1.63e+03      |
| total_timesteps    | 714000        |
| value_loss         | 0.08802465    |
------------

--------------------------------------
| approxkl           | 0.0013527492  |
| clipfrac           | 0.005791666   |
| explained_variance | 0.125         |
| fps                | 437           |
| n_updates          | 135           |
| policy_entropy     | 0.71803916    |
| policy_loss        | -0.0010188927 |
| serial_timesteps   | 405000        |
| time_elapsed       | 1.85e+03      |
| total_timesteps    | 810000        |
| value_loss         | 0.08979756    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0013432545  |
| clipfrac           | 0.0060416665  |
| explained_variance | 0.109         |
| fps                | 437           |
| n_updates          | 136           |
| policy_entropy     | 0.6932734     |
| policy_loss        | -0.0015315295 |
| serial_timesteps   | 408000        |
| time_elapsed       | 1.87e+03      |
| total_timesteps    | 816000        |
| value_loss         | 0.08992804    |
-------------------------

--------------------------------------
| approxkl           | 0.00047955083 |
| clipfrac           | 0.0031250003  |
| explained_variance | 0.123         |
| fps                | 402           |
| n_updates          | 152           |
| policy_entropy     | 0.572739      |
| policy_loss        | -0.0011398988 |
| serial_timesteps   | 456000        |
| time_elapsed       | 2.1e+03       |
| total_timesteps    | 912000        |
| value_loss         | 0.08740221    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0008109651  |
| clipfrac           | 0.006583333   |
| explained_variance | 0.155         |
| fps                | 401           |
| n_updates          | 153           |
| policy_entropy     | 0.5603754     |
| policy_loss        | -0.0012352165 |
| serial_timesteps   | 459000        |
| time_elapsed       | 2.11e+03      |
| total_timesteps    | 918000        |
| value_loss         | 0.090317525   |
-------------------------

--------------------------------------
| approxkl           | 0.0008300486  |
| clipfrac           | 0.0047916663  |
| explained_variance | 0.149         |
| fps                | 431           |
| n_updates          | 169           |
| policy_entropy     | 0.44024312    |
| policy_loss        | -0.0013246641 |
| serial_timesteps   | 507000        |
| time_elapsed       | 2.35e+03      |
| total_timesteps    | 1014000       |
| value_loss         | 0.09064502    |
--------------------------------------
---------------------------------------
| approxkl           | 0.0005316943   |
| clipfrac           | 0.003          |
| explained_variance | 0.188          |
| fps                | 424            |
| n_updates          | 170            |
| policy_entropy     | 0.45071593     |
| policy_loss        | -0.00077502057 |
| serial_timesteps   | 510000         |
| time_elapsed       | 2.36e+03       |
| total_timesteps    | 1020000        |
| value_loss         | 0.091152266    |
-------------

--------------------------------------
| approxkl           | 0.0008778226  |
| clipfrac           | 0.00875       |
| explained_variance | 0.205         |
| fps                | 411           |
| n_updates          | 186           |
| policy_entropy     | 0.35747808    |
| policy_loss        | -0.0012891327 |
| serial_timesteps   | 558000        |
| time_elapsed       | 2.59e+03      |
| total_timesteps    | 1116000       |
| value_loss         | 0.08637524    |
--------------------------------------
--------------------------------------
| approxkl           | 0.00029410058 |
| clipfrac           | 0.0019166665  |
| explained_variance | 0.191         |
| fps                | 403           |
| n_updates          | 187           |
| policy_entropy     | 0.3623139     |
| policy_loss        | -0.0011716222 |
| serial_timesteps   | 561000        |
| time_elapsed       | 2.61e+03      |
| total_timesteps    | 1122000       |
| value_loss         | 0.08193247    |
-------------------------

In [8]:
# del model # remove to demonstrate saving and loading
model = PPO2.load("ppo2_pommerman_v4_4")

n_cpu = 1
env = DummyVecEnv([lambda: env_pom for i in range(n_cpu)])
model.envs = env

# test the learned model
num_win = 0
num_tie = 0
num_lose = 0
total = 50 # number of playouts
for i_episode in range(total):
    obs = env.reset()
    done = False
    info = None
    while not done:
        env.render()
        action_training, _states = model.predict(obs)
#         print(action_training)
        obs, rewards, dones, infos = env.step(action_training)
#         print(infos)
        done = dones[0]
        info = infos[0]
        time.sleep(0.1)
    print('Episode {} finished'.format(i_episode))
    if(info["result"].value == 0):
        if(1 in info["winners"]):
            num_win+=1
        else:
            num_lose+=1
    elif(info["result"].value == 2):
        num_tie+=1
#     print(info)
env.close()
print("Win ", num_win, "/", total, " games")
print("Tie ", num_tie, "/", total, " games")
print("Lose ", num_lose, "/", total, " games")


Loading a model without an environment, this model cannot be trained until it has a valid environment.
Episode 0 finished
Episode 1 finished
Episode 2 finished
Episode 3 finished
Episode 4 finished
Episode 5 finished
Episode 6 finished
Episode 7 finished
Episode 8 finished
Episode 9 finished
Win  2 / 10  games
Tie  1 / 10  games
Lose  7 / 10  games


# baseline example code

In [None]:
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import PPO2

# multiprocess environment
n_cpu = 1
env = DummyVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("ppo2_cartpole")

del model # remove to demonstrate saving and loading

model = PPO2.load("ppo2_cartpole")

# Enjoy trained agent
obs = env.reset()



In [None]:
print(obs)
print(env.buf_obs[None].shape)
print(env.observation_space)

action, _states = model.predict(obs)
obs, rewards, dones, info = env.step(action)

print(obs)
print(rewards)
print(dones)
print(info)

# while True:
#     action, _states = model.predict(obs)
#     obs, rewards, dones, info = env.step(action)
#     env.render()