In [1]:
import os
import sys
import numpy as np

In [2]:
#!git clone https://github.com/MultiAgentLearning/playground
!git clone https://PedFox:mads60612201@github.com/esbenlkruse/Pommerman

os.chdir('Pommerman')
os.chdir('playground')

!python setup.py install

Cloning into 'Pommerman'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 109 (delta 10), reused 103 (delta 9), pack-reused 0[K
Receiving objects: 100% (109/109), 1.12 MiB | 2.46 MiB/s, done.
Resolving deltas: 100% (10/10), done.
  File "setup.py", line 14
    """.format(*CURRENT_PYTHON, *MIN_PYTHON))
                                ^
SyntaxError: invalid syntax


In [None]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision --user
import torch

# install JSAnimation
#! pip install git+https://github.com/jakevdp/JSAnimation.git
  
# install gym
! pip install git+https://github.com/openai/gym --user

[31mtorch-0.4.1-cp36-cp36m-linux_x86_64.whl is not a supported wheel on this platform.[0m


In [None]:
try: 
  import gym
except ImportError:
    print("Please restart and run all.")

In [None]:
!pom_battle

# This is where the "Pommerman Demo" starts.

This notebook demonstrates how to train Pommerman agents. Please let us know at support@pommerman.com if you run into any issues.

In [None]:
import os
import sys
import numpy as np

from pommerman.agents import SimpleAgent, RandomAgent, PlayerAgent
from pommerman.configs import ffa_v0_fast_env
from pommerman.envs.v0 import Pomme
from pommerman.characters import Bomber
from pommerman import utility

# Random agents

The following codes instantiates the environment with four random agents who take actions until the game is finished. (This will be a quick game.)

In [None]:
# Instantiate the environment
config = ffa_v0_fast_env()
env = Pomme(**config["env_kwargs"])

In [None]:
# Add four random agents
agents = {}
for agent_id in range(4):
    agents[agent_id] = RandomAgent(config["agent"](agent_id, config["game_type"]))
env.set_agents(list(agents.values()))
env.set_init_game_state(None)

In [None]:
# Seed and reset the environment
env.seed(0)
obs = env.reset()

# Run the random agents until we're done
done = False
while not done:
    #env.render()
    actions = env.act(obs)
    print(actions)
    obs, reward, done, info = env.step(actions)
#env.render(close=True)
env.close()

#print(info)

In [None]:
import pommerman.characters

# Running our own Code

Trying to understand env.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gym

In [None]:
# Instantiate the environment
config = ffa_v0_fast_env()
env = Pomme(**config["env_kwargs"])

In [None]:
class QNetwork(nn.Module):
    """Q-network"""

    def __init__(self, n_inputs, n_outputs, learning_rate):
        super(QNetwork, self).__init__()
        # network
        self.out = nn.Linear(n_inputs, n_outputs, bias=False)
        torch.nn.init.uniform_(self.out.weight, 0, 0.01)
        # training
        self.optimizer = optim.SGD(self.parameters(), lr=learning_rate)

    def forward(self, x):
        return self.out(x)
    
    def loss(self, q_outputs, q_targets):
        return torch.sum(torch.pow(q_targets - q_outputs, 2))
      
def one_hot(i, l):
    """One-hot encoder for the states"""
    a = np.zeros((len(i), l))
    #print(i, l)
    a[range(len(i)), i] = 1
    return a

In [None]:
'''This is the base abstraction for agents in pommerman.
All agents should inherent from this class'''


class BaseAgent:
    """Parent abstract Agent."""

    def __init__(self, character=Bomber):
        self._character = character

    def __getattr__(self, attr):
        return getattr(self._character, attr)

    def act(self, obs, action_space):
        raise NotImplementedError()

    def episode_end(self, reward):
        """This is called at the end of the episode to let the agent know that
        the episode has ended and what is the reward.

        Args:
          reward: The single reward scalar to this agent.
        """
        pass

    def init_agent(self, id, game_type):
        self._character = self._character(id, game_type)

    @staticmethod
    def has_user_input():
        return False

    def shutdown(self):
        pass


In [None]:
# Add own agent

class OwnAgent(BaseAgent):
    """The Own Agent that returns random actions given an action_space."""
    def __init__(self, character=Bomber):
        self._character = character
        self.n_inputs = 14 #env.observation_space.n
        self.n_outputs = env.action_space.n
        self.learning_rate = 0.1
        self.gamma = 0.99 # discount rate

        self.qnet = QNetwork(self.n_inputs, self.n_outputs, self.learning_rate)
        
        self.Q = 0
    
    def act(self, obs, action_space):
        # 1. do foward pass of current state to compute Q-values for all actions
        self.qnet.optimizer.zero_grad()
        #print(obs)
        self.Q = self.qnet(torch.from_numpy(one_hot(obs['board'].flatten(), self.n_inputs)).float())

        # 2. select action with epsilon-greedy strategy
        a = self.Q.argmax().item() if np.random.rand() > epsilon else env.action_space.sample()
        
        return a
      
    def afterstep(self, s1, r, a):
        # 3. do forward pass for the next state
        with torch.no_grad():
            Q1 = self.qnet(torch.from_numpy( one_hot(s1['board'].flatten(), self.n_inputs)).float())

        # 4. set Q-target
        q_target = self.Q.clone()
        q_target[0, a] = r + self.gamma * Q1.max().item() * (not done)

        # 5. update network weights
        loss = self.qnet.loss(self.Q, q_target)
        loss.backward(retain_graph=True)
        self.qnet.optimizer.step()

        return loss

In [None]:
# Add four random agents
agents = {}

for agent_id in range(3):
    agents[agent_id] = SimpleAgent(config["agent"](agent_id, config["game_type"]))
    
agents[3] = OwnAgent(config["agent"](3, config["game_type"]))
env.set_agents(list(agents.values()))
env.set_init_game_state(None)

In [None]:
## Seed and reset the environment
#env.seed(0)
#obs = env.reset()
#
## Run the random agents until we're done
#done = False
#while not done:
#    #env.render()
#    actions = env.act(obs)
#    obs, reward, done, info = env.step(actions)
##env.render(close=True)
#env.close()
#
#print(info)'''

In [None]:
# train Q-network

num_episodes = 1000
episode_limit = 100

val_freq = 100 # validation frequency

try:
    epsilon = 1.0
    rewards, lengths, losses, epsilons = [], [], [], []
    print('start training')
    for i in range(num_episodes):
        # init new episode
        s, ep_reward, ep_loss = env.reset(), 0, 0
        
        for j in range(episode_limit):
            
            actions = env.act(s)
            print("",end="")
            s1, r_tot, done, _ = env.step(actions)
            r = r_tot[3]
            
            loss = agents[3].afterstep(s1[3],r, actions[3])
            
            # 6. bookkeeping
            s = s1
        
            ep_loss += loss.item()
            if done: break
        
        ep_reward += r
        
        # bookkeeping
        epsilon *= num_episodes/(i/(num_episodes/20)+num_episodes) # decrease epsilon
        epsilons.append(epsilon); rewards.append(ep_reward); lengths.append(j+1); losses.append(ep_loss)
        if (i+1) % val_freq == 0: print('{:5d} mean training reward: {:5.2f}'.format(i+1, np.mean(rewards[-val_freq:])))
    print('done')
except KeyboardInterrupt:
    print('interrupt')

In [None]:
# plot results

def moving_average(a, n=10) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret / n

plt.figure(figsize=(16, 9))
plt.subplot(411)
plt.title('training rewards')
plt.plot(range(1, num_episodes+1), rewards)
plt.plot(range(1, num_episodes+1), moving_average(rewards))
plt.xlim([0, num_episodes])
plt.subplot(412)
plt.title('training lengths')
plt.plot(range(1, num_episodes+1), lengths)
plt.plot(range(1, num_episodes+1), moving_average(lengths))
plt.xlim([0, num_episodes])
plt.subplot(413)
plt.title('training loss')
plt.plot(range(1, num_episodes+1), losses)
plt.plot(range(1, num_episodes+1), moving_average(losses))
plt.xlim([0, num_episodes])
plt.subplot(414)
plt.title('epsilon')
plt.plot(range(1, num_episodes+1), epsilons)
plt.xlim([0, num_episodes])
plt.tight_layout(); plt.show()