# Installing the dependencies:

In [6]:
!git clone https://github.com/Near32/comaze-python.git ; cd comaze-python; git checkout develop-rl-template; git pull; git status; pip install -e .

fatal: destination path 'comaze-python' already exists and is not an empty directory.
Already on 'develop-rl-template'
Your branch is up to date with 'origin/develop-rl-template'.
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 17 (delta 8), reused 17 (delta 8), pack-reused 0[K
Unpacking objects: 100% (17/17), done.
From https://github.com/Near32/comaze-python
   52a41aa..4117798  develop-rl-template -> origin/develop-rl-template
Updating 52a41aa..4117798
Fast-forward
 comaze/agents/abstract_agent.py                    |  14 [32m++[m[31m-[m
 comaze/agents/rl/abstract_on_policy_rl_agent.py    |  54 [32m++++++[m[31m----[m
 comaze/agents/rl/simple_on_policy_rl_agent.py      |  11 [32m+[m[31m-[m
 comaze/env/comaze.py                               |  24 [32m+++[m[31m--[m
 setup.py                                           |  17 [32m++[m[31m--[m
 .../rl/test_trainin

# Before continuing any further, please restart the kernel (Runtime->restart runtime) in order to make the installed packaged available.

# Create a simple On-Policy RL Agent:


---



In [4]:
from typing import Any
from typing import Dict
from typing import List
from typing import Callable
from typing import Optional

import numpy as np 

import gym
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch import distributions 

from comaze.agents.rl import AbstractOnPolicyRLAgent
from comaze.agents.utils import dict_encoded_pov_avail_moves_extract_exp_fn, discrete_direction_only_format_move_fn


class SimpleOnPolicyRLAgent(AbstractOnPolicyRLAgent):
  """
  Simple on-policy RL agents using PyTorch.
  
  Call init_rl_algo at the end of the init function.

  The output of select_action must be a dictionnary containing:
    - "action": the actual action that needs to be transformed 
                using the format_move_fn function.
    - "log_prob_action": the log likelihood over the action
                          distribution. 
  
  Note the default extract_exp_fn and format_move_fn functions.
  They are the minimum to allow any learning to take place.

  As AbstractAgent requests it, you also need to implement:
    - agent_id: Agent's unique id.
    - select_action: Agent's action selection logic.
  """

  def __init__(
    self, 
    learning_rate: float=1e-4,
    discount_factor: float=0.99,
    num_actions: int=5,
    pov_shape: List[int]=[7,7,12],
    agent_order: int=0, 
    environment: Optional[gym.Env]=None, 
    ) -> None:
    """
    Initializes the agent.
    """
    nn.Module.__init__(self=self)
    AbstractOnPolicyRLAgent.__init__(
      self=self,
      extract_exp_fn=dict_encoded_pov_avail_moves_extract_exp_fn, 
      format_move_fn=discrete_direction_only_format_move_fn,
      learning_rate=learning_rate,
      discount_factor=discount_factor,
      agent_order=agent_order,
      environment=environment,
    )

    self.num_actions = num_actions
    self.pov_shape = pov_shape
    self.build_agent()
    
    self.init_rl_algo()
  
  @property
  def agent_id(self) -> str:
    return "simple_onpolicy_rlagent"
  
  def build_agent(self):
    self.embed_pov_size = 256
    self.embed_pov = nn.Sequential(
      nn.Conv2d(in_channels=self.pov_shape[-1], out_channels=32, kernel_size=3, stride=1, padding=1),
      nn.ReLU(),
      nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2, padding=1),
      nn.ReLU(),
      nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
      nn.ReLU(),
      nn.Flatten(),
      nn.Linear(512, self.embed_pov_size),
      nn.ReLU(),
    )
    
    self.embed_action_size = 128
    self.embed_action_space = nn.Linear(self.num_actions, self.embed_action_size)
    
    policy_input_size = self.embed_pov_size+self.embed_action_size
    self.policy = nn.Linear(policy_input_size, self.num_actions)
  
  def get_formatted_inputs(self, obs):
    nobs = {}
    for k,v in obs.items():
      if 'pov' in k:
        # move channels around:
        assert len(v.shape)==3
        v = np.transpose(v, (2,0,1))
      nv = torch.from_numpy(v).unsqueeze(0).float()
      nobs[k] = nv
    return nobs

  def select_action(self, observation: Any) -> Dict[str, Any]:
    """
    Returns agent's action given `observation`.
    """

    obs = self.get_formatted_inputs(observation)

    pov_input = obs["encoded_pov"]
    action_space = obs["available_moves"]
    
    pov_emb = self.embed_pov(pov_input)
    action_emb = self.embed_action_space(action_space)
    
    pov_action_emb = torch.cat((pov_emb, action_emb), dim=1)
    action_pred = self.policy(pov_action_emb)
    
    action_prob = F.softmax(action_pred, dim = -1)  
    avail_action_prob = action_prob * obs["available_moves"]
    dist = distributions.Categorical(avail_action_prob)
    action = dist.sample()
    log_prob_action = dist.log_prob(action)

    action_dict = {
      "action": action.item(),
      "log_prob_action": log_prob_action
    }

    return action_dict

In [5]:
import random
from typing import Callable
import pandas as pd 

from functools import partial
from tqdm import tqdm 

from comaze.env import TwoPlayersCoMazeGym
from comaze.agents import AbstractAgent #, SimpleOnPolicyRLAgent


def two_players_environment_loop(
    agent1: AbstractAgent,
    agent2: AbstractAgent,
    environment,
    max_episode_length,
):
  """
  Loop runner for the environment.
  """

  # Setup environment.
  #environment = TwoPlayersCoMazeGym(**environment_kwargs)
  state = environment.reset()

  # Initialize agents.
  agent1.set_environment(environment=environment, agent_order=0)
  agent2.set_environment(environment=environment, agent_order=1)

  # Book-keeping.
  t = 0
  done = False
  trajectory = list()

  ebar = tqdm(total=max_episode_length, position=1)
  while not done and t<=max_episode_length:
    ebar.update(1)
    # Turn-based game.
    if t%2 == 0:
      move = agent1.select_move(state)
    else:
      move = agent2.select_move(state)
  
    # Progress simulation.
    next_state, reward, done, info = environment.step(move)

    # Used for logging.
    trajectory.append((t, state, move, reward, next_state, done, info))

    # Agent internals.
    """
    if t%2 == 0:
      agent1.update(move, next_state, reward, done)
    else:
      agent2.update(move, next_state, reward, done)
    """
    if t==max_episode_length:
      done = True
      reward = -1
    
    for agent in [agent1, agent2]:
      agent.update(move, next_state, reward, done)

    # Book-keeping.
    t = t + 1
    state = next_state
  

  # Dump logs.
  pd.DataFrame(trajectory).to_csv("{}-{}.csv".format(
      agent1.agent_id, agent2.agent_id)
  )


## Let us test the agent:

In [7]:
agent1 = SimpleOnPolicyRLAgent( 
  learning_rate=1e-4,
  discount_factor=0.99,
  num_actions=5,
  pov_shape=[7,7,12],
)

agent2 = SimpleOnPolicyRLAgent( 
  learning_rate=1e-4,
  discount_factor=0.99,
  num_actions=5,
  pov_shape=[7,7,12],
)

max_episode_length = 50
verbose = False 

environment_kwargs = {
    "level":"1",
    "verbose":verbose,
}
environment = TwoPlayersCoMazeGym(**environment_kwargs)

two_players_environment_loop(
    agent1=agent1,
    agent2=agent2,
    environment=environment,
    max_episode_length=max_episode_length,
)


  0%|          | 0/50 [00:00<?, ?it/s][A
  4%|▍         | 2/50 [00:00<00:02, 16.63it/s][A
  6%|▌         | 3/50 [00:00<00:03, 12.66it/s][A
  8%|▊         | 4/50 [00:00<00:04, 10.62it/s][A
 10%|█         | 5/50 [00:00<00:04,  9.76it/s][A
 12%|█▏        | 6/50 [00:00<00:04,  9.23it/s][A
 14%|█▍        | 7/50 [00:00<00:04,  8.81it/s][A
 16%|█▌        | 8/50 [00:00<00:04,  8.57it/s][A
 18%|█▊        | 9/50 [00:00<00:04,  8.38it/s][A
 20%|██        | 10/50 [00:01<00:04,  8.28it/s][A
 22%|██▏       | 11/50 [00:01<00:04,  8.26it/s][A
 24%|██▍       | 12/50 [00:01<00:04,  8.29it/s][A
 26%|██▌       | 13/50 [00:01<00:04,  8.32it/s][A
 28%|██▊       | 14/50 [00:01<00:04,  8.27it/s][A
 30%|███       | 15/50 [00:01<00:04,  8.25it/s][A
 32%|███▏      | 16/50 [00:01<00:04,  8.24it/s][A
 34%|███▍      | 17/50 [00:01<00:03,  8.26it/s][A
 36%|███▌      | 18/50 [00:02<00:03,  8.20it/s][A
 38%|███▊      | 19/50 [00:02<00:03,  8.20it/s][A
 40%|████      | 20/50 [00:02<00:03,  8.20it/s]

Loss -0.1445927619934082 :: EP reward -1
Loss 0.11999654769897461 :: EP reward -1


# Training a diad of SimpleOnPolicyRLAgent agents:

In [8]:
agent1 = SimpleOnPolicyRLAgent( 
  learning_rate=1e-4,
  discount_factor=0.99,
  num_actions=5,
  pov_shape=[7,7,12],
)

agent2 = SimpleOnPolicyRLAgent( 
  learning_rate=1e-4,
  discount_factor=0.99,
  num_actions=5,
  pov_shape=[7,7,12],
)

max_episode_length = 50
nbr_training_episodes = 1000
verbose = False 

tbar = tqdm(total=nbr_training_episodes, position=0)
for episode in range(nbr_training_episodes):
  tbar.update(1)
  environment_kwargs = {
      "level":"1",
      "verbose":verbose,
  }
  environment = TwoPlayersCoMazeGym(**environment_kwargs)

  two_players_environment_loop(
      agent1=agent1,
      agent2=agent2,
      environment=environment,
      max_episode_length=max_episode_length,
  )


  0%|          | 0/1000 [00:00<?, ?it/s]
  0%|          | 0/50 [00:00<?, ?it/s][A
  4%|▍         | 2/50 [00:00<00:02, 16.83it/s][A
  6%|▌         | 3/50 [00:00<00:03, 12.97it/s][A
  8%|▊         | 4/50 [00:00<00:04, 10.97it/s][A
 10%|█         | 5/50 [00:00<00:04,  9.83it/s][A
 12%|█▏        | 6/50 [00:00<00:04,  9.34it/s][A
 14%|█▍        | 7/50 [00:00<00:04,  8.93it/s][A
 16%|█▌        | 8/50 [00:00<00:04,  8.74it/s][A
 18%|█▊        | 9/50 [00:00<00:04,  8.63it/s][A
 20%|██        | 10/50 [00:01<00:04,  8.54it/s][A
 22%|██▏       | 11/50 [00:01<00:04,  8.38it/s][A
 24%|██▍       | 12/50 [00:01<00:04,  8.42it/s][A
 26%|██▌       | 13/50 [00:01<00:04,  8.32it/s][A
 28%|██▊       | 14/50 [00:01<00:04,  8.25it/s][A
 30%|███       | 15/50 [00:01<00:04,  8.13it/s][A
 32%|███▏      | 16/50 [00:01<00:04,  7.98it/s][A
 34%|███▍      | 17/50 [00:01<00:04,  8.04it/s][A
 36%|███▌      | 18/50 [00:02<00:03,  8.17it/s][A
 38%|███▊      | 19/50 [00:02<00:03,  8.23it/s][A
 40%|██

Loss 0.6091338992118835 :: EP reward -1
Loss 0.20674264430999756 :: EP reward -1



  0%|          | 0/50 [00:00<?, ?it/s][A
  4%|▍         | 2/50 [00:00<00:02, 17.52it/s][A
  6%|▌         | 3/50 [00:00<00:03, 13.08it/s][A
  8%|▊         | 4/50 [00:00<00:04, 11.20it/s][A
 10%|█         | 5/50 [00:00<00:04, 10.16it/s][A
 12%|█▏        | 6/50 [00:00<00:04,  9.13it/s][A
 14%|█▍        | 7/50 [00:00<00:04,  8.83it/s][A
 16%|█▌        | 8/50 [00:00<00:04,  8.66it/s][A
 18%|█▊        | 9/50 [00:00<00:04,  8.54it/s][A
 20%|██        | 10/50 [00:01<00:04,  8.42it/s][A
 22%|██▏       | 11/50 [00:01<00:04,  8.39it/s][A
 24%|██▍       | 12/50 [00:01<00:04,  8.24it/s][A
 26%|██▌       | 13/50 [00:01<00:04,  7.64it/s][A
 28%|██▊       | 14/50 [00:01<00:04,  7.80it/s][A
 30%|███       | 15/50 [00:01<00:04,  7.95it/s][A
 32%|███▏      | 16/50 [00:01<00:04,  8.05it/s][A
 34%|███▍      | 17/50 [00:01<00:04,  8.15it/s][A
 36%|███▌      | 18/50 [00:02<00:03,  8.10it/s][A
 38%|███▊      | 19/50 [00:02<00:03,  8.19it/s][A
 40%|████      | 20/50 [00:03<00:12,  2.37it/s]

Loss -0.6764953136444092 :: EP reward -1


RuntimeError: ignored