

In this env, we add a new ability - range attack.
- the attack is valid if the distaince between the agent is <= attack_distance. There is no miss. It always takes 1 health from the opponent.
- the attack has a cool down of # cool_down turns.
- each player can see the cool_down status of each other.


In [47]:
import random
import numpy as np
from gym.spaces import Discrete,Tuple, Box

from ray.rllib.env.multi_agent_env import MultiAgentEnv

class LOREnv2(MultiAgentEnv):
    """Two-player environment for league of rookie setup1
    This is a turn based version, the two players take action by turn.
    
    The game happens in a 5x5 2D space. Two players are put into two spots. 
    In each turn, the play can take one of the following actions
    - Move 1 step (one of the 4 directions)
    - Attack the opponent
    
    The observation has the followings.
        - 2D position of self
        - health of self
        - 2D position of the opponent
        - health of the opponent
    """

    # all the actions
    MOVEUP = 0
    MOVEDOWN = 1
    MOVELEFT = 2
    MOVERIGHT = 3
    ATTACK = 4
    SPECIALATTACK = 5
    
    action_string = {
        MOVEUP: "MoveUp",
        MOVEDOWN: "MoveDown",
        MOVELEFT: "MoveLeft",
        MOVERIGHT: "MoveRight",
        ATTACK: "Attack",
        SPECIALATTACK: "SpecialAttack",
    }
    
    # max heath to start with
    max_health = 3
    
    # space is of size n x n 
    # (0, 0) is at the top left corner
    # x represents the vertical direction
    # y represents the horizontal direction
    space_size_n = 4
    
    # miss rate on any one attack
    attack_miss_rate = 0.2
        
    # each attack takes some health
    attak_power = 1
    
    # reward of win a game
    game_award = 100
    
    invalid_action_penalty = -10
    
    # for special attack
    special_attack_cool_down = 3
    special_attack_distance = 2
    
        
    def generate_init_pos(self):
        player1_init_pos = [random.randrange(LOREnv2.space_size_n), random.randrange(LOREnv2.space_size_n)]
        player2_init_pos = [random.randrange(LOREnv2.space_size_n), random.randrange(LOREnv2.space_size_n)]
        
        while player1_init_pos == player2_init_pos:
          player2_init_pos = [random.randrange(LOREnv2.space_size_n), random.randrange(LOREnv2.space_size_n)]
        
        return player1_init_pos, player2_init_pos

    def __init__(self, config):
        self.action_space = Discrete(len(LOREnv2.action_string))
        
        # the observation is a tuple: [self_pos_x, self_pos_y, self.health, pos_x, pos_y, health]
        # start with a discrete space
        self.observation_space = Tuple(
            [
                # self position in x/y
                Box(low = 0, high = LOREnv2.space_size_n - 1, shape=(2, ), dtype=np.int16),
                # opponent position in x/y
                Box(low = 0, high = LOREnv2.space_size_n - 1, shape=(2, ), dtype=np.int16),
                # self health and opponent health
                Box(low = 0, high = LOREnv2.max_health, shape=(2, ), dtype=np.int16),
                # self special attack cool down and opponent's cool down
                Box(low = 0, high = LOREnv2.special_attack_cool_down, shape=(2, ), dtype=np.int16)
                
            ]
        )
        
        self.player1 = "player1"
        self.player2 = "player2"
        
        # set init position
        self.player1_init_pos, self.player2_init_pos = self.generate_init_pos()
        
        self.position = {
                self.player1: self.player1_init_pos,
                self.player2: self.player2_init_pos
        }
        
        self.health = {
            self.player1: LOREnv2.max_health,
            self.player2: LOREnv2.max_health
        }
        
        self.special_attack_cd = {
            self.player1: 0,
            self.player2: 0,
        }
        
        self.last_reward = 0
        
        # For test-case inspections (compare both players' scores).
        self.player1_score = self.player2_score = 0

    # reset the env
    # return the initial observation
    # the player1 always take action first
    def reset(self):
        self.player1_init_pos, self.player2_init_pos = self.generate_init_pos()
        
        self.position = {
                self.player1: self.player1_init_pos,
                self.player2: self.player2_init_pos
        }
        
        self.health = {
            self.player1: LOREnv2.max_health,
            self.player2: LOREnv2.max_health
        }
        
        self.special_attack_cd = {
            self.player1: 0,
            self.player2: 0,
        }
        
        self.turn_reward_player1 = 0
        self.turn_reward_player2 = 0
        self.last_reward = 0
        
        return {
            self.player1: tuple(
                [
                    np.array([self.position[self.player1][0], self.position[self.player1][1]]),
                    np.array([self.position[self.player2][0], self.position[self.player2][1]]),
                    np.array([self.health[self.player1], self.health[self.player2]]),
                    np.array([self.special_attack_cd[self.player1], self.special_attack_cd[self.player2]])
                ]
            )
        }
    
    
    def move_agent(self, player, opponent, action):
        if self.health[player] <= 0:  # no health no action
            return 0
        
        if action == LOREnv2.MOVEUP or action == LOREnv2.MOVEDOWN:
            new_x = self.position[player][0] + (1 if action == LOREnv2.MOVEDOWN else -1)
            if new_x < 0 or new_x >= LOREnv2.space_size_n: # invalid move
                return self.invalid_action_penalty
            elif (self.position[opponent][0] == new_x and self.position[opponent][1] == self.position[player][1]):
                return self.invalid_action_penalty
            else:
                self.position[player][0] = new_x
                return 0
                
        if action == LOREnv2.MOVELEFT or action == LOREnv2.MOVERIGHT:
            new_y = self.position[player][1] + (1 if action == LOREnv2.MOVERIGHT else -1)
            if new_y < 0 or new_y >= LOREnv2.space_size_n: # invalid move
                return self.invalid_action_penalty
            elif (self.position[opponent][1] == new_y and self.position[opponent][0] == self.position[player][0]):
                return self.invalid_action_penalty
            else:
                self.position[player][1] = new_y
                return 0
        
        return 0
    
    def is_adjacent(self):
        return (self.position[self.player1][0] == self.position[self.player2][0] and abs(self.position[self.player1][1] - self.position[self.player2][1]) <= 1) \
            or (self.position[self.player1][1] == self.position[self.player2][1] and abs(self.position[self.player1][0] - self.position[self.player2][0]) <= 1) 
                    
    
    def is_in_distance(self, distance):
        d_square = (self.position[self.player1][0] - self.position[self.player2][0]) * (self.position[self.player1][0] - self.position[self.player2][0]) \
            + (self.position[self.player1][1] - self.position[self.player2][1]) * (self.position[self.player1][1] - self.position[self.player2][1])
    
        return d_square <= distance * distance
    
    
    def take_action(self, player, opponent, action):
        reward = 0
        
        if action == LOREnv2.ATTACK:
            hit1 =  0 if not(self.is_adjacent()) or random.random() < LOREnv2.attack_miss_rate else 1
            
            self.health[opponent] = self.health[opponent] - hit1  * LOREnv2.attak_power
            reward = hit1 * LOREnv2.attak_power
        elif action == LOREnv2.SPECIALATTACK:
            if self.special_attack_cd[player] > 0:
                reward = self.invalid_action_penalty
            elif self.is_in_distance(LOREnv2.special_attack_distance) == False: # invalid, cannot use special attack
                reward = self.invalid_action_penalty
            else:
                self.health[opponent] = self.health[opponent] - LOREnv2.attak_power
                reward = LOREnv2.attak_power
                    
                # reset cd
                self.special_attack_cd[player] = LOREnv2.special_attack_cool_down
        else: # move
            reward = self.move_agent(player, opponent, action)
    
    
        # check health
        if self.health[player] <=0 and self.health[opponent] > 0:
            reward = -1 * LOREnv2.game_award
        
        if self.health[player] > 0 and self.health[opponent] <= 0:
            reward = LOREnv2.game_award
        
        if self.health[player] == 0 and self.health[opponent] == 0:
            reward = 0
    
        return reward
    
    
    # update state and observation based on the 2 actions
    def step(self, action_dict):
        # only one action each turn
        assert len(action_dict) == 1, action_dict
                
        if self.player1 in action_dict:
            player = self.player1
            opponent = self.player2
        else:
            player = self.player2
            opponent = self.player1

            
        # update special attack CD
        if self.special_attack_cd[player] > 0:
            self.special_attack_cd[player] = self.special_attack_cd[player] - 1
        
        
        # take action
        reward = self.take_action(player, opponent, action_dict[player])
           
            
        # get the new obs
        obs = {
            opponent: tuple(
                [
                    np.array([self.position[opponent][0], self.position[opponent][1]]),
                    np.array([self.position[player][0], self.position[player][1]]),
                    np.array([self.health[opponent], self.health[player]]),
                    np.array([self.special_attack_cd[opponent], self.special_attack_cd[player]])
                ]
            )
        }
        
        # get the reward
        rew = {
            opponent: -1 * reward + self.last_reward,
        }
        
        self.last_reward = reward
        
        done = {
            "__all__": self.health[self.player1] == 0 or self.health[self.player2] == 0
        }
        
        # it is required that when done["__all__"] == True, the obv/rew should include all live agent
        if done["__all__"]:
            obs[player] = tuple(
                [
                    np.array([self.position[player][0], self.position[player][1]]),
                    np.array([self.position[opponent][0], self.position[opponent][1]]),
                    np.array([self.health[player], self.health[opponent]]),
                    np.array([self.special_attack_cd[player], self.special_attack_cd[opponent]])
                ]
            )
            
            rew[player] = reward
        

        if self.health[self.player2] == 0:
            self.player1_score += 1
        elif self.health[self.player1] == 0:
            self.player2_score += 1

        return obs, rew, done, {}
    


In [48]:
import random
from ray.rllib.policy.policy import Policy

class LORHeuristicReckLessOrCautious(Policy):
    """
    Heuristic policy
    Random pick between one of the following.
    
    cautious
    
    if self.health > 1:
        if self and opponent is adjacent:
            attack
        elif can use special attack and within range:
            special attack
        else:
            move torwards the opponent
    else:
        if self and opponent is adjacent:
            move away from the opponent 
        elif can use special attack and within range:
            use special attack
        else:
            attack
            
    reckless
    if self and opponent is adjacent:
        attack
    elif can use special attack and within range:
        use special attack
    else
        move torwards the opponent
        
    
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()
    
    def can_use_special_attack(self, obv, cd):
        d_square = (obv[0] - obv[2])*(obv[0] - obv[2]) + (obv[1] - obv[3]) * (obv[1] - obv[3])
        return d_square <= LOREnv2.special_attack_distance * LOREnv2.special_attack_distance and \
            cd <= 0
    
    def take_cautious_action(self, obv):
        # each ob is np array (self.x, self.y, oponent.x, oppoennt.y, self.health, opponent.health)
        self_x = obv[0]
        self_y = obv[1]
        op_x = obv[2]
        op_y = obv[3]
        self_h = obv[4]
        op_h = obv[5]
        self_cd = obv[6]
        op_cd = obv[7]
        
        
        if self_h > 1: #self_h >= op_h and self_h > 1:
            if (self_x == op_x and abs(self_y - op_y) <= 1) or (self_y == op_y and abs(self_x - op_x) <= 1):
                return LOREnv2.ATTACK
            elif self.can_use_special_attack(obv, self_cd):
                return LOREnv2.SPECIALATTACK
            else:
                if self_x != op_x:
                    return LOREnv2.MOVEUP if self_x > op_x else LOREnv2.MOVEDOWN
                else:
                    return LOREnv2.MOVELEFT if self_y > op_y else LOREnv2.MOVERIGHT
        else:
            if (self_x == op_x and abs(self_y - op_y) <= 1) or (self_y == op_y and abs(self_x - op_x) <= 1):
                if self_x == op_x:
                    return LOREnv2.MOVEUP if self_x == LOREnv2.space_size_n -1  else LOREnv2.MOVEDOWN
                else:
                    return LOREnv2.MOVELEFT if self_y == LOREnv2.space_size_n -1  else LOREnv2.MOVERIGHT
            elif self.can_use_special_attack(obv, self_cd):
                return LOREnv2.SPECIALATTACK
            else:
                return LOREnv2.ATTACK

    
    def take_reckless_action(self, obv):
        # each ob is np array (self.x, self.y, oponent.x, oppoennt.y, self.health, opponent.health)
        self_x = obv[0]
        self_y = obv[1]
        op_x = obv[2]
        op_y = obv[3]
        self_h = obv[4]
        op_h = obv[5]
        self_cd = obv[6]
        op_cd = obv[7]
        
        move_x_first = 1 if random.random() < 0.5 else 0
        
        if (self_x == op_x and abs(self_y - op_y) <= 1) or (self_y == op_y and abs(self_x - op_x) <= 1):
            return LOREnv2.ATTACK
        elif self.can_use_special_attack(obv, self_cd):
            return LOREnv2.SPECIALATTACK
        else:
            if move_x_first:             
                if self_x != op_x:
                    return LOREnv2.MOVEUP if self_x > op_x else LOREnv2.MOVEDOWN
                else:
                    return LOREnv2.MOVELEFT if self_y > op_y else LOREnv2.MOVERIGHT
            else:
                if self_y != op_y:
                    return LOREnv2.MOVELEFT if self_y > op_y else LOREnv2.MOVERIGHT
                else:
                    return LOREnv2.MOVEUP if self_x > op_x else LOREnv2.MOVEDOWN
    

    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):

        return [self.take_reckless_action(x) if random.random() < 0 else self.take_cautious_action(x)  for x in obs_batch], [], {}
    
    def learn_on_batch(self, samples):
        pass

    def get_weights(self):
        pass

    def set_weights(self, weights):
        pass


In [49]:
import ray
from ray.rllib.agents.dqn import DQNTrainer

ray.shutdown()
ray.init()

def select_policy(agent_id):
    if agent_id == "player1":
        return "learned"
    else:
        return "LORHeuristicReckLessOrCautious"

env = LOREnv2({})
    
config = {
    "env": LOREnv2,
    "gamma": 0.9,
    "num_workers": 0,
    "num_envs_per_worker": 4,
    "rollout_fragment_length": 10,
    "train_batch_size": 500,
    "multiagent": {
        "policies_to_train": ["learned"],
        "policies": {
            "LORHeuristicReckLessOrCautious": (LORHeuristicReckLessOrCautious, env.observation_space, env.action_space, {}),
            "learned": (None, env.observation_space, env.action_space, {
                "model": {
                        "use_lstm": True
                },
            }),
        },
        "policy_mapping_fn": select_policy,
    },
}

trainer_obj2 = DQNTrainer(config=config)
env = trainer_obj2.workers.local_worker().env
for _ in range(50):
    results = trainer_obj2.train()
    #print(results)
    
    #if _ % 100 == 0:
    print(env.player1_score, env.player2_score)

2020-06-23 05:48:36,770	INFO resource_spec.py:212 -- Starting Ray with 22.07 GiB memory available for workers and up to 11.05 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-23 05:48:37,156	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8272[39m[22m
2020-06-23 05:48:39,368	INFO trainable.py:217 -- Getting current IP.


0 12
2 30
2 41
2 61
3 70
4 84
7 86
7 86
14 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
16 92
27 94
49 96
71 99
91 100
116 101
139 102
150 102
170 104
180 104


In [50]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    
def print_obv(env, obv, size):
    for i in range(size):
        for j in range(size):
            if i == env.position[env.player1][0] and j == env.position[env.player1][1]:
                print(f"{bcolors.FAIL}{env.health[env.player1]}{bcolors.ENDC}", end = "")
            elif i == env.position[env.player2][0] and j == env.position[env.player2][1]:
                print(f"{bcolors.OKBLUE}{env.health[env.player2]}{bcolors.ENDC}", end = "")
            else:
                print("-", end = "")
        if i < size - 1:
            print("")
        else:
            print(f"     CD: {bcolors.FAIL}{env.special_attack_cd[env.player1]}{bcolors.ENDC}, {bcolors.OKBLUE}{env.special_attack_cd[env.player2]}{bcolors.ENDC}")

        
    
def simulateTurns(env, trainer1, size, base_policy, max_turn = 100):
    obv = env.reset()
    done = {"__all__" : False}
    turn = 0
    
    while done["__all__"] == False and turn < max_turn:
        # print ovb
        print_obv(env, obv, size)
        
        # player1 take action
        a1 = trainer1.compute_action(observation = obv[env.player1], policy_id = "learned")
        
        print("".join(['*']*(size)) , end = " action ")
        print(f"{bcolors.FAIL}{env.action_string[a1]}{bcolors.ENDC}", end = " , ")
        print(f"{bcolors.OKBLUE}WAIT{bcolors.ENDC}")
        
        # update obv
        obv, reward, done, info = env.step({env.player1: a1})
        
                
        if done["__all__"] == False:
            # print new obv
            print_obv(env, obv, size)

            # player2 take action
            a2 = trainer1.compute_action(observation = obv[env.player2], policy_id = base_policy)

            print("".join(['*']*(size)) , end = " action ")
            print(f"{bcolors.FAIL}WAIT{bcolors.ENDC}", end = " , ")
            print(f"{bcolors.OKBLUE}{env.action_string[a2]}{bcolors.ENDC}")

            # update obv
            obv, reward, done, info = env.step({env.player2: a2})
                
        turn += 1
    
    print_obv(env, obv, size)

the stratgy of the player1 is easy, attack as long as its health is more or equal.

In [51]:
trainer_obj2.get_policy("learned").config['explore'] = False
sim_env = LOREnv2({})

simulateTurns(sim_env, trainer_obj2, 4, "LORHeuristicReckLessOrCautious")

---[94m3[0m
[91m3[0m---
----
----     CD: [91m0[0m, [94m0[0m
**** action [91mAttack[0m , [94mWAIT[0m
---[94m3[0m
[91m3[0m---
----
----     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveDown[0m
----
[91m3[0m--[94m3[0m
----
----     CD: [91m0[0m, [94m0[0m
**** action [91mAttack[0m , [94mWAIT[0m
----
[91m3[0m--[94m3[0m
----
----     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveLeft[0m
----
[91m3[0m-[94m3[0m-
----
----     CD: [91m0[0m, [94m0[0m
**** action [91mSpecialAttack[0m , [94mWAIT[0m
----
[91m3[0m-[94m2[0m-
----
----     CD: [91m3[0m, [94m0[0m
**** action [91mWAIT[0m , [94mSpecialAttack[0m
----
[91m2[0m-[94m2[0m-
----
----     CD: [91m3[0m, [94m3[0m
**** action [91mAttack[0m , [94mWAIT[0m
----
[91m2[0m-[94m2[0m-
----
----     CD: [91m2[0m, [94m3[0m
**** action [91mWAIT[0m , [94mMoveLeft[0m
----
[91m2[0m[94m2[0m--
----
----     CD: [91m2[0m, [94m2[0m
**** ac

In [53]:
simulateTurns(sim_env, trainer_obj2, 4, "LORHeuristicReckLessOrCautious")

---[94m3[0m
-[91m3[0m--
----
----     CD: [91m0[0m, [94m0[0m
**** action [91mAttack[0m , [94mWAIT[0m
---[94m3[0m
-[91m3[0m--
----
----     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveDown[0m
----
-[91m3[0m-[94m3[0m
----
----     CD: [91m0[0m, [94m0[0m
**** action [91mAttack[0m , [94mWAIT[0m
----
-[91m3[0m-[94m3[0m
----
----     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mSpecialAttack[0m
----
-[91m2[0m-[94m3[0m
----
----     CD: [91m0[0m, [94m3[0m
**** action [91mSpecialAttack[0m , [94mWAIT[0m
----
-[91m2[0m-[94m2[0m
----
----     CD: [91m3[0m, [94m3[0m
**** action [91mWAIT[0m , [94mMoveLeft[0m
----
-[91m2[0m[94m2[0m-
----
----     CD: [91m3[0m, [94m2[0m
**** action [91mAttack[0m , [94mWAIT[0m
----
-[91m2[0m[94m1[0m-
----
----     CD: [91m2[0m, [94m2[0m
**** action [91mWAIT[0m , [94mMoveDown[0m
----
-[91m2[0m--
--[94m1[0m-
----     CD: [91m2[0m, [94m1[0m
**** ac

In [58]:
simulateTurns(sim_env, trainer_obj2, 4, "LORHeuristicReckLessOrCautious")

---[94m3[0m
----
----
[91m3[0m---     CD: [91m0[0m, [94m0[0m
**** action [91mAttack[0m , [94mWAIT[0m
---[94m3[0m
----
----
[91m3[0m---     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveDown[0m
----
---[94m3[0m
----
[91m3[0m---     CD: [91m0[0m, [94m0[0m
**** action [91mAttack[0m , [94mWAIT[0m
----
---[94m3[0m
----
[91m3[0m---     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveDown[0m
----
----
---[94m3[0m
[91m3[0m---     CD: [91m0[0m, [94m0[0m
**** action [91mAttack[0m , [94mWAIT[0m
----
----
---[94m3[0m
[91m3[0m---     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveDown[0m
----
----
----
[91m3[0m--[94m3[0m     CD: [91m0[0m, [94m0[0m
**** action [91mAttack[0m , [94mWAIT[0m
----
----
----
[91m3[0m--[94m3[0m     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveLeft[0m
----
----
----
[91m3[0m-[94m3[0m-     CD: [91m0[0m, [94m0[0m
**** action [91mSp

In [59]:
ray.shutdown()
ray.init()

trainer_obj3 = DQNTrainer(
    env = LOREnv2,
    config = {}
)

env = trainer_obj3.workers.local_worker().env
for _ in range(50):
    results = trainer_obj3.train()
    #print(results)
    
    #if _ % 100 == 0:
    print(env.player1_score, env.player2_score)


2020-06-23 06:08:32,886	INFO resource_spec.py:212 -- Starting Ray with 21.97 GiB memory available for workers and up to 11.0 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-23 06:08:33,266	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8272[39m[22m
2020-06-23 06:08:36,120	INFO trainable.py:217 -- Getting current IP.


8 13
19 24
35 34
53 45
78 64
113 81
133 105
152 126
156 126
174 141
174 141
174 141
176 141
177 141
178 141
180 141
182 142
182 142
182 142
182 142
182 142
182 142
182 143
182 143
182 143
182 144
183 145
183 146
183 147
183 147
184 148
185 148
185 148
185 148
186 149
187 149
187 149
187 149
188 149
188 150
188 150
188 150
188 150
189 150
189 151
189 151
189 151
189 151
189 152
189 152


In [60]:
def simulateTurnsSamePolicy(env, trainer1, size, max_turn = 100):
    obv = env.reset()
    done = {"__all__" : False}
    turn = 0
    
    while done["__all__"] == False and turn < max_turn:
        # print ovb
        print_obv(env, obv, size)
        
        # player1 take action
        a1 = trainer1.compute_action(observation = obv[env.player1])
        
        print("".join(['*']*(size)) , end = " action ")
        print(f"{bcolors.FAIL}{env.action_string[a1]}{bcolors.ENDC}", end = " , ")
        print(f"{bcolors.OKBLUE}WAIT{bcolors.ENDC}")
        
        # update obv
        obv, reward, done, info = env.step({env.player1: a1})
        
                
        if done["__all__"] == False:
            # print new obv
            print_obv(env, obv, size)

            # player2 take action
            a2 = trainer1.compute_action(observation = obv[env.player2])

            print("".join(['*']*(size)) , end = " action ")
            print(f"{bcolors.FAIL}WAIT{bcolors.ENDC}", end = " , ")
            print(f"{bcolors.OKBLUE}{env.action_string[a2]}{bcolors.ENDC}")

            # update obv
            obv, reward, done, info = env.step({env.player2: a2})
                
        turn += 1
    
    print_obv(env, obv, size)

In [62]:
trainer_obj3.get_policy().config['explore'] = False
sim_env = LOREnv2({})

simulateTurnsSamePolicy(sim_env, trainer_obj3, 4)

--[91m3[0m-
----
----
---[94m3[0m     CD: [91m0[0m, [94m0[0m
**** action [91mAttack[0m , [94mWAIT[0m
--[91m3[0m-
----
----
---[94m3[0m     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveLeft[0m
--[91m3[0m-
----
----
--[94m3[0m-     CD: [91m0[0m, [94m0[0m
**** action [91mMoveRight[0m , [94mWAIT[0m
---[91m3[0m
----
----
--[94m3[0m-     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mAttack[0m
---[91m3[0m
----
----
--[94m3[0m-     CD: [91m0[0m, [94m0[0m
**** action [91mMoveLeft[0m , [94mWAIT[0m
--[91m3[0m-
----
----
--[94m3[0m-     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveLeft[0m
--[91m3[0m-
----
----
-[94m3[0m--     CD: [91m0[0m, [94m0[0m
**** action [91mMoveLeft[0m , [94mWAIT[0m
-[91m3[0m--
----
----
-[94m3[0m--     CD: [91m0[0m, [94m0[0m
**** action [91mWAIT[0m , [94mMoveRight[0m
-[91m3[0m--
----
----
--[94m3[0m-     CD: [91m0[0m, [94m0[0m
**** action 