

In this env, we define a rookie version of league of legends: LOR
- the space is of size N x N
- there are 2 players
- at the begining of the game, they start from (0,N-1) and (N-1,0), both has health = M
- the two players take action by turn: move (left, right, up, down), attack, special attack, go_home, hold. each turn, the player can take 1 action.
- the players cannot move into each other.
- each attack, if valid, take 1 health from the opponent. Attack is valid when the two players has distance 1. Special attack is valid when they have distance 2, but it has a cool down of 3 turns.
- if the player choose to go_home, his returns to the start point and gain full health. 
- the area of the three positions around the init position is considered the base of the each player.
- if the player gets into the base of the opponent, the player win the game
- if the player lose all his health on the way, the player is set back to his starting point. he is not allowed to take any action for L turns. But he gains full health. During the time, he does not take damage.

An example of a 5x5 space

---*A
---**
-----
**---
B*---


In [10]:
import random
import numpy as np
from gym.spaces import Discrete,Tuple, Box

from ray.rllib.env.multi_agent_env import MultiAgentEnv

class LOREnv3(MultiAgentEnv):
    # all the actions
    MOVEUP = 0
    MOVEDOWN = 1
    MOVELEFT = 2
    MOVERIGHT = 3
    ATTACK = 4
    SPECIALATTACK = 5
    GOBACK = 6
    HOLD = 7
    
    action_string = {
        MOVEUP: "MoveUp",
        MOVEDOWN: "MoveDown",
        MOVELEFT: "MoveLeft",
        MOVERIGHT: "MoveRight",
        ATTACK: "Attack",
        SPECIALATTACK: "SpecialAttack",
        GOBACK: "GoBack",
        HOLD: "Hold"
    }
    
    # max heath to start with
    max_health = 2
    
    # space is of size n x n 
    # (0, 0) is at the top left corner
    # x represents the vertical direction
    # y represents the horizontal direction
    space_size_n = 5
    
    # miss rate on any one attack
    attack_miss_rate = 0
        
    # each attack takes some health
    attak_power = 1
    
    # reward of win a game
    game_award = 100
    
    invalid_action_penalty = -10
    
    # for special attack
    special_attack_cool_down = 3
    special_attack_distance = 2
    
    # turns to wait while dead
    dead_hold_turns = 3
    
    # Util function
    def generate_init_pos(self):
        player1_init_pos = [0, LOREnv3.space_size_n - 1]
        player2_init_pos = [LOREnv3.space_size_n - 1, 0]
        
        return player1_init_pos, player2_init_pos

    
    # RLLIB API
    def __init__(self, config):
        self.action_space = Discrete(len(LOREnv3.action_string))
        
        # the observation is a tuple:
        #    my pos (x,y)
        #    opponent pos (x,y)
        #    (my health, opponent health)
        #    (my attack cd, opponent attack cd)
        #    (my revive turns, opponent revive turns)
        self.observation_space = Tuple(
            [
                # self position in x/y
                Box(low = 0, high = LOREnv3.space_size_n - 1, shape=(2, ), dtype=np.int16),
                # opponent position in x/y
                Box(low = 0, high = LOREnv3.space_size_n - 1, shape=(2, ), dtype=np.int16),
                # self health and opponent health
                Box(low = 0, high = LOREnv3.max_health, shape=(2, ), dtype=np.int16),
                # self special attack cool down and opponent's cool down
                Box(low = 0, high = LOREnv3.special_attack_cool_down, shape=(2, ), dtype=np.int16),
                # remaining turns to revive, self and opponent. (0 means alive)
                Box(low = 0, high = LOREnv3.dead_hold_turns, shape=(2, ), dtype=np.int16),
                
            ]
        )
        
        # player id is important in a multi-agent environment
        self.player1 = "player1"
        self.player2 = "player2"
        
        self.player1_init_pos, self.player2_init_pos = self.generate_init_pos()
        
        self.reset()
        
        # measure how many games did they win
        self.player1_score = self.player2_score = 0

    # RLLIB API
    # reset the env, return the initial observation
    # this is called by RLLIB when a game is done
    # the player1 always take action first
    def reset(self):        
        self.position = {
                self.player1: self.player1_init_pos.copy(),
                self.player2: self.player2_init_pos.copy()
        }
        
        self.health = {
            self.player1: LOREnv3.max_health,
            self.player2: LOREnv3.max_health
        }
        
        self.special_attack_cd = {
            self.player1: 0,
            self.player2: 0,
        }
        
        self.turns_to_revive = {
            self.player1: 0,
            self.player2: 0
        }
        
        self.last_reward = 0
        
        # based on the key in this dict, RLLIB will let the corresponding agent take action
        return {
            self.player1: tuple(
                [
                    np.array([self.position[self.player1][0], self.position[self.player1][1]]),
                    np.array([self.position[self.player2][0], self.position[self.player2][1]]),
                    np.array([self.health[self.player1], self.health[self.player2]]),
                    np.array([self.special_attack_cd[self.player1], self.special_attack_cd[self.player2]]),
                    np.array([self.turns_to_revive[self.player1], self.turns_to_revive[self.player2]])
                ]
            )
        }
    
    
    # Util function
    def is_in_opponent_base(self, player):
        if player == self.player1  \
            and self.position[player][0] >= LOREnv3.space_size_n - 2 and self.position[player][1] <= 1:
            return True
        if player == self.player2 \
            and self.position[player][0] <= 1 and self.position[player][1] >= LOREnv3.space_size_n - 2:
            return True
        
        return False
    
    # Util function
    def move_agent(self, player, opponent, action):
        if self.health[player] <= 0:  # no health no action
            return 0
        
        if action == LOREnv3.MOVEUP or action == LOREnv3.MOVEDOWN:
            new_x = self.position[player][0] + (1 if action == LOREnv3.MOVEDOWN else -1)
            if new_x < 0 or new_x >= LOREnv3.space_size_n: # invalid move
                return self.invalid_action_penalty
            elif (self.position[opponent][0] == new_x and self.position[opponent][1] == self.position[player][1]):
                return self.invalid_action_penalty
            else:
                self.position[player][0] = new_x
                return 0
                
        if action == LOREnv3.MOVELEFT or action == LOREnv3.MOVERIGHT:
            new_y = self.position[player][1] + (1 if action == LOREnv3.MOVERIGHT else -1)
            if new_y < 0 or new_y >= LOREnv3.space_size_n: # invalid move
                return self.invalid_action_penalty
            elif (self.position[opponent][1] == new_y and self.position[opponent][0] == self.position[player][0]):
                return self.invalid_action_penalty
            else:
                self.position[player][1] = new_y
                return 0
        
        return 0
    
    # Util function
    def is_adjacent(self):
        return (self.position[self.player1][0] == self.position[self.player2][0] and abs(self.position[self.player1][1] - self.position[self.player2][1]) <= 1) \
            or (self.position[self.player1][1] == self.position[self.player2][1] and abs(self.position[self.player1][0] - self.position[self.player2][0]) <= 1) 
    
    
    # Util function
    def is_in_distance(self, distance):
        d_square = (self.position[self.player1][0] - self.position[self.player2][0]) * (self.position[self.player1][0] - self.position[self.player2][0]) \
            + (self.position[self.player1][1] - self.position[self.player2][1]) * (self.position[self.player1][1] - self.position[self.player2][1])
    
        return d_square <= distance * distance
    
    # Util function
    # the agent is rewarded if:
    #   get a valid attack on the opponent
    #   kill the opponent
    #   gain health by going back to base
    #   get into opponent's base (win)
    # the agent gets penalty if:
    #   has invalid action (hit wall, run into opponent, attack when the cd is not done)
    def take_action(self, player, opponent, action):
        reward = 0
        
        if self.turns_to_revive[player] > 0: # if dead, cannot take action besides HOLD
            if action == LOREnv3.HOLD:
                reward = 0
            else:
                reward = self.invalid_action_penalty
        else:
            if action == LOREnv3.ATTACK:
                hit1 =  0 if not(self.is_adjacent()) or random.random() < LOREnv3.attack_miss_rate else 1
                
                if self.turns_to_revive[opponent] > 0: # cannot attack opponent if in revive
                    hit1 = 0
            
                self.health[opponent] = self.health[opponent] - hit1  * LOREnv3.attak_power
                reward = hit1 * LOREnv3.attak_power
            elif action == LOREnv3.SPECIALATTACK:
                if self.special_attack_cd[player] > 0:
                    reward = self.invalid_action_penalty
                elif self.is_in_distance(LOREnv3.special_attack_distance) == False: # invalid, cannot use special attack
                    reward = self.invalid_action_penalty
                elif self.turns_to_revive[opponent] > 0: # cannot attack opponent if in revive
                    reward = 0
                    # reset cd
                    self.special_attack_cd[player] = LOREnv3.special_attack_cool_down
                else:
                    self.health[opponent] = self.health[opponent] - LOREnv3.attak_power
                    reward = LOREnv3.attak_power
                    # reset cd
                    self.special_attack_cd[player] = LOREnv3.special_attack_cool_down
            elif action == LOREnv3.HOLD:
                reward = 0
            elif action == LOREnv3.GOBACK:
                reward = LOREnv3.max_health - self.health[player]
            
                # gain full health and go back to init position
                self.health[player] = LOREnv3.max_health
                self.position[player] = self.player1_init_pos.copy() if player == self.player1 else self.player2_init_pos.copy()
            else: # move
                reward = self.move_agent(player, opponent, action)
        
        
        # if the opponent is dead, more reward
        if self.health[opponent] == 0:
            reward = reward * LOREnv3.dead_hold_turns
        
        # if the player reach the opponent's base     
        if self.is_in_opponent_base(player):
            reward = LOREnv3.game_award
    
        return reward
    
    
    # RLLIB API
    # action_dict shows what action the agent took. it is corresponding to the agent id given the observation dict
    # update state and observation based on the actions, and decide reward
    # return the new observation, and the reward to the agent(s)
    def step(self, action_dict):
        # only one action each turn
        assert len(action_dict) == 1, action_dict
                
        if self.player1 in action_dict:
            player = self.player1
            opponent = self.player2
        else:
            player = self.player2
            opponent = self.player1
        
            
        # update special attack CD
        if self.special_attack_cd[player] > 0:
            self.special_attack_cd[player] = self.special_attack_cd[player] - 1
        
        
        # take action
        reward = self.take_action(player, opponent, action_dict[player])
        
        # if opponent is killed, set him back to base and start revive CD
        if self.health[opponent] == 0:
            # opponent is send back to init position and start revive turns
            self.health[opponent] = LOREnv3.max_health
            self.position[opponent] = self.player1_init_pos.copy() if opponent == self.player1 else self.player2_init_pos.copy()
            self.turns_to_revive[opponent] = LOREnv3.dead_hold_turns
        
        # update player's revive CD if needed
        if self.turns_to_revive[player] > 0:
            self.turns_to_revive[player] = self.turns_to_revive[player] - 1    
           
            
        # get the new obs for the opponent, since the game goes by turn
        obs = {
            opponent: tuple(
                [
                    np.array([self.position[opponent][0], self.position[opponent][1]]),
                    np.array([self.position[player][0], self.position[player][1]]),
                    np.array([self.health[opponent], self.health[player]]),
                    np.array([self.special_attack_cd[opponent], self.special_attack_cd[player]]),
                    np.array([self.turns_to_revive[opponent], self.turns_to_revive[player]])
                ]
            )
        }
        
        # get the reward for the opponent 
        # which is the sum of last reward (the reward of the oppoennt's action) and the negative of this player's new reward
        rew = {
            opponent: -1 * reward + self.last_reward,
        }
        
        # update last reward
        self.last_reward = reward
        
        # the game is done if anyone gets into the other's base
        done = {
            "__all__": self.is_in_opponent_base(self.player1) or self.is_in_opponent_base(self.player2)
        }
        
        # it is required that when done["__all__"] == True, the obv/rew should include all live agent
        if done["__all__"]:
            obs[player] = tuple(
                [
                    np.array([self.position[player][0], self.position[player][1]]),
                    np.array([self.position[opponent][0], self.position[opponent][1]]),
                    np.array([self.health[player], self.health[opponent]]),
                    np.array([self.special_attack_cd[player], self.special_attack_cd[opponent]]),
                    np.array([self.turns_to_revive[player], self.turns_to_revive[opponent]])
                ]
            )
            
            rew[player] = reward
        

        # update score
        if self.is_in_opponent_base(self.player1) and not(self.is_in_opponent_base(self.player2)):
            self.player1_score += 1
        elif self.is_in_opponent_base(self.player2) and not(self.is_in_opponent_base(self.player1)):
            self.player2_score += 1

        return obs, rew, done, {}
    


In [11]:
import random
from ray.rllib.policy.policy import Policy

# define a heuristic agent
class LORHeuristic(Policy):
    """
    Heuristic policy  - reckless player
    
    if can land a  valid attack:
        attack
    if can can use special attack and land valid attack:
        use special attack
    if can move towards opponent base
        move towards it
    else
        randomly choose from (move left, down, hold)
        
    
    """

    # RLLIB API
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()
        self.space_size = LOREnv3.space_size_n
    
    # Util func
    def can_use_special_attack(self, obv, cd):
        d_square = (obv[0] - obv[2])*(obv[0] - obv[2]) + (obv[1] - obv[3]) * (obv[1] - obv[3])
        return d_square <= LOREnv3.special_attack_distance * LOREnv3.special_attack_distance and \
            cd <= 0
    
    # Util func
    def can_move_torwards_opponent_base(self, obv):
        # assume that the policy always play player2, and target (0, size -1)
        self_x = obv[0]
        self_y = obv[1]
        op_x = obv[2]
        op_y = obv[3]
        
        actions = []
        
        if self_x > 0 and not(op_y == self_y and op_x == self_x - 1): # can reduce x
            actions.append(LOREnv3.MOVEUP)
        
        if self_y < LOREnv3.space_size_n -1 and not(op_y == self_y + 1 and op_x == self_x): # can increase y
            actions.append(LOREnv3.MOVERIGHT)
            
        action = -1
        if len(actions) > 0:
            action = actions[random.randrange(len(actions))]
        
        return action
                     
    # Util func
    def take_reckless_action(self, obv):
        # each ob is np array (self.x, self.y, oponent.x, oppoennt.y, self.health, opponent.health)
        self_x = obv[0]
        self_y = obv[1]
        op_x = obv[2]
        op_y = obv[3]
        self_h = obv[4]
        op_h = obv[5]
        self_cd = obv[6]
        op_cd = obv[7]
        self_revive_cd = obv[8]
        op_revive_cd = obv[9]
        
        if self_revive_cd > 0:
            return LOREnv3.HOLD
             
        
        try_move_towards_base = self.can_move_torwards_opponent_base(obv)
        
        
        if (self_x == op_x and abs(self_y - op_y) <= 1) or (self_y == op_y and abs(self_x - op_x) <= 1):
            return LOREnv3.ATTACK
        elif self.can_use_special_attack(obv, self_cd):
            return LOREnv3.SPECIALATTACK
        elif try_move_towards_base  >= 0:
            return try_move_towards_base 
        else: # randomly move left or down or hold
            actions = [LOREnv3.HOLD]
        
            if self_y > 0 and not(op_y == self_y - 1 and op_x == self_x): 
                actions.append(LOREnv3.MOVELEFT)
            
            if self_x < self.space_size - 1 and not(op_y == self_y and op_x == self_x + 1): 
                actions.append(LOREnv3.MOVEDOWN)
            
            return actions[random.randrange(len(actions))]
                

    # RLLIB API
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):

        return [self.take_reckless_action(x) for x in obs_batch], [], {}
    
    
    def learn_on_batch(self, samples):
        pass

    def get_weights(self):
        pass

    def set_weights(self, weights):
        pass


In [3]:
import ray
from ray.rllib.agents.dqn import DQNTrainer

ray.shutdown()
ray.init()

def select_policy(agent_id):
    if agent_id == "player1":
        return "learned"
    else:
        return "LORHeuristic"

env = LOREnv3({})
    
config = {
    "env": LOREnv3,
    "gamma": 0.9,
    "num_workers": 0,
    "num_envs_per_worker": 8,
    "rollout_fragment_length": 10,
    "train_batch_size": 500,
    "multiagent": {
        "policies_to_train": ["learned"],
        "policies": {
            "LORHeuristic": (LORHeuristic, env.observation_space, env.action_space, {}),
            "learned": (None, env.observation_space, env.action_space, {
                "model": {
                        "use_lstm": True
                },
            }),
        },
        "policy_mapping_fn": select_policy,
    },
}

trainer_obj2 = DQNTrainer(config=config)
env = trainer_obj2.workers.local_worker().env
for _ in range(500):
    results = trainer_obj2.train()
    #print(results)
    
    if _ % 10 == 0:
      print(env.player1_score, env.player2_score)

2020-06-25 22:54:35,495	INFO resource_spec.py:212 -- Starting Ray with 66.65 GiB memory available for workers and up to 32.58 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-25 22:54:36,668	INFO services.py:563 -- Failed to connect to the redis server, retrying.
2020-06-25 22:54:38,175	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m
2020-06-25 22:54:40,052	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-06-25 22:54:40,393	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2020-06-25 22:54:45,485	INFO trainable.py:217 -- Getting current IP.


0 8
0 87
0 167
0 234
0 292
0 345
0 407
0 470
0 511
0 555
0 594
0 630
0 661
0 696
0 725
0 756
0 790
0 818
0 839
0 866
0 885
0 913
0 933
0 953
0 977
0 993
0 1012
0 1028
0 1046
0 1066
0 1084
0 1094
0 1101
0 1109
0 1122
0 1126
0 1126
0 1128
0 1132
0 1134
0 1135
0 1137
0 1138
0 1138
0 1140
0 1141
0 1141
0 1142
0 1142
0 1145


In [4]:
for _ in range(1000):
    results = trainer_obj2.train()
    #print(results)
    
    if _ % 10 == 0:
      print(env.player1_score, env.player2_score)

0 1146
0 1148
0 1150
0 1153
0 1153
0 1154
0 1156
0 1157
2 1158
4 1160
5 1164
15 1168
36 1171
67 1190
102 1195
163 1197
222 1203
286 1205
360 1206
431 1207
505 1208
575 1209
643 1212
715 1214
782 1214
848 1215
921 1216
990 1217
1061 1219
1132 1221
1204 1223
1276 1223
1342 1225
1411 1228
1486 1228
1556 1230
1629 1231
1702 1231
1773 1232
1844 1233
1914 1237
1982 1239
2056 1240
2121 1241
2191 1242
2259 1245
2331 1245
2400 1246
2470 1249
2543 1250
2617 1250
2688 1250
2762 1251
2834 1251
2907 1252
2980 1254
3052 1255
3123 1255
3194 1255
3261 1258
3331 1258
3402 1260
3470 1260
3541 1260
3610 1261
3682 1262
3754 1263
3826 1263
3899 1265
3972 1267
4043 1268
4114 1270
4186 1270
4260 1271
4330 1272
4399 1274
4471 1276
4542 1278
4614 1280
4688 1280
4762 1280
4826 1283
4899 1284
4966 1286
5038 1287
5109 1289
5184 1290
5257 1291
5326 1291
5398 1291
5473 1291
5546 1292
5618 1293
5690 1293
5763 1293
5835 1293
5907 1294
5974 1295
6042 1298
6115 1298


In [12]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    
def print_obv(env, obv, size):
    for i in range(size):
        for j in range(size):
            if i == env.position[env.player1][0] and j == env.position[env.player1][1]:
                print(f"{bcolors.FAIL}{env.health[env.player1]}{bcolors.ENDC}", end = "")
            elif i == env.position[env.player2][0] and j == env.position[env.player2][1]:
                print(f"{bcolors.OKBLUE}{env.health[env.player2]}{bcolors.ENDC}", end = "")
            else:
                print("-", end = "")
        if i < size - 1:
            print("")
        else:
            print(f"    CD: {bcolors.FAIL}{env.special_attack_cd[env.player1]}{bcolors.ENDC}, {bcolors.OKBLUE}{env.special_attack_cd[env.player2]}{bcolors.ENDC}", end = "")
            print(f"    Revive: {bcolors.FAIL}{env.turns_to_revive[env.player1]}{bcolors.ENDC}, {bcolors.OKBLUE}{env.turns_to_revive[env.player2]}{bcolors.ENDC}")

        
    
def simulateTurns(env, trainer1, size, base_policy, max_turn = 100):
    obv = env.reset()
    done = {"__all__" : False}
    turn = 0
    
    while done["__all__"] == False and turn < max_turn:
        # print ovb
        print_obv(env, obv, size)
        
        # player1 take action
        a1 = trainer1.compute_action(observation = obv[env.player1], policy_id = "learned")
        
        print("".join(['*']*(size)) , end = " action ")
        print(f"{bcolors.FAIL}{env.action_string[a1]}{bcolors.ENDC}", end = " , ")
        print(f"{bcolors.OKBLUE}WAIT{bcolors.ENDC}")
        
        # update obv
        obv, reward, done, info = env.step({env.player1: a1})
        
                
        if done["__all__"] == False:
            # print new obv
            print_obv(env, obv, size)

            # player2 take action
            a2 = trainer1.compute_action(observation = obv[env.player2], policy_id = base_policy)

            print("".join(['*']*(size)) , end = " action ")
            print(f"{bcolors.FAIL}WAIT{bcolors.ENDC}", end = " , ")
            print(f"{bcolors.OKBLUE}{env.action_string[a2]}{bcolors.ENDC}")

            # update obv
            obv, reward, done, info = env.step({env.player2: a2})
                
        turn += 1
    
    print_obv(env, obv, size)

In [8]:
trainer_obj2.get_policy("learned").config['explore'] = False
sim_env = LOREnv3({})

simulateTurns(sim_env, trainer_obj2, 5, "LORHeuristic")

----[91m2[0m
-----
-----
-----
[94m2[0m----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mGoBack[0m , [94mWAIT[0m
----[91m2[0m
-----
-----
-----
[94m2[0m----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mMoveUp[0m
----[91m2[0m
-----
-----
[94m2[0m----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
---[91m2[0m-
-----
-----
[94m2[0m----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mMoveUp[0m
---[91m2[0m-
-----
[94m2[0m----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
--[91m2[0m--
-----
[94m2[0m----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mMoveUp[0m
--[91m2[0m--
[94m2[0m----
-----
-----
-----    CD:

In [9]:
simulateTurns(sim_env, trainer_obj2, 5, "LORHeuristic")

----[91m2[0m
-----
-----
-----
[94m2[0m----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mGoBack[0m , [94mWAIT[0m
----[91m2[0m
-----
-----
-----
[94m2[0m----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mMoveRight[0m
----[91m2[0m
-----
-----
-----
-[94m2[0m---    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveDown[0m , [94mWAIT[0m
-----
----[91m2[0m
-----
-----
-[94m2[0m---    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mMoveUp[0m
-----
----[91m2[0m
-----
-[94m2[0m---
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
-----
---[91m2[0m-
-----
-[94m2[0m---
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mMoveUp[0m
-----
---[91m2[0m-
-[94m2[0m---
-----
-----    

In [13]:
ray.shutdown()
ray.init()

trainer_obj3 = DQNTrainer(
    env = LOREnv3,
    config = {}
)

env = trainer_obj3.workers.local_worker().env
for _ in range(1000):
    results = trainer_obj3.train()
    #print(results)
    
    if _ % 10 == 0:
      print(env.player1_score, env.player2_score)


2020-06-26 05:26:58,707	INFO resource_spec.py:212 -- Starting Ray with 66.46 GiB memory available for workers and up to 32.48 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-26 05:26:59,075	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m
2020-06-26 05:27:00,755	INFO trainable.py:217 -- Getting current IP.


2 0
133 36
145 75
161 153
162 197
163 210
165 216
167 221
168 227
169 244
171 256
171 271
173 276
176 276
180 278
180 278
183 279
186 279
187 279
190 280
193 282
197 287
198 290
198 293
205 301
217 307
299 320
457 322
464 334
612 335
647 336
651 336
655 336
659 336
659 336
666 337
666 337
667 338
673 350
675 350
676 350
680 351
682 351
682 356
688 356
699 359
721 359
741 362
775 368
819 379
883 401
935 496
941 706
989 725
1000 729
1002 729
1023 733
1028 733
1039 733
1061 734
1061 734
1070 737
1076 741
1079 744
1087 745
1097 748
1098 748
1099 748
1103 748
1116 750
1117 751
1117 751
1119 752
1119 754
1121 756
1121 757
1124 760
1133 763
1135 763
1136 764
1136 765
1142 767
1148 768
1160 768
1171 770
1180 773
1181 782
1187 785
1189 790
1190 792
1192 792
1193 794
1195 801
1200 817
1203 817
1204 818
1204 820
1217 821
1219 823
1219 826


In [14]:
def simulateTurnsSamePolicy(env, trainer1, size, max_turn = 100):
    obv = env.reset()
    done = {"__all__" : False}
    turn = 0
    
    while done["__all__"] == False and turn < max_turn:
        # print ovb
        print_obv(env, obv, size)
        
        # player1 take action
        a1 = trainer1.compute_action(observation = obv[env.player1])
        
        print("".join(['*']*(size)) , end = " action ")
        print(f"{bcolors.FAIL}{env.action_string[a1]}{bcolors.ENDC}", end = " , ")
        print(f"{bcolors.OKBLUE}WAIT{bcolors.ENDC}")
        
        # update obv
        obv, reward, done, info = env.step({env.player1: a1})
        
                
        if done["__all__"] == False:
            # print new obv
            print_obv(env, obv, size)

            # player2 take action
            a2 = trainer1.compute_action(observation = obv[env.player2])

            print("".join(['*']*(size)) , end = " action ")
            print(f"{bcolors.FAIL}WAIT{bcolors.ENDC}", end = " , ")
            print(f"{bcolors.OKBLUE}{env.action_string[a2]}{bcolors.ENDC}")

            # update obv
            obv, reward, done, info = env.step({env.player2: a2})
                
        turn += 1
    
    print_obv(env, obv, size)

In [15]:
trainer_obj3.get_policy().config['explore'] = False
sim_env = LOREnv3({})

simulateTurnsSamePolicy(sim_env, trainer_obj3, 5)

----[91m2[0m
-----
-----
-----
[94m2[0m----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
---[91m2[0m-
-----
-----
-----
[94m2[0m----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mMoveUp[0m
---[91m2[0m-
-----
-----
[94m2[0m----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
--[91m2[0m--
-----
-----
[94m2[0m----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mMoveDown[0m
--[91m2[0m--
-----
-----
-----
[94m2[0m----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
-[91m2[0m---
-----
-----
-----
[94m2[0m----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mMoveRight[0m
-[91m2[0m---
-----
-----
-----
-[94m2[0m---

-----
-----    CD: [91m0[0m, [94m1[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
-[94m1[0m[91m1[0m--
-----
-----
-----
-----    CD: [91m0[0m, [94m1[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mAttack[0m
-[94m1[0m--[91m2[0m
-----
-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m3[0m, [94m0[0m
***** action [91mMoveDown[0m , [94mWAIT[0m
-[94m1[0m--[91m2[0m
-----
-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m2[0m, [94m0[0m
***** action [91mWAIT[0m , [94mHold[0m
-[94m1[0m--[91m2[0m
-----
-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m2[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
-[94m1[0m--[91m2[0m
-----
-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m1[0m, [94m0[0m
***** action [91mWAIT[0m , [94mHold[0m
-[94m1[0m--[91m2[0m
-----
-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [9

-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mHold[0m
-[94m1[0m--[91m2[0m
-----
-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
-[94m1[0m-[91m2[0m-
-----
-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mHold[0m
-[94m1[0m-[91m2[0m-
-----
-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
-[94m1[0m[91m2[0m--
-----
-----
-----
-----    CD: [91m0[0m, [94m0[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mWAIT[0m , [94mSpecialAttack[0m
-[94m1[0m[91m1[0m--
-----
-----
-----
-----    CD: [91m0[0m, [94m3[0m    Revive: [91m0[0m, [94m0[0m
***** action [91mMoveLeft[0m , [94mWAIT[0m
-[94m1[0m[91m1[0m--
-----
-----
-----
-----    CD: [91m0[0m, [94m3[0m  