In [27]:
import random
import numpy as np
from gym.spaces import Discrete,Tuple, Box

from ray.rllib.env.multi_agent_env import MultiAgentEnv

class LOREnv1(MultiAgentEnv):
    """Two-player environment for league of rookie setup1
    The game happens in a 5x5 2D space. Two players are put into two spots. 
    In each turn, the play can take one of the following actions
    - Move 1 step (one of the 4 directions)
    - Attack the opponent
    The attack action is evaluated after any move action
    
    The observation has the followings.
        - 2D position of self
        - health of self
        - 2D position of the opponent
        - health of the opponent
    """

    # all the actions
    MOVEUP = 0
    MOVEDOWN = 1
    MOVELEFT = 2
    MOVERIGHT = 3
    ATTACK = 4
    
    action_string = {
        MOVEUP: "MoveUp",
        MOVEDOWN: "MoveDown",
        MOVELEFT: "MoveLeft",
        MOVERIGHT: "MoveRight",
        ATTACK: "Attack"
    }
    
    # max heath to start with
    max_health = 3
    
    # space is of size n x n 
    # (0, 0) is at the top left corner
    # x represents the vertical direction
    # y represents the horizontal direction
    space_size_n = 3
    
    # miss rate on any one attack
    attack_miss_rate = 0.1
    
    # each attack takes some health
    attak_power = 1
    
    # reward of win a game
    game_award = 100
    
        
    def generate_init_pos(self):
        player1_init_pos = [random.randrange(LOREnv1.space_size_n), random.randrange(LOREnv1.space_size_n)]
        player2_init_pos = [random.randrange(LOREnv1.space_size_n), random.randrange(LOREnv1.space_size_n)]
        
        while player1_init_pos == player2_init_pos:
          player2_init_pos = [random.randrange(LOREnv1.space_size_n), random.randrange(LOREnv1.space_size_n)]
        
        return player1_init_pos, player2_init_pos

    def __init__(self, config):
        self.action_space = Discrete(5)
        
        # the observation is a tuple: [self_pos_x, self_pos_y, self.health, pos_x, pos_y, health]
        # start with a discrete space
        self.observation_space = Tuple(
            [
                # self position in x/y
                Box(low = 0, high = LOREnv1.space_size_n - 1, shape=(2, ), dtype=np.int16),
                # opponent position in x/y
                Box(low = 0, high = LOREnv1.space_size_n - 1, shape=(2, ), dtype=np.int16),
                # self health and opponent health
                Box(low = 0, high = LOREnv1.max_health, shape=(2, ), dtype=np.int16),
                
            ]
        )
        
        self.player1 = "player1"
        self.player2 = "player2"
        
        # set init position
        self.player1_init_pos, self.player2_init_pos = self.generate_init_pos()
        
        self.position = {
                self.player1: self.player1_init_pos,
                self.player2: self.player2_init_pos
        }
        
        self.health = {
            self.player1: LOREnv1.max_health,
            self.player2: LOREnv1.max_health
        }
        
        # For test-case inspections (compare both players' scores).
        self.player1_score = self.player2_score = 0

    # reset the env
    # return the initial observation
    def reset(self):
        self.player1_init_pos, self.player2_init_pos = self.generate_init_pos()
        
        self.position = {
                self.player1: self.player1_init_pos,
                self.player2: self.player2_init_pos
        }
        
        self.health = {
            self.player1: LOREnv1.max_health,
            self.player2: LOREnv1.max_health
        }
        
        return {
            self.player1: tuple(
                [
                    np.array([self.position[self.player1][0], self.position[self.player1][1]]),
                    np.array([self.position[self.player2][0], self.position[self.player2][1]]),
                    np.array([self.health[self.player1], self.health[self.player2]])
                ]
            ),
            self.player2: tuple(
                [
                    np.array([self.position[self.player2][0], self.position[self.player2][1]]),
                    np.array([self.position[self.player1][0], self.position[self.player1][1]]),
                    np.array([self.health[self.player2], self.health[self.player1]])
                ]
            )
        }
    
    def move_agent(self, player, opponent, action):
        if self.health[player] <= 0:  # no health no action
            return
        
        if action == LOREnv1.MOVEUP or action == LOREnv1.MOVEDOWN:
            new_x = self.position[player][0] + (1 if action == LOREnv1.MOVEDOWN else -1)
            if new_x < 0 or new_x >= LOREnv1.space_size_n \
            or (self.position[opponent][0] == new_x and self.position[opponent][1] == self.position[player][1]):
                return # invalid move
            else:
                self.position[player][0] = new_x
                
        if action == LOREnv1.MOVELEFT or action == LOREnv1.MOVERIGHT:
            new_y = self.position[player][1] + (1 if action == LOREnv1.MOVERIGHT else -1)
            if new_y < 0 or new_y >= LOREnv1.space_size_n \
            or (self.position[opponent][1] == new_y and self.position[opponent][0] == self.position[player][0]):
                return # invalid move
            else:
                self.position[player][1] = new_y
        
        return # not a move action

    def attack_agent(self, player, opponent, action):
        if action != LOREnv1.ATTACK or self.health[player] <= 0:
            return 0 # 0 attack gain
        
        
        hit =  0 if random.random() < LOREnv1.attack_miss_rate else 1
        # attack is only valid if the two agents are adjacent (not diagonal)
        if (self.position[player][0] == self.position[opponent][0] and abs(self.position[player][1] - self.position[opponent][1]) <= 1) \
            or (self.position[player][1] == self.position[opponent][1] and abs(self.position[player][0] - self.position[opponent][0]) <= 1):
            self.health[opponent] = self.health[opponent] - hit * LOREnv1.attak_power
            
            return hit * LOREnv1.attak_power
        else:
            return 0 
    
    def get_reward(self, player, opponent, attack_gain):
        if self.health[player] <=0 and self.health[opponent] > 0:
            return -1 * LOREnv1.game_award
        
        if self.health[player] > 0 and self.health[opponent] <= 0:
            return LOREnv1.game_award
        
        if self.health[player] == 0 and self.health[opponent] == 0:
            return 0
        
        return attack_gain
    
    def get_reward2(self, attack_gain_player1, attack_gain_player2):
        if self.health[self.player1] <=0 and self.health[self.player2] > 0:
            return [-1 * LOREnv1.max_health, LOREnv1.max_health]
        
        if self.health[self.player1] > 0 and self.health[self.player2] <= 0:
            return [LOREnv1.max_health, -1 * LOREnv1.max_health]
        
        if self.health[self.player1] == 0 and self.health[self.player2] == 0:
            return [-1 * LOREnv1.max_health, -1 * LOREnv1.max_health]
        
        return [attack_gain_player1, attack_gain_player2]
        
    
    
    # update state and observation based on the 2 actions
    def step(self, action_dict):        
        # update position     
        # randomly pick who to move first (if both decide to move)
        who_moves_first = self.player1 if random.random() < 0.5 else self.player2
        
        if who_moves_first == self.player1: 
            self.move_agent(self.player1, self.player2, action_dict[self.player1])
            self.move_agent(self.player2, self.player1, action_dict[self.player2])
        else:
            self.move_agent(self.player2, self.player1, action_dict[self.player2])
            self.move_agent(self.player1, self.player2, action_dict[self.player1])
        
        # update attack 
        # randomly pick who to attack first (if both decide to attach)
        who_attacks_first = self.player1 if random.random() < 0.5 else self.player2
        
        if who_attacks_first == self.player1:
            attack_gain_player1 = self.attack_agent(self.player1, self.player2, action_dict[self.player1])
            attack_gain_player2 = self.attack_agent(self.player2, self.player1, action_dict[self.player2])
        else:
            attack_gain_player2 = self.attack_agent(self.player2, self.player1, action_dict[self.player2])
            attack_gain_player1 = self.attack_agent(self.player1, self.player2, action_dict[self.player1])
            
        # get the new obs
        obs = {
            self.player1: tuple(
                [
                    np.array([self.position[self.player1][0], self.position[self.player1][1]]),
                    np.array([self.position[self.player2][0], self.position[self.player2][1]]),
                    np.array([self.health[self.player1], self.health[self.player2]])
                ]
            ),
            self.player2: tuple(
                [
                    np.array([self.position[self.player2][0], self.position[self.player2][1]]),
                    np.array([self.position[self.player1][0], self.position[self.player1][1]]),
                    np.array([self.health[self.player2], self.health[self.player1]])
                ]
            )
        }
        
        # get the reward
        rew = {
            self.player1: self.get_reward(self.player1, self.player2, attack_gain_player1),
            self.player2: self.get_reward(self.player2, self.player1, attack_gain_player2),
        }
        
        done = {
            "__all__": self.health[self.player1] == 0 or self.health[self.player2] == 0,
        }

        if rew["player1"] == LOREnv1.game_award:
            self.player1_score += 1
        elif rew["player2"] == LOREnv1.game_award:
            self.player2_score += 1

        return obs, rew, done, {}
    


In [17]:
from ray.rllib.policy.policy import Policy

class LORHeuristicCautious(Policy):
    """
    Heuristic policy
    if self.health >= opponent.health and self.health > 1:
        if self and opponent is adjacent:
            attack
        else:
            move torwards the opponent
    else:
        if self and opponent is adjacent:
            move away from the opponent 
        else:
            attack   
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()
    
    @staticmethod
    def take_action(obv):
        # each ob is np array (self.x, self.y, oponent.x, oppoennt.y, self.health, opponent.health)
        self_x = obv[0]
        self_y = obv[1]
        op_x = obv[2]
        op_y = obv[3]
        self_h = obv[4]
        op_h = obv[4]
        
        if self_h >= op_h and self_h > 1:
            if (self_x == op_x and abs(self_y - op_y) <= 1) or (self_y == op_y and abs(self_x - op_x) <= 1):
                return LOREnv1.ATTACK
            else:
                if self_x != op_x:
                    return LOREnv1.MOVEUP if self_x > op_x else LOREnv1.MOVEDOWN
                else:
                    return LOREnv1.MOVELEFT if self_y > op_y else LOREnv1.MOVERIGHT
        else:
            if (self_x == op_x and abs(self_y - op_y) <= 1) or (self_y == op_y and abs(self_x - op_x) <= 1):
                if self_x == op_x:
                    return LOREnv1.MOVEUP if self_x == LOREnv1.space_size_n -1  else LOREnv1.MOVEDOWN
                else:
                    return LOREnv1.MOVELEFT if self_y == LOREnv1.space_size_n -1  else LOREnv1.MOVERIGHT
            else:
                return LOREnv1.ATTACK

                

    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):

        return [LORHeuristicCautious.take_action(x) for x in obs_batch], [], {}
    
    def learn_on_batch(self, samples):
        pass

    def get_weights(self):
        pass

    def set_weights(self, weights):
        pass

    
class LORHeuristicReckless(Policy):
    """
    Heuristic policy
    if self and opponent is adjacent:
        attack
    else
        move torwards the opponent
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()
    
    @staticmethod
    def take_action(obv):
        # each ob is np array (self.x, self.y, oponent.x, oppoennt.y, self.health, opponent.health)
        self_x = obv[0]
        self_y = obv[1]
        op_x = obv[2]
        op_y = obv[3]
        self_h = obv[4]
        op_h = obv[4]
        
        if (self_x == op_x and abs(self_y - op_y) <= 1) or (self_y == op_y and abs(self_x - op_x) <= 1):
            return LOREnv1.ATTACK
        else:
            if self_x != op_x:
                return LOREnv1.MOVEUP if self_x > op_x else LOREnv1.MOVEDOWN
            else:
                return LOREnv1.MOVELEFT if self_y > op_y else LOREnv1.MOVERIGHT
                

    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):

        return [LORHeuristicReckless.take_action(x) for x in obs_batch], [], {}
    
    def learn_on_batch(self, samples):
        pass

    def get_weights(self):
        pass

    def set_weights(self, weights):
        pass

In [18]:
import ray
from ray.rllib.agents.dqn import DQNTrainer

ray.shutdown()
ray.init()

def select_policy(agent_id):
    if agent_id == "player1":
        return "learned"
    else:
        return "LORHeuristicReckless"

env = LOREnv1({})
    
config = {
    "env": LOREnv1,
    "gamma": 0.9,
    "num_workers": 0,
    "num_envs_per_worker": 4,
    "rollout_fragment_length": 10,
    "train_batch_size": 500,
    "multiagent": {
        "policies_to_train": ["learned"],
        "policies": {
            "LORHeuristicReckless": (LORHeuristicReckless, env.observation_space, env.action_space, {}),
            "learned": (None, env.observation_space, env.action_space, {
                "model": {
                        "use_lstm": True
                },
            }),
        },
        "policy_mapping_fn": select_policy,
    },
}

trainer_obj = DQNTrainer(config=config)
env = trainer_obj.workers.local_worker().env
for _ in range(50):
    results = trainer_obj.train()
    #print(results)
    
    #if _ % 100 == 0:
    print(env.player1_score, env.player2_score)

2020-06-16 03:11:06,410	INFO resource_spec.py:212 -- Starting Ray with 27.2 GiB memory available for workers and up to 13.61 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-16 03:11:06,850	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8268[39m[22m
2020-06-16 03:11:09,221	INFO trainable.py:217 -- Getting current IP.


3 30
13 51
30 76
46 100
64 116
81 132
99 150
120 162
139 169
181 182
233 189
259 196
297 213
339 216
395 221
448 229
504 233
555 240
609 245
663 249
719 251
772 253
828 255
883 255
936 261
986 263
1041 265
1095 265
1146 268
1201 269
1252 272
1314 272
1371 273
1423 275
1475 277
1528 281
1586 281
1641 286
1697 288
1755 288
1812 288
1870 289
1926 289
1984 290
2037 290
2092 291
2147 291
2199 291
2258 291
2318 292


In [45]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    
def print_obv(env, obv, size):
    for i in range(size):
        for j in range(size):
            if i == obv[env.player1][0][0] and j == obv[env.player1][0][1]:
                print(f"{bcolors.FAIL}{obv[env.player1][2][0]}{bcolors.ENDC}", end = "")
            elif i == obv[env.player2][0][0] and j == obv[env.player2][0][1]:
                print(f"{bcolors.OKBLUE}{obv[env.player2][2][0]}{bcolors.ENDC}", end = "")
            else:
                print("-", end = "")
        print("")

        
def simulate(env, trainer1, trainer2, size):
    obv = env.reset()
    done = {"__all__" : False}
    
    while done["__all__"] == False:
        a1 = trainer1.compute_action(obv[env.player1])
        a2 = trainer2.compute_action(obv[env.player2])
        
        
        print_obv(env, obv, size)
        print("".join(['*']*(size)) , end = " action ")
        print(f"{bcolors.FAIL}{env.action_string[a1]}{bcolors.ENDC}", end = " , ")
        print(f"{bcolors.OKBLUE}{env.action_string[a2]}{bcolors.ENDC}")
        
        obv, reward, done, info = env.step({env.player1: a1, env.player2: a2})
    
    print_obv(env, obv, size)
    
def simulate2(env, trainer1, size, base_policy):
    obv = env.reset()
    done = {"__all__" : False}
    
    while done["__all__"] == False:
        a1 = trainer1.compute_action(observation = obv[env.player1], policy_id = "learned")
        a2 = trainer1.compute_action(observation = obv[env.player2], policy_id = base_policy)
        
        
        print_obv(env, obv, size)
        print("".join(['*']*(size)) , end = " action ")
        print(f"{bcolors.FAIL}{env.action_string[a1]}{bcolors.ENDC}", end = " , ")
        print(f"{bcolors.OKBLUE}{env.action_string[a2]}{bcolors.ENDC}")
        
        obv, reward, done, info = env.step({env.player1: a1, env.player2: a2})
    
    print_obv(env, obv, size)

In [35]:
# disable random explore of the DQN traininer so that it follows the policy prediction.
trainer_obj.get_policy("learned").config['explore'] = False


In [41]:
trainer_obj.get_policy("learned").model.base_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 6)]          0                                            
__________________________________________________________________________________________________
fc_1 (Dense)                    (None, 256)          1792        observations[0][0]               
__________________________________________________________________________________________________
fc_out (Dense)                  (None, 256)          65792       fc_1[0][0]                       
__________________________________________________________________________________________________
value_out (Dense)               (None, 1)            257         fc_1[0][0]                       
Total params: 67,841
Trainable params: 67,841
Non-trainable params: 0
________________________

In [38]:
sim_env = LOREnv1({})
    
simulate2(sim_env, trainer_obj, 3, "LORHeuristicReckless")

---
--[91m3[0m
--[94m3[0m
*** action [91mMoveLeft[0m , [94mAttack[0m
---
-[91m3[0m-
--[94m3[0m
*** action [91mAttack[0m , [94mMoveUp[0m
---
-[91m3[0m[94m2[0m
---
*** action [91mAttack[0m , [94mAttack[0m
---
-[91m2[0m[94m1[0m
---
*** action [91mAttack[0m , [94mAttack[0m
---
-[91m2[0m[94m0[0m
---


In [39]:
sim_env = LOREnv1({})
    
simulate2(sim_env, trainer_obj, 3, "LORHeuristicReckless")

--[94m3[0m
[91m3[0m--
---
*** action [91mAttack[0m , [94mMoveDown[0m
---
[91m3[0m-[94m3[0m
---
*** action [91mAttack[0m , [94mMoveLeft[0m
---
[91m3[0m[94m2[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
---
[91m2[0m[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
---
[91m2[0m[94m0[0m-
---


In [42]:
import ray
from ray.rllib.agents.dqn import DQNTrainer

ray.shutdown()
ray.init()

def select_policy(agent_id):
    if agent_id == "player1":
        return "learned"
    else:
        return "LORHeuristicCautious"

env_constant = LOREnv1({})
    
config = {
    "env": LOREnv1,
    "gamma": 0.9,
    "num_workers": 0,
    "num_envs_per_worker": 4,
    "rollout_fragment_length": 10,
    "train_batch_size": 500,
    "multiagent": {
        "policies_to_train": ["learned"],
        "policies": {
            "LORHeuristicCautious": (LORHeuristicCautious, env_constant.observation_space, env_constant.action_space, {}),
            "learned": (None, env_constant.observation_space, env_constant.action_space, {
                "model": {
                        "use_lstm": True
                },
            }),
        },
        "policy_mapping_fn": select_policy,
    },
}

trainer_obj2 = DQNTrainer(config=config)
local_env = trainer_obj2.workers.local_worker().env
for _ in range(50):
    results = trainer_obj2.train()
    #print(results)
    
    #if _ % 100 == 0:
    print(local_env.player1_score, local_env.player2_score)

2020-06-16 03:30:17,867	INFO resource_spec.py:212 -- Starting Ray with 27.15 GiB memory available for workers and up to 13.58 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-16 03:30:18,242	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8268[39m[22m
2020-06-16 03:30:20,549	INFO trainable.py:217 -- Getting current IP.


0 19
0 42
0 60
0 70
0 86
0 89
0 90
0 90
0 91
0 92
0 92
0 93
0 93
0 93
0 93
0 93
0 93
0 93
0 93
0 93
0 93
0 94
0 94
0 94
0 94
0 94
0 95
0 95
0 95
0 95
0 95
0 95
0 95
0 96
0 96
0 96
0 96
0 98
0 99
0 100
0 100
0 100
0 102
0 103
0 103
0 107
0 108
0 108
0 111
0 113


In [43]:
trainer_obj2.get_policy("learned").config['explore'] = False

In [47]:
sim_env = LOREnv1({})
    
simulate2(sim_env, trainer_obj2, 3, "LORHeuristicCautious")

-[91m3[0m-
---
-[94m3[0m-
*** action [91mMoveLeft[0m , [94mMoveUp[0m
[91m3[0m--
-[94m3[0m-
---
*** action [91mAttack[0m , [94mMoveUp[0m
[91m3[0m[94m2[0m-
---
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m[94m1[0m-
---
---
*** action [91mAttack[0m , [94mMoveDown[0m
[91m2[0m--
-[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m--
-[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m--
-[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m--
-[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m--
-[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m--
-[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m--
-[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m--
-[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m--
-[94m1[0m-
---
*** action [91mAttack[0m , [94mAttack[0m
[91m2[0m--
-[94m1

KeyboardInterrupt: 