In [1]:
"""
Code to locally create and train an agent to play Halite IV, hosted by Kaggle - for rules of competition see link:
https://www.kaggle.com/c/halite

In summary the rules are 4 players on a board of size 21*21. Each player has some numbers of ships
(which can move, stay still, or become a shipyard), and shipyards, which can produce ships or skip their turn. 
All actions for all pieces are resolved simultaneously. The goal of the game is to collect as much of a resource,
'halite' as possible. Each square has some halite. Ships using their turn to stay still collect 25% of the halite 
on their respective square. Building ships and shipyards collect shipyards. Collisions between any two pieces lead to
all (or all but one) of them being destroyed, even if they are from the same player, with rules as specified in the link 
above. The game terminates after 400 moves, or earlier if only one player is still 'alive' (i.e has ships or shipyards
and enough halite to build a new ship).


The approach taken in this code is to train a neutral network to play this game. In particular, two neural networks are 
trained, such that one takes as input the observation of a single ship and recommends one of five actions (move in one of
the four directions North/East/South/West, stay still) and an analogous neural network for a shipyard recommending one of 
two actions (build ship/skip). 

This is a reinforcment learning problem. An environment is created which takes converts the game board information to an
array to use as input for the neural network, and then convert the resulting output into moves, and resolves these moves 
to produce the next set of "observations" for all relevant pieces. When a game is terminated for one of the pieces (i.e
it is destroyed or the game turn limit is reached), the environment also calculates a reward for each piece that played
in the episode so that the neural network could be updated.

The particular training algorithm used is Proximal Policy Optimization (PPO) - https://arxiv.org/abs/1707.06347

After training, the resulting neural network is saved (along with checkpoints throughout). A game is played (against
itself or basic hard-coded opponents) and then visualized.


"""

import numpy as np 

#import the game environment and its helper functions
from kaggle_environments import make, evaluate
from kaggle_environments.envs.halite.helpers import *


import sys
import random

#Use a gym to train the agents
import gym
from gym import spaces


#ray rllib is used to create and train the agents - an earlier version must be used (ray 0.8.6) needs to be used
#to be compatible with this code. 
import ray
from ray.tune.result import (NODE_IP, TRAINING_ITERATION, TIME_TOTAL_S,
                             TIMESTEPS_TOTAL, EXPR_PARAM_FILE,
                             EXPR_PARAM_PICKLE_FILE, EXPR_PROGRESS_FILE,
                             EXPR_RESULT_FILE)
from ray.tune.utils import flatten_dict
from ray import tune

VALID_SUMMARY_TYPES = [int, float, np.float32, np.float64, np.int32, np.int64]


print(ray.__version__)

Loading environment football failed: No module named 'gfootball'
0.8.6


In [2]:
#override some functions in ray libraries (fixes have yet to be rolled out while this code was written)

def _on_result(self, result):
        step = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION]

        tmp = result.copy()
        for k in [
                "config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION
        ]:
            if k in tmp:
                del tmp[k]  # not useful to log these

        flat_result = flatten_dict(tmp, delimiter="/")
        path = ["ray", "tune"]
        valid_result = {}

        for attr, value in flat_result.items():
            full_attr = "/".join(path + [attr])
            if type(value) in VALID_SUMMARY_TYPES and not np.isnan(value):
                valid_result[full_attr] = value
                self._file_writer.add_scalar(
                    full_attr, value, global_step=step)
            elif (type(value) == list
                  and len(value) > 0) or (type(value) == np.ndarray
                                          and value.size > 0):
                valid_result[full_attr] = value
                try:
                    self._file_writer.add_histogram(
                        full_attr, value, global_step=step)
                # In case TensorboardX still doesn't think it's a valid value
                # (e.g. `[[]]`), warn and move on.
                except (ValueError, TypeError):
                    if log_once("invalid_tbx_value"):
                        logger.warning(
                            "You are trying to log an invalid value ({}={}) "
                            "via {}!".format(full_attr, value,
                                             type(self).__name__))

        self.last_result = valid_result
        self._file_writer.flush()
def same_padding(in_size, filter_size, stride_size):
    """Note: Padding is added to match TF conv2d `same` padding. See
    www.tensorflow.org/versions/r0.12/api_docs/python/nn/convolution
    Args:
        in_size (tuple): Rows (Height), Column (Width) for input
        stride_size (Union[int,Tuple[int, int]]): Rows (Height), column (Width)
            for stride. If int, height == width.
        filter_size (tuple): Rows (Height), column (Width) for filter
    Returns:
        padding (tuple): For input into torch.nn.ZeroPad2d.
        output (tuple): Output shape after padding and convolution.
    """
    in_height, in_width = in_size
    if isinstance(filter_size, int):
        filter_height, filter_width = filter_size, filter_size
    else:
        filter_height, filter_width = filter_size
    stride_height, stride_width = stride_size

    out_height = np.ceil(float(in_height) / float(stride_height))
    out_width = np.ceil(float(in_width) / float(stride_width))

    pad_along_height = int(
        ((out_height - 1) * stride_height + filter_height - in_height))
    pad_along_width = int(
        ((out_width - 1) * stride_width + filter_width - in_width))
    pad_top = pad_along_height // 2
    pad_bottom = pad_along_height - pad_top
    pad_left = pad_along_width // 2
    pad_right = pad_along_width - pad_left
    padding = (pad_left, pad_right, pad_top, pad_bottom)
    output = (out_height, out_width)
    return padding, output

tune.logger.TBXLogger.on_result = _on_result

from ray.rllib.agents.registry import get_agent_class
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.models.torch.misc import SlimFC, AppendBiasLayer,normc_initializer,SlimConv2d
#from ray.rllib.models.tf.visionnet_v1 import _get_filter_config
from ray.tune import run_experiments
from ray.tune.registry import register_env
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.utils.framework import try_import_tf, try_import_torch

In [3]:
#we use the pytorch framework for the NN
torch, nn = try_import_torch()
tf = try_import_tf()

In [4]:
#As the environment has many active "players", the number of which is not constant through the game, 
#a multiagent environment is used.
from ray.rllib.env.multi_agent_env import MultiAgentEnv



#Define the Gym
class HaliteGym(MultiAgentEnv):
    def __init__(self, return_agent_actions = False, part=False,opponents=["random","random","random"]):
        
        #Create game environment using kaggle framework
        board_size = 21
        self.configuration = {"size": board_size,"maxCellHalite":500}
        self.ks_env = make("halite", configuration=self.configuration) 
        
        #The maximum number of active agents 
        self.num_agents = 50

        #The number of players
        self.player_count = 4
        self.ks_env.reset(self.player_count)  
        
        
        #Unless otherwise specified, play against random opponents, i.e ones which pick all moves at random
        self.opponents = opponents
        

        self.env = self.ks_env  
        state = self.ks_env.state[0]
        
        
        self.configuration = self.ks_env.configuration          
        self.startingHalite = self.ks_env.configuration.startingHalite
        self.size = self.ks_env.configuration.size
        
        
        #These will be used to normalize the inputs to the NN
        self.maximumHalite = 10000 + self.startingHalite
        self.maximumCargo = self.configuration.maxCellHalite
        
        self.board = Board(state.observation,self.configuration)
        
        
        self.ship_actions = [None, ShipAction.NORTH,ShipAction.SOUTH,ShipAction.EAST,
                             ShipAction.WEST]
        self.shipyard_actions = [ShipyardAction.SPAWN,None]
        
        
        
        self.player = self.board.current_player
        
        #initial values
        self.ship_num = 1
        self.shipyard_num = 0
        self.halite_store = self.player.halite
        
        
        
        """
        Define some arrays that act as trackers throughout the game, all of which are reset after each game is completed.
        
        -piece_history contains all the pieces, both dead and alive. In the case that the piece is a ship, the value saved
        for that key is the halite stored on that ship.
        
        -dead_pieces contains all pieces that were on the board at this game but were destroyed
        
        -reward_tracker saves the cumulative reward given for each piece
        
        -collect_reward is a counter saved for giving rewards to a ship which collected some halite in its turn
        
        -destroyed_shipyard_penalty is a counter for penalising ships if they allowed a shipyard to be destroyed
        
        -changed_pieces is a list of all the pieces for which the action provided by the NN was cancelled (e.g if they
        would cause a collision, we would like to prevent that).
        """
        self.piece_history = {}
        self.dead_pieces ={}
        self.reward_tracker = {}
        self.collect_reward = 0 
        self.destroyed_shipyard_penalty = 0
        self.changed_pieces = []
        
        #reward to add to each piece if turn 400 is reached relating to which place the overall player scored.
        self.place_rewards = [1.0,0.2,-0.2,-1]

        
        
        #Define observation and action spaces
        obs_low = np.stack((np.zeros((21,21)),-1*np.ones((21,21)),-1*np.ones((21,21)),np.zeros((21,21))))
        obs_high = np.ones((4,21,21))

        self.action_space = spaces.Discrete(5)
        self.observation_space = spaces.Box(obs_low,obs_high)
        
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-1, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
        
    """
    This function takes in the "raw" observation (state) given by the kaggle environment 
    and returns a 3D array corresponding to the observation passed to the NN. The observation
    returned has 4 layers of 21*21, containing:
    1. the normalised halite on each square of the board
    2. 0.5 + normalised ship's cargo if a friendly ship is present on that square, -1*that amount for enemy ships
    3. 1 if a friendly shipyard is present, -1 if an enemy ship is present, 0 otherwise.
    4. Some other hyperparameters relevant to the observation: the amount of halite a player has available to use,
    and the turn number, both normalised
    
    """
    def obs_array(self,state,player_number):
        board_size = self.size
        raw_obs = state

        
        halite = np.array(state['halite'])/(self.configuration.maxCellHalite)
        
        player_1_ships = np.zeros(board_size*board_size)
        player_1_shipyards = np.zeros(board_size*board_size)
         
        other_params_array = np.zeros(board_size*board_size)
        
        #Iterate over friendly pieces and add them to array
        for key in raw_obs['players'][player_number][2]:
            player_1_ships[raw_obs['players'][player_number][2][key][0]] += \
            (0.5 + raw_obs['players'][player_number][2][key][1]/(100*self.maximumCargo)) 
        for key in raw_obs['players'][player_number][1]:
            player_1_shipyards[raw_obs['players'][player_number][1][key]] += 1
            halite[raw_obs['players'][player_number][1][key]] = 0
        
        #Other players
        player_2 = (player_number+1)%4
        player_3 = (player_number+2)%4
        player_4 = (player_number+3)%4

        #iterate over other players and add them to array
        for key in raw_obs['players'][player_2][2]:
            player_1_ships[raw_obs['players'][player_2][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_2][2][key][1]/(100*self.maximumCargo)) 
        for key in raw_obs['players'][player_2][1]:
            player_1_shipyards[raw_obs['players'][player_2][1][key]] -= 1
            halite[raw_obs['players'][player_2][1][key]] = 0
            
            
        for key in raw_obs['players'][player_3][2]:
            player_1_ships[raw_obs['players'][player_3][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_3][2][key][1]/(100*self.maximumCargo)) 
        for key in raw_obs['players'][player_3][1]:
            player_1_shipyards[raw_obs['players'][player_3][1][key]] -= 1
            halite[raw_obs['players'][player_3][1][key]] = 0

            
            
        for key in raw_obs['players'][player_4][2]:
            player_1_ships[raw_obs['players'][player_4][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_4][2][key][1]/(100*self.maximumCargo)) 
        for key in raw_obs['players'][player_4][1]:
            player_1_shipyards[raw_obs['players'][player_4][1][key]] -= 1
            halite[raw_obs['players'][player_4][1][key]] = 0


        halite = halite.reshape(board_size,board_size)
        player_1_ships = player_1_ships.reshape(board_size,board_size)
        player_1_shipyards = player_1_shipyards.reshape(board_size,board_size)

        
        """
        Here we help the NN by slightly manipulating the observations slightly.
        If a shipyard is present, we reduce the observed halite at each square by a factor
        equal to the distance between the square and where the shipyard is, to place a higher
        importance on squares near the shipyard.
        
        Otherwise, the ships tend to stray too far from the shipyard and become lose.
        """
        if len(self.board.players[player_number].shipyards)>0:
            shipyard_pos = np.asarray(np.where(player_1_shipyards==1)).T[0]
            #print(shipyard_pos)
            x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]
            #print(x_coord,y_coord)
            for i in range(0,21):
                for j in range(0,21):
                    if (i,j) != (x_coord,y_coord):
                        rel_dist = self.distance(i,x_coord,j,y_coord)
                        halite[i][j]/=rel_dist
                    
        else:
            shipyard_pos = None
            rel_dist = 1
            

        #stack the arrays and return 3D array
        other_params_array = np.zeros((board_size,board_size))
        other_params_array[0][0] = raw_obs['players'][player_number][0]/self.maximumHalite
        other_params_array[0][1] = raw_obs.step/400

        obs = np.stack((halite,player_1_ships,player_1_shipyards,\
                        other_params_array))



        return obs
    
    
    """
    This function takes the observation array produced by the function above and permutes the arrays so that
    they are centred around the current ship that we are predicting for, to contextualise the actions.
    
    """
    def recentre(self,observation,piece,player):
        position = piece.position
        x_coord = position[0]
        y_coord = position[1]
        
        to_shift = observation[:-1,:,:]
        other_params_array = observation[-1,:,:]
        
        shifted = np.roll(to_shift, shift =(y_coord-10,10-x_coord)  , axis = (1,2))
        
        new_obs = np.concatenate((shifted,[other_params_array]),axis=0)
        
        #If piece ship, add parameter to observation equal to cargo ship is carrying. If ship is shipyard,
        # add parameter for number of ships our player has.
        if piece in player.ships:
            new_obs[-1,0,2] = piece.halite/(50*self.maximumCargo)
            
            """
            Again we manipulate the observations slightly to help contextualise the observation. Once a ship is 
            carrying a lot of halite, we want it to return to the shipyard to "deposit" it. So once we are carrying a lot
            of halite, we reduce the observed halite on surrounding squares and increase the preceived halite on the square
            where the shipyard is.
            """
            if piece.halite>0:
                if len(player.shipyards)>0:
                    
                    scaledown_factor = max(1,piece.halite/100)
                    shipyard_val = min(1,piece.halite/150)
                    
                    shipyard_pos = np.asarray(np.where(new_obs[2,:,:]==1)).T[0]
                    x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]

                    new_obs[0,:,:] /= scaledown_factor
                    new_obs[0,x_coord,y_coord] = shipyard_val
        elif piece in player.shipyards:
            new_obs[-1,0,2] = len(player.ships)/(21**2)
            
            

            
        return new_obs
    

    #function to calculate distance between points
    
    def distance(self,x1,x2,y1,y2):
        x1,x2 = min(x1,x2),max(x1,x2)
        y1,y2 = min(y1,y2),max(y1,y2)
        
        if x2-x1>11:
            x1 +=21
        if y2-y1>11:
            y1+=21
        distance = np.sqrt((x2-x1)**2+(y2-y1)**2)
        return distance
        
        
    #Shipyard building is automated. Out of all the ships that can build a shipyard, pick the most central one.
    def building_shipyard(self,player):
        possible_builds = [ship for ship in player.ships if ship.halite+player.halite >500]
        if len(possible_builds)==0:
            return None
        else:
            average_x = np.mean(np.array([ship.position.x for ship in player.ships]))
            average_y = np.mean(np.array([ship.position.y for ship in player.ships]))
            smallest_distance = 10**6
            ship_id = None
            for ship in possible_builds:
                if self.distance(ship.position.x,average_x,ship.position.y,average_y) < smallest_distance:
                    ship_id = ship.id
                    smallest_distance = self.distance(ship.position.x,average_x,ship.position.y,average_y)
            return ship_id
        
        
    #calculate the final positions of each player at the end of the game based on whether they survived and how much 
    #halite they had at the end
    def final_positions(self):
        final_rewards = {}
        for player_id,player in self.board.players.items():

            if self.env.state[player_id].status == 'ACTIVE':
                relevant_rewards = {k:v for k,v in self.reward_tracker.items() if int(k[6])==player_id}
                final_rewards[player_id] = sum(relevant_rewards.values())
                
                
        return final_rewards
            
            
    

    #Give reward for winning the game
    def win_reward(self,done):

        
        if done:

            return max(0,(self.player.halite-5000))/self.startingHalite
                
        else:
            
            return 0
        

        
    #If a shipyard was destroyed but could have survived by building a ship, penalise it
    def destroyed_shipyard_reward(self,halite_prev):
        self.destroyed_shipyard_penalty = -0.2
        if halite_prev >= 500:
            return -0.5
        else:
            return 0
    
    #penalise ships for being destroyed
    def destroyed_ship_reward(self,piece):

        #pos = self.next_pos_dict[piece]
        #cell = self.board.cells[pos]
        return -1.0
        


    #If a shipyard built a ship when there weren't many, reward it 
    def surviving_shipyard_reward(self,shipyard,player):
        if 'player'+str(player.id)+'shipyard'+shipyard.id in self.prev_action_dict.keys():
            
            
            
            pos = shipyard.position
            action = self.prev_action_dict['player'+str(player.id)+'shipyard'+shipyard.id]
            
            if shipyard.cell.ship in player.ships:
                if 'player'+str(player.id)+'ship'+shipyard.cell.ship.id not in self.prev_action_dict.keys():
                    if len(player.ships)<10:
                        return 0.1
                    else:
                        return 0
                else:
                    return 0
            else:
                return 0
            
            
 
        else:
            return 0

                
                
    #if a ship survived the turn, give it a reward based on how much halite it collected or deposited this turn
    def surviving_ship_reward(self,ship,player_id):
        pos = ship.cell.position
        treward = 0
        #if the ship existed the prior turn (i.e if it wasn't only just created)
        if 'player'+str(player_id)+'ship'+ship.id in self.prev_action_dict.keys():
            action = self.prev_action_dict['player'+str(player_id)+'ship'+ship.id]
            
            #if the action taken was due to an override, don't reward ship for this action
            if 'player'+str(player_id)+'ship'+ship.id not in self.changed_pieces:

                
                #Only reward ships for collecting as long as they aren't already carrying a lot
                if ((self.piece_history['player'+str(player_id)+'ship'+ship.id] <150)):

                    if len(self.board.players[player_id].shipyards)>0:
                        shipyard_pos = self.board.players[player_id].shipyards[0].position
                        rel_dist = self.distance(ship.position.x,shipyard_pos.x,ship.position.y,shipyard_pos.y)
                    else:
                        rel_dist = 10
                    if (max(0,ship.halite-self.piece_history['player'+str(player_id)+'ship'+ship.id]))>0:
                        collected_halite = (max(0,ship.halite-self.piece_history['player'+str(player_id)+'ship'+ship.id]))/(rel_dist*self.maximumCargo)
                        treward += collected_halite
                        self.collect_reward += 0.01 * collected_halite

                
                
                if ((self.piece_history['player'+str(player_id)+'ship'+ship.id] >0)):
                    deposited_halite = (max(0,self.piece_history['player'+str(player_id)+'ship'+ship.id]-ship.halite))/(self.maximumCargo)
                    if deposited_halite > 1:
                        deposited_halite = 1
                    self.reward_tracker['player'+str(player_id)+'ship'+ship.id] += deposited_halite
                    self.collect_reward += 0.01* deposited_halite
                    treward += deposited_halite






                


        else:
            self.reward_tracker['player'+str(player_id)+'ship'+ship.id] = 0
            treward = 0


        return treward

            
            
    
        
            
        
            
        
        
    
    #At end of game, reset game environment and produce new starting observations
    def reset(self):

        self.raw_obs = self.env.reset(4)[0].observation
        self.board = Board(self.raw_obs,self.configuration)
        
        
        


        self.piece_history = {}
        self.dead_pieces = {}
        self.dist_to_shipyard = {}
        self.prev_action_dict = {}
        self.next_pos_dict = {}
        self.reward_tracker = {}
        self.changed_pieces = []
        
        
        

        obs = {}
        #Give results for initial pieces - just the one ship in our case.
        for player_id,player in self.board.players.items():
            self.base_view = self.obs_array(self.raw_obs,player_id)
            self.base_view[0,:,:] = np.zeros(21*21).reshape(21,21)
        
            for ship in player.ships:
                self.piece_history['player'+str(player_id)+'ship'+ship.id] = 0
                obs['player'+str(player_id)+'ship'+ship.id] = self.recentre(self.base_view,ship,player)
                self.reward_tracker['player'+str(player_id)+'ship'+ship.id] = 0

            for shipyard in self.player.shipyards:
                obs['player'+str(player_id)+'shipyard'+shipyard.id] = self.recentre(self.base_view,shipyard,player)
                self.piece_history['player'+str(player_id)+'shipyard'+shipyard.id] = 1


        return obs
    
    #Translate output of NN into actions, pass them into environment parser, and return new observations and rewards
    def step(self, action_dict):
        self.board = Board(self.raw_obs,self.configuration)
        
        self.prev_action_dict = action_dict
        self.collect_reward = 0 
        self.destroyed_shipyard_penalty  =0
        
        obs, reward, done, info = {}, {}, {}, {}

        
        self.dist_to_shipyard = {}
        self.changed_pieces = []
        
        self.ship_ids = []
        
        
        #In the case where we are training all players at the same time, i.e need to repeat loop for all players
        for player_id,player in self.board.players.items():
            
            #counter of where the action would take the ship if it was taken. Used to detect collisions
            self.next_pos_dict = {}


            #if there are no shipyards, choose one to build into a shipyard
            if len(player.shipyards) ==0:
                if self.building_shipyard(player) is not None:
                    self.ship_ids.append(self.building_shipyard(player)) 




            #Translate action dictionary into moves for all ships and shipyards for players
            for ship in player.ships:
                if ship.id in self.ship_ids:
                    ship.next_action = ShipAction.CONVERT
                    obs['player'+str(player_id)+'ship'+ship.id] ,reward['player'+str(player_id)+'ship'+ship.id],\
                    done['player'+str(player_id)+'ship'+ship.id], info['player'+str(player_id)+'ship'+ship.id] = self.base_view,0,True,{}
                    
                    self.dead_pieces['player'+str(player_id)+'ship'+ship.id] = 1
                    

                else:


                    action = action_dict['player'+str(player_id)+'ship'+ship.id]
                    ship.next_action = self.ship_actions[action]



                    if ship.next_action is None:
                        self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.position

                    elif ship.next_action == ShipAction.NORTH:
                        self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.cell.north.position

                    elif ship.next_action == ShipAction.SOUTH:
                        self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.cell.south.position

                    elif ship.next_action == ShipAction.WEST:
                        self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.cell.west.position

                    elif ship.next_action == ShipAction.EAST:
                        self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.cell.east.position



            for shipyard in player.shipyards:
                action = action_dict['player'+str(player_id)+'shipyard'+shipyard.id]
                shipyard.next_action = self.shipyard_actions[action]
                if shipyard.next_action == ShipyardAction.SPAWN:
                    self.next_pos_dict['player'+str(player_id)+'shipyard'+shipyard.id] = shipyard.position

            #Collisions detector. If collisions between friendly ships will happen, cancel moves until no more collisions
            #are detected. This is not ideal as would want NN to learn this itself, but was difficult to find successful reward
            #policy to teach this
            
            collisions = True
            passes = 0
            while collisions:
                passes +=1

                values = list(self.next_pos_dict.values())
                uniques = set(values)

                if len(values) ==0:
                    collisions = False
                    break
                if len(values) == len(set(values)):
                    collisions = False
                    break

                for pos in uniques:
                    temp = {k:v for k,v in self.next_pos_dict.items() if v == pos}
                    if len(temp.keys())>1:
                        pieces = temp.keys()
                        if any(['shipyard' in piece for piece in pieces]):
                            for piece in pieces:
                                if 'shipyard' in piece:
                                    action_dict[piece] = 1
                                    self.changed_pieces.append(piece)

                        elif any([action_dict[piece] ==0  for piece in pieces]):
                            for piece in pieces:
                                if action_dict[piece] != 0:
                                    self.changed_pieces.append(piece)
                                    action_dict[piece] = 0

                        else:
                            halites = dict((k,self.piece_history[k]) for k in pieces)
                            piece_can_move = max(halites, key=halites.get)
                            for piece in pieces:
                                if piece != piece_can_move:
                                    action_dict[piece] = 0
                                    self.changed_pieces.append(piece)

                #reset collision detection
                self.next_pos_dict = {} 

                
                #update the moves played after passes
                for ship in player.ships:
                    if ship.id in self.ship_ids:
                        ship.next_action = ShipAction.CONVERT

                    else:


                        action = action_dict['player'+str(player_id)+'ship'+ship.id]
                        ship.next_action = self.ship_actions[action]



                        if ship.next_action is None:
                            self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.position

                        elif ship.next_action == ShipAction.NORTH:
                            self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.cell.north.position

                        elif ship.next_action == ShipAction.SOUTH:
                            self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.cell.south.position

                        elif ship.next_action == ShipAction.WEST:
                            self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.cell.west.position

                        elif ship.next_action == ShipAction.EAST:
                            self.next_pos_dict['player'+str(player_id)+'ship'+ship.id] = ship.cell.east.position


                for shipyard in player.shipyards:
                    action = action_dict['player'+str(player_id)+'shipyard'+shipyard.id]
                    shipyard.next_action = self.shipyard_actions[action]
                    if shipyard.next_action == ShipyardAction.SPAWN:
                        self.next_pos_dict['player'+str(player_id)+'shipyard'+shipyard.id] = shipyard.position




        
        #Play the move. End game if ending conditions satisfied
        if True: 
            next_actions = [player.next_actions for player in self.board.players.values()]
            self.raw_obs = self.env.step(next_actions)
            active_counter = 0
            for player in self.raw_obs[0:]:
                if player.status == 'ACTIVE':
                    active_counter +=1
            if ((active_counter<2) or (self.raw_obs[0].observation.step ==398)):
                game_done =True
            else:
                game_done = False
            self.raw_obs = self.raw_obs[0].observation
            self.board = Board(self.raw_obs, self.configuration)
            
            
            
            #update counters,rewards  and observations for all pieces for all players

            for player_id,player in self.board.players.items():
                self.base_view = self.obs_array(self.raw_obs,player_id)

                
                for ship in player.ships:

                    obs['player'+str(player_id)+'ship'+ship.id] ,reward['player'+str(player_id)+'ship'+ship.id],\
                    done['player'+str(player_id)+'ship'+ship.id], info['player'+str(player_id)+'ship'+ship.id] =\
                    self.recentre(self.base_view,ship,player)[:,:,:],self.surviving_ship_reward(ship,player_id), game_done, {}




                for shipyard in player.shipyards:
                    self.piece_history['player'+str(player_id)+'shipyard'+shipyard.id]=1

                    obs['player'+str(player_id)+'shipyard'+shipyard.id] , reward['player'+str(player_id)+'shipyard'+shipyard.id],\
                    done['player'+str(player_id)+'shipyard'+shipyard.id], info['player'+str(player_id)+'shipyard'+shipyard.id] =\
                    self.recentre(self.base_view,shipyard,player)[:,:,:],self.surviving_shipyard_reward(shipyard,player),game_done,{}




                for ship in player.ships:
                    reward['player'+str(player_id)+'ship'+ship.id] += self.collect_reward
                    self.piece_history['player'+str(player_id)+'ship'+ship.id] = ship.halite





                for shipyard in self.player.shipyards:
                    if 'player'+str(player_id)+'shipyard'+shipyard.id in self.changed_pieces:
                        reward['player'+str(player_id)+'shipyard'+shipyard.id] = 0


        
        #Also do this for pieces that died and do not appear in the list of active pieces for the players
        for piece in self.piece_history.keys():
            if ((piece not in obs.keys()) and (piece not in self.dead_pieces.keys())):
                if 'shipyard' in piece:
                    piece_reward = self.destroyed_shipyard_reward(self.board.players[int(piece[6])].halite)
                else:
                    piece_reward = self.destroyed_ship_reward(piece) + self.reward_tracker[piece]

                    
                obs[piece],reward[piece],done[piece],info[piece] = self.base_view[:,:,:],piece_reward,True,{}

                self.dead_pieces[piece] = 1
                
        #If the game is finished update the rewards based on final placings.
        if game_done:
            place = 0
            final_rewards = self.final_positions()
            while len(final_rewards.keys())>0:
                player_id = max(final_rewards, key=final_rewards.get)
                player = self.board.players[player_id]
                
                for ship in player.ships:
                    reward['player'+str(player_id)+'ship'+ship.id] +=   self.place_rewards[place]
                    
                place+=1
                del final_rewards[player_id]
                    

        
        
        
        done["__all__"] = game_done
        

        #end of step, return the observation, reward, whether the episode is done, and other info for each piece.
        return obs, reward, done, info

In [5]:
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.annotations import override



"""

Vision Network - the exact configuration is specified later. 
The network is convolutional due to the spatial nature of the input data.

"""

class TorchCustomModel(TorchModelV2, nn.Module):
    """Generic vision network."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        if not model_config.get("conv_filters"):
            model_config["conv_filters"] =  _get_filter_config(obs_space.shape)

        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)

        activation = self.model_config.get("conv_activation")
        filters = self.model_config["conv_filters"]
        no_final_linear = self.model_config.get("no_final_linear")
        vf_share_layers = self.model_config.get("vf_share_layers")
        
        print(activation)
        print(filters)
        print(no_final_linear)
        print(vf_share_layers)

        # Whether the last layer is the output of a Flattened (rather than
        # a n x (1,1) Conv2D).
        self.last_layer_is_flattened = False
        self._logits = None

        layers = []
        (in_channels,w,h) = obs_space.shape
        in_size = [w, h]
        

        num_outputs = 100
        in_channels -=1
        
        
        for out_channels, kernel, stride in filters[:-1]:
            padding, out_size = same_padding(in_size, kernel, [stride, stride])
            layers.append(
                SlimConv2d(
                    in_channels,
                    out_channels,
                    kernel,
                    stride,
                    padding,
                    activation_fn=activation))
            in_channels = out_channels
            in_size = out_size

        out_channels, kernel, stride = filters[-1]

        # No final linear: Last layer is a Conv2D and uses num_outputs.
        if no_final_linear and num_outputs:
            layers.append(
                SlimConv2d(
                    in_channels,
                    num_outputs,
                    kernel,
                    stride,
                    None,  # padding=valid
                    activation_fn=activation))
            out_channels = num_outputs
        # Finish network normally (w/o overriding last layer size with
        # `num_outputs`), then add another linear one of size `num_outputs`.
        else:
            layers.append(
                SlimConv2d(
                    in_channels,
                    out_channels,
                    kernel,
                    stride,
                    None,  # padding=valid
                    activation_fn=activation))

            # num_outputs defined. Use that to create an exact
            # `num_output`-sized (1,1)-Conv2D.
            if num_outputs:
                in_size = [
                    np.ceil((in_size[0] - kernel[0]) / stride),
                    np.ceil((in_size[1] - kernel[1]) / stride)
                ]
                padding, _ = same_padding(in_size, [1, 1], [1, 1])
                self._logits = SlimConv2d(
                    out_channels,
                    num_outputs, [1, 1],
                    1,
                    padding,
                    activation_fn=None)
            # num_outputs not known -> Flatten, then set self.num_outputs
            # to the resulting number of nodes.
            else:
                self.last_layer_is_flattened = True
                layers.append(nn.Flatten())
                self.num_outputs = out_channels

        self._convs = nn.Sequential(*layers)

        # Build the value layers
        self._value_branch_separate = self._value_branch = None
        if vf_share_layers:
            self._value_branch = SlimFC(
                out_channels,
                1,
                initializer=normc_initializer(0.01),
                activation_fn=None)
        else:
            vf_layers = []
            (in_channels,w, h) = obs_space.shape
            
            
            
            in_channels -=1
            
            
            
            in_size = [w, h]
            for out_channels, kernel, stride in filters[:-1]:
                padding, out_size = same_padding(in_size, kernel,
                                                 [stride, stride])
                vf_layers.append(
                    SlimConv2d(
                        in_channels,
                        out_channels,
                        kernel,
                        stride,
                        padding,
                        activation_fn=activation))
                in_channels = out_channels
                in_size = out_size

            out_channels, kernel, stride = filters[-1]
            vf_layers.append(
                SlimConv2d(
                    in_channels,
                    out_channels,
                    kernel,
                    stride,
                    None,
                    activation_fn=activation))

            vf_layers.append(
                SlimConv2d(
                    in_channels=out_channels,
                    out_channels=num_outputs,
                    kernel=1,
                    stride=1,
                    padding=None,
                    activation_fn=None))
            self._value_branch_separate = nn.Sequential(*vf_layers)
            
            
        self.extras_layer = nn.Sequential(
            nn.Linear(3,25)
        )
        
        self.vf_extras_layer = nn.Sequential(
            nn.Linear(3,25)
        )
            
        self.add_layer = nn.Sequential(
            nn.Linear(125,5)
        )
        
        self.vf_add_layer = nn.Sequential(
            nn.Linear(125,1)
        )

        # Holds the current "base" output (before logits layer).
        self._features = None
        self.extras = None

    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict["obs"].float()
        
        self._features = obs[:,:-1,:,:]
        self.extras = obs[:,-1,0,:3]
        
        
        
        
        
        conv_out = self._convs(self._features)
        # Store features to save forward pass when getting value_function out.
        if not self._value_branch_separate:
            self._features = conv_out

        if not self.last_layer_is_flattened:
            if self._logits:
                conv_out = self._logits(conv_out)
            if conv_out.shape[2] != 1 or conv_out.shape[3] != 1:
                raise ValueError(
                    "Given `conv_filters` ({}) do not result in a [B, {} "
                    "(`num_outputs`), 1, 1] shape (but in {})! Please adjust "
                    "your Conv2D stack such that the last 2 dims are both "
                    "1.".format(self.model_config["conv_filters"],
                                self.num_outputs, list(conv_out.shape)))

            logits = conv_out.squeeze(3)
            logits = logits.squeeze(2)
            extras = self.extras_layer(self.extras)
            logits = torch.cat((logits,extras),dim=1)
            logits = self.add_layer(logits)

            return logits, state
        else:
            return conv_out, state

    @override(TorchModelV2)
    def value_function(self):
        assert self._features is not None, "must call forward() first"
        if self._value_branch_separate:
            value = self._value_branch_separate(self._features)

            value = value.squeeze(3)

            value = value.squeeze(2)
            extras = self.vf_extras_layer(self.extras)
            
            out = torch.cat((value,extras),dim=1)
            value = self.vf_add_layer(out)
            return value.squeeze(1)
        else:
            if not self.last_layer_is_flattened:
                features = self._features.squeeze(3)
                features = features.squeeze(2)
            else:
                features = self._features
            return self._value_branch(features).squeeze(1)

    
"""
Same as above but a different and slightly different architecture for the shipyard model.
"""
class TorchCustomModel2(TorchModelV2, nn.Module):
    """Generic vision network."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        if not model_config.get("conv_filters"):
            model_config["conv_filters"] =  _get_filter_config(obs_space.shape)

        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)

        activation = self.model_config.get("conv_activation")
        #print(activation)
        filters = self.model_config["conv_filters"]
        no_final_linear = self.model_config.get("no_final_linear")
        vf_share_layers = self.model_config.get("vf_share_layers")
        
        print(activation)
        print(filters)
        print(no_final_linear)
        print(vf_share_layers)

        # Whether the last layer is the output of a Flattened (rather than
        # a n x (1,1) Conv2D).
        self.last_layer_is_flattened = False
        self._logits = None

        layers = []
        (in_channels,w,h) = obs_space.shape
        in_size = [w, h]
        

        num_outputs = 100
        in_channels -=1
        
        
        for out_channels, kernel, stride in filters[:-1]:
            padding, out_size = same_padding(in_size, kernel, [stride, stride])
            layers.append(
                SlimConv2d(
                    in_channels,
                    out_channels,
                    kernel,
                    stride,
                    padding,
                    activation_fn=activation))
            in_channels = out_channels
            in_size = out_size

        out_channels, kernel, stride = filters[-1]

        # No final linear: Last layer is a Conv2D and uses num_outputs.
        if no_final_linear and num_outputs:
            layers.append(
                SlimConv2d(
                    in_channels,
                    num_outputs,
                    kernel,
                    stride,
                    None,  # padding=valid
                    activation_fn=activation))
            out_channels = num_outputs
        # Finish network normally (w/o overriding last layer size with
        # `num_outputs`), then add another linear one of size `num_outputs`.
        else:
            layers.append(
                SlimConv2d(
                    in_channels,
                    out_channels,
                    kernel,
                    stride,
                    None,  # padding=valid
                    activation_fn=activation))

            # num_outputs defined. Use that to create an exact
            # `num_output`-sized (1,1)-Conv2D.
            if num_outputs:
                in_size = [
                    np.ceil((in_size[0] - kernel[0]) / stride),
                    np.ceil((in_size[1] - kernel[1]) / stride)
                ]
                padding, _ = same_padding(in_size, [1, 1], [1, 1])
                self._logits = SlimConv2d(
                    out_channels,
                    num_outputs, [1, 1],
                    1,
                    padding,
                    activation_fn=None)
            # num_outputs not known -> Flatten, then set self.num_outputs
            # to the resulting number of nodes.
            else:
                self.last_layer_is_flattened = True
                layers.append(nn.Flatten())
                self.num_outputs = out_channels

        self._convs = nn.Sequential(*layers)

        # Build the value layers
        self._value_branch_separate = self._value_branch = None
        if vf_share_layers:
            self._value_branch = SlimFC(
                out_channels,
                1,
                initializer=normc_initializer(0.01),
                activation_fn=None)
        else:
            vf_layers = []
            (in_channels,w, h) = obs_space.shape
            
            
            
            in_channels -=1
            
            
            
            in_size = [w, h]
            for out_channels, kernel, stride in filters[:-1]:
                padding, out_size = same_padding(in_size, kernel,
                                                 [stride, stride])
                vf_layers.append(
                    SlimConv2d(
                        in_channels,
                        out_channels,
                        kernel,
                        stride,
                        padding,
                        activation_fn=activation))
                in_channels = out_channels
                in_size = out_size

            out_channels, kernel, stride = filters[-1]
            vf_layers.append(
                SlimConv2d(
                    in_channels,
                    out_channels,
                    kernel,
                    stride,
                    None,
                    activation_fn=activation))

            vf_layers.append(
                SlimConv2d(
                    in_channels=out_channels,
                    out_channels=num_outputs,
                    kernel=1,
                    stride=1,
                    padding=None,
                    activation_fn=None))
            self._value_branch_separate = nn.Sequential(*vf_layers)
            
            
        self.extras_layer = nn.Sequential(
            nn.Linear(3,25)
        )
        
        self.vf_extras_layer = nn.Sequential(
            nn.Linear(3,25)
        )
            
        self.add_layer = nn.Sequential(
            nn.Linear(125,2)
        )
        
        self.vf_add_layer = nn.Sequential(
            nn.Linear(125,1)
        )

        # Holds the current "base" output (before logits layer).
        self._features = None
        self.extras = None

    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict["obs"].float()
        
        self._features = obs[:,:-1,:,:]
        self.extras = obs[:,-1,0,:3]
        
        
        #self._features = input_dict["obs"].float()
        
        
        
        conv_out = self._convs(self._features)
        # Store features to save forward pass when getting value_function out.
        if not self._value_branch_separate:
            self._features = conv_out

        if not self.last_layer_is_flattened:
            if self._logits:
                conv_out = self._logits(conv_out)
            if conv_out.shape[2] != 1 or conv_out.shape[3] != 1:
                raise ValueError(
                    "Given `conv_filters` ({}) do not result in a [B, {} "
                    "(`num_outputs`), 1, 1] shape (but in {})! Please adjust "
                    "your Conv2D stack such that the last 2 dims are both "
                    "1.".format(self.model_config["conv_filters"],
                                self.num_outputs, list(conv_out.shape)))

            logits = conv_out.squeeze(3)
            logits = logits.squeeze(2)
            extras = self.extras_layer(self.extras)
            logits = torch.cat((logits,extras),dim=1)
            logits = self.add_layer(logits)

            return logits, state
        else:
            return conv_out, state

    @override(TorchModelV2)
    def value_function(self):
        assert self._features is not None, "must call forward() first"
        if self._value_branch_separate:
            value = self._value_branch_separate(self._features)

            value = value.squeeze(3)
            value = value.squeeze(2)
            extras = self.vf_extras_layer(self.extras)
            
            out = torch.cat((value,extras),dim=1)
            value = self.vf_add_layer(out)
            return value.squeeze(1)
        else:
            if not self.last_layer_is_flattened:
                features = self._features.squeeze(3)
                features = features.squeeze(2)
            else:
                features = self._features
            return self._value_branch(features).squeeze(1)



In [6]:
#create instance of environment
def env_creator(_):
    return HaliteGym()
single_env = HaliteGym()
env_name = "HaliteGym"
register_env(env_name, env_creator)





In [7]:
#generate instances of policies and define model architecture of them

obs_space = single_env.observation_space
act_space = single_env.action_space
num_agents = single_env.num_agents


#Define the model configuration - number of output layers, [size of filters], step size.
def gen_ship_policy():
    config = {
        "model": {
                "custom_model": TorchCustomModel,
                "conv_activation":nn.ReLU,
                "conv_filters" : [[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]],
            },
    }
    return (None, obs_space, act_space, config)
def gen_shipyard_policy():
    config = {
        "model": {
                "custom_model": TorchCustomModel2,
                "conv_activation":nn.ReLU,
                "conv_filters" : [[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]],
            },
    }
    return (None,obs_space,spaces.Discrete(2),config)
policy_graphs = {}

#We create 8 policies in total - 2 policies for each player, 4 players. This way we can have self play, as the different
#players have independent policies, and so don't all follow the same strategy, and we can have more deviation between them.


#policy_graphs['ship_policy'] = gen_ship_policy()
policy_graphs['default_policy'] = gen_ship_policy()
policy_graphs['ship_policy2'] = gen_ship_policy()
policy_graphs['ship_policy3'] = gen_ship_policy()
policy_graphs['ship_policy4'] = gen_ship_policy()

policy_graphs['shipyard_policy1'] = gen_shipyard_policy()
policy_graphs['shipyard_policy2'] = gen_shipyard_policy()
policy_graphs['shipyard_policy3'] = gen_shipyard_policy()
policy_graphs['shipyard_policy4'] = gen_shipyard_policy()

ship_policies = ['default_policy','ship_policy2','ship_policy3','ship_policy4']
shipyard_policies = ['shipyard_policy1','shipyard_policy2','shipyard_policy3','shipyard_policy4']

#map the agent name to the correct policy that it should follow,following naming convention used
#throughout in the gym environment.


def policy_mapping_fn(agent_id):
    if 'shipyard' in agent_id:
        return shipyard_policies[int(agent_id[6])]
        #return 'shipyard_policy1'
    else:
        return ship_policies[int(agent_id[6])]

In [8]:
#Import training algorithms. PPO used. In principle should work for other training algorithms but had some issues
#with other ones and some dependencies - further improvement potential here.

import ray.rllib.agents.ppo as ppo
import ray.rllib.agents.dqn as dqn
import ray.rllib.agents.a3c as a3c
import ray.rllib.agents.sac as sac
from ray.tune.logger import pretty_print


import sys
np.set_printoptions(threshold=sys.maxsize)

log_dir = "/checkpoints"

#traning configuration. For explanation of what variables mean see https://docs.ray.io/en/master/rllib-training.html
config={
    "log_level": "INFO",
    "num_workers":0,
    "num_cpus_for_driver": 1,
    "num_cpus_per_worker": 0,
    "lr": 5e-3,


    "multiagent": {
        "policies": policy_graphs,
        "policy_mapping_fn": policy_mapping_fn,
        "policies_to_train": [
             "default_policy","shipyard_policy1","shipyard_policy2","shipyard_policy3","shipyard_policy4",\
        'ship_policy2','ship_policy3','ship_policy4'],
    },

    "train_batch_size": 1000,
    "rollout_fragment_length":500,
    "env": HaliteGym,
    "framework": "torch",
}



ray.shutdown()
ray.init()

#create trainer instance
trainer = ppo.PPOTrainer(config=config)

#if have saved checkpoints, can restore them here and either use them to predict moves or continue training.
trainer.restore('./checkpoints/checkpoint-380')

#Train for specified number of iterations (with this set up, each iteration takes approximately 30 minutes)

"""
for i in range(1,31):
    # Perform one iteration of training the policy with PPO
    result = trainer.train()
    del result['info']
    print(pretty_print(result))

    if i % 10 == 0:
        checkpoint = trainer.save('/kaggle/working')
        print("checkpoint saved at", checkpoint)
"""




2020-10-21 20:00:23,215	INFO resource_spec.py:212 -- Starting Ray with 0.73 GiB memory available for workers and up to 0.38 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-10-21 20:00:24,233	INFO services.py:1165 -- View the Ray dashboard at localhost:8265
2020-10-21 20:00:28,178	ERROR syncer.py:46 -- Log sync requires rsync to be installed.
2020-10-21 20:00:28,186	INFO ppo.py:132 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2020-10-21 20:00:28,245	INFO catalog.py:308 -- Wrapping <class '__main__.TorchCustomModel'> as None


<class 'torch.nn.modules.activation.ReLU'>
[[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]]
False
False


2020-10-21 20:00:28,408	INFO catalog.py:308 -- Wrapping <class '__main__.TorchCustomModel'> as None


<class 'torch.nn.modules.activation.ReLU'>
[[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]]
False
False


2020-10-21 20:00:28,486	INFO catalog.py:308 -- Wrapping <class '__main__.TorchCustomModel'> as None


<class 'torch.nn.modules.activation.ReLU'>
[[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]]
False
False


2020-10-21 20:00:28,770	INFO catalog.py:308 -- Wrapping <class '__main__.TorchCustomModel'> as None


<class 'torch.nn.modules.activation.ReLU'>
[[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]]
False
False


2020-10-21 20:00:28,845	INFO catalog.py:308 -- Wrapping <class '__main__.TorchCustomModel2'> as None


<class 'torch.nn.modules.activation.ReLU'>
[[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]]
False
False


2020-10-21 20:00:28,876	INFO catalog.py:308 -- Wrapping <class '__main__.TorchCustomModel2'> as None


<class 'torch.nn.modules.activation.ReLU'>
[[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]]
False
False


2020-10-21 20:00:28,967	INFO catalog.py:308 -- Wrapping <class '__main__.TorchCustomModel2'> as None


<class 'torch.nn.modules.activation.ReLU'>
[[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]]
False
False


2020-10-21 20:00:28,992	INFO catalog.py:308 -- Wrapping <class '__main__.TorchCustomModel2'> as None


<class 'torch.nn.modules.activation.ReLU'>
[[16, [3, 3], 1], [32, [5, 5], 5], [32, [4, 4], 4]]
False
False


2020-10-21 20:00:29,154	INFO rollout_worker.py:941 -- Built policy map: {'default_policy': <ray.rllib.policy.torch_policy_template.PPOTorchPolicy object at 0x0000023B6AA086C8>, 'ship_policy2': <ray.rllib.policy.torch_policy_template.PPOTorchPolicy object at 0x0000023B6AAD3708>, 'ship_policy3': <ray.rllib.policy.torch_policy_template.PPOTorchPolicy object at 0x0000023B6AAD3D08>, 'ship_policy4': <ray.rllib.policy.torch_policy_template.PPOTorchPolicy object at 0x0000023B6AB07E08>, 'shipyard_policy1': <ray.rllib.policy.torch_policy_template.PPOTorchPolicy object at 0x0000023B6FEFE808>, 'shipyard_policy2': <ray.rllib.policy.torch_policy_template.PPOTorchPolicy object at 0x0000023B6FEFE848>, 'shipyard_policy3': <ray.rllib.policy.torch_policy_template.PPOTorchPolicy object at 0x0000023B6FF06D88>, 'shipyard_policy4': <ray.rllib.policy.torch_policy_template.PPOTorchPolicy object at 0x0000023B6AAA7808>}
2020-10-21 20:00:29,157	INFO rollout_worker.py:942 -- Built preprocessor map: {'default_polic

'\nfor i in range(1,31):\n    # Perform one iteration of training the policy with PPO\n    result = trainer.train()\n    del result[\'info\']\n    print(pretty_print(result))\n\n    if i % 10 == 0:\n        checkpoint = trainer.save(\'/kaggle/working\')\n        print("checkpoint saved at", checkpoint)\n'

In [9]:
#load policies.
ship_policy = trainer.get_policy('default_policy')
shipyard_policy = trainer.get_policy('shipyard_policy1')

ship_policy2 = trainer.get_policy('ship_policy2')
shipyard_policy2 = trainer.get_policy('shipyard_policy2')

ship_policy3 = trainer.get_policy('ship_policy3')
shipyard_policy3 = trainer.get_policy('shipyard_policy3')

ship_policy4 = trainer.get_policy('ship_policy4')
shipyard_policy4 = trainer.get_policy('shipyard_policy4')


In [10]:
"""
Define agents to feed into kaggle environment. These work in a very similar way to the agent defined within the 
gym environment above, with some syntax differences to fit the required structure for kaggle_environments.

Unfortunately cannot add argument to def(obs,config,policies) as this will not be allowed by kaggle_environments, and so
to play game between all four trained agents, need to copy the definition of agent 4 times, but use different policies to
predict the moves in each, which is somewhat less aesthetic.

"""

def agent1(obs,config):
    
    
    def obs_array(state):
        board_size = 21
        raw_obs = state
 
        halite = np.array(state['halite'])/(config.maxCellHalite)

        
        player_1_ships = np.zeros(board_size*board_size)
        player_1_shipyards = np.zeros(board_size*board_size)
        
        

        

 
        other_params_array = np.zeros(board_size*board_size)
        
        
        for key in raw_obs['players'][state.player][2]:
            player_1_ships[raw_obs['players'][state.player][2][key][0]] += \
            (0.5 + raw_obs['players'][state.player][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][state.player][1]:
            player_1_shipyards[raw_obs['players'][state.player][1][key]] += 1
            halite[raw_obs['players'][state.player][1][key]] = 0

        player_2 = (state.player+1)%4
        player_3 = (state.player+2)%4
        player_4 = (state.player+3)%4

        
        for key in raw_obs['players'][player_2][2]:
            player_1_ships[raw_obs['players'][player_2][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_2][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_2][1]:
            player_1_shipyards[raw_obs['players'][player_2][1][key]] -= 1
            halite[raw_obs['players'][player_2][1][key]] = 0
            
        for key in raw_obs['players'][player_3][2]:
            player_1_ships[raw_obs['players'][player_3][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_3][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_3][1]:
            player_1_shipyards[raw_obs['players'][player_3][1][key]] -= 1
            halite[raw_obs['players'][player_3][1][key]] = 0

            
            
        for key in raw_obs['players'][player_4][2]:
            player_1_ships[raw_obs['players'][player_4][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_4][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_4][1]:
            player_1_shipyards[raw_obs['players'][player_4][1][key]] -= 1
            halite[raw_obs['players'][player_4][1][key]] = 0

        

        halite = halite.reshape(board_size,board_size)
        player_1_ships = player_1_ships.reshape(board_size,board_size)
        player_1_shipyards = player_1_shipyards.reshape(board_size,board_size)
        
        
        
       
        
        
        if len(me.shipyards)>0:
            shipyard_pos = np.asarray(np.where(player_1_shipyards==1)).T[0]

            x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]

            for i in range(0,21):
                for j in range(0,21):
                    if (i,j) != (x_coord,y_coord):
                        rel_dist = distance(i,x_coord,j,y_coord)
                        halite[i][j]/=rel_dist
                    
        else:
            shipyard_pos = None
            rel_dist = 1

        other_params_array = np.zeros((board_size,board_size))
        other_params_array[0][0] = raw_obs['players'][state.player][0]/maximumHalite
        other_params_array[0][1] = raw_obs.step/400


        obs = np.stack((halite,player_1_ships,player_1_shipyards,\
                        other_params_array))



            
        return obs
    
    def recentre(observation,piece):
        position = piece.position
        x_coord = position[0]
        y_coord = position[1]
        
        to_shift = observation[:-1,:,:]
        other_params_array = observation[-1,:,:]
        
        shifted = np.roll(to_shift, shift =(y_coord-10,10-x_coord)  , axis = (1,2))
        
        new_obs = np.concatenate((shifted,[other_params_array]),axis=0)
        
        if piece in me.ships:
            new_obs[-1,0,2] = piece.halite/(50*config.maxCellHalite)
            if piece.halite>0:
                if len(me.shipyards)>0:
                    
                    scaledown_factor = max(1,piece.halite/100)
                    shipyard_val = min(1,piece.halite/150)
                    
                    shipyard_pos = np.asarray(np.where(new_obs[2,:,:]==1)).T[0]
                 
                    x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]

                    new_obs[0,:,:] /= scaledown_factor
                    new_obs[0,x_coord,y_coord] = shipyard_val

        elif piece in me.shipyards:
            new_obs[-1,0,2] = len(me.ships)/(21**2)
            
        return new_obs
    
    
    def building_shipyard():
        possible_builds = [ship for ship in me.ships if ship.halite+me.halite >500]
        if len(possible_builds)==0:
            return None
        else:
            average_x = np.mean(np.array([ship.position.x for ship in me.ships]))
            average_y = np.mean(np.array([ship.position.y for ship in me.ships]))
            smallest_distance = 10**6
            ship_id = None
            for ship in possible_builds:
                if distance(ship.position.x,average_x,ship.position.y,average_y) < smallest_distance:
                    ship_id = ship.id
                    smallest_distance = distance(ship.position.x,average_x,ship.position.y,average_y)
            return ship_id
    
    
    def distance(x1,x2,y1,y2):
        x1,x2 = min(x1,x2),max(x1,x2)
        y1,y2 = min(y1,y2),max(y1,y2)
        
        if x2-x1>11:
            x1 +=21
        if y2-y1>11:
            y1+=21
        distance = np.sqrt((x2-x1)**2+(y2-y1)**2)
        return distance
        
    
    
    
   
    
    
    
    ship_actions = [None, ShipAction.NORTH,ShipAction.SOUTH,ShipAction.EAST,
                             ShipAction.WEST]
    shipyard_actions = [ShipyardAction.SPAWN,None]

    size = config.size
    board = Board(obs, config)
    me = board.current_player
    side = obs.player
    maximumHalite = 10000 + config.startingHalite
    

    
    if len(me.shipyards)==0:
        ship_id = building_shipyard()
    else:
        ship_id = None
    
    
    next_pos_dict = {}
    action_dict = {}
    piece_history = {}
    input_array = obs_array(obs)
        
        
    for ship in me.ships:
        if ship.id == ship_id:
            ship.next_action = ShipAction.CONVERT
        else:
            recentred_view = recentre(input_array,ship)
            action_info = ship_policy.compute_single_action(recentred_view[:,:,:],explore=False)
 
            action = action_info[0]
            action_dict['ship'+ship.id] =action
            piece_history['ship'+ship.id] = ship.halite

            ship.next_action = ship_actions[action]

            if ship.next_action is None:
                next_pos_dict['ship'+ship.id] = ship.position

            elif ship.next_action == ShipAction.NORTH:
                next_pos_dict['ship'+ship.id] = ship.cell.north.position

            elif ship.next_action == ShipAction.SOUTH:
                next_pos_dict['ship'+ship.id] = ship.cell.south.position

            elif ship.next_action == ShipAction.WEST:
                next_pos_dict['ship'+ship.id] = ship.cell.west.position

            elif ship.next_action == ShipAction.EAST:
                next_pos_dict['ship'+ship.id] = ship.cell.east.position

        
        
        
        
        
        
        
    
    for shipyard in me.shipyards:
        recentred_view = recentre(input_array,shipyard)
        action_info = shipyard_policy.compute_single_action(recentred_view[:,:,:],explore=False)

        action = action_info[0]

        action_dict['shipyard'+shipyard.id] =action
        piece_history['shipyard'+shipyard.id] = 1
        
        shipyard.next_action = shipyard_actions[action]

        if shipyard.next_action == ShipyardAction.SPAWN:
            next_pos_dict['shipyard'+shipyard.id] = shipyard.position
    
    
    collisions = True
    passes = 0
    
    
    while collisions:
        passes +=1

        values = list(next_pos_dict.values())
        uniques = set(values)
        if len(values) == 0:

            collisions = False
            break

        elif len(values) == len(set(values)):
            collisions = False
            break

        for pos in uniques:
            temp = {k:v for k,v in next_pos_dict.items() if v == pos}
            if len(temp.keys())>1:

                pieces = temp.keys()
                if any([piece.startswith('shipyard') for piece in pieces]):
                    for piece in pieces:
                        if piece.startswith('shipyard'):
                            action_dict[piece] = 1
                                
                elif any([action_dict[piece] ==0  for piece in pieces]):
                    for piece in pieces:
                        action_dict[piece] = 0
                else:
                    halites = dict((k,piece_history[k]) for k in pieces)
                    piece_can_move = max(halites, key=halites.get)
 
                    for piece in pieces:
                        if piece != piece_can_move:
                            action_dict[piece] = 0
                            
                            
        next_pos_dict = {}
        for ship in me.ships:
            if ship.id == ship_id:
                ship.next_action = ShipAction.CONVERT
            else:
            
            
                action = action_dict['ship'+ship.id]
                ship.next_action = ship_actions[action]

                if ship.next_action is None:
                    next_pos_dict['ship'+ship.id] = ship.position

                elif ship.next_action == ShipAction.NORTH:
                    next_pos_dict['ship'+ship.id] = ship.cell.north.position

                elif ship.next_action == ShipAction.SOUTH:
                    next_pos_dict['ship'+ship.id] = ship.cell.south.position

                elif ship.next_action == ShipAction.WEST:
                    next_pos_dict['ship'+ship.id] = ship.cell.west.position

                elif ship.next_action == ShipAction.EAST:
                    next_pos_dict['ship'+ship.id] = ship.cell.east.position
                
                
        for shipyard in me.shipyards:
            action = action_dict['shipyard'+shipyard.id]
            shipyard.next_action = shipyard_actions[action]

            
            if shipyard.next_action == ShipyardAction.SPAWN:
                next_pos_dict['shipyard'+shipyard.id] = shipyard.position
    
    
    
    
    return me.next_actions



def agent2(obs,config):
    
    
    def obs_array(state):
        board_size = 21
        raw_obs = state

        halite = np.array(state['halite'])/(config.maxCellHalite)

        
        player_1_ships = np.zeros(board_size*board_size)
        player_1_shipyards = np.zeros(board_size*board_size)
        
        
       
        

 
        other_params_array = np.zeros(board_size*board_size)
        
        
        for key in raw_obs['players'][state.player][2]:
            player_1_ships[raw_obs['players'][state.player][2][key][0]] += \
            (0.5 + raw_obs['players'][state.player][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][state.player][1]:
            player_1_shipyards[raw_obs['players'][state.player][1][key]] += 1
            halite[raw_obs['players'][state.player][1][key]] = 0

        player_2 = (state.player+1)%4
        player_3 = (state.player+2)%4
        player_4 = (state.player+3)%4

        
        for key in raw_obs['players'][player_2][2]:
            player_1_ships[raw_obs['players'][player_2][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_2][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_2][1]:
            player_1_shipyards[raw_obs['players'][player_2][1][key]] -= 1
            halite[raw_obs['players'][player_2][1][key]] = 0
            
        for key in raw_obs['players'][player_3][2]:
            player_1_ships[raw_obs['players'][player_3][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_3][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_3][1]:
            player_1_shipyards[raw_obs['players'][player_3][1][key]] -= 1
            halite[raw_obs['players'][player_3][1][key]] = 0

            
            
        for key in raw_obs['players'][player_4][2]:
            player_1_ships[raw_obs['players'][player_4][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_4][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_4][1]:
            player_1_shipyards[raw_obs['players'][player_4][1][key]] -= 1
            halite[raw_obs['players'][player_4][1][key]] = 0

        

        halite = halite.reshape(board_size,board_size)
        player_1_ships = player_1_ships.reshape(board_size,board_size)
        player_1_shipyards = player_1_shipyards.reshape(board_size,board_size)
        
        


        if len(me.shipyards)>0:
            shipyard_pos = np.asarray(np.where(player_1_shipyards==1)).T[0]
 
            x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]

            for i in range(0,21):
                for j in range(0,21):
                    if (i,j) != (x_coord,y_coord):
                        rel_dist = distance(i,x_coord,j,y_coord)
                        halite[i][j]/=rel_dist
                    
        else:
            shipyard_pos = None
            rel_dist = 1
            
            
        other_params_array = np.zeros((board_size,board_size))
        other_params_array[0][0] = raw_obs['players'][state.player][0]/maximumHalite
        other_params_array[0][1] = raw_obs.step/400


        obs = np.stack((halite,player_1_ships,player_1_shipyards,\
                        other_params_array))



            
        return obs
    
    def recentre(observation,piece):
        position = piece.position
        x_coord = position[0]
        y_coord = position[1]
        
        to_shift = observation[:-1,:,:]
        other_params_array = observation[-1,:,:]
        
        shifted = np.roll(to_shift, shift =(y_coord-10,10-x_coord)  , axis = (1,2))
        
        new_obs = np.concatenate((shifted,[other_params_array]),axis=0)
        if piece in me.ships:
            new_obs[-1,0,2] = piece.halite/(50*config.maxCellHalite)
            if piece.halite>0:
                if len(me.shipyards)>0:
                    
                    scaledown_factor = max(1,piece.halite/100)
                    shipyard_val = min(1,piece.halite/150)
                    

                    shipyard_pos = np.asarray(np.where(new_obs[2,:,:]==1)).T[0]

                    x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]

                    new_obs[0,:,:] /= scaledown_factor
                    new_obs[0,x_coord,y_coord] = shipyard_val

        elif piece in me.shipyards:
            new_obs[-1,0,2] = len(me.ships)/(21**2)
        return new_obs
    
    
    def building_shipyard():
        possible_builds = [ship for ship in me.ships if ship.halite+me.halite >500]
        if len(possible_builds)==0:
            return None
        else:
            average_x = np.mean(np.array([ship.position.x for ship in me.ships]))
            average_y = np.mean(np.array([ship.position.y for ship in me.ships]))
            smallest_distance = 10**6
            ship_id = None
            for ship in possible_builds:
                if distance(ship.position.x,average_x,ship.position.y,average_y) < smallest_distance:
                    ship_id = ship.id
                    smallest_distance = distance(ship.position.x,average_x,ship.position.y,average_y)
            return ship_id
    
    
    def distance(x1,x2,y1,y2):
        x1,x2 = min(x1,x2),max(x1,x2)
        y1,y2 = min(y1,y2),max(y1,y2)
        
        if x2-x1>11:
            x1 +=21
        if y2-y1>11:
            y1+=21
        distance = np.sqrt((x2-x1)**2+(y2-y1)**2)
        return distance
        
    
    
    
   
    
    
    
    ship_actions = [None, ShipAction.NORTH,ShipAction.SOUTH,ShipAction.EAST,
                             ShipAction.WEST]
    shipyard_actions = [ShipyardAction.SPAWN,None]

    size = config.size
    board = Board(obs, config)
    me = board.current_player
    side = obs.player
    maximumHalite = 10000 + config.startingHalite
    

    
    if len(me.shipyards)==0:
        ship_id = building_shipyard()
    else:
        ship_id = None
    
    
    next_pos_dict = {}
    action_dict = {}
    piece_history = {}
    input_array = obs_array(obs)
        
        
    for ship in me.ships:
        if ship.id == ship_id:
            ship.next_action = ShipAction.CONVERT
        else:
            recentred_view = recentre(input_array,ship)
            action_info = ship_policy2.compute_single_action(recentred_view[:,:,:],explore=False)

            action = action_info[0]
            action_dict['ship'+ship.id] =action
            piece_history['ship'+ship.id] = ship.halite

            ship.next_action = ship_actions[action]

            if ship.next_action is None:
                next_pos_dict['ship'+ship.id] = ship.position

            elif ship.next_action == ShipAction.NORTH:
                next_pos_dict['ship'+ship.id] = ship.cell.north.position

            elif ship.next_action == ShipAction.SOUTH:
                next_pos_dict['ship'+ship.id] = ship.cell.south.position

            elif ship.next_action == ShipAction.WEST:
                next_pos_dict['ship'+ship.id] = ship.cell.west.position

            elif ship.next_action == ShipAction.EAST:
                next_pos_dict['ship'+ship.id] = ship.cell.east.position

        
        
        
        
        
        
        
    
    for shipyard in me.shipyards:
        recentred_view = recentre(input_array,shipyard)
        action_info = shipyard_policy2.compute_single_action(recentred_view[:,:,:],explore=False)

        action = action_info[0]

        action_dict['shipyard'+shipyard.id] =action
        piece_history['shipyard'+shipyard.id] = 1
        
        shipyard.next_action = shipyard_actions[action]

        if shipyard.next_action == ShipyardAction.SPAWN:
            next_pos_dict['shipyard'+shipyard.id] = shipyard.position
    
    
    collisions = True
    passes = 0
    
    
    while collisions:
        passes +=1

        values = list(next_pos_dict.values())
        uniques = set(values)
        if len(values) == 0:

            collisions = False
            break
        elif len(values) == len(set(values)):
            collisions = False
            break

        for pos in uniques:
            temp = {k:v for k,v in next_pos_dict.items() if v == pos}
            if len(temp.keys())>1:

                pieces = temp.keys()
                if any([piece.startswith('shipyard') for piece in pieces]):
                    for piece in pieces:
                        if piece.startswith('shipyard'):
                            action_dict[piece] = 1
                                
                elif any([action_dict[piece] ==0  for piece in pieces]):
                    for piece in pieces:
                        action_dict[piece] = 0
                else:
                    halites = dict((k,piece_history[k]) for k in pieces)
                    piece_can_move = max(halites, key=halites.get)

                    for piece in pieces:
                        if piece != piece_can_move:
                            action_dict[piece] = 0
                            
                            
        next_pos_dict = {}
        for ship in me.ships:
            if ship.id == ship_id:
                ship.next_action = ShipAction.CONVERT
            else:
            
            
                action = action_dict['ship'+ship.id]
                ship.next_action = ship_actions[action]

                if ship.next_action is None:
                    next_pos_dict['ship'+ship.id] = ship.position

                elif ship.next_action == ShipAction.NORTH:
                    next_pos_dict['ship'+ship.id] = ship.cell.north.position

                elif ship.next_action == ShipAction.SOUTH:
                    next_pos_dict['ship'+ship.id] = ship.cell.south.position

                elif ship.next_action == ShipAction.WEST:
                    next_pos_dict['ship'+ship.id] = ship.cell.west.position

                elif ship.next_action == ShipAction.EAST:
                    next_pos_dict['ship'+ship.id] = ship.cell.east.position
                
                
        for shipyard in me.shipyards:
            action = action_dict['shipyard'+shipyard.id]
            shipyard.next_action = shipyard_actions[action]

            
            if shipyard.next_action == ShipyardAction.SPAWN:
                next_pos_dict['shipyard'+shipyard.id] = shipyard.position
    
    
    
    
    return me.next_actions


def agent3(obs,config):
    
    
    def obs_array(state):
        board_size = 21
        raw_obs = state

        halite = np.array(state['halite'])/(config.maxCellHalite)

        
        player_1_ships = np.zeros(board_size*board_size)
        player_1_shipyards = np.zeros(board_size*board_size)

        

 
        other_params_array = np.zeros(board_size*board_size)
        
        
        for key in raw_obs['players'][state.player][2]:
            player_1_ships[raw_obs['players'][state.player][2][key][0]] += \
            (0.5 + raw_obs['players'][state.player][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][state.player][1]:
            player_1_shipyards[raw_obs['players'][state.player][1][key]] += 1
            halite[raw_obs['players'][state.player][1][key]] = 0

        player_2 = (state.player+1)%4
        player_3 = (state.player+2)%4
        player_4 = (state.player+3)%4

        
        for key in raw_obs['players'][player_2][2]:
            player_1_ships[raw_obs['players'][player_2][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_2][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_2][1]:
            player_1_shipyards[raw_obs['players'][player_2][1][key]] -= 1
            halite[raw_obs['players'][player_2][1][key]] = 0
            
        for key in raw_obs['players'][player_3][2]:
            player_1_ships[raw_obs['players'][player_3][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_3][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_3][1]:
            player_1_shipyards[raw_obs['players'][player_3][1][key]] -= 1
            halite[raw_obs['players'][player_3][1][key]] = 0

            
            
        for key in raw_obs['players'][player_4][2]:
            player_1_ships[raw_obs['players'][player_4][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_4][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_4][1]:
            player_1_shipyards[raw_obs['players'][player_4][1][key]] -= 1
            halite[raw_obs['players'][player_4][1][key]] = 0

        

        halite = halite.reshape(board_size,board_size)
        player_1_ships = player_1_ships.reshape(board_size,board_size)
        player_1_shipyards = player_1_shipyards.reshape(board_size,board_size)
        
        

        if len(me.shipyards)>0:
            shipyard_pos = np.asarray(np.where(player_1_shipyards==1)).T[0]
            #print(shipyard_pos)
            x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]
            #print(x_coord,y_coord)
            for i in range(0,21):
                for j in range(0,21):
                    if (i,j) != (x_coord,y_coord):
                        rel_dist = distance(i,x_coord,j,y_coord)
                        halite[i][j]/=rel_dist
                    
        else:
            shipyard_pos = None
            rel_dist = 1

        other_params_array = np.zeros((board_size,board_size))
        other_params_array[0][0] = raw_obs['players'][state.player][0]/maximumHalite
        other_params_array[0][1] = raw_obs.step/400


        obs = np.stack((halite,player_1_ships,player_1_shipyards,\
                        other_params_array))



            
        return obs
    
    def recentre(observation,piece):
        position = piece.position
        x_coord = position[0]
        y_coord = position[1]
        
        to_shift = observation[:-1,:,:]
        other_params_array = observation[-1,:,:]
        
        shifted = np.roll(to_shift, shift =(y_coord-10,10-x_coord)  , axis = (1,2))
        
        new_obs = np.concatenate((shifted,[other_params_array]),axis=0)
        
        if piece in me.ships:
            new_obs[-1,0,2] = piece.halite/(50*config.maxCellHalite)
            if piece.halite>0:
                if len(me.shipyards)>0:
                    
                    scaledown_factor = max(1,piece.halite/100)
                    shipyard_val = min(1,piece.halite/150)
                    
                    shipyard_pos = np.asarray(np.where(new_obs[2,:,:]==1)).T[0]
                    
                    x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]
                
                    new_obs[0,:,:] /= scaledown_factor
                    new_obs[0,x_coord,y_coord] = shipyard_val
 
        elif piece in me.shipyards:
            new_obs[-1,0,2] = len(me.ships)/(21**2)
        return new_obs
    
    
    def building_shipyard():
        possible_builds = [ship for ship in me.ships if ship.halite+me.halite >500]
        if len(possible_builds)==0:
            return None
        else:
            average_x = np.mean(np.array([ship.position.x for ship in me.ships]))
            average_y = np.mean(np.array([ship.position.y for ship in me.ships]))
            smallest_distance = 10**6
            ship_id = None
            for ship in possible_builds:
                if distance(ship.position.x,average_x,ship.position.y,average_y) < smallest_distance:
                    ship_id = ship.id
                    smallest_distance = distance(ship.position.x,average_x,ship.position.y,average_y)
            return ship_id
    
    
    def distance(x1,x2,y1,y2):
        x1,x2 = min(x1,x2),max(x1,x2)
        y1,y2 = min(y1,y2),max(y1,y2)
        
        if x2-x1>11:
            x1 +=21
        if y2-y1>11:
            y1+=21
        distance = np.sqrt((x2-x1)**2+(y2-y1)**2)
        return distance
        
    
    
    
   
    
    
    
    ship_actions = [None, ShipAction.NORTH,ShipAction.SOUTH,ShipAction.EAST,
                             ShipAction.WEST]
    shipyard_actions = [ShipyardAction.SPAWN,None]

    size = config.size
    board = Board(obs, config)
    me = board.current_player
    side = obs.player
    maximumHalite = 10000 + config.startingHalite
    

    
    if len(me.shipyards)==0:
        ship_id = building_shipyard()
    else:
        ship_id = None
    
    
    next_pos_dict = {}
    action_dict = {}
    piece_history = {}
    input_array = obs_array(obs)
        
        
    for ship in me.ships:
        if ship.id == ship_id:
            ship.next_action = ShipAction.CONVERT
        else:
            recentred_view = recentre(input_array,ship)
            action_info = ship_policy3.compute_single_action(recentred_view[:,:,:],explore=False)

            action = action_info[0]
            action_dict['ship'+ship.id] =action
            piece_history['ship'+ship.id] = ship.halite

            ship.next_action = ship_actions[action]

            if ship.next_action is None:
                next_pos_dict['ship'+ship.id] = ship.position

            elif ship.next_action == ShipAction.NORTH:
                next_pos_dict['ship'+ship.id] = ship.cell.north.position

            elif ship.next_action == ShipAction.SOUTH:
                next_pos_dict['ship'+ship.id] = ship.cell.south.position

            elif ship.next_action == ShipAction.WEST:
                next_pos_dict['ship'+ship.id] = ship.cell.west.position

            elif ship.next_action == ShipAction.EAST:
                next_pos_dict['ship'+ship.id] = ship.cell.east.position

        
        
        
        
        
        
        
    
    for shipyard in me.shipyards:
        recentred_view = recentre(input_array,shipyard)
        action_info = shipyard_policy3.compute_single_action(recentred_view[:,:,:],explore=False)

        action = action_info[0]

        action_dict['shipyard'+shipyard.id] =action
        piece_history['shipyard'+shipyard.id] = 1
        
        shipyard.next_action = shipyard_actions[action]

        if shipyard.next_action == ShipyardAction.SPAWN:
            next_pos_dict['shipyard'+shipyard.id] = shipyard.position
    
    
    collisions = True
    passes = 0
    
    
    while collisions:
        passes +=1

        values = list(next_pos_dict.values())
        uniques = set(values)
        if len(values) == 0:

            collisions = False
            break

        elif len(values) == len(set(values)):
            collisions = False
            break

        for pos in uniques:
            temp = {k:v for k,v in next_pos_dict.items() if v == pos}
            if len(temp.keys())>1:

                pieces = temp.keys()
                if any([piece.startswith('shipyard') for piece in pieces]):
                    for piece in pieces:
                        if piece.startswith('shipyard'):
                            action_dict[piece] = 1
                                
                elif any([action_dict[piece] ==0  for piece in pieces]):
                    for piece in pieces:
                        action_dict[piece] = 0
                else:
                    halites = dict((k,piece_history[k]) for k in pieces)

                    piece_can_move = max(halites, key=halites.get)

                    for piece in pieces:
                        if piece != piece_can_move:
                            action_dict[piece] = 0
                            
                            
        next_pos_dict = {}
        for ship in me.ships:
            if ship.id == ship_id:
                ship.next_action = ShipAction.CONVERT
            else:
            
            
                action = action_dict['ship'+ship.id]
                ship.next_action = ship_actions[action]

                if ship.next_action is None:
                    next_pos_dict['ship'+ship.id] = ship.position

                elif ship.next_action == ShipAction.NORTH:
                    next_pos_dict['ship'+ship.id] = ship.cell.north.position

                elif ship.next_action == ShipAction.SOUTH:
                    next_pos_dict['ship'+ship.id] = ship.cell.south.position

                elif ship.next_action == ShipAction.WEST:
                    next_pos_dict['ship'+ship.id] = ship.cell.west.position

                elif ship.next_action == ShipAction.EAST:
                    next_pos_dict['ship'+ship.id] = ship.cell.east.position
                
                
        for shipyard in me.shipyards:
            action = action_dict['shipyard'+shipyard.id]
            shipyard.next_action = shipyard_actions[action]

            
            if shipyard.next_action == ShipyardAction.SPAWN:
                next_pos_dict['shipyard'+shipyard.id] = shipyard.position
    
    
    
    
    return me.next_actions


def agent4(obs,config):
    
    
    def obs_array(state):
        board_size = 21
        raw_obs = state

        halite = np.array(state['halite'])/(config.maxCellHalite)

        
        player_1_ships = np.zeros(board_size*board_size)
        player_1_shipyards = np.zeros(board_size*board_size)
        

        

 
        other_params_array = np.zeros(board_size*board_size)
        
        
        for key in raw_obs['players'][state.player][2]:
            player_1_ships[raw_obs['players'][state.player][2][key][0]] += \
            (0.5 + raw_obs['players'][state.player][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][state.player][1]:
            player_1_shipyards[raw_obs['players'][state.player][1][key]] += 1
            halite[raw_obs['players'][state.player][1][key]] = 0

        player_2 = (state.player+1)%4
        player_3 = (state.player+2)%4
        player_4 = (state.player+3)%4

        
        for key in raw_obs['players'][player_2][2]:
            player_1_ships[raw_obs['players'][player_2][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_2][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_2][1]:
            player_1_shipyards[raw_obs['players'][player_2][1][key]] -= 1
            halite[raw_obs['players'][player_2][1][key]] = 0
            
        for key in raw_obs['players'][player_3][2]:
            player_1_ships[raw_obs['players'][player_3][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_3][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_3][1]:
            player_1_shipyards[raw_obs['players'][player_3][1][key]] -= 1
            halite[raw_obs['players'][player_3][1][key]] = 0

            
            
        for key in raw_obs['players'][player_4][2]:
            player_1_ships[raw_obs['players'][player_4][2][key][0]] +=\
            -(0.5 + raw_obs['players'][player_4][2][key][1]/(100*config.maxCellHalite)) 
        for key in raw_obs['players'][player_4][1]:
            player_1_shipyards[raw_obs['players'][player_4][1][key]] -= 1
            halite[raw_obs['players'][player_4][1][key]] = 0

        

        halite = halite.reshape(board_size,board_size)
        player_1_ships = player_1_ships.reshape(board_size,board_size)
        player_1_shipyards = player_1_shipyards.reshape(board_size,board_size)
        
        


        if len(me.shipyards)>0:
            shipyard_pos = np.asarray(np.where(player_1_shipyards==1)).T[0]

            x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]

            for i in range(0,21):
                for j in range(0,21):
                    if (i,j) != (x_coord,y_coord):
                        rel_dist = distance(i,x_coord,j,y_coord)
                        halite[i][j]/=rel_dist
                    
        else:
            shipyard_pos = None
            rel_dist = 1
            
            
        other_params_array = np.zeros((board_size,board_size))
        other_params_array[0][0] = raw_obs['players'][state.player][0]/maximumHalite
        other_params_array[0][1] = raw_obs.step/400


        obs = np.stack((halite,player_1_ships,player_1_shipyards,\
                        other_params_array))



            
        return obs
    
    def recentre(observation,piece):
        position = piece.position
        x_coord = position[0]
        y_coord = position[1]
        
        to_shift = observation[:-1,:,:]
        other_params_array = observation[-1,:,:]
        
        shifted = np.roll(to_shift, shift =(y_coord-10,10-x_coord)  , axis = (1,2))
        
        new_obs = np.concatenate((shifted,[other_params_array]),axis=0)
        
        if piece in me.ships:
            new_obs[-1,0,2] = piece.halite/(50*config.maxCellHalite)
            if piece.halite>0:
                if len(me.shipyards)>0:
                    
                    scaledown_factor = max(1,piece.halite/100)
                    shipyard_val = min(1,piece.halite/150)
                    
                    shipyard_pos = np.asarray(np.where(new_obs[2,:,:]==1)).T[0]
                    x_coord,y_coord = shipyard_pos[0],shipyard_pos[1]
                    new_obs[0,:,:] /= scaledown_factor
                    new_obs[0,x_coord,y_coord] = shipyard_val
        elif piece in me.shipyards:
            new_obs[-1,0,2] = len(me.ships)/(21**2)
        return new_obs
    
    
    def building_shipyard():
        possible_builds = [ship for ship in me.ships if ship.halite+me.halite >500]
        if len(possible_builds)==0:
            return None
        else:
            average_x = np.mean(np.array([ship.position.x for ship in me.ships]))
            average_y = np.mean(np.array([ship.position.y for ship in me.ships]))
            smallest_distance = 10**6
            ship_id = None
            for ship in possible_builds:
                if distance(ship.position.x,average_x,ship.position.y,average_y) < smallest_distance:
                    ship_id = ship.id
                    smallest_distance = distance(ship.position.x,average_x,ship.position.y,average_y)
            return ship_id
    
    
    def distance(x1,x2,y1,y2):
        x1,x2 = min(x1,x2),max(x1,x2)
        y1,y2 = min(y1,y2),max(y1,y2)
        
        if x2-x1>11:
            x1 +=21
        if y2-y1>11:
            y1+=21
        distance = np.sqrt((x2-x1)**2+(y2-y1)**2)
        return distance
        
    
    
    
   
    
    
    
    ship_actions = [None, ShipAction.NORTH,ShipAction.SOUTH,ShipAction.EAST,
                             ShipAction.WEST]
    shipyard_actions = [ShipyardAction.SPAWN,None]

    size = config.size
    board = Board(obs, config)
    me = board.current_player
    side = obs.player
    maximumHalite = 10000 + config.startingHalite
    

    
    if len(me.shipyards)==0:
        ship_id = building_shipyard()
    else:
        ship_id = None
    
    
    next_pos_dict = {}
    action_dict = {}
    piece_history = {}
    input_array = obs_array(obs)
        
        
    for ship in me.ships:
        if ship.id == ship_id:
            ship.next_action = ShipAction.CONVERT
        else:
            recentred_view = recentre(input_array,ship)
            action_info = ship_policy4.compute_single_action(recentred_view[:,:,:],explore=False)
            action = action_info[0]
            action_dict['ship'+ship.id] =action
            piece_history['ship'+ship.id] = ship.halite

            ship.next_action = ship_actions[action]

            if ship.next_action is None:
                next_pos_dict['ship'+ship.id] = ship.position

            elif ship.next_action == ShipAction.NORTH:
                next_pos_dict['ship'+ship.id] = ship.cell.north.position

            elif ship.next_action == ShipAction.SOUTH:
                next_pos_dict['ship'+ship.id] = ship.cell.south.position

            elif ship.next_action == ShipAction.WEST:
                next_pos_dict['ship'+ship.id] = ship.cell.west.position

            elif ship.next_action == ShipAction.EAST:
                next_pos_dict['ship'+ship.id] = ship.cell.east.position

        
        
        
        
        
        
        
    
    for shipyard in me.shipyards:
        recentred_view = recentre(input_array,shipyard)
        action_info = shipyard_policy4.compute_single_action(recentred_view[:,:,:],explore=False)
        action = action_info[0]

        action_dict['shipyard'+shipyard.id] =action
        piece_history['shipyard'+shipyard.id] = 1
        
        shipyard.next_action = shipyard_actions[action]

        if shipyard.next_action == ShipyardAction.SPAWN:
            next_pos_dict['shipyard'+shipyard.id] = shipyard.position
    
    
    collisions = True
    passes = 0
    
    
    while collisions:
        passes +=1

        values = list(next_pos_dict.values())
        uniques = set(values)
        if len(values) == 0:

            collisions = False
            break
        elif len(values) == len(set(values)):
            collisions = False
            break

        for pos in uniques:
            temp = {k:v for k,v in next_pos_dict.items() if v == pos}
            if len(temp.keys())>1:

                pieces = temp.keys()
                if any([piece.startswith('shipyard') for piece in pieces]):
                    for piece in pieces:
                        if piece.startswith('shipyard'):
                            action_dict[piece] = 1
                                
                elif any([action_dict[piece] ==0  for piece in pieces]):
                    for piece in pieces:
                        action_dict[piece] = 0
                else:
                    halites = dict((k,piece_history[k]) for k in pieces)
                    piece_can_move = max(halites, key=halites.get)
                    for piece in pieces:
                        if piece != piece_can_move:
                            action_dict[piece] = 0
                            
                            
        next_pos_dict = {}
        for ship in me.ships:
            if ship.id == ship_id:
                ship.next_action = ShipAction.CONVERT
            else:
            
            
                action = action_dict['ship'+ship.id]
                ship.next_action = ship_actions[action]

                if ship.next_action is None:
                    next_pos_dict['ship'+ship.id] = ship.position

                elif ship.next_action == ShipAction.NORTH:
                    next_pos_dict['ship'+ship.id] = ship.cell.north.position

                elif ship.next_action == ShipAction.SOUTH:
                    next_pos_dict['ship'+ship.id] = ship.cell.south.position

                elif ship.next_action == ShipAction.WEST:
                    next_pos_dict['ship'+ship.id] = ship.cell.west.position

                elif ship.next_action == ShipAction.EAST:
                    next_pos_dict['ship'+ship.id] = ship.cell.east.position
                
                
        for shipyard in me.shipyards:
            action = action_dict['shipyard'+shipyard.id]
            shipyard.next_action = shipyard_actions[action]

            
            if shipyard.next_action == ShipyardAction.SPAWN:
                next_pos_dict['shipyard'+shipyard.id] = shipyard.position
    
    
    
    
    return me.next_actions

def random_agent(obs,config):
    ship_actions = [ShipAction.CONVERT,None, ShipAction.NORTH,ShipAction.SOUTH,ShipAction.EAST,
                             ShipAction.WEST]
    shipyard_actions = [ShipyardAction.SPAWN,None]
    board = Board(obs,config)
    me = board.current_player
    for ship in me.ships:
        ship.next_action = random.choice(ship_actions)
    for shipyard in me.shipyards:
        shipyard.next_action = random.choice(shipyard_actions)
    return me.next_actions

In [11]:
#run a game between 4 specified players and render it


test_env = make("halite",configuration = {"size": 21}, debug=True)
agent_count = 4
test_env.reset(agent_count)     

steps = test_env.run([agent1,agent2,agent3,agent4])

test_env.render(mode="ipython", width=800, height=800)

Traceback (most recent call last):
  File "C:\Users\guymo\Anaconda3\lib\site-packages\ray\dashboard/dashboard.py", line 960, in <module>
    metrics_export_address=metrics_export_address)
  File "C:\Users\guymo\Anaconda3\lib\site-packages\ray\dashboard/dashboard.py", line 513, in __init__
    build_dir = setup_static_dir(self.app)
  File "C:\Users\guymo\Anaconda3\lib\site-packages\ray\dashboard/dashboard.py", line 414, in setup_static_dir
    "&& npm run build)", build_dir)
FileNotFoundError: [Errno 2] Dashboard build directory not found. If installing from source, please follow the additional steps required to build the dashboard(cd python/ray/dashboard/client && npm ci && npm run build): 'C:\\Users\\guymo\\Anaconda3\\lib\\site-packages\\ray\\dashboard\\client/build'



In [12]:
"""
run several games and record placements to compare performance of all agents
"""


def get_win_percentages(agent1, agent2,agent3,agent4, n_rounds=10):
    wins = 0
    losses = 0
    eliminated_draws = 0
    survived_draws = 0
    outcomes =[]
    pos_list = [[],[],[],[]]
    
    for i in range(n_rounds):
        test_env = make("halite")
        agent_count = 4
        test_env.reset(agent_count)  
        steps = test_env.run([agent1, agent2,agent3,agent4])
        final = steps[-1]
        #print(final)
        outcomes.append([final[0].reward,final[1].reward,final[2].reward,final[3].reward])
        
        
    for outcome in outcomes:
        print(outcome)
        outcome_array = np.array(outcome)
        place = 1
        
        while place<5:
            idx = np.argmax(outcome_array)
            pos_list[idx].append(place)
            outcome_array[idx] = -10**5
            place+=1
            
    print('agent1 places:' ,pos_list[0])
    print('agent2 places:' ,pos_list[1])
    print('agent3 places:' , pos_list[2])
    print('agent4 places:' , pos_list[3])

    return outcomes

#results = get_win_percentages(agent1,"random","random","random")