<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Training-AI-Agents" data-toc-modified-id="Training-AI-Agents-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Training AI Agents</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Imports</a></span><ul class="toc-item"><li><span><a href="#Stable-Baselines-Imports" data-toc-modified-id="Stable-Baselines-Imports-1.0.1.1"><span class="toc-item-num">1.0.1.1&nbsp;&nbsp;</span>Stable Baselines Imports</a></span></li></ul></li><li><span><a href="#Creating-the-Player-Class" data-toc-modified-id="Creating-the-Player-Class-1.0.2"><span class="toc-item-num">1.0.2&nbsp;&nbsp;</span>Creating the Player Class</a></span></li><li><span><a href="#Creating-a-custom-Environment" data-toc-modified-id="Creating-a-custom-Environment-1.0.3"><span class="toc-item-num">1.0.3&nbsp;&nbsp;</span>Creating a custom Environment</a></span></li><li><span><a href="#Checking-the-environment" data-toc-modified-id="Checking-the-environment-1.0.4"><span class="toc-item-num">1.0.4&nbsp;&nbsp;</span>Checking the environment</a></span></li><li><span><a href="#Random-Agent-Decisions" data-toc-modified-id="Random-Agent-Decisions-1.0.5"><span class="toc-item-num">1.0.5&nbsp;&nbsp;</span>Random Agent Decisions</a></span><ul class="toc-item"><li><span><a href="#Analysis:" data-toc-modified-id="Analysis:-1.0.5.1"><span class="toc-item-num">1.0.5.1&nbsp;&nbsp;</span>Analysis:</a></span></li></ul></li><li><span><a href="#PPO---Promixal-Policy-Optimization" data-toc-modified-id="PPO---Promixal-Policy-Optimization-1.0.6"><span class="toc-item-num">1.0.6&nbsp;&nbsp;</span>PPO - Promixal Policy Optimization</a></span><ul class="toc-item"><li><span><a href="#Analysis:" data-toc-modified-id="Analysis:-1.0.6.1"><span class="toc-item-num">1.0.6.1&nbsp;&nbsp;</span>Analysis:</a></span></li></ul></li><li><span><a href="#PPO2---Promixal-Policy-Optimization" data-toc-modified-id="PPO2---Promixal-Policy-Optimization-1.0.7"><span class="toc-item-num">1.0.7&nbsp;&nbsp;</span>PPO2 - Promixal Policy Optimization</a></span><ul class="toc-item"><li><span><a href="#Analysis:" data-toc-modified-id="Analysis:-1.0.7.1"><span class="toc-item-num">1.0.7.1&nbsp;&nbsp;</span>Analysis:</a></span></li></ul></li><li><span><a href="#A2C---Advantage-Actor-Critic" data-toc-modified-id="A2C---Advantage-Actor-Critic-1.0.8"><span class="toc-item-num">1.0.8&nbsp;&nbsp;</span>A2C - Advantage Actor Critic</a></span><ul class="toc-item"><li><span><a href="#Analysis:" data-toc-modified-id="Analysis:-1.0.8.1"><span class="toc-item-num">1.0.8.1&nbsp;&nbsp;</span>Analysis:</a></span></li></ul></li></ul></li></ul></li></ul></div>

# Training AI Agents

### Imports

I am using Stable Baselines, and Open AI's gym to create and test the environment. I am also brining in player.py, that will provide the probabilities of an action happening, which are determined from NBA player statistics from the current NBA season.

In [209]:
#Lots of help from Dustin Pierce at General Assembly
#https://stable-baselines.readthedocs.io/en/master/guide/custom_env.html
#https://github.com/koulanurag/ma-gym/blob/master/ma_gym/envs/pong_duel/pong_duel.py
#https://github.com/hardmaru/slimevolleygym/blob/master/slimevolleygym/slimevolley.py
#https://medium.com/@m.alzantot/you-can-see-what-is-the-observation-space-by-print-env-observation-space-c4e59e64ac52

import copy
import logging

import gym
import numpy as np
from gym import spaces
from gym.utils import seeding

logger = logging.getLogger(__name__)

import random
import time
# import ball  --potential future add on
# from player import Player

from ..utils.draw import draw_grid, fill_cell, draw_circle, write_cell_text, draw_score_board

#### Stable Baselines Imports

In [33]:
from stable_baselines.common.env_checker import check_env

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import PPO1, PPO2, A2C, results_plotter

### Creating the Player Class

In [377]:
class Player():
    
    def __init__(self, player_name):

        self.player_name = player_name
        self._is_defended = False
        self._has_ball = True
        self._close_range = False
        self._midrange = False
        self._three_point_range = True

        self.steal = random.randint(0, 3)/100
        self.block = random.randint(0, 4)/100
        self.shooting_close = random.randint(80, 95)/100
        self.shooing_midrange = random.randint(40, 60)/100
        self.shooting3pts = random.randint(30, 45)/100
        self.stamina = random.randint(70,95)/100
        

In [378]:
class Player2():

    def __init__(self, player_name):
        
        self.player_name = player_name
        self._is_defended = False
        self._has_ball = False
        self._close_range = False
        self._midrange = False
        self._three_point_range = True

        self.steal = random.randint(0, 3)/100
        self.block = random.randint(0, 4)/100
        self.shooting_close = random.randint(80, 95)/100
        self.shooing_midrange = random.randint(40, 60)/100
        self.shooting3pts = random.randint(30, 45)/100
        self.stamina = random.randint(70,95)/100
        

### Creating a custom Environment

In [386]:
class BasketballEnv(gym.Env):
    """
    Custom Environment that follows gym interface.
    This is a simple env where multiple agents learn strategies to put the ball in the hoop.
    For this simple iteration, actions will be determined by probabilities rather than physics.
    """
    # In google colab, we cannot implement the GUI ('human' render mode)
    metadata = {'render.modes': ['human', 'rgb_array']}

    
    def __init__(self, step_cost=0, reward=-0.1, max_rounds=1):
        #Grid size will be standard basketball halfcourt at 6"=1'-0" scale
        self._grid_shape = (100, 94)

        #Number of players
        self.n_agents = 4
        self.n_teams = 2
        self.n_agents_team_A = int(self.n_agents / 2)
        self.n_agents_team_B = int(self.n_agents / 2)
        self.reward = reward
        self._max_rounds = max_rounds
        self.action_space = spaces.MultiDiscrete([9, 2, 2])
        self.player_w_ball=[True, False]

        self._step_count = 0
        self._step_cost = step_cost
        self._total_episode_reward = [0 for _ in range(self.n_teams)]
        self.agent_pos = {_: None for _ in range(self.n_agents)}
        self.x = [env.agent_pos[x][0] for x in range(0,4)]
        self.y = [env.agent_pos[y][1] for y in range(0,4)]
        
        #Set starting positions for agents in Team A
        self.agent_pos[0] = (self._grid_shape[0]//2, self._grid_shape[1] - 2)
        self.agent_pos[1] = (random.randint(0, self._grid_shape[0]//2), self._grid_shape[1] - 2)
        
        # Marking where a third player for Team A would go in the future
        #         self.agent_pos[2] = (random.randint(self._grid_shape[0]//2, self._grid_shape[0]), self._grid_shape[1] - 2)
        
        #Set starting positions for agents in Team B
        self.agent_pos[2] = (self.agent_pos[0], self._grid_shape[1] - 8)
        self.agent_pos[3] = (self.agent_pos[1], self._grid_shape[1] - 8)

        # Marking where a third player for Team B would go in the future        
        #         self.agent_pos[5] = (random.randint(self._grid_shape[0]//2, self._grid_shape[0]), self._grid_shape[1] - 10)
        self._agent_dones = None
        self.__rounds = 0

        # Observing agent positions for 4 agents
        self._obs_low = np.array([0., 0., 0., 0., 0., 0., 0., 0.])
        self._obs_high = np.array([1., 1., 1., 1., 1., 1., 1., 1,])
        self.observation_space = spaces.Box(low=self._obs_low, high=self._obs_high,
                                        dtype=np.float32)

        self.viewer = None
        self.seed()
        
    def is_done(self):
        self.__rounds == self._max_rounds
        
    def get_action_meanings(self, agent_i=None):
        if agent_i is not None:
            assert agent_i <= self.n_agents
            return [ACTION_MEANING[i] for i in range(self.action_space[agent_i].n)]
        else:
            return [[ACTION_MEANING[i] for i in range(ac.n)] for ac in self.action_space]

    def __create_grid(self):
        _grid = [[PRE_IDS['empty'] for _ in range(self._grid_shape[1])] for row in range(self._grid_shape[0])]
        return _grid

    def __update_agent_view(self, agent_i):
        for row in range(self.agent_prev_pos[agent_i][0],
                         self.agent_prev_pos[agent_i][0]):
            self._full_obs[row][self.agent_prev_pos[agent_i][1]] = PRE_IDS['empty']

        for row in range(self.agent_pos[agent_i][0], self.agent_pos[agent_i][0]):
            self._full_obs[row][self.agent_pos[agent_i][1]] = PRE_IDS['agent'] + str(agent_i + 1) \
                                                              + '_' + str(row - self.agent_pos[agent_i][0])

#     def __draw_base_img(self):
#         self._base_img = draw.draw_grid(self._grid_shape[0], self._grid_shape[1],
#                                    cell_size=CELL_SIZE, fill='white', line_color='white')

    def __init_full_obs(self):
        self._full_obs = self.__create_grid()
        for agent_i in range(self.n_agents):
            self.__update_agent_view(agent_i)

        for agent_i in range(self.n_agents):
            self.__update_agent_view(agent_i)

#         self.__draw_base_img()

    #Countdown timer as 24 second shot clock for each round
    #https://www.geeksforgeeks.org/how-to-create-a-countdown-timer-using-python/
    def countdown(self, t=24):     
        while t: 
            mins, secs = divmod(t, 60) 
            timer = '{:02d}:{:02d}'.format(mins, secs)  
            time.sleep(1) 
            t -= 1
        return t 
        

    def get_agent_obs(self):
        _obs = []

        for agent_i in range(self.n_agents):
            pos = self.agent_pos[agent_i]
            _agent_i_obs_a = pos[0] / self._grid_shape[0]
            _agent_i_obs_b = pos[1] / self._grid_shape[1]
            
            _obs.append(_agent_i_obs_a)
            _obs.append(_agent_i_obs_b)

        return np.array(_obs)
    
##############
#Define Reset#   
##############

    def reset(self):
        self.__rounds = 0
        self.countdown(24)
        
        #Set starting positions for agents in Team A
        self.agent_pos[0] = (self._grid_shape[0]//2, self._grid_shape[1] - 2)
        self.agent_pos[1] = (random.randint(0, self._grid_shape[0]//2), self._grid_shape[1] - 2)

        
        #Set starting positions for agents in Team B
        self.agent_pos[2] = (self._grid_shape[0]//2, self._grid_shape[1] - 8)
        self.agent_pos[3] = (random.randint(0, self._grid_shape[0]//2), self._grid_shape[1] - 8)

        
        self.agent_prev_pos = {_: self.agent_pos[_] for _ in range(self.n_agents)}
        self._agent_dones = False
        self.__init_full_obs()
        self._step_count = 0
        self._total_episode_reward = [0 for _ in range(self.n_teams)]

        return self.get_agent_obs()

    
###############################
#Define Properties and Actions#   
###############################
    
            
    #This will determine success of an action
    def action_success(self, p_1):
        return np.random.choice([0, 1], p=[1 - p_1, p_1])

    #Determine if a player is close to the goal
    def close_range(self):
        for agent_i in range(n_agents):
            if self.y > 0 and self.y < self._grid_shape[1] and self.x > 0 and self.x < self._grid_shape[0]:
                if np.sqrt((x.self - GOAL[0])**2 + (y.self-GOAL[1])**2) <= 6:
                    agent_i.player._close_range = True

    #Determine if a player is mid-range from the goal
    def midrange(self):
        for agent_i in range(n_agents):
            if self.y > 0 and self.y < self._grid_shape[1] and self.x > 0 and self.x < self._grid_shape[0]:
                if player._close_range == False and player._three_point_range == False:
                    agent_i.player._midrange = True

    #Determine if a player is in three point range
    def _three_point_range(self):
        for agent_i in range(n_agents):
            if self.y > 0 and self.y < self._grid_shape[1] and self.x > 0 and self.x < self._grid_shape[0]:
                if self.y <= 19.67 and self.x <= 6.67 or self.x >= 93.33:
                    player._three_point_range = True
                elif self.y > 19.67 and np.sqrt((x.self - GOAL[0])**2 + (y.self-GOAL[1])**2) > 44.3:
                    agent_i.player._three_point_range = True

    #define defensive rebound, will be a reward for the defensive team
    def d_rebound(self):
        return True
        
    def rebound(self):
        for agent_i in range(self.n_agents_team_A):
            if self.action_success(0.3):
                #UPDATE THE PLAYER_W_BALL:
                o_rebounder = random.choice([[True, False], [False, True]])
                self.player_w_ball = o_rebounder
                if o_rebounder == [True, False]:
                    player._has_ball = True
                    player2._has_ball = False
                else:
                    player._has_ball = False
                    player2._has_ball = True
                return 0.2
            else:
                return self.d_rebound()

    #define shot, a made shot will be a reward for the offensive team
    def shot(self):
        
        for agent_i in range(self.n_agents_team_A):

            #Can only shoot if the player has the ball -- [1,0]
            if self.player_w_ball == 1:

                #Close range shot
                if player._close_range():
                    shot = action_success(player.shooting_close)
                    if shot == 1:
                        return 2
                    else:
                        return rebound()

                #Midrange shot
                if player._midrange():
                    shot = action_success(player.shooting_midrange)
                    if shot == 1:
                        return 2
                    else:
                        return rebound()

                #3 point shot
                if _three_point_range():
                    shot = action_success(player.shooting3pts)
                    if shot == 1:
                        return 3
                    else:
                        return rebound()
                        
    def ball_pass(self):
        for agent_i in range(n_agents_team_A):
            if player._has_ball == True:
                player._has_ball = False
                player2._has_ball = True
                self.player_w_ball = [0,1]
                
            if player2._has_ball == True:
                player._has_ball = True
                player2._has_ball = False
                self.player_w_ball = True
                
            

    def defended(self):
        for agent_i in range(n_agents_team_B):
            for agent_j in n_agents_team_A:
                if np.sqrt(agent_i.self.x**2 + agent_i.self.y**2) < 5:
                    agent_j.player._is_defended=True
                else:
                    agent_j.player._is_defended=False
    
    def steal(self):
        for agent_i in range(self.n_agents_team_B):
            for agent_j in range(self.n_agents_team_A):
                if player._has_ball and player._is_defended:
                    return action_success(0.02)
    
    def block(self):
        for agent_i in range(self.n_agents_team_B):
            for agent_j in range(self.n_agents_team_A):
                if player._has_ball and player._is_defended and player._close_range:
                    return action_success(0.04)
                if player._has_ball and player._is_defended and player._midrange:
                    return action_success(0.03)
                if player._has_ball and player._is_defended and player._three_point_range:
                    return action_success(0.02)
                
    def out_of_bounds(self):
        for agent_x in range(self.n_agents):
            if self.x[agent_x] > 0 and self.x[agent_x] < self._grid_shape[0] and self.y[agent_x] > 0 and self.y[agent_x] < self._grid_shape[1]:
                return False
            else:
                return True

    
    
###############
#Define Render#   
###############

    def render(self, mode='human'):
        img = copy.copy(self._base_img)
        for agent_i in range(self.n_agents):
            for row in self.agent_pos[agent_i][0]:
                fill_cell(img, (row, self.agent_pos[agent_i][1]), cell_size=CELL_SIZE, fill=AGENT_COLORS[agent_i])

        img = draw_border(img, border_width=2, fill='gray')

        img = np.asarray(img)
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control import rendering
            if self.viewer is None:
                self.viewer = rendering.SimpleImageViewer()
            self.viewer.imshow(img)
            return self.viewer.isopen

    def __update_agent_pos(self, agent_i, move):

        curr_pos = copy.copy(self.agent_pos[agent_i])
        for agent_i in range(self.n_agents):
            if self.out_of_bounds() == False:
                if move == 0:  # noop
                    next_pos = None
                elif move == 1:  # up
                    next_pos = [curr_pos[0] - 1, curr_pos[1]]
                elif move == 2:  # upright
                    next_pos = [curr_pos[0] - 1, curr_pos[1] + 1]
                elif move == 3:  # right
                    next_pos = [curr_pos[0], curr_pos[1] + 1]
                elif move == 4:  # downright
                    next_pos = [curr_pos[0] + 1, curr_pos[1] + 1]
                elif move == 5:  # down
                    next_pos = [curr_pos[0] + 1, curr_pos[1]]
                elif move == 6:  # downleft
                    next_pos = [curr_pos[0] + 1, curr_pos[1] - 1]
                elif move == 7:  # left
                    next_pos = [curr_pos[0], curr_pos[1] - 1]
                elif move == 8:  # upleft
                    next_pos = [curr_pos[0] - 1, curr_pos[1] - 1]
                else:
                    raise Exception('Action Not found!')


                if next_pos is not None:
                    self.agent_prev_pos[agent_i] = self.agent_pos[agent_i]
                    self.agent_pos[agent_i] = next_pos
                    self.__update_agent_view(agent_i)

#############
#Define Seed#   
#############

    def seed(self, n=None):
        self.np_random, seed = seeding.np_random(n)
        return [seed]

#############
#Define Step#   
#############
    
    def step(self, action_n=[0,0,0,0]):
#         assert len(action_n) == self.n_agents
        self._step_count += 1
        rewards = [self._step_cost for _ in range(self.n_teams)]

        # if shot made, new round
        if self.shot() == 2:
            rewards = [2, 0]
            self.__rounds += 1
            
        elif self.shot() == 3:
            rewards = [3, 0]
            self.__rounds += 1

        # if steal made, new round
        if self.steal():
            rewards = [0, 2]
            self.__rounds += 1
        
        # if defensive rebound made, new round
        if self.rebound() == 0.2:
            rewards = [0.2, 0]
            self.__rounds += 0
        if self.rebound() == 1:
            rewards = [0, 1]
            self.__rounds +=1
        
        # if block made, new round
        if self.block():
            rewards = [0, 2]
            self.__rounds += 1
            
        # if Offense fails to get off a shot within time limit
        if self.countdown() < 1:
            rewards = [0, 2]
            self.__rounds += 1
            
        if self.out_of_bounds():
            rewards = [-10, -10]
            self.__rounds +=1
                        
        if self.__rounds == self._max_rounds:
            self._agent_dones = True
        elif self.countdown() < 1:
            self._agent_dones = True
            self.__rounds +=1
        else:
            for agent_i in range(self.n_agents_team_A):
                self.__update_agent_pos(agent_i, action_n[agent_i])
            for agent_j in range(self.n_agents_team_B):
                self.__update_agent_pos(agent_j, action_n[agent_j])
                
        for i in range(self.n_teams):
            self._total_episode_reward[i] += rewards[i]

        return self.get_agent_obs(), rewards[0]-rewards[1], self._agent_dones, {'rounds': self.__rounds}

# Define constants for clearer code

CELL_SIZE = 5

#Goal Location
GOAL = [50, 10.5]

ACTION_MEANING = {
    0 : 'NOOP',
    1 : 'UP',
    2 : 'UPRIGHT',
    3 : 'RIGHT',
    4 : 'DOWNRIGHT',
    5 : 'DOWN',
    6 : 'DOWNLEFT',
    7 : 'LEFT',
    8 : 'UPLEFT',
    9 : 'BALL_PASS',
    10 : 'SHOOT',
    11 : 'STEAL',
    12 : 'BLOCK',
}

AGENT_TEAMS = {
    0: 'A',
    1: 'A',
    2: 'B',
    3: 'B',
}

AGENT_COLORS = {
    0: 'red',
    1: 'red',
    2: 'blue',
    3: 'blue',
}

WALL_COLOR = 'black'

# each pre-id should be unique and single char
PRE_IDS = {
    'agent': 'A',
    'goal' : 'G',
    'empty': 'O'
}


In [387]:
print(env._agent_dones)

True


In [383]:
player = Player('Luka Doncic')

In [388]:
player._has_ball

True

In [390]:
env.reward

-0.1

### Checking the environment

In [None]:
player = Player('Luka Doncic')
player2 = Player2('Kristaps Porzingis')
env = BasketballEnv()
# It will check your custom environment and output additional warnings if needed
check_env(env)



### Random Agent Decisions

In [338]:
class RandomAgent:
    def __init__(self):
        self.total_reward = 0.0
    def step(self, env):
        # current_obs = env.get_observation()
        actions = ACTION_MEANING
        action = random.choice(actions)
        reward = env.step(action)
        #print(f"Took action {action} and got reward {reward}")
        self.total_reward += reward

In [None]:
env = BasketballEnv()
agent1 = RandomAgent()

curry= Player()

while not env.is_done():
    agent.step(env)
print(f"Total reward: {agent.total_reward}")

#### Analysis:
Analysis of the random agent results go here.

### PPO - Promixal Policy Optimization

In [None]:
env = gym.make('BasketballEnv')

model = PPO1(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("ppo1_Basketball")

model = PPO1.load("ppo1_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
stable_baselines.plot_results()

#### Analysis:
Analysis of the PPO Model results go here.

### PPO2 - Promixal Policy Optimization

In [None]:
# multiprocess environment
env = make_vec_env('BasketballEnv', n_envs=4)

model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("ppo2_basketball")

del model # remove to demonstrate saving and loading

model = PPO2.load("ppo2_basketball")

# Enjoy trained agent
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
stable_baselines.plot_results()

#### Analysis:
Analysis of the PPO2 Model results go here.

### A2C - Advantage Actor Critic

In [None]:
# https://stable-baselines.readthedocs.io/en/master/modules/a2c.html
# Parallel environments
env = make_vec_env('BasketballEnv', n_envs=4)

model = A2C(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("a2c_basketball")

del model # remove to demonstrate saving and loading

model = A2C.load("a2c_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
stable_baselines.plot_results()

#### Analysis:
Analysis of the A2C Model results go here.

class BasketballEnv(gym.Env):
    """
    Custom Environment that follows gym interface.
    This is a simple env where multiple agents learn strategies to put the ball in the hoop.
    For this simple iteration, actions will be determined by probabilities rather than physics.
    """
    # In google colab, we cannot implement the GUI ('human' render mode)
    metadata = {'render.modes': ['human', 'rgb_array']}

    
    def __init__(self, step_cost=0, reward=0, max_rounds=1):
        #Grid size will be standard basketball halfcourt at 6"=1'-0" scale
        self._grid_shape = (100, 94)

        #Number of players
        self.n_agents = 4
        self.n_agents_team_A = int(self.n_agents / 2)
        self.n_agents_team_B = int(self.n_agents / 2)
        self.reward = reward
        self._max_rounds = max_rounds
        self.action_space = spaces.MultiDiscrete([9, 2, 2])

        self._step_count = None
        self._step_cost = step_cost
        self._total_episode_reward = None
        self.agent_pos = {_: None for _ in range(self.n_agents)}
        
        #Set starting positions for agents in Team A
        self.agent_pos[0] = (self._grid_shape[0]//2, self._grid_shape[1] - 2)
        self.agent_pos[1] = (random.randint(0, self._grid_shape[0]//2), self._grid_shape[1] - 2)
        
        # Marking where a third player for Team A would go in the future
        #         self.agent_pos[2] = (random.randint(self._grid_shape[0]//2, self._grid_shape[0]), self._grid_shape[1] - 2)
        
        #Set starting positions for agents in Team B
        self.agent_pos[2] = (self.agent_pos[0], self._grid_shape[1] - 8)
        self.agent_pos[3] = (self.agent_pos[1], self._grid_shape[1] - 8)

        # Marking where a third player for Team B would go in the future        
        #         self.agent_pos[5] = (random.randint(self._grid_shape[0]//2, self._grid_shape[0]), self._grid_shape[1] - 10)
        self._agent_dones = None
        self.__rounds = None

        # Observing agent positions for 4 agents
        self._obs_low = np.array([0., 0., 0., 0., 0., 0., 0., 0.])
        self._obs_high = np.array([1., 1., 1., 1., 1., 1., 1., 1,])
        self.observation_space = spaces.Box(low=self._obs_low, high=self._obs_high,
                                        dtype=np.float32)

        self.viewer = None
        self.seed()
        
    def is_done(self):
        self.__rounds == self._max_rounds
        
    def get_action_meanings(self, agent_i=None):
        if agent_i is not None:
            assert agent_i <= self.n_agents
            return [ACTION_MEANING[i] for i in range(self.action_space[agent_i].n)]
        else:
            return [[ACTION_MEANING[i] for i in range(ac.n)] for ac in self.action_space]

    def __create_grid(self):
        _grid = [[PRE_IDS['empty'] for _ in range(self._grid_shape[1])] for row in range(self._grid_shape[0])]
        return _grid

    def __update_agent_view(self, agent_i):
        for row in range(self.agent_prev_pos[agent_i][0],
                         self.agent_prev_pos[agent_i][0]):
            self._full_obs[row][self.agent_prev_pos[agent_i][1]] = PRE_IDS['empty']

        for row in range(self.agent_pos[agent_i][0], self.agent_pos[agent_i][0]):
            self._full_obs[row][self.agent_pos[agent_i][1]] = PRE_IDS['agent'] + str(agent_i + 1) \
                                                              + '_' + str(row - self.agent_pos[agent_i][0])

#     def __draw_base_img(self):
#         self._base_img = draw.draw_grid(self._grid_shape[0], self._grid_shape[1],
#                                    cell_size=CELL_SIZE, fill='white', line_color='white')

    def __init_full_obs(self):
        self._full_obs = self.__create_grid()
        for agent_i in range(self.n_agents):
            self.__update_agent_view(agent_i)

        for agent_i in range(self.n_agents):
            self.__update_agent_view(agent_i)

#         self.__draw_base_img()

    #Countdown timer as 24 second shot clock for each round
    #https://www.geeksforgeeks.org/how-to-create-a-countdown-timer-using-python/
    def countdown(self, t=24):     
        while t: 
            mins, secs = divmod(t, 60) 
            timer = '{:02d}:{:02d}'.format(mins, secs)  
            time.sleep(1) 
            t -= 1
        return t 

    def get_agent_obs(self):
        _obs = []

        for agent_i in range(self.n_agents):
            pos = self.agent_pos[agent_i]
            _agent_i_obs = [pos[0] / self._grid_shape[0], pos[1] / self._grid_shape[1]]
            
            _obs.append(_agent_i_obs)

        return _obs
    
##############
#Define Reset#   
##############

    def reset(self):
        self.__rounds = 0
        self.countdown(24)
        
        #Set starting positions for agents in Team A
        self.agent_pos[0] = (self._grid_shape[0]//2, self._grid_shape[1] - 2)
        self.agent_pos[1] = (random.randint(0, self._grid_shape[0]//2), self._grid_shape[1] - 2)

        
        #Set starting positions for agents in Team B
        self.agent_pos[2] = (self._grid_shape[0]//2, self._grid_shape[1] - 8)
        self.agent_pos[3] = (random.randint(0, self._grid_shape[0]//2), self._grid_shape[1] - 8)

        
        self.agent_prev_pos = {_: self.agent_pos[_] for _ in range(self.n_agents)}
        self._agent_dones = [False, False]
        self.__init_full_obs()
        self._step_count = 0
        self._total_episode_reward = [0 for _ in range(self.n_agents)]

        return np.array(self.get_agent_obs())

    
###############################
#Define Properties and Actions#   
###############################
    
            
    #This will determine success of an action
    def action_success(self, p_1):
        return np.random.choice([0, 1], p=[1 - p_1, p_1])

    #Determine if a player is close to the goal
    def close_range(self):
        for agent_i in n_agents:
            if self.y > 0 and self.y < self._grid_shape[1] and self.x > 0 and self.x < self._grid_shape[0]:
                if np.sqrt((x.self - GOAL[0])**2 + (y.self-GOAL[1])**2) <= 6:
                    agent_i.player._close_range = True

    #Determine if a player is mid-range from the goal
    def midrange(self):
        for agent_i in n_agents:
            if self.y > 0 and self.y < self._grid_shape[1] and self.x > 0 and self.x < self._grid_shape[0]:
                if player._close_range == False and player._three_point_range == False:
                    agent_i.player._midrange = True

    #Determine if a player is in three point range
    def _three_point_range(self):
        for agent_i in n_agents:
            if self.y > 0 and self.y < self._grid_shape[1] and self.x > 0 and self.x < self._grid_shape[0]:
                if self.y <= 19.67 and self.x <= 6.67 or self.x >= 93.33:
                    player._three_point_range = True
                elif self.y > 19.67 and np.sqrt((x.self - GOAL[0])**2 + (y.self-GOAL[1])**2) > 44.3:
                    agent_i.player._three_point_range = True

    #define defensive rebound, will be a reward for the defensive team
    def d_rebound(self):
        return True
        
    def rebound(self):
        for agent_i in n_agents_team_A:
            if action_success(0.3):
                o_rebounder = random.choice([agent_j in n_agents_team_A])
                o_rebounder.player._has_ball = True
            else:
                return d_rebound()

    #define shot, a made shot will be a reward for the offensive team
    def shot(self):
        
        for agent_i in self.n_agents:

            #Can only shoot if the player has the ball
            if player._has_ball():

                #Close range shot
                if player._close_range():
                    shot = action_success(player.shooting_close)
                    if shot == 1:
                        return 2
                    else:
                        rebound()

                #Midrange shot
                if player._midrange():
                    shot = action_success(player.shooting_midrange)
                    if shot == 1:
                        return 2
                    else:
                        rebound()

                #3 point shot
                if _three_point_range():
                    shot = action_success(player.shooting3pts)
                    if shot == 1:
                        return 3
                    else:
                        rebound()
                        
    def ball_pass(self):
        for agent_i, agent_j in n_agents_team_A:
            if agent_i.player._has_ball():
                agent_j.player._has_ball = True
                agent_i.player._has_ball = False
            else:
                agent_i.player._has_ball = True
                agent_j.player._has_ball = False

    def defended(self):
        for agent_i in n_agents_team_B:
            for agent_j in n_agents_team_A:
                if np.sqrt(agent_i.self.x**2 + agent_i.self.y**2) < 5:
                    agent_j.player._is_defended=True
                else:
                    agent_j.player._is_defended=False
    
    def steal(self):
        for agent_i in n_agents_team_B:
            for agent_j in n_agents_team_A:
                if agent_j.player._has_ball and agent_j.player._is_defended:
                    return action_success(0.02)
    
    def block(self):
        for agent_i in n_agents_team_B:
            for agent_j in n_agents_team_A:
                if agent_j.player._has_ball and agent_j.player._is_defended and agent_j.player._close_range:
                    return action_success(0.04)
                if agent_j.player._has_ball and agent_j.player._is_defended and agent_j.player._midrange:
                    return action_success(0.03)
                if agent_j.player._has_ball and agent_j.player._defended and agent_j.player._three_point_range:
                    return action_success(0.02)
    
    
    
    
###############
#Define Render#   
###############

    def render(self, mode='human'):
        img = copy.copy(self._base_img)
        for agent_i in range(self.n_agents):
            for row in self.agent_pos[agent_i][0]:
                fill_cell(img, (row, self.agent_pos[agent_i][1]), cell_size=CELL_SIZE, fill=AGENT_COLORS[agent_i])

        img = draw_border(img, border_width=2, fill='gray')

        img = np.asarray(img)
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control import rendering
            if self.viewer is None:
                self.viewer = rendering.SimpleImageViewer()
            self.viewer.imshow(img)
            return self.viewer.isopen

    def __update_agent_pos(self, agent_i, move):

        curr_pos = copy.copy(self.agent_pos[agent_i])
        if self.x > 0 and self.x < self._grid_shape[0] and self.y > 0 and self.y < self._grid_shape[1]:
            if move == 0:  # noop
                next_pos = None
            elif move == 1:  # up
                next_pos = [curr_pos[0] - 1, curr_pos[1]]
            elif move == 2:  # upright
                next_pos = [curr_pos[0] - 1, curr_pos[1] + 1]
            elif move == 3:  # right
                next_pos = [curr_pos[0], curr_pos[1] + 1]
            elif move == 4:  # downright
                next_pos = [curr_pos[0] + 1, curr_pos[1] + 1]
            elif move == 5:  # down
                next_pos = [curr_pos[0] + 1, curr_pos[1]]
            elif move == 6:  # downleft
                next_pos = [curr_pos[0] + 1, curr_pos[1] - 1]
            elif move == 7:  # left
                next_pos = [curr_pos[0], curr_pos[1] - 1]
            elif move == 8:  # upleft
                next_pos = [curr_pos[0] - 1, curr_pos[1] - 1]
            else:
                raise Exception('Action Not found!')
        else:
            raise Exception('Out of Bounds!')

        if next_pos is not None:
            self.agent_prev_pos[agent_i] = self.agent_pos[agent_i]
            self.agent_pos[agent_i] = next_pos
            self.__update_agent_view(agent_i)

#############
#Define Seed#   
#############

    def seed(self, n=None):
        self.np_random, seed = seeding.np_random(n)
        return [seed]

#############
#Define Step#   
#############
    
    def step(self, action_n):

        self._step_count += 1
        rewards = [self._step_cost for _ in range(self.n_agents)]

        # if shot made, new round
        if shot() == 2:
            rewards = [2, 0]
            self.__rounds += 1
            
        elif shot() == 3:
            rewards = [3, 0]
            self.__rounds += 1

        # if steal made, new round
        if steal():
            rewards = [0, 2]
            self.__rounds += 1
        
        # if defensive rebound made, new round
        if d_rebound():
            rewards = [0, 1]
            self.__rounds += 1
        
        # if block made, new round
        if block():
            rewards = [0, 2]
            self.__rounds += 1
            
        # if Offense fails to get off a shot within time limit
        if countdown() < 1:
            rewards = [0, 2]
            self.__rounds += 1
                        
        if self.__rounds == self._max_rounds:
            self._agent_dones = [True for _ in range(self.n_agents)]
        else:
            for agent_i in range(self.n_agents):
                self.__update_agent_pos(agent_i, action_n[agent_i])

        for i in range(self.n_agents):
            self._total_episode_reward[i] += rewards[i]

        return self.get_agent_obs(), rewards, self._agent_dones, {'rounds': self.__rounds}

# Define constants for clearer code

CELL_SIZE = 5

#Goal Location
GOAL = [50, 10.5]

ACTION_MEANING = {
    0 : 'NOOP',
    1 : 'UP',
    2 : 'UPRIGHT',
    3 : 'RIGHT',
    4 : 'DOWNRIGHT',
    5 : 'DOWN',
    6 : 'DOWNLEFT',
    7 : 'LEFT',
    8 : 'UPLEFT',
    9 : 'BALL_PASS',
    10 : 'SHOOT',
    11 : 'STEAL',
    12 : 'BLOCK',
}

AGENT_TEAMS = {
    0: 'A',
    1: 'A',
    2: 'B',
    3: 'B',
}

AGENT_COLORS = {
    0: 'red',
    1: 'red',
    2: 'blue',
    3: 'blue',
}

WALL_COLOR = 'black'

# each pre-id should be unique and single char
PRE_IDS = {
    'agent': 'A',
    'goal' : 'G',
    'empty': 'O'
}


In [None]:
        
        
        
    def _get_obs(self, obs):
        """
        Concatenate the time feature to the current observation.
        :param obs: (np.ndarray)
        :return: (np.ndarray)
        """
        # Remaining time is more general
        time_feature = 1 - (self._current_step / self._max_steps)
        if self._test_mode:
            time_feature = 1.0
        # Optionnaly: concatenate [time_feature, time_feature ** 2]
    return np.concatenate((obs, [time_feature]))

    def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # Initialize the agent at the right of the grid
        self.agent_pos = self.grid_size - 1
   
    # here we convert to float32 to make it more general (in case we want to use continuous actions)
    return np.array([self.agent_pos]).astype(np.float32)

    def step(self, action):
        if action == self.LEFT:
            self.agent_pos -= 1
        elif action == self.RIGHT:
            self.agent_pos += 1
        else:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # Account for the boundaries of the grid
    self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # Are we at the left of the grid?
    done = bool(self.agent_pos == 0)

    # Null reward everywhere except when reaching the goal (left of the grid)
    reward = 1 if self.agent_pos == 0 else 0

    # Optionally we can pass additional info, we are not using that for now
    info = {}

    return np.array([self.agent_pos]).astype(np.float32), reward, done, info

    def render(self, mode='console'):
        if mode != 'console':
            raise NotImplementedError()
            # agent is represented as a cross, rest as a dot
            print("." * self.agent_pos, end="")
            print("x", end="")
            print("." * (self.grid_size - self.agent_pos))

    def close(self):
        pass
    


In [None]:
#Lots of help from Dustin Pierce
class Environment:
    SIZE = [50, 94]
    GOAL_A = [10.5, 50]
    BACKBOARD_A = [25, 4]
    BACKBOARD_B = [25, 90]
    GOAL_B = [25, 88.75]
    3PT_LINE = [Coods] 
    SCORE = 0
    
    def shot(self):
        if self.x ** 2 + self.y**2 > AMOUNT:
            
    
    def __init__(self):
        self.time_left = countdown(300)

        self.x = 44
        self.y = 25
        
    #Countdown timer as game clock
    #https://www.geeksforgeeks.org/how-to-create-a-countdown-timer-using-python/
    def countdown(t=300):     
        while t: 
            mins, secs = divmod(t, 60) 
            timer = '{:02d}:{:02d}'.format(mins, secs)  
            time.sleep(1) 
            t -= 1
            return t 

    def num_states(self):
        return t

    def num_actions(self):
        return 10

    def get_observation(self):
        return [self.x, self.y]

    def get_state_num(self):
        return self.x*self.SIZE + self.y

    def get_pos_from_state_num(self, state_num):
        return (state_num // self.SIZE, state_num % self.SIZE)

    def has_ball(self):
        if self.x == ball.x and self.y == ball.y:
            return True
        return False
    
    def has_dribble:
        
    def on_offense(self):
        
        
        
    def get_actions(self):
        if has_ball == False:
            return ["up", "up-right", "right", "down-right", "down", "down-left", "left", "up-left", "jump", "screen"]
        if has_ball == True and has_dribble == True:
            return ['pass', 'shoot', 'up', 'down', 'left', 'right']

    def is_done(self):
        return self.time_left == 0

    def at_goal(self):
        return self.x == self.GOAL[0] and self.y == self.GOAL[1]

    def is_clear(self, x, y):
        for w in self.WALLS:
            if x == w[0] and y == w[1]:
            return False
        return True

    def action(self, action):
        if self.is_done():
            raise Exception("Episode is already over")
        self.steps_left -= 1
        if action == "up" and self.y > 0:
            if self.is_clear(self.x, self.y-1):
            self.y -= 1
        elif action == "down" and self.y < self.SIZE-1:
            if self.is_clear(self.x, self.y+1):
            self.y += 1
        elif action == "left" and self.x > 0:
            if self.is_clear(self.x-1, self.y):
            self.x -= 1
        elif action == "right" and self.x < self.SIZE-1:
            if self.is_clear(self.x+1, self.y):
            self.x += 1

        if self.x == self.GOAL[0] and self.y == self.GOAL[1]:
            return 1.0
        return 0.0