In [54]:
import random


# A agent at any time has the following state
#  pos : int
#  health : int (>=0)
class AgentState:
    def __init__(self, pos, health):
        self.pos = pos
        self.health = health

# A game has the following state at any time. The state is visiable to both agents
#  state of agent_0
#  state of agent_1
class GameState:                
    def __init__(self, agent_0_state, agent_1_state):
        self.agents = [agent_0_state, agent_1_state]
        
    def __str__(self):
     return str(self.agents[0].pos) + "," + str(self.agents[0].health) + "," + str(self.agents[1].pos) + "," + str(self.agents[1].health)

# at any time in a game, a agent can take one of the following actions
class GameAction:
    MOVELEFT = 1
    MOVERIGHT = 2
    ATTACK = 3
        
# The simulator manges the following
#   the game state at any time
#   reward judgment
class GameSimulator:
    def __init__(self):
        # init the game state (start of a game)
        self.init_health = 3
        self.init_pos_0 = 5
        self.init_pos_1 = 7
        self.miss_rate = 0.1
        self.move_speed = 1
        self.attack_power = 1
        self.attack_range = 1
        self.final_award = 100
        
        agent_0 = AgentState(self.init_pos_0, self.init_health)
        agent_1 = AgentState(self.init_pos_1, self.init_health)
        self.state = GameState(agent_0, agent_1)

    
    def move_agent(self, agent_idx, agent_action):
        # no health no action
        if self.state.agents[agent_idx].health <= 0:
            return
        
        opponent_idx = (agent_idx + 1) % 2
          # first decide the move actions
        if agent_action == GameAction.MOVELEFT:
            # no other agent should on the path to the target spot
            if self.state.agents[opponent_idx].pos >= self.state.agents[agent_idx].pos - self.move_speed and \
            self.state.agents[opponent_idx].pos <= self.state.agents[agent_idx].pos :
                return
            else:
                self.state.agents[agent_idx].pos = self.state.agents[agent_idx].pos - self.move_speed
        
        if agent_action == GameAction.MOVERIGHT:
            # no other agent should on the path to the target spot
            if self.state.agents[opponent_idx].pos <= self.state.agents[agent_idx].pos + self.move_speed and \
            self.state.agents[opponent_idx].pos >= self.state.agents[agent_idx].pos :
                return
            else:
                self.state.agents[agent_idx].pos = self.state.agents[agent_idx].pos + self.move_speed
                
        # [Q] should we reward for being closer to the opponent ? 
    
    def attack_agent(self, agent_idx, agent_action):
        # no health no action
        if self.state.agents[agent_idx].health <= 0:
            return
        
        opponent_idx = (agent_idx + 1) % 2
        
        if agent_action == GameAction.ATTACK:
            # there is a change to miss the hit
            hit =  0 if random.random() < self.miss_rate else 1
            
            # attack is only valid if the two agents are within attach range
            if abs(self.state.agents[opponent_idx].pos - self.state.agents[agent_idx].pos) <= self.attack_range:
                self.state.agents[opponent_idx].health = self.state.agents[opponent_idx].health - hit * self.attack_power
                
                # if the opponent still has positive health, reward by attack power
                # if the opponent has zero or negative health, reward by final reward
                if self.state.agents[opponent_idx].health > 0:
                    return hit * self.attack_power
                else:
                    return self.final_award
        
        # no attack action return 0
        return 0
    

    # consider the action of both agent_0 and agent_1, decide the reward to each
    def take_action(self, action_0, action_1):
        reward_0 = 0
        reward_1 = 0
      
        # first make the move update
        who_moves_first = 0 if random.random() < 0.5 else 1
        
        if who_moves_first == 0: # agent_0 moves first
            self.move_agent(0, action_0)
            self.move_agent(1, action_1)
        else:
            self.move_agent(1, action_1)
            self.move_agent(0, action_0)
        
      
        # then make the attack update
        gain_0 = 0
        gain_1 = 0
        
        who_attacks_first = 0 if random.random() < 0.5 else 1
        if who_attacks_first == 0: # agent_0 attacks first
            gain_0 = self.attack_agent(0, action_0)
            gain_1 = self.attack_agent(1, action_1)
        else:
            gain_1 = self.attack_agent(1, action_1)
            gain_0 = self.attack_agent(0, action_0)
        
        # if the agent still has positive health, reward is the attack result
        # otherwise, the reward is -1 * final_reward
        reward_0 = gain_0 if self.state.agents[0].health > 0 else -1 * gain_1
        reward_1 = gain_1 if self.state.agents[1].health > 0 else -1 * gain_0
        
        return self.state, reward_0, reward_1

    def reset(self):
        # Reset state 
        agent_0 = AgentState(self.init_pos_0, self.init_health)
        agent_1 = AgentState(self.init_pos_1, self.init_health)
        self.state = GameState(agent_0, agent_1)
        
        return self.state
    
    def is_finished(self):
        # finished when one agent is 0 health
        # return the winner
        if self.state.agents[0].health <= 0:
            return 1
        
        if self.state.agents[1].health <= 0:
            return 0
        
        return -1
        

In [15]:
import numpy as np

a =[0,0,0,0]

max(a)

0

In [48]:
import random
import numpy as np

class AgentClass1:
    def __init__(self, learning_rate=0.1, discount=0.95, exploration_rate=1.0):
        self.learning_rate = learning_rate # How much we appreciate new q-value over current
        self.discount = discount # How much we appreciate future reward over current
        self.exploration_rate = 1.0 # Initial exploration rate
        self.exploration_decay = 0.01 # Shift from exploration to explotation

        self.action_list = [GameAction.MOVELEFT, GameAction.MOVERIGHT, GameAction.ATTACK]
        
        # q_dict that keeps the reward for each key
        # the key is composed of "agent_0_pos, agent_0_health, agent_1_pos, agent_1_health"
        # the value is a array of size len(self.action_list) represents the reward of each action
        self.q_dict = {"init" : [0]*len(self.action_list)}
        


    def get_next_action(self, game_state):
        state_key = str(game_state)
        
        if state_key not in self.q_dict:  # always init the state if not init yet
            self.q_dict[state_key] = [0]*len(self.action_list)
                
        if random.random() > self.exploration_rate: # Explore (gamble) or exploit (greedy)
            return self.greedy_action(game_state)
        else:
            return self.random_action()

    def greedy_action(self, game_state):
        state_key = str(game_state)
        
        # randomly pick one from the actions with max reward
        max_reward = max(self.q_dict[state_key])
        indices = [i for i, x in enumerate(self.q_dict[state_key]) if x == max_reward]
        action_idx =random.randrange(len(indices))
            
        return self.action_list[action_idx]   
            

    def random_action(self):
        return self.action_list[random.randrange(len(self.action_list))]
    

    def update(self, cur_state, new_state, action, reward):
        cur_state_idx = str(cur_state)
        new_state_idx = str(new_state)
        action_idx = self.action_list.index(action)
        
        # cur Q-dict value
        cur_value = self.q_dict[cur_state_idx][action_idx]
        
        # What would be our best next action?
        if new_state_idx not in self.q_dict: # if future state does not init yet, init it
            self.q_dict[new_state_idx] = [0]*len(self.action_list)
            
        future_reward = max(self.q_dict[new_state_idx])
        
        # Main Q-table updating algorithm
        new_value = cur_value + self.learning_rate * (reward + self.discount * future_reward - cur_value)
        self.q_dict[cur_state_idx][action_idx] = new_value

        # Finally shift our exploration_rate toward zero (less gambling)
        if self.exploration_rate > 0:
            self.exploration_rate *= (1 - self.exploration_decay)

In [55]:
import random
import json
import time
import copy

# parse arguments
iterations = 10000
    
# setup simulation
master = GameSimulator()
master.reset()
total_reward = 0 # Score keeping
    
# setup agents
agent_0 = AgentClass1()
agent_1 = AgentClass1()

agent_0_win = 0
agent_1_win = 0

# main loop
for step in range(iterations):
    old_state = copy.deepcopy(master.state) # Store current state
    
    action_0 = agent_0.get_next_action(old_state) # Query agent for the next action
    action_1 = agent_1.get_next_action(old_state)
        
    new_state, reward_0, reward_1 = master.take_action(action_0, action_1) # Take action, get new state and reward
        
    agent_0.update(old_state, new_state, action_0, reward_0) 
    agent_1.update(old_state, new_state, action_1, reward_1) 

    total_reward += max(reward_0, reward_1) # Keep score
    
    winner = master.is_finished()
    
    if winner >= 0 :  # game is finished with winner
        if winner == 0:
            agent_0_win += 1
        if winner == 1:
            agent_1_win += 1
        
        # reset to start a new game
        master.reset()
            
    
    
    print(json.dumps({'i': step, "o_state": str(old_state), "n_state": str(new_state), "a0": action_0, "1": action_1, 't_re': total_reward, '0_win': agent_0_win, '1_win': agent_1_win}))

    time.sleep(0.0001) # Avoid spamming stdout too fast!




{"i": 0, "o_state": "5,3,7,3", "n_state": "4,3,7,3", "a0": 1, "1": 3, "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 1, "o_state": "4,3,7,3", "n_state": "4,3,8,3", "a0": 3, "1": 2, "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 2, "o_state": "4,3,8,3", "n_state": "5,3,8,3", "a0": 2, "1": 3, "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 3, "o_state": "5,3,8,3", "n_state": "6,3,7,3", "a0": 2, "1": 1, "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 4, "o_state": "6,3,7,3", "n_state": "5,3,8,3", "a0": 1, "1": 2, "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 5, "o_state": "5,3,8,3", "n_state": "6,3,9,3", "a0": 2, "1": 2, "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 6, "o_state": "6,3,9,3", "n_state": "7,3,10,3", "a0": 2, "1": 2, "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 7, "o_state": "7,3,10,3", "n_state": "6,3,9,3", "a0": 1, "1": 1, "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 8, "o_state": "6,3,9,3", "n_state": "7,3,9,3", "a0": 2, "1": 3, "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 9, "o_state": "7,3,9,3", "n_state": "8,2,9,3", 

In [56]:
agent = agent_0

for k in agent.q_dict:
    if max(agent.q_dict[k]) > 0:
        print(json.dumps({"state": k, "action_reward": agent.q_dict[k]}))




{"state": "5,3,7,3", "action_reward": [0.0009025000000000002, 0.0, 0.1]}
{"state": "4,3,7,3", "action_reward": [0, 0.009500000000000001, 0.0]}
{"state": "2,3,4,2", "action_reward": [0, 0, 0.1]}
{"state": "3,2,4,1", "action_reward": [0, 0, 10.0]}
{"state": "6,3,8,3", "action_reward": [0, 0, 0.1]}
{"state": "6,3,7,2", "action_reward": [0, 0, 0.1]}
{"state": "6,2,7,1", "action_reward": [0, 0, 10.0]}
{"state": "13,2,14,3", "action_reward": [0, 0.0, 0.1]}
{"state": "4,3,6,3", "action_reward": [0.0, 0.009500000000000001, 0.0]}
{"state": "2,3,4,3", "action_reward": [0, 0, 0.1]}
{"state": "14,3,16,3", "action_reward": [0.0, 0, 0.1]}
{"state": "13,3,15,2", "action_reward": [0, 0, 0.1]}
{"state": "8,3,10,1", "action_reward": [0, 0, 10.0]}
