In [52]:
import random


# A agent at any time has the following state
#  pos : int
#  health : int (>=0)
class AgentState:
    def __init__(self, health):
        self.health = health

# A game has the following state at any time. The state is visiable to both agents
#  state of agent_0
#  state of agent_1
class GameState:                
    def __init__(self, agent_0_state, agent_1_state, agents_distance):
        self.agents = [agent_0_state, agent_1_state]
        self.agents_distance = agents_distance
        
    def __str__(self):
     return str(self.agents_distance) + "," + str(self.agents[0].health) + ","  + str(self.agents[1].health)

# at any time in a game, a agent can take one of the following actions
class GameAction:
    MOVETOWARDS = "MOVETOWARDS"
    MOVEAWAY = "MOVEAWAY"
    ATTACK = "ATTACK"
        
# The simulator manges the following
#   the game state at any time
#   reward judgment
class GameSimulator:
    def __init__(self):
        # init the game state (start of a game)
        self.init_health = 3
        self.miss_rate = 0.1
        self.move_speed = 1
        self.attack_power = 1
        self.attack_range = 1
        self.final_award = 100
        self.init_agents_distance = 2
        self.max_agents_distance = 5
        
        agent_0 = AgentState(self.init_health)
        agent_1 = AgentState(self.init_health)
        self.state = GameState(agent_0, agent_1, self.init_agents_distance)

    
    def move_agent(self, agent_idx, agent_action):
        # no health no action
        if self.state.agents[agent_idx].health <= 0:
            return
        
        opponent_idx = (agent_idx + 1) % 2
        if agent_action == GameAction.MOVETOWARDS: # cannot run into each other
            self.state.agents_distance = max(1, self.state.agents_distance - self.move_speed)
        
        if agent_action == GameAction.MOVEAWAY: # cannot be more than max_agents_distance apart
            self.state.agents_distance = min(self.state.agents_distance + self.move_speed, self.max_agents_distance)
                
        # [Q] should we reward for being closer to the opponent ? 
    
    def attack_agent(self, agent_idx, agent_action):
        # no health no action
        if self.state.agents[agent_idx].health <= 0:
            return
        
        opponent_idx = (agent_idx + 1) % 2
        
        if agent_action == GameAction.ATTACK:
            # there is a change to miss the hit
            hit =  0 if random.random() < self.miss_rate else 1
            
            # attack is only valid if the two agents are within attack range
            if self.state.agents_distance <= self.attack_range:
                self.state.agents[opponent_idx].health = self.state.agents[opponent_idx].health - hit * self.attack_power
                
                # if the opponent still has positive health, reward by attack power
                # if the opponent has zero or negative health, reward by final reward
                if self.state.agents[opponent_idx].health > 0:
                    return hit * self.attack_power
                else:
                    return self.final_award
        
        # no attack action return 0
        return 0
    

    # consider the action of both agent_0 and agent_1, decide the reward to each
    def take_action(self, action_0, action_1):
        reward_0 = 0
        reward_1 = 0
      
        # first make the move update
        who_moves_first = 0 if random.random() < 0.5 else 1
        
        if who_moves_first == 0: # agent_0 moves first
            self.move_agent(0, action_0)
            self.move_agent(1, action_1)
        else:
            self.move_agent(1, action_1)
            self.move_agent(0, action_0)
        
      
        # then make the attack update
        gain_0 = 0
        gain_1 = 0
        
        who_attacks_first = 0 if random.random() < 0.5 else 1
        if who_attacks_first == 0: # agent_0 attacks first
            gain_0 = self.attack_agent(0, action_0)
            gain_1 = self.attack_agent(1, action_1)
        else:
            gain_1 = self.attack_agent(1, action_1)
            gain_0 = self.attack_agent(0, action_0)
        
        # if the agent still has positive health, reward is the attack result
        # otherwise, the reward is -1 * final_reward
        reward_0 = gain_0 if self.state.agents[0].health > 0 else -1 * gain_1
        reward_1 = gain_1 if self.state.agents[1].health > 0 else -1 * gain_0
        
        return self.state, reward_0, reward_1

    def reset(self):
        # Reset state 
        agent_0 = AgentState(self.init_health)
        agent_1 = AgentState(self.init_health)
        self.state = GameState(agent_0, agent_1, self.init_agents_distance)
        
        return self.state
    
    def is_finished(self):
        # finished when one agent is 0 health
        # return the winner
        if self.state.agents[0].health <= 0:
            return 1
        
        if self.state.agents[1].health <= 0:
            return 0
        
        return -1
        

In [76]:
import random
import numpy as np

class AgentClass1:
    def __init__(self, learning_rate=0.2, discount=0.95, exploration_rate=1.0):
        self.learning_rate = learning_rate # How much we appreciate new q-value over current
        self.discount = discount # How much we appreciate future reward over current
        self.init_exploration_rate = exploration_rate # Initial exploration rate
        self.exploration_rate = self.init_exploration_rate
        self.exploration_decay = 0.01 # Shift from exploration to explotation

        self.action_list = [GameAction.MOVETOWARDS, GameAction.MOVEAWAY, GameAction.ATTACK]
        
        # q_dict that keeps the reward for each key
        # the key is composed of "agents_distance, agent_0_health, agent_1_health"
        # the value is a array of size len(self.action_list) represents the reward of each action
        self.q_dict = {"init" : [0]*len(self.action_list)}
        


    def get_next_action(self, game_state):
        state_key = str(game_state)
        
        if state_key not in self.q_dict:  # always init the state if not init yet
            self.q_dict[state_key] = [0]*len(self.action_list)
                
        if random.random() > self.exploration_rate: # Explore (gamble) or exploit (greedy)
            return self.greedy_action(game_state)
        else:
            return self.random_action()

    def greedy_action(self, game_state):
        state_key = str(game_state)
        
        # randomly pick one from the actions with max reward
        max_reward = max(self.q_dict[state_key])
        indices = [i for i, x in enumerate(self.q_dict[state_key]) if x == max_reward]
        action_idx =random.randrange(len(indices))
            
        return self.action_list[indices[action_idx]]   
            

    def random_action(self):
        return self.action_list[random.randrange(len(self.action_list))]
    

    def update(self, cur_state, new_state, action, reward):
        cur_state_idx = str(cur_state)
        new_state_idx = str(new_state)
        action_idx = self.action_list.index(action)
        
        # cur Q-dict value
        cur_value = self.q_dict[cur_state_idx][action_idx]
        
        # What would be our best next action?
        if new_state_idx not in self.q_dict: # if future state does not init yet, init it
            self.q_dict[new_state_idx] = [0]*len(self.action_list)
            
        future_reward = max(self.q_dict[new_state_idx])
        
        # Main Q-table updating algorithm
        new_value = cur_value + self.learning_rate * (reward + self.discount * future_reward - cur_value)
        self.q_dict[cur_state_idx][action_idx] = new_value

        # Finally shift our exploration_rate toward zero (less gambling)
        if self.exploration_rate > 0:
            self.exploration_rate *= (1 - self.exploration_decay)
            
    def reset(self):
        self.exploration_rate = self.init_exploration_rate
        
    def dump_qtable(self, file_name):
        with open(file_name, 'w') as outfile:
            json.dump(self.q_dict, outfile)
    
    def load_qtable(self, file_name):
        with open(file_name) as json_file: 
            self.q_dict = json.load(json_file)

In [54]:
import random
import json
import time
import copy

# parse arguments
iterations = 10000
    
# setup simulation
master = GameSimulator()
master.reset()
total_reward = 0 # Score keeping
    
# setup agents
agent_0 = AgentClass1()
agent_1 = AgentClass1()

agent_0_win = 0
agent_1_win = 0

# main loop
for step in range(iterations):
    old_state = copy.deepcopy(master.state) # Store current state
    
    action_0 = agent_0.get_next_action(old_state) # Query agent for the next action
    action_1 = agent_1.get_next_action(old_state)
        
    new_state, reward_0, reward_1 = master.take_action(action_0, action_1) # Take action, get new state and reward
        
    agent_0.update(old_state, new_state, action_0, reward_0) 
    agent_1.update(old_state, new_state, action_1, reward_1) 

    total_reward += max(reward_0, reward_1) # Keep score
    
    winner = master.is_finished()
    
    if winner >= 0 :  # game is finished with winner
        if winner == 0:
            agent_0_win += 1
        if winner == 1:
            agent_1_win += 1
        
        # reset to start a new game
        master.reset()
        agent_0.reset()
        agent_1.reset()
            
    
    if step % 100 == 0 or step == iterations - 1 :
        print(json.dumps({'i': step, "o_state": str(old_state), "n_state": str(new_state), "a0": action_0, "a1": action_1, 't_re': total_reward, '0_win': agent_0_win, '1_win': agent_1_win}))

    time.sleep(0.0001) # Avoid spamming stdout too fast!




{"i": 0, "o_state": "2,3,3", "n_state": "2,3,3", "a0": "MOVETOWARDS", "a1": "MOVEAWAY", "t_re": 0, "0_win": 0, "1_win": 0}
{"i": 100, "o_state": "2,3,2", "n_state": "2,3,2", "a0": "ATTACK", "a1": "ATTACK", "t_re": 309, "0_win": 0, "1_win": 3}
{"i": 200, "o_state": "4,3,3", "n_state": "3,3,3", "a0": "MOVETOWARDS", "a1": "ATTACK", "t_re": 514, "0_win": 1, "1_win": 4}
{"i": 300, "o_state": "5,2,1", "n_state": "5,2,1", "a0": "MOVEAWAY", "a1": "MOVEAWAY", "t_re": 723, "0_win": 1, "1_win": 6}
{"i": 400, "o_state": "4,3,3", "n_state": "5,3,3", "a0": "ATTACK", "a1": "MOVEAWAY", "t_re": 1134, "0_win": 2, "1_win": 9}
{"i": 500, "o_state": "1,3,2", "n_state": "1,3,2", "a0": "MOVETOWARDS", "a1": "MOVEAWAY", "t_re": 1443, "0_win": 4, "1_win": 10}
{"i": 600, "o_state": "1,1,3", "n_state": "1,1,3", "a0": "MOVEAWAY", "a1": "MOVETOWARDS", "t_re": 1852, "0_win": 7, "1_win": 11}
{"i": 700, "o_state": "3,3,1", "n_state": "3,3,1", "a0": "MOVETOWARDS", "a1": "MOVEAWAY", "t_re": 2365, "0_win": 7, "1_win": 16

In [62]:
agent = agent_1

for k in agent.q_dict:
    #if max(agent.q_dict[k]) > 0:
        print(json.dumps({"state": k, "action_reward": agent.q_dict[k]}))




{"state": "init", "action_reward": [0, 0, 0]}
{"state": "2,3,3", "action_reward": [59.192142687799844, 56.65228233812724, 65.44567293450208]}
{"state": "1,2,3", "action_reward": [69.15128893578579, 69.76665041855351, 78.59548100181057]}
{"state": "2,2,3", "action_reward": [69.25138849436537, 63.82757891767544, 74.24422879961443]}
{"state": "1,1,3", "action_reward": [82.99353329742051, 71.81920038223006, 88.15828459672574]}
{"state": "2,1,3", "action_reward": [79.88893356331212, 61.08745423177678, 74.9794432740392]}
{"state": "4,1,3", "action_reward": [60.243033819705346, 42.13384237451874, 39.46841279651556]}
{"state": "5,1,3", "action_reward": [54.58499325816181, 21.35297968364941, 29.35571625960204]}
{"state": "3,1,3", "action_reward": [76.19856113105766, 42.91057836084131, 59.174265295364116]}
{"state": "1,0,3", "action_reward": [0, 0, 0]}
{"state": "1,0,2", "action_reward": [0, 0, 0]}
{"state": "4,3,3", "action_reward": [59.98392509352531, 52.674083582439515, 54.949421056311586]}
{

In [69]:
agent_0.dump_qtable('agent_train_results/class1_f_class1_qtable_health3_distance2_agent0.txt')
agent_1.dump_qtable('agent_train_results/class1_f_class1_qtable_health3_distance2_agent1.txt')


{'1,0,1': [0, 0, 0],
 '1,0,2': [0, 0, 0],
 '1,0,3': [0, 0, 0],
 '1,1,0': [0, 0, 0],
 '1,1,1': [-2.275699636562667, 76.79297957747167, 60.0771497216827],
 '1,1,2': [80.84136151873172, 79.86561660330482, 88.0045305065819],
 '1,1,3': [81.53172710654489, 69.78442248335446, 89.57285545508955],
 '1,2,0': [0, 0, 0],
 '1,2,1': [-36.0215486258103, 44.08544553588937, -13.394185841663841],
 '1,2,2': [54.35594310190142, 64.48191627869619, 76.82315114299134],
 '1,2,3': [73.60243605674052, 65.62248410574226, 78.94397230451997],
 '1,3,0': [0, 0, 0],
 '1,3,1': [-47.746064101162986, 16.551592779218947, 2.5786318768098253],
 '1,3,2': [43.21917713840395, 50.69484712644265, 59.1992745039743],
 '1,3,3': [64.1073632276718, 62.9937394407941, 70.2950241393042],
 '2,1,1': [8.074868078032573, 71.32975533327567, 82.41861813153588],
 '2,1,2': [79.02272583752763, 73.38146271959127, 87.98928253993225],
 '2,1,3': [82.8058206414583, 67.11766764820308, 73.60182195090086],
 '2,2,1': [-37.19379482890021, 40.939458608712

In [None]:
import textwrap

def print_round(game_state, action_0, action_1):
    distance_gap = "".join(['.']*(game_state.agents_distance - 1))
    line = '{:<40s} {:<40s} {:<40s}'.format("Agent0: " + action_0, "Agent1: " + action_1, str(game_state.agents[0].health) + distance_gap + str(game_state.agents[1].health))
    print(line)

agent_0 = AgentClass1()
agent_1 = AgentClass1()

agent_0.load_qtable('agent_train_results/class1_f_class1_qtable_health3_distance2_agent0.txt')
agent_0.exploration_rate = 0

agent_1.load_qtable('agent_train_results/class1_f_class1_qtable_health3_distance2_agent1.txt')
agent_1.exploration_rate = 0


In [83]:

    
master.init_agents_distance = 3
master.reset()

print_round(master.state, "init", "init")

for step in range(100):
    old_state = copy.deepcopy(master.state) # Store current state
    
    action_0 = agent_0.get_next_action(old_state) # Query agent for the next action
    action_1 = agent_1.get_next_action(old_state)
        
    new_state, reward_0, reward_1 = master.take_action(action_0, action_1) # Take action, get new state and reward
    
    print_round(new_state, action_0, action_1)
        
    
    if master.is_finished() >= 0 :  # game is finished with winner
        break

Agent0: init                             Agent1: init                             3..3                                    
Agent0: MOVETOWARDS                      Agent1: MOVETOWARDS                      33                                      
Agent0: ATTACK                           Agent1: ATTACK                           23                                      
Agent0: ATTACK                           Agent1: ATTACK                           13                                      
Agent0: MOVEAWAY                         Agent1: ATTACK                           1.3                                     
Agent0: ATTACK                           Agent1: MOVETOWARDS                      12                                      
Agent0: MOVEAWAY                         Agent1: ATTACK                           1.2                                     
Agent0: ATTACK                           Agent1: ATTACK                           1.2                                     
Agent0: ATTACK  

In [85]:
master.init_agents_distance = 3
master.reset()

print_round(master.state, "init", "init")

for step in range(100):
    old_state = copy.deepcopy(master.state) # Store current state
    
    action_0 = agent_0.get_next_action(old_state) # Query agent for the next action
    action_1 = agent_1.get_next_action(old_state)
        
    new_state, reward_0, reward_1 = master.take_action(action_0, action_1) # Take action, get new state and reward
    
    print_round(new_state, action_0, action_1)
        
    
    if master.is_finished() >= 0 :  # game is finished with winner
        break

Agent0: init                             Agent1: init                             3..3                                    
Agent0: MOVETOWARDS                      Agent1: MOVETOWARDS                      33                                      
Agent0: ATTACK                           Agent1: ATTACK                           22                                      
Agent0: ATTACK                           Agent1: ATTACK                           11                                      
Agent0: ATTACK                           Agent1: MOVEAWAY                         1.1                                     
Agent0: ATTACK                           Agent1: ATTACK                           1.1                                     
Agent0: ATTACK                           Agent1: ATTACK                           1.1                                     
Agent0: ATTACK                           Agent1: ATTACK                           1.1                                     
Agent0: ATTACK  