In [1]:
import numpy as np
from random import random
import tensorflow as tf

In [2]:
# q_table = np.zeros([3**9,9])

def gen_q_table():
    num_states = 3**9
    lookup_dict = {}
    for one in range(3):
        for two in range(3):
            for three in range(3):
                for four in range(3):
                    for five in range(3):
                        for six in range(3):
                            for seven in range(3):
                                for eight in range(3):
                                    for nine in range(3):
                                        state = str(one) + str(two) + str(three) + str(four) + str(five) + str(six) + str(seven) + str(eight) + str(nine)
                                        lookup_dict[state] = [0 for i in range(9)]
    return lookup_dict 

In [3]:
num_games = 100
learning_rate = .1
discount_factor = .9

In [4]:
def find_available_moves(state):
    available_moves = []
    for i in range(9):
        if state[i] == 0:
            available_moves.append(i)
            
    return available_moves

In [5]:
# 0 = Empty
# 1 = Player placed X
# 2 = Opponent placed O
def random_agent(state, player):
    available_moves = find_available_moves(state)
    move = available_moves[int(random()*len(available_moves))]
    next_state = state
    next_state[move] = player
    
    return move, next_state, False

In [6]:
def always_block_and_solve_agent(state, player):
    next_state = state
    if state == [0 for i in range(9)]:
        move = 4
        next_state[move] = player
        
        return move, next_state, False
    
    if player == 1:
        opponent = 2
    else:
        opponent = 1
        
    # First determine if agent can solve
    for a,b,c in [(0,1,2), (3,4,5), (6,7,8),
          (0,3,6), (1,4,7), (2,5,8),
          (0,4,8), (2,4,6)]:
        if state[a] == state[b] == player or state[a] == state[b] == player or state[b] == state[c] == player or state[a] == state[c] == player:
            if min(state[a], state[b], state[c]) == 0:
                move_idx = [state[a],state[b],state[c]].index(0)
                move = [a,b,c][move_idx]
                next_state[move] = player

                return move, next_state, False
            
    # If can't solve, determine if agent can block
        # First determine if agent can solve
    for a,b,c in [(0,1,2), (3,4,5), (6,7,8),
          (0,3,6), (1,4,7), (2,5,8),
          (0,4,8), (2,4,6)]:
        if state[a] == state[b] == opponent or state[a] == state[b] == opponent or state[b] == state[c] == opponent or state[a] == state[c] == opponent:
            if min(state[a], state[b], state[c]) == 0:
                move_idx = [state[a],state[b],state[c]].index(0)
                move = [a,b,c][move_idx]
                next_state[move] = player

                return move, next_state, False
            
    # If can't block or win, become random_agent
    move, next_state, illegal = random_agent(state, player)
            
    return move, next_state, False

In [7]:
def player_move(state, value_function, player):
    illegal = False
#     available_moves = []
#     for i in range(9):
#         if state[i] == 0:
#             available_moves.append(i)
            
#     max_action_vals = [value_function[val] for val in range(len(value_function)) if val in available_moves]
#     max_action = max(max_action_vals)
#     max_action_idx = max_action_vals.index(max_action)
#     move = available_moves[max_action_idx]
    
    max_action = max(value_function)
    max_action_idx = value_function.index(max_action)
    move = max_action_idx
    
    if state[move] != 0:
        illegal = True
        return move, state, illegal
    else:
        next_state = state
        next_state[move] = player

        return move, next_state, illegal

In [8]:
def evaluate_game(state, illegal, player_val):
    reward = 0
    if illegal:
        reward = -10
        return True, reward

    if player_val == 1:
        opponent_val = 2
    else:
        opponent_val = 1
        
    for a,b,c in [(0,1,2), (3,4,5), (6,7,8),
              (0,3,6), (1,4,7), (2,5,8),
              (0,4,8), (2,4,6)]:
        if state[a] == state[b] == state[c] == player_val:
            reward = 1
            return True, reward
        if state[a] == state[b] == state[c] == opponent_val:
            reward = -2
            return True, reward
    if min(state) != 0:
        reward = 0
        return True, reward
    
    return False, reward

In [9]:
def print_game(state):
    print('-------')
    print('|' + str(state[0]) + '|' + str(state[1]) + '|' + str(state[2]) + '|')
    print('|' + str(state[3]) + '|' + str(state[4]) + '|' + str(state[5]) + '|')
    print('|' + str(state[6]) + '|' + str(state[7]) + '|' + str(state[8]) + '|')
    print('-------')

In [10]:
def train(num_games, learning_rate, discount_factor, explore_factor, self_play_prob=.5, q_table=gen_q_table(), print_results=False, agent_name=random_agent):
    agent = lambda agent_name, state, player: agent_name(state, player)
    
    for i in range(num_games):
        state = [0 for j in range(9)]
        game_over = False
        first_move = random() <= .5
        play_self = random() <= self_play_prob

        if not first_move:
            if play_self:
                state_id = ''.join([str(i) for i in state])
                _, state, illegal = player_move(state, q_table[state_id], 2)
            else:
                _, state, illegal = agent(agent_name, state, 2)
                
            if illegal:
                break
                
        while not game_over:
            state_id = ''.join([str(j) for j in state])
                
            # Currently always goes first (fix)
            explore = random() <= explore_factor
            
            if explore:
                q_move, q_state, illegal = random_agent(state, 1)
            else:
                q_move, q_state, illegal = player_move(state, q_table[state_id], 1) 
            game_over, reward = evaluate_game(q_state, illegal, 1)
            if game_over == True:
                if print_results:
                    if reward == 1:
                        print("Player wins")
                    else:
                        print("Cat Game")
                    print_game(q_state)
                q_table[state_id][q_move] = reward
                
                break
        
            if print_results:
                print_game(q_state)
                
            if play_self:
                if print_results:
                    print("Playing Self")
                q_state_id = ''.join([str(s) for s in q_state])
                opponent_move, new_state, illegal = player_move(state, q_table[q_state_id], 2)
            else:
                if print_results:
                    print("Playing Opponent")
                opponent_move, new_state, illegal = agent(agent_name, q_state, 2) 
            game_over, reward = evaluate_game(q_state, illegal, 1)
            if game_over == True:
                if print_results:
                    if reward < 0:
                        print("Opponent Wins")
                    else:
                        print("Cat Game")
                    print_game(new_state)
                
                q_table[state_id][q_move] = reward
                
                break

            # Update q table
            new_state_id = ''.join([str(s) for s in new_state])
            max_next = max(q_table[new_state_id])
            
            q_table_update = learning_rate*(discount_factor * max_next - q_table[state_id][q_move])
            if print_results:
                print(q_table_update)
            q_table[state_id][q_move] += q_table_update

            if game_over == True:
                break

            state = new_state
            if print_results:
                print_game(state)

        if i % (num_games // 10) == 0:
            explore_factor = explore_factor * (num_games - i) / num_games
            print("Num iteration: %d, Explore Factor: %f" % (i, explore_factor))
            print("Number games lost: %d\nAverage Reward: %f" % evaluate_against_block_and_solve(q_table,1000))
#             print("Number games lost: %d\nAverage Reward: %f" % evaluate_against_random(q_table,1000))
            
    return q_table

In [11]:
def q_play(q_table, state):
    state_id = ''.join([str(i) for i in state])
    value_function = q_table[state_id]
    available_moves = []
    for i in range(9):
        if state[i] == 0:
            available_moves.append(i)
            
    max_action_vals = [value_function[val] for val in range(len(value_function)) if val in available_moves]
    max_action = max(max_action_vals)
    max_action_idx = max_action_vals.index(max_action)
    move = available_moves[max_action_idx]
    
    next_state = state
    next_state[move] = 1
    
    return next_state

In [12]:
def play_random_agent(q_table, print_results=True):
    state = [0 for j in range(9)]
    for j in range(6):
        state_id = ''.join([str(i) for i in state])

        q_move, q_state, illegal = player_move(state, q_table[state_id], 1) 
        game_over, reward = evaluate_game(q_state, illegal, 1)

        if game_over == True:
            if print_results:
                if reward == 1:
                    print("Player wins")
                else:
                    print("Cat Game")
                print_game(q_state)
            q_table[state_id][q_move] = reward

            return reward

        random_move, new_state, illegal = random_agent(q_state, 2) 
        game_over, reward = evaluate_game(q_state, illegal, 1)
        if game_over == True:
            if print_results:
                if reward < 0:
                    print("Opponent Wins")
                else:
                    print("Cat Game")
                print_game(new_state)

            q_table[state_id][q_move] = reward

            return reward
        
        state = new_state
        if print_results:
            print_game(state)

In [13]:
def play_block_and_solve(q_table, print_results=True):
    state = [0 for j in range(9)]
    first_move = random() <= .5

    if not first_move:
        if print_results:
            print("Opponent first move")
            
        _, state, illegal = always_block_and_solve_agent(state, 2)
        
    else:
        if print_results:
            print("Player first")
                
    for j in range(6):
        state_id = ''.join([str(i) for i in state])

        q_move, q_state, illegal = player_move(state, q_table[state_id], 1) 
        game_over, reward = evaluate_game(q_state, illegal, 1)
        if game_over == True:
            if print_results:
                if reward == 1:
                    print("Player wins")
                else:
                    print("Cat Game")
                print_game(q_state)
            q_table[state_id][q_move] = reward

            return reward
        
        if print_results:
            print_game(q_state)

        random_move, new_state, illegal = always_block_and_solve_agent(q_state, 2) 
        game_over, reward = evaluate_game(q_state, illegal,1)
        if game_over == True:
            if print_results:
                if reward < 0:
                    print("Opponent Wins")
                else:
                    print("Cat Game")
                print_game(new_state)

            q_table[state_id][q_move] = reward

            return reward
        
        state = new_state
        if print_results:
            print_game(state)

In [14]:
def evaluate_against_random(q_table, num_loops):
    num_lost = 0
    average_reward = 0
    for _ in range(num_loops):
        reward = play_random_agent(q_table,print_results=False)
        if reward < 0:
            num_lost += 1
        average_reward += reward / num_loops
        
    return num_lost, average_reward

In [15]:
def evaluate_against_block_and_solve(q_table, num_loops):
    num_lost = 0
    average_reward = 0
    for _ in range(num_loops):
        reward = play_block_and_solve(q_table,print_results=False)
        if reward < 0:
            num_lost += 1
        average_reward += reward / num_loops
        
    return num_lost, average_reward

In [50]:
def train_player():
    q_table = train(200000, .1, .2, 1,self_play_prob=0,agent_name=random_agent)
    q_table = train(200000, .1, .99, 1,self_play_prob=0,q_table=q_table,agent_name=always_block_and_solve_agent)
    q_table = train(400000, .1, .2, .5,self_play_prob=0,q_table=q_table,agent_name=random_agent)
    q_table = train(400000, .1, .99, .5,self_play_prob=0,q_table=q_table,agent_name=always_block_and_solve_agent)
    
    return q_table

In [16]:
move_freqs = {key: 0 for key in range(2)}
num_iters = 100000
for i in range(num_iters):
    move = int(random()*2)
    move_freqs[move] += 1
    

In [17]:
move_freqs

{0: 49989, 1: 50011}

In [18]:
test = [0 for i in range(9)]
print(''.join([str(i) for i in test]))
test

000000000


[0, 0, 0, 0, 0, 0, 0, 0, 0]

In [19]:
state = [1, 2, 1, 0, 0, 0, 1, 1, 1]
available_moves = []
for i in range(9):
    if state[i] == 0:
        available_moves.append(i)

In [20]:
available_moves

[3, 4, 5]

In [22]:
evaluate_game([2, 2, 1, 2, 0, 0, 2, 0, 0], False, 1)
# print_game([2, 2, 1, 2, 0, 0, 2, 0, 0])

(True, -2)

In [51]:
%%time
# q_table = train(1, .1, .9, 0,q_table,True)
q_table = train(400000, .1, .99, 1,self_play_prob=0,agent_name=always_block_and_solve_agent)
# q_table = train(10000, .1, .2, 1,self_play_prob=1,q_table=q_table)
# q_table = train(500000, .2, .3, 1, q_table=q_table, agent_name=random_agent)

Num iteration: 0, Explore Factor: 1.000000
Number games lost: 111
Average Reward: -0.078000
Num iteration: 40000, Explore Factor: 0.900000
Number games lost: 0
Average Reward: 0.420000
Num iteration: 80000, Explore Factor: 0.720000
Number games lost: 0
Average Reward: 0.477000
Num iteration: 120000, Explore Factor: 0.504000
Number games lost: 0
Average Reward: 0.510000
Num iteration: 160000, Explore Factor: 0.302400
Number games lost: 0
Average Reward: 0.503000
Num iteration: 200000, Explore Factor: 0.151200
Number games lost: 0
Average Reward: 0.493000
Num iteration: 240000, Explore Factor: 0.060480
Number games lost: 0
Average Reward: 0.498000
Num iteration: 280000, Explore Factor: 0.018144
Number games lost: 0
Average Reward: 0.503000
Num iteration: 320000, Explore Factor: 0.003629
Number games lost: 0
Average Reward: 0.470000
Num iteration: 360000, Explore Factor: 0.000363
Number games lost: 0
Average Reward: 0.489000
CPU times: user 31.2 s, sys: 77.4 ms, total: 31.2 s
Wall time: 3

In [52]:
%%time
q_table = train_player()

Num iteration: 0, Explore Factor: 1.000000
Number games lost: 0
Average Reward: 0.470000
Num iteration: 20000, Explore Factor: 0.900000
Number games lost: 0
Average Reward: 0.462000
Num iteration: 40000, Explore Factor: 0.720000
Number games lost: 0
Average Reward: 0.472000
Num iteration: 60000, Explore Factor: 0.504000
Number games lost: 0
Average Reward: 0.285000
Num iteration: 80000, Explore Factor: 0.302400
Number games lost: 0
Average Reward: 0.390000
Num iteration: 100000, Explore Factor: 0.151200
Number games lost: 0
Average Reward: 0.476000
Num iteration: 120000, Explore Factor: 0.060480
Number games lost: 0
Average Reward: 0.224000
Num iteration: 140000, Explore Factor: 0.018144
Number games lost: 0
Average Reward: 0.255000
Num iteration: 160000, Explore Factor: 0.003629
Number games lost: 0
Average Reward: 0.378000
Num iteration: 180000, Explore Factor: 0.000363
Number games lost: 0
Average Reward: 0.276000
Num iteration: 0, Explore Factor: 1.000000
Number games lost: 0
Avera

In [24]:
state = [0 for i in range(9)]
# new_state = q_play(q_table, state)
_, new_state = always_block_and_solve_agent(state, 1)
new_state

ValueError: too many values to unpack

In [25]:
state = [1, 1, 0, 1, 2, 0, 0, 0, 2]
print(convert_state(state))
print_game(state)
state[6] = 2
# new_state = q_play(q_table, state)
_, new_state = always_block_and_solve_agent(state, 1)
print(new_state)
print_game(new_state)

NameError: name 'convert_state' is not defined

In [26]:
evaluate_game([1, 2, 1, 1, 2, 0, 0, 2, 1])

TypeError: evaluate_game() takes exactly 3 arguments (1 given)

In [26]:
list(q_table.keys())[1]

'200221020'

In [53]:
q_table['000000000']

[0.6585606013976042,
 0.6910349083309222,
 0.6443684584124908,
 0.6573302579538166,
 0.6850609599488371,
 0.6854595703193759,
 0.9555289349572427,
 0.6742865201224162,
 0.6565291132415879]

In [28]:
q_table['000000000'].index(max(q_table['000000000']))

2

In [47]:
reward = play_random_agent(q_table)

-------
|0|0|0|
|0|0|2|
|0|1|0|
-------
-------
|0|0|0|
|0|0|2|
|1|1|2|
-------
-------
|0|0|1|
|0|2|2|
|1|1|2|
-------
-------
|1|2|1|
|0|2|2|
|1|1|2|
-------
Player wins
-------
|1|2|1|
|1|2|2|
|1|1|2|
-------


In [30]:
print(evaluate_against_random(q_table,100))

(47, -4.050000000000011)


In [24]:
play_block_and_solve(q_table, print_results=True)

Player first
-------
|0|0|0|
|0|0|0|
|0|0|1|
-------
-------
|2|0|0|
|0|0|0|
|0|0|1|
-------
-------
|2|0|1|
|0|0|0|
|0|0|1|
-------
-------
|2|0|1|
|0|0|2|
|0|0|1|
-------
-------
|2|0|1|
|0|0|2|
|1|0|1|
-------
-------
|2|0|1|
|0|0|2|
|1|2|1|
-------
Player wins
-------
|2|0|1|
|0|1|2|
|1|2|1|
-------


1

In [49]:
num_games = 100000
# num_lost, reward = evaluate_against_block_and_solve(q_table,num_games)
num_lost, reward = evaluate_against_random(q_table,num_games)
print("Percent lost: %.5f, Average Reward: %.2f" % (num_lost / num_games, reward))

Percent lost: 0.00543, Average Reward: 0.97


In [33]:
class QNetwork:
    def __init__(self, learning_rate=0.01, state_size=18, 
                 action_size=9, hidden_size=10, 
                 name='QNetwork'):
        # state inputs to the Q-network
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            
            # One hot encode the actions to later choose the Q-value for the action
            self.actions_ = tf.placeholder(tf.int32, [None], name='actions')
            one_hot_actions = tf.one_hot(self.actions_, action_size)
            
            # Target Q values for training
            self.targetQs_ = tf.placeholder(tf.float32, [None], name='target')
            
            # ReLU hidden layers
            self.fc1 = tf.contrib.layers.fully_connected(self.inputs_, hidden_size)
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, hidden_size)

            # Linear output layer
            self.output = tf.contrib.layers.fully_connected(self.fc2, action_size, 
                                                            activation_fn=None)
            
            ### Train with loss (targetQ - Q)^2
            # output has length 2, for two actions. This lext line chooses
            # one value from output (per row) according to the one-hot encoded actions.
            self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q))
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

In [33]:
from collections import deque
class Memory():
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]
    
train_episodes = 1000          # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # expotentional decay rate for exploration prob

# Network parameters
hidden_size = 64               # number of units in each Q-network hidden layer
learning_rate = 0.0001         # Q-network learning rate

# Memory parameters
memory_size = 10000            # memory capacity
batch_size = 20                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

In [None]:
# Initialize the simulation
env.reset()
# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())

memory = Memory(max_size=memory_size)

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length):
    # Uncomment the line below to watch the simulation
    # env.render()

    # Make a random action
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)

    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        
        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        state, reward, done, _ = env.step(env.action_space.sample())
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        state = next_state

In [34]:
def convert_state(state):
    new_state = [0 for i in range(18)]
    
    for s in range(len(state)):
        if state[s] != 0:
            new_state_idx = int(9 * (state[s] - 1) + s)
            new_state[new_state_idx] = 1
    return new_state

In [35]:
train_episodes = 100000          # max number of episodes to learn from
gamma = 0.9                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # expotentional decay rate for exploration prob

# Network parameters
hidden_size = 64               # number of units in each Q-network hidden layer
learning_rate = 0.01         # Q-network learning rate
player_val = 1
agent_name = always_block_and_solve_agent

In [36]:
tf.reset_default_graph()
mainQN = QNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate)

In [37]:
def step_game(state, action, player_val, agent=random_agent):
    if state[action] != 0:
        illegal = True
    else:
        illegal = False
    
    if player_val == 1:
        opponent_val = 2
    else:
        opponent_val = 1
        
    q_state = state
    q_state[action] = player_val
    
    game_over, reward = evaluate_game(state, illegal, player_val)
    
    if game_over:
        return state, game_over, reward
    
    _, new_state, illegal = agent_name(state, opponent_val)
    
    game_over, reward = evaluate_game(state, illegal, player_val)
    
    return new_state, game_over, reward

In [38]:
def initialize_game(player_val, agent=random_agent):
    state = [0 for s in range(9)]
    if np.random.rand() <= .5:
        return state
    else:
        if player_val == 1:
            opponent_val = 2
        else:
            opponent_val = 1
            
        _, state, _ = agent(state, opponent_val)
    
        return state

In [47]:
# Now train with experiences
saver = tf.train.Saver()
rewards_list = []
loss_list = []
with tf.Session() as sess:
    # Initialize variables
    sess.run(tf.global_variables_initializer())
    
    step = 0
    average_reward = 0
    for ep in range(1, train_episodes):
        total_reward = 0
        state = initialize_game(player_val, agent=agent_name)
#         print("Initial State")
#         print(state)
        done = False
        states = []
        next_states = []
        actions = []
        rewards = []
        while not done:
            # Explore or Exploit
            explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*(ep/10)) 
#             explore_p = 0
            if explore_p > np.random.rand():
#                 # Make a random action
#                 print("Random action")
                available_moves = find_available_moves(state)
                action = available_moves[int(random()*len(available_moves))]
#                 print("After random action")
#                 print(state)
            else:
                # Get action from Q-network
                
#                 q_move, q_state, illegal = player_move(state, q_table[state_id], 1) 
                converted_state = np.asarray(convert_state(state))
                feed = {mainQN.inputs_: converted_state.reshape(1,*converted_state.shape)}
                Qs = sess.run(mainQN.output, feed_dict=feed)
#                 print("Qs")
#                 print(Qs)
                available_moves = find_available_moves(state)
    
                max_action_vals = [state[val] for val in range(len(state)) if val in available_moves]
                max_action = max(max_action_vals)
                max_action_idx = max_action_vals.index(max_action)
                action = available_moves[max_action_idx]
#                 print("Available_moves")
#                 print(available_moves)
#                 move_vals = [s for s in state if s in available_moves]
#                 print("State")
#                 print(state)
#                 print("Move Vals")
#                 print(move_vals)
#                 action = np.argmax(move_vals)
#                 print(action)
            actions.append(action)
            
            # Take action, get new state and reward
            
#             print("state")
#             print_game(state)
#             print("action")
#             print(action)
            next_state, done, reward = step_game(state, action, player_val, agent=agent_name)
#             print("Next state")
#             print_game(next_state)
            total_reward += reward
            average_reward += reward/train_episodes
            
            if done:
                # the episode ends so no next state
#                 next_state = np.zeros(9)
#                 next_state = [-1 for z in range(9)]
                
                if ep % (train_episodes // 10) == 0:
                    print('Episode: {}'.format(ep),
                          'Average reward: {}'.format(np.mean(rewards_list)),
#                           'Training loss: {:.4f}'.format(loss),
                          'Explore P: {:.4f}'.format(explore_p))
                    rewards_list = []
                    
                rewards_list.append(reward)

            else:
                state = next_state
            next_states.append(convert_state(next_state))
            states.append(convert_state(state))
            rewards.append(reward)
            
#             # Sample mini-batch from memory
#             batch = memory.sample(batch_size)
#             states = np.array([each[0] for each in batch])
#             actions = np.array([each[1] for each in batch])
#             rewards = np.array([each[2] for each in batch])
#             next_states = np.array([each[3] for each in batch])
            
#             # Train network
        target_Qs = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states})

#             episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1)
#         print("Target Qs")
#         print(target_Qs)
#             target_Qs[episode_ends] = (0, 0)
#         print("states")
#         print(states)
#         print("Outputs")
#         print(Qs)
#         print("Rewards")
#         print(rewards)
        targets = rewards + gamma * np.max(target_Qs, axis=1)
#         print("Targets")
#         print(targets)
#         print("Actions")
#         print(actions)
        loss, _ = sess.run([mainQN.loss, mainQN.opt],
                            feed_dict={mainQN.inputs_: states,
                                       mainQN.targetQs_: targets,
                                       mainQN.actions_: actions})
    
        loss_list.append(loss)
        if ep % (train_episodes // 10) == 0:
            print("Average Loss: %.2f" % np.mean(loss_list))
            loss_list = []
    print(average_reward)
    saver.save(sess, "checkpoints/tic-tac-toe.ckpt")

Episode: 10000 Average reward: -1.625062506250625 Explore P: 0.9058
Average Loss: 0.26
Episode: 20000 Average reward: -1.6166 Explore P: 0.8205
Average Loss: 0.08
Episode: 30000 Average reward: -1.6253 Explore P: 0.7434
Average Loss: 0.07
Episode: 40000 Average reward: -1.6601 Explore P: 0.6736
Average Loss: 0.27
Episode: 50000 Average reward: -1.665 Explore P: 0.6105
Average Loss: 0.04
Episode: 60000 Average reward: -1.6754 Explore P: 0.5533
Average Loss: 0.15
Episode: 70000 Average reward: -1.6846 Explore P: 0.5016
Average Loss: 0.22
Episode: 80000 Average reward: -1.6998 Explore P: 0.4548
Average Loss: 0.15
Episode: 90000 Average reward: -1.712 Explore P: 0.4125
Average Loss: 0.07
-1.6673299999976314


In [978]:
test_episodes = 2

with tf.Session() as sess:
    saver.restore(sess, "checkpoints/tic-tac-toe.ckpt")
    
    for ep in range(1, test_episodes):
        game_over = False
        state = initialize_game(player_val, agent=agent_name)
        
        while not game_over:
            print_game(state)

            
            # Get action from Q-network
            converted_state = np.asarray(convert_state(state))
            feed = {mainQN.inputs_: converted_state.reshape(1,*converted_state.shape)}
            Qs = sess.run(mainQN.output, feed_dict=feed)
            action = np.argmax(Qs)
            next_state, game_over, reward = step_game(state, action, player_val, agent=agent_name)
            print(Qs)
            print(action)
            if game_over:
                print(reward)

            else:
                state = next_state

-------
|0|0|0|
|0|0|0|
|0|0|0|
-------
[[ 1.17139781  1.21745515  0.96601009  1.22202933  1.25242293  1.27167892
   1.25516498  1.24944532  1.15785098]]
5
-------
|0|0|0|
|0|0|1|
|0|0|2|
-------
[[ 0.77960378  0.7806859   0.59929162  0.75685883  0.76190478  0.7611509
   0.79187161  0.80504137  0.7632966 ]]
7
-------
|0|0|0|
|0|2|1|
|0|1|2|
-------
[[ 0.90489191  0.94092888  0.74684429  0.91692197  0.96818149  0.94752759
   0.96581078  0.96296114  0.88441658]]
4
-10


In [683]:
len(np.zeros(9))

9