In [1]:
import uuid
import time
import pickle
import sys
import gym.spaces
import itertools
import numpy as np
import random
import tensorflow                as tf
import tensorflow.contrib.layers as layers
from collections import namedtuple
import TicTacToe
from collections import Counter
import Players
from importlib import reload
reload(Players)
reload(TicTacToe)

  from ._conv import register_converters as _register_converters


<module 'TicTacToe' from '/Users/christophermiller/Documents/GitHub/ai/TicTacToe/venv/TicTacToe.py'>

In [2]:
tf.reset_default_graph()

def TicTacToe_model(placeholder, scope, num_actions = 9):
    '''A model for a TicTacToe Q-function
    Inputs:
        placeholder: [None, ob_dim] placeholder representing inputs to our neural network
        scope: a string that becomes the scope of all layers in this network
        reuse: 
        num_actions: an int representing the number of possible actions (the output dimension)
    
    The final layer outputs values in the range [-1,1], which matches the range of possible target q-values
    placeholder = tf.contrib.layers.flatten(placeholder)
    
    The Q-function is thought of as a function of two varables Q(s,a). Here we treat it as a num_actions-dimensional
    function of one variable, so that Q(s,a) = Q(s)[a]
    
    We initialize bias and weights to zero, except for the final layer, where the weights are initialized to one.  
    
    Returns:
        model: [None, num_actions] variable representing the outputs of our q-function
    '''
    with tf.variable_scope(scope):
        out = placeholder
        out = tf.cast(out, tf.float32)
        out = tf.layers.dense(out, 64  , bias_initializer = tf.zeros_initializer(), activation = tf.nn.softmax)
        out = tf.layers.dense(out, 64  , bias_initializer = tf.zeros_initializer(), activation = tf.nn.softmax)
        out = tf.layers.dense(out, 64  , bias_initializer = tf.zeros_initializer(), activation = tf.nn.softmax)
        out = tf.layers.dense(out, num_actions , kernel_initializer = tf.zeros_initializer(), bias_initializer = tf.zeros_initializer(), activation = tf.nn.sigmoid)
        out = (out*2)-1
    return out

    
def sample_action(model, mask_placeholder):
    '''Symbolically selects an action from logits with restrictions
    Inputs: 
        model: a [None, action_dim] variable consisting of logits
        mask_placeholder: a [None, action_dim] placeholder that will be fed boolean vectors
    
    Returns:
        A random legal action, where legal values are those which mask_placeholder assigns 1
        The probabilities are weighted according to the logits
    '''
    out = model
    dist = tf.distributions.Categorical(probs=maskedSoftmax(out, mask_placeholder))
    return dist.sample()
    
    
def maskedSoftmax(logits, mask):
    '''Computes the softmax of our logits, given that some moves are illegal
    Inputs:
        Masked softmax over dim 1
        param logits: [None, ac_dim]
        param mask: [None, ac_dim]
        
        ***This code is edited from code we found online***
        We do not want there to be any probability of making illegal moves. 
        Intuitively, we are computing softmax of our logits, but pretending that the only entries 
            are the legal ones.
        This is actually implemented via SparseTensor calculations.
        
    Returns: 
        result: [None, ac_dim] a sequence of probability distributions, with zero probability of illegal moves
    '''
    indices = tf.where(mask)
    values = tf.gather_nd(logits, indices)
    denseShape = tf.cast(tf.shape(logits), tf.int64)
    
    # Tensorflow will automatically set output probabilities to zero of 
    # undesignated entries in sparse vector
    sparseResult = tf.sparse_softmax(tf.SparseTensor(indices, values, denseShape))
    
    result = tf.scatter_nd(sparseResult.indices, sparseResult.values, sparseResult.dense_shape)
    result.set_shape(logits.shape)
    return result


def batch_rollout(player,opponent, env, max_time_steps = 100):
    '''Produces a batch of rollouts from the environment.
    Inputs:
        player: realization of Player.Player abstract class
        opponent: realization of Player.Player abstract class
        env: an environment
        max_time_steps: an integer
    
    This function plays a number of rounds of a two-player game, and returns the trajectories observed by player
    
    Returns:
        paths: a list of dictionaries. Each dictionary is a rollout, and takes the keys:
            'observation': [None, obs_dime] np.array of the observations of player
            'action': [None,] np.array of the actions of player
            'reward': [None,] np.array of the rewards gotten by player
        batch_winners: TODO
    '''
    paths = []
    batch_winners = Counter({0: 0, 1: 0, 2:0})
    time_steps = 0
    while time_steps < max_time_steps:
        path = sample_trajectory(player,opponent,env)
        paths += [path]
        batch_winners[env.current_winner] +=1
        time_steps += len(path['observation'])
    return paths, batch_winners
    
    
    
def sample_trajectory(player, opponent, env):
    """Produces a single rollout of the environment following the player policy
    Inputs:
        player:   realization of Player.Player abstract class
        opponent: realization of Player.Player abstract class
        env:      environment which follows open ai gym environment structure and has a current_player int either 1 or 2
        TODO: it doesn't quite match the reward structure, no?^
       
    Returns:
    a list of dictionaries. Each dictionary is a rollout, and takes the keys:
        'observation': [None, obs_dime] np.array of the observations of player
        'action': [None,] np.array of the actions of player
        'reward': [None,] np.array of the rewards gotten by player
    """
    
    obs, acs, rewards, masks = [], [], [], []
    ob = env.reset()
    done = False
    player_has_acted = False
    action = None
    
    #Do rest of moves
    while not done:
        #Get current observation of current player
        ob = env.get_observation(env.current_player)
        legal_moves = env.legal_moves()
        if env.current_player == 1:
            #Reward is recorded as results of state,action pair... need to check player 1 has acted already
            if player_has_acted:
                rewards.append(env.get_reward(1))
            else:
                player_has_acted = True
                
            action = player.policy(np.array([ob]), np.array([legal_moves]))
            obs.append(ob)
            acs.append(action[0])
            masks.append(legal_moves)
        else:
            action = opponent.policy(np.array([ob]), np.array([legal_moves]))
        done, _ = env.step(action[0]) 

    #Need to record final reward for player 1
    rewards.append(env.get_reward(1))
    
    path = {"observation" : np.array(obs, dtype=np.int32), 
                "reward" : np.array(rewards, dtype=np.float32), 
                "action" : np.array(acs, dtype=np.int32),
                "mask" : np.array(masks, dtype=np.int32)}
    return path

    
    
def sum_of_rewards(paths, gamma = .6): 
    re_n = [path["reward"] for path in paths]
    q_n = []
    for seq_of_rewards in re_n:
        for t in range(len(seq_of_rewards)):
            weighted_sequence = seq_of_rewards[t:] * np.array([gamma**i for i in range(len(seq_of_rewards[t:]))])
            q_n.append(np.sum(weighted_sequence))
    adv_n = q_n
    return adv_n
        
def standardize_advantage(adv_n):
    adv_n = (adv_n - np.mean(adv_n)) 
    adv_n = adv_n * (1.0/(np.std(adv_n)+.0000001))
    return adv_n

def get_log_prob(model, action_placeholder, mask_placeholder):
    action_dim = 9 
    logits = model
    
    indices = tf.where(mask_placeholder)
    values = tf.gather_nd(logits, indices)
    denseShape = tf.cast(tf.shape(logits), tf.int64)
    
    """THIS IS THE KEY: tensorflow will automatically set output probabilities to zero of undesignated entries in sparse vector"""
    sparseResult = tf.sparse_softmax(tf.SparseTensor(indices, values, denseShape))
    
    probability_dist = tf.scatter_nd(sparseResult.indices, sparseResult.values, sparseResult.dense_shape)
#     probability_dist = probability_dist.set_shape(logits.shape)
    log_probability_dist = tf.scatter_nd(sparseResult.indices, tf.log(sparseResult.values), sparseResult.dense_shape)

    """Want to emulate this:"""
#     probability_dist = tf.nn.softmax(logits)
#     legal_pseudo_probability_dist = probability_dist*values
#     legalprobability_dist = tf.divide(legal_pseudo_probability_dist, tf.reduce_sum(legal_pseudo_probability_dist, axis= 1))
    
    prod = tf.multiply(probability_dist, tf.one_hot(action_placeholder, action_dim ))
    
    entropy = - tf.reduce_sum(probability_dist * log_probability_dist, axis = 1)
    
    
    
    log_prob = tf.log(tf.reduce_sum(prod , axis = 1 ))
#    log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels= action_placeholder, logits= tf.SparseTensor(indices, values, denseShape))
    return log_prob, entropy

def loss_and_update_op(log_prob, entropy, adv_n, entropy_coeff = .1):
    loss = -tf.reduce_mean(log_prob * adv_n) -  entropy_coeff * entropy
    optimizer = tf.train.AdamOptimizer(5e-3)
    update_op = optimizer.minimize(loss)
    return loss, update_op, optimizer
    

In [5]:
#Main code for running policy gradient

tf.reset_default_graph()

#define the board, models *symbolically*
observation_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32)
adv_n_placeholder = tf.placeholder(shape = [None], dtype = tf.float32)
action_placeholder = tf.placeholder(shape = [None], dtype = tf.int32)
mask_placeholder = tf.placeholder(shape=[None, 9], dtype = tf.int32)


model = TicTacToe_model(observation_placeholder, 9, scope = "policy_gradient")
#old_model = TicTacToe_model(board_placeholder, 9, scope = "model-2")
model_input_s = sample_action(model, mask_placeholder)

#Define Loss functions *symbolically*
log_prob, entropy = get_log_prob(model, action_placeholder, mask_placeholder)
loss, update_op, optimizer = loss_and_update_op(log_prob, entropy, adv_n_placeholder, entropy_coeff = 0)

#start a session
sess =tf.Session()
sess.run(tf.global_variables_initializer())
#Defines player, opponent
player = Players.NN_Player(model, model_input_s, sess, observation_placeholder, mask_placeholder, duplicate=False, deterministic = False)
opponent = Players.Random_Player()

#Loads old player,opponent
# temp_file_name = './bot_10_28_v6.ckpt'

#Want to duplicate session
# saver = tf.train.Saver()
# saver.restore(sess, temp_file_name)


# opponent = Players.NN_Player(model, model_input_s, sess, observation_placeholder, mask_placeholder)




#start an environment
env = TicTacToe.TicTacToe()

number_updates_per_expert_update = 5
number_expert_updates = 1000

for k in range(number_expert_updates):
    print("iteration number", k)
    
    batch_adv_n = []
    iteration_winners = Counter({0:0,1:0,2:0})
    
    tic = time.time()
    for i in range(number_updates_per_expert_update):
        paths, batch_winners = batch_rollout(player, opponent, env, max_time_steps=1000)
        iteration_winners += batch_winners
        
        adv_n = sum_of_rewards(paths)
        batch_adv_n = batch_adv_n + adv_n
        

        boards = np.concatenate([path['observation'] for path in paths])
        masks = np.concatenate([path['mask'] for path in paths])
        actions = np.squeeze(np.concatenate([path["action"] for path in paths])).astype(int)
        
        sess.run(update_op, feed_dict = {mask_placeholder: masks, adv_n_placeholder: adv_n, observation_placeholder: boards , action_placeholder: actions})
    
    
    #Unwind win data:
#     print(iteration_winners)
    print("mean adv", np.mean(batch_adv_n))
    print("iteration time", time.time() - tic)
#     print(paths[0])
    
    
    expert_player = Players.Expert_Player()
    _, expert_batch_winners = batch_rollout(player, expert_player, env, max_time_steps=900)
    player_loss_percentage_vs_expert = expert_batch_winners[2]*1.0/(expert_batch_winners[0] + expert_batch_winners[1] + expert_batch_winners[2])
    print("loss percent vs expert", player_loss_percentage_vs_expert)
    opponent = Players.NN_Player(model, model_input_s, sess, observation_placeholder, mask_placeholder)
            
    


TypeError: TicTacToe_model() got multiple values for argument 'scope'

In [46]:
#Save current net

temp_file_name = './bot_11_01_q_v2.ckpt'

#Want to duplicate session
saver = tf.train.Saver()
saver.save(sess, temp_file_name)

'./bot_11_01_q_v2.ckpt'

In [48]:
#Load current net

temp_file_name = './bot_11_01_q_v2.ckpt'

#Want to duplicate session
saver = tf.train.Saver()
saver.restore(sess, temp_file_name)

INFO:tensorflow:Restoring parameters from ./bot_11_01_q_v2.ckpt


In [None]:
from importlib import reload
human = Players.Human_Player()
player.epsilon = 0
env = TicTacToe.TicTacToe()
paths, batch_winners = batch_rollout(opponent,human,env,max_time_steps=10000)
print(batch_winners)
    

1
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Say something: 0
[0]
1
[[ 1  0  0]
 [ 0 -1  0]
 [ 0  0  0]]
Say something: 8
[8]
1
[[ 1  0  0]
 [ 0 -1 -1]
 [ 0  0  1]]
Say something: 3
[3]
1
[[ 1  0  0]
 [ 1 -1 -1]
 [-1  0  1]]
Say something: 2
[2]
1
[[ 1  0  1]
 [ 1 -1 -1]
 [-1 -1  1]]
Say something: 1
[1]
1
[[ 0 -1  0]
 [ 0  0  0]
 [ 0  0  0]]


# Implementing Q-Learning

In [7]:
reload(Players)


def symbolic_Q_update(model, target_placeholder, action_placeholder, learning_rate = .01):
    '''Produce the symbolic variables for loss, the update, and the optimizer
    Inputs:
        model: [None, action_dim] variable consisting of Q values
        target_placeholder: [None,] placeholder that will be fed target values for the Q-function
        action_placeholder: [None,] placeholder that will be fed the action values
        learning_rate: float representing the size of gradient step
        
        The loss is the mean squared error ||Q(s,a) - Q'(s,a)||^2
        We use AdamOptimizer with no bells or whistles
        
    Returns:
        update_op: a method to be called when we desire to take a gradient step
    '''
    q_action_s = tf.reduce_sum(tf.multiply(model, tf.one_hot(action_placeholder, 9)), 1)
    loss = tf.losses.mean_squared_error(q_action_s, target_placeholder)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    update_op = optimizer.minimize(loss)
    return update_op, loss

def compute_target_values(model, next_state, masks, not_end_of_path, reward, decay = .99, verbose = False):
    '''Computes the target values for our Q-function update
    Inputs: 
        model: [None, action_dim] variable consisting of Q values
        next_state: [None, ob_dim] np.array of states
        masks: [None, ac_dim] np.array of masks (legal moves)
        not_end_of_path: [None,] np.array of 0,1 integers (0 denotes the end of a rolllout)
        reward: [None,] np.array of real numbers representing the return of the action resulting in next_state
        decay: real number in [0,1] representing the decay rate (often called gamma)
        verbose: Boolean
    
    This function is used to compute the Bellman backup values used to update our Q-function. 
        Recall: Q(s,a,s') <~~  r(s,a) +  max_a' [Q(s',a')]
        The right side of this equation is called the target value.
    
    Returns:
        target: [None,] batch of real numbers, indicating target values
    ''' 
    next_state_Qs = sess.run(model, feed_dict= {observation_placeholder: next_state})
    future_expected_reward = []
    for next_state_Q, mask in zip(next_state_Qs,masks):
        indices = np.where(mask)
        values = next_state_Q[indices]
        future_expected_reward.append(np.max(values))
    future_reward_if_not_done = [eop * fer for eop, fer in zip(not_end_of_path.tolist(), future_expected_reward)]
    target = reward + future_reward_if_not_done
    if verbose:
        print("not end of path", not_end_of_path)
        print("future expected reward", future_expected_reward)
        print("future reward if not done", future_reward_if_not_done)
        print("reward", reward)
        print("target", target)
        print("--")
    return target

def sample_paths(paths, batch_size = 10):
    '''From a collection of rollouts, this samples a random uniform batch
    Inputs: 
        paths: a list of dictionaries containing the data of a rollout
        batch_size: integer determining batch size to be returned
    
    Returns:
        state1: [batch_size, ob_dim] np.array of states
        action: [batch_size,] np.array of actions
        state2: [batch_size, ob_dim] np.array of states
        reward: [batch_size,] np.array of states
        mask:   [batch_size,ac_dim] np.array of masks
        done:   [batch_size,] binary np.array. 
            A 0 corresponds to a terminal game state, a 1 is a non-terminal game state
    '''
    
    #Make the easy lists
    observation_list = np.concatenate([path['observation'] for path in paths])
    action_list = np.concatenate([path['action'] for path in paths])
    reward_list = np.concatenate([path['reward'] for path in paths])
    mask_list = np.concatenate([path['mask'] for path in paths])

    #Make the done list
    number_of_states = len(observation_list)
    list_of_ones = [1] * number_of_states
    partial_sum =0
    for path in paths:
        partial_sum += len(path['observation'])
        list_of_ones[partial_sum-1] = 0
    done_list = list_of_ones
    

    #Select randomly chosen entries
    indices = np.random.choice(number_of_states, batch_size) 
    state1 = np.array([observation_list[i] for i in indices])
    action = np.array([action_list[i] for i in indices])
    state2 = np.array([observation_list[(i+1) % number_of_states] for i in indices])
    reward = np.array([reward_list[i] for i in indices])
    mask = np.array([mask_list[(i+1) % number_of_states] for i in indices])
    done = np.array([done_list[i] for i in indices])
    
    return state1, action, state2 , reward, mask, done

    

# Running Q-learning

In [45]:
tf.reset_default_graph()
reload(Players)

#Define the placeholders
observation_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32, name = "obs_placeholder")
action_placeholder = tf.placeholder(shape = [None], dtype = tf.int32, name = "act_placeholder")

#target place holder is r(s,a) + \gamma \max_a Q(s',a)
target_placeholder = tf.placeholder(shape = [None], dtype = tf.float32, name = "target_placeholder")

#Define the model and loss function
model = TicTacToe_model(observation_placeholder, scope = "Q_learn")
update_op = symbolic_Q_update(model, target_placeholder, action_placeholder)

#Start a session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

#Define the players
player = Players.NN_Player(model, model, sess, observation_placeholder, duplicate = False)
opponent = Players.Random_Player()


#Define the environment
env = TicTacToe.TicTacToe()

#Load current net
temp_file_name = './bot_10_31_q_v2.ckpt'

#Want to duplicate session
#saver = tf.train.Saver()
#saver.restore(sess, temp_file_name)

replay_buffer = []
first_state = None
step = 0
while True:
    step += 1
    
    #Collect rollouts
    paths, _ = batch_rollout(player, opponent, env, max_time_steps = 100)
    
    #Add rollouts to the replay buffer
    if len(replay_buffer) > 100000:
        replay_buffer = paths
    else:
        replay_buffer += paths
    
    #Collect samples from our replay buffer
    states, actions, next_states, rewards, masks, not_end_of_path = sample_paths(replay_buffer, batch_size = 100)
    
    #Compute target values
    target_values = compute_target_values(model, next_states, masks, not_end_of_path, rewards, verbose=False)
    
    #Update the network
    sess.run(update_op, feed_dict= {observation_placeholder : states, action_placeholder : actions, target_placeholder : target_values })
    
    
    #Occasionally, test the model and replace it with a previous iteration
    if step%1000 ==0:
        step = 0
#        print(sess.run(model, feed_dict= {observation_placeholder : [np.array([[0,0,0],[0,0,0],[0,0,0]])] }))
        player.epsilon = 0
        expert_player = Players.Child_Player()
        _, expert_batch_winners = batch_rollout(player, expert_player, env, max_time_steps=1000)
        print(expert_batch_winners)
        batch_percentages = np.array([expert_batch_winners[0], expert_batch_winners[1], expert_batch_winners[2]])*1.0/(expert_batch_winners[0] + expert_batch_winners[1] + expert_batch_winners[2])
        player.epsilon = .2
        print("batch percentages", batch_percentages.tolist())
        opponent = Players.NN_Player(model, model, sess, observation_placeholder, duplicate = True)
        

    


Destroying NN_Player and Session...
Destroying NN_Player and Session...
[[0.7624899  0.62806654 0.72391427 0.2153095  0.78453815 0.29561567
  0.717113   0.36618185 0.62203956]]
Counter({2: 197, 0: 74, 1: 34})
batch percentages [0.24262295081967214, 0.11147540983606558, 0.6459016393442623]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
[[0.8248923  0.7050432  0.7919456  0.44908166 0.8942462  0.42637086
  0.7486932  0.59787965 0.7915474 ]]
Counter({2: 202, 1: 46, 0: 43})
batch percentages [0.14776632302405499, 0.15807560137457044, 0.6941580756013745]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.7777405  0.74678445 0.77774405 0.45682645 0.89168394 0.55582595
  0.76682734 0.75960135 0.7733172 ]]
Counter({2: 178, 0: 55, 1: 46})
batch percentages [0.1971326164874552, 0.16487455197132617, 0.6379928315412187]
dup

Destroying NN_Player and Session...
[[0.36745596 0.35330915 0.41312277 0.31147063 0.42753804 0.43105114
  0.39097655 0.36911488 0.39982116]]
Counter({0: 160, 1: 51, 2: 20})
batch percentages [0.6926406926406926, 0.22077922077922077, 0.08658008658008658]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.49672902 0.48863387 0.5261749  0.31554246 0.5195421  0.4581324
  0.3378563  0.3566513  0.41982222]]
Counter({1: 131, 0: 114, 2: 0})
batch percentages [0.46530612244897956, 0.5346938775510204, 0.0]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.4217869  0.4461708  0.51761687 0.34206176 0.5594008  0.47837067
  0.39228833 0.43003857 0.5128069 ]]
Counter({0: 165, 1: 68, 2: 4})
batch percentages [0.6962025316455697, 0.2869198312236287, 0.016877637130801686]
duplicating session 

INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.42673683 0.39900064 0.46940446 0.36937308 0.43322372 0.37036455
  0.4773574  0.3757844  0.38205755]]
Counter({1: 133, 0: 110, 2: 3})
batch percentages [0.44715447154471544, 0.540650406504065, 0.012195121951219513]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.4994464  0.3890351  0.3974172  0.64789724 0.5147668  0.32707584
  0.5199075  0.43140388 0.46211064]]
Counter({0: 171, 1: 64, 2: 0})
batch percentages [0.7276595744680852, 0.2723404255319149, 0.0]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.3488208  0.405303   0.38759327 0.43685675 0.43414712 0.37083066
  0.37384307 0.30673242 0.4123224 ]]
Counter({0: 168, 1: 65, 2: 2})
batch percentages [0.7148936170212766, 0

INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.434245   0.35993063 0.40026975 0.34267282 0.4485587  0.36563122
  0.4234209  0.39132297 0.4812112 ]]
Counter({0: 123, 1: 121, 2: 3})
batch percentages [0.4979757085020243, 0.4898785425101215, 0.012145748987854251]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.4159931  0.37224007 0.44296825 0.31737828 0.46321404 0.3291011
  0.41769946 0.39057124 0.4751655 ]]
Counter({0: 139, 1: 107, 2: 0})
batch percentages [0.5650406504065041, 0.4349593495934959, 0.0]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.38655496 0.33508825 0.40542603 0.38664198 0.47362816 0.37584043
  0.3952676  0.40814602 0.45322847]]
Counter({0: 160, 1: 76, 2: 0})
batch percentages [0.6779661016949152, 0

INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.39196694 0.34537005 0.3906101  0.31300366 0.4432298  0.3094511
  0.42201805 0.29974616 0.39738727]]
Counter({0: 167, 1: 62, 2: 4})
batch percentages [0.7167381974248928, 0.26609442060085836, 0.017167381974248927]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.4099077  0.40400672 0.41002285 0.32138968 0.46690142 0.3794737
  0.44581378 0.41019917 0.42562997]]
Counter({0: 160, 1: 76, 2: 1})
batch percentages [0.6751054852320675, 0.3206751054852321, 0.004219409282700422]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.48479772 0.3639711  0.45921898 0.32483912 0.4629842  0.44360125
  0.47184622 0.40989852 0.46793008]]
Counter({0: 128, 1: 120, 2: 0})
batch percentages [0.516

INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.41001642 0.40561342 0.43402755 0.39620876 0.48044407 0.40474117
  0.45825005 0.39395738 0.39750528]]
Counter({0: 159, 1: 75, 2: 2})
batch percentages [0.673728813559322, 0.3177966101694915, 0.00847457627118644]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.428038   0.37512362 0.4413458  0.37930298 0.5001781  0.39134037
  0.4505651  0.420133   0.38628447]]
Counter({0: 173, 1: 61, 2: 0})
batch percentages [0.7393162393162394, 0.2606837606837607, 0.0]
duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt
Destroying NN_Player and Session...
[[0.42784035 0.3862059  0.45850456 0.4030651  0.46476638 0.43729687
  0.45949316 0.34751332 0.415424  ]]
Counter({0: 162, 1: 73, 2: 2})
batch percentages [0.6835443037974683, 0.30

KeyboardInterrupt: 

In [None]:
#training against expert policy, to see if this works at all

In [None]:
print(sess.run(model, feed_dict= {observation_placeholder : [np.array([[0,0,0],[0,0,0],[0,0,0]])] }))
print(sess.run(model, feed_dict= {observation_placeholder : [np.array([[0,0,0],[0,1,0],[0,0,-1]])] }))
print(sess.run(model, feed_dict= {observation_placeholder : [np.array([[0,0,0],[0,-1,-1],[0,0,1]])] }))
player.epsilon = 0
expert_player = Players.Random_Player()
_, expert_batch_winners = batch_rollout(player, expert_player, env, max_time_steps=1000)
print(expert_batch_winners)
batch_percentages = np.array([expert_batch_winners[0], expert_batch_winners[1], expert_batch_winners[2]])*1.0/(expert_batch_winners[0] + expert_batch_winners[1] + expert_batch_winners[2])
player.epsilon = .2
print("batch percentages", batch_percentages.tolist())
opponent = Players.NN_Player(model, model, sess, observation_placeholder, duplicate = True)

# Code that Nick might want

In [34]:
###Taken from the bottom of Q-learning training, after 'replay_buffer'

    states, actions, next_states, rewards, masks, not_end_of_path = sample_paths(replay_buffer, batch_size = 100)
    target_values = compute_target_values(model, next_states, masks, not_end_of_path, rewards, verbose=False)
#     for state, neof, act, target in zip(states, not_end_of_path, actions,target_values):
#         if first_state is None:
#             if neof == 0:
#                 first_state = state
#         elif np.max(np.abs(first_state-state))==0:
#             print(state)
#             print("current q values", sess.run(model,feed_dict= {observation_placeholder : [state]}))
#             print("current action", act)
#             print("current target", target)
            
            
#     print(target_values[0])
#    for i, shit in enumerate(zip(target_values, states, actions, next_states, rewards, masks, not_end_of_path)):
#       print(i, shit)
#    rewards_for_average =np.concatenate([rewards_for_average,rewards])
#    test_obs = np.array([[[ 1, 2,  2],[ 0, 2,  2],[ 0,  1,  1]]])
#    test_act = np.array([2])
#    test_targ = np.array([-1.0])
#    sess.run(update_op, feed_dict= {observation_placeholder : test_obs, action_placeholder : test_act, target_placeholder : test_targ })
#    print(sess.run(model, feed_dict= {observation_placeholder : test_obs}) )
#    vals = sess.run(model, feed_dict = {observation_placeholder: states})
#    old = []
    sess.run(update_op, feed_dict= {observation_placeholder : states, action_placeholder : actions, target_placeholder : target_values })
    
    if step%1000 ==0:
        step = 0
        print(sess.run(model, feed_dict= {observation_placeholder : [np.array([[0,0,0],[0,0,0],[0,0,0]])] }))
        print(sess.run(model, feed_dict= {observation_placeholder : [np.array([[0,0,0],[0,1,0],[0,0,-1]])] }))
        print(sess.run(model, feed_dict= {observation_placeholder : [np.array([[0,0,0],[0,-1,-1],[0,0,1]])] }))
        player.epsilon = 0
        expert_player = Players.Random_Player()
        _, expert_batch_winners = batch_rollout(player, expert_player, env, max_time_steps=1000)
        print(expert_batch_winners)
        batch_percentages = np.array([expert_batch_winners[0], expert_batch_winners[1], expert_batch_winners[2]])*1.0/(expert_batch_winners[0] + expert_batch_winners[1] + expert_batch_winners[2])
        player.epsilon = .2
        print("batch percentages", batch_percentages.tolist())
        opponent = Players.NN_Player(model, model, sess, observation_placeholder, duplicate = True)
        

    

IndentationError: unexpected indent (<ipython-input-34-15257dded3e9>, line 3)

# Implementing DQN

# Running DQN

In [8]:
tf.reset_default_graph()
reload(Players)

#Define the placeholders
observation_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32, name = "obs_placeholder")
action_placeholder = tf.placeholder(shape = [None], dtype = tf.int32, name = "act_placeholder")

#target place holder is r(s,a) + \gamma \max_a Q(s',a)
target_placeholder = tf.placeholder(shape = [None], dtype = tf.float32, name = "target_placeholder")

#Define the model and loss function
model = TicTacToe_model(observation_placeholder, scope = "Q_learn")
target_model = TicTacToe_model(observation_placeholder, scope = "target_network")
update_op = symbolic_Q_update(model, target_placeholder, action_placeholder)

#Start a session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

#Define the players
player = Players.NN_Player(model, model, sess, observation_placeholder, duplicate = False)
opponent = Players.Random_Player()
judge =  Players.NN_Player(target_model, target_model, sess, observation_placeholder, duplicate = False)


#Define the environment
env = TicTacToe.TicTacToe()

#Load current net
temp_file_name = './bot_10_31_q_v2.ckpt'

#Want to duplicate session
#saver = tf.train.Saver()
#saver.restore(sess, temp_file_name)

replay_buffer = []
first_state = None
step = 0

while True:
    step += 1

    #Collect rollouts
    paths, _ = batch_rollout(player, opponent, env, max_time_steps = 100)

    #Add rollouts to the replay buffer
    if len(replay_buffer) > 100000:
        replay_buffer = paths
    else:
        replay_buffer += paths

    #Collect samples from our replay buffer
    states, actions, next_states, rewards, masks, not_end_of_path = sample_paths(replay_buffer, batch_size = 100)

    #Compute target values
    target_values = compute_target_values(target_model, next_states, masks, not_end_of_path, rewards, verbose=False)
    print(target_values)
    #Update the network
    sess.run(update_op, feed_dict= {observation_placeholder : states, action_placeholder : actions, target_placeholder : target_values })

    if steps % 200 ==0:
        judge = Players.NN_Player(model, model, sess, observation_placeholder, duplicate = True)
        #target_model updates to current model
        print("target has updated")


    #Occasionally, test the model and replace it with a previous iteration
    if step%1000 ==0:
        step = 0
        print(sess.run(judge, feed_dict= {observation_placeholder : [np.array([[0,0,0],[0,0,0],[0,0,0]])] }))
        player.epsilon = 0
        expert_player = Players.Child_Player()
        _, expert_batch_winners = batch_rollout(player, expert_player, env, max_time_steps=1000)
        print(expert_batch_winners)
        batch_percentages = np.array([expert_batch_winners[0], expert_batch_winners[1], expert_batch_winners[2]])*1.0/(expert_batch_winners[0] + expert_batch_winners[1] + expert_batch_winners[2])
        player.epsilon = .2
        print("batch percentages", batch_percentages.tolist())
        opponent = Players.NN_Player(model, model, sess, observation_placeholder, duplicate = True)


    

ValueError: Shapes (?,) and (?, 3, 9) are incompatible