In [1]:
import uuid
import time
import pickle
import sys
import gym.spaces
import itertools
import numpy as np
import random
import tensorflow                as tf
import tensorflow.contrib.layers as layers
from collections import namedtuple
import TicTacToe
from collections import Counter
import Players



  from ._conv import register_converters as _register_converters


In [2]:
tf.reset_default_graph()

def TicTacToe_model(placeholder, scope, reuse=tf.AUTO_REUSE, num_actions = 9):
    # A model for a TicTacToe q-function
    placeholder = tf.contrib.layers.flatten(placeholder)
    with tf.variable_scope(scope, reuse=reuse):
        out = placeholder
        out = tf.cast(out, tf.float32)
        out = tf.layers.dense(out, 64  , bias_initializer = tf.zeros_initializer(), activation = tf.nn.softmax)
        out = tf.layers.dense(out, 64  , bias_initializer = tf.zeros_initializer(), activation = tf.nn.softmax)
        out = tf.layers.dense(out, num_actions , kernel_initializer = tf.zeros_initializer(), bias_initializer = tf.zeros_initializer(), activation = None)
    return out

    
def sample_action(model, mask_placeholder):
    out = model
    dist = tf.distributions.Categorical(probs=maskedSoftmax(out, mask_placeholder))
    return dist.sample()
    
    
"""Code from online"""
def maskedSoftmax(logits, mask):
    """
    Masked softmax over dim 1
    :param logits: (N, L)
    :param mask: (N, L)
    :return: probabilities (N, L)
    """
    indices = tf.where(mask)
    values = tf.gather_nd(logits, indices)
    denseShape = tf.cast(tf.shape(logits), tf.int64)
    
    """THIS IS THE KEY: tensorflow will automatically set output probabilities to zero of undesignated entries in sparse vector"""
    sparseResult = tf.sparse_softmax(tf.SparseTensor(indices, values, denseShape))
    
    result = tf.scatter_nd(sparseResult.indices, sparseResult.values, sparseResult.dense_shape)
    result.set_shape(logits.shape)
    return result


def batch_rollout(player,opponent, env, max_time_steps = 100, exploration_on =False, epsilon =.1):
    paths = []
    batch_winners = Counter({0: 0, 1: 0, 2:0})
    time_steps = 0
    while time_steps < max_time_steps:
        path = sample_trajectory(player,opponent,env, exploration_on, epsilon)
        paths += [path]
        batch_winners[env.current_winner] +=1
        time_steps += len(path['observation'])
    return paths, batch_winners
    
    
    
def sample_trajectory(player, opponent, env, exploration_on = False, epsilon = .1):
    """player:   realization of Player.Player abstract class
       opponent: realization of Player.Player abstract class
       env:      environment which follows open ai gym environment structure and has a current_player int either 1 or 2
    
       realizes a rollout of env using player and opponent policy
       
       ouputs a path dictionary with keys: observation, reward, action
       Each yields a 1D np array which shows the observation, reward, action pair at every point of the rollout
    """
    
    obs, acs, rewards, masks = [], [], [], []
    ob = env.reset()
    done = False
    player_has_acted = False
    action = None
    
    #Do rest of moves
    while not done:
        #Get current observation of current player
        ob = env.get_observation(env.current_player)
        legal_moves = env.legal_moves()
        if env.current_player == 1:
            #Reward is recorded as results of state,action pair... need to check player 1 has acted already
            if player_has_acted:
                rewards.append(env.get_reward(1))
            else:
                player_has_acted = True
                
            action = player.policy(np.array([ob]), np.array([legal_moves]))
            if exploration_on:
                legal_options = np.array(legal_moves)
                action = [random.choice(np.nonzero(legal_options)[0])]
            obs.append(ob)
            acs.append(action[0])
            masks.append(legal_moves)
        else:
            action = opponent.policy(np.array([ob]), np.array([legal_moves]))
        done, _ = env.step(action[0]) 

    #Need to record final reward for player 1
    rewards.append(env.get_reward(1))
    
    path = {"observation" : np.array(obs, dtype=np.int32), 
                "reward" : np.array(rewards, dtype=np.float32), 
                "action" : np.array(acs, dtype=np.int32),
                "mask" : np.array(masks, dtype=np.int32)}
    return path

    
    
def sum_of_rewards(paths, gamma = .6): 
    re_n = [path["reward"] for path in paths]
    q_n = []
    for seq_of_rewards in re_n:
        for t in range(len(seq_of_rewards)):
            weighted_sequence = seq_of_rewards[t:] * np.array([gamma**i for i in range(len(seq_of_rewards[t:]))])
            q_n.append(np.sum(weighted_sequence))
    adv_n = q_n
    return adv_n
        
def standardize_advantage(adv_n):
    adv_n = (adv_n - np.mean(adv_n)) 
    adv_n = adv_n * (1.0/(np.std(adv_n)+.0000001))
    return adv_n

def get_log_prob(model, action_placeholder, mask_placeholder):
    action_dim = 9 
    logits = model
    
    indices = tf.where(mask_placeholder)
    values = tf.gather_nd(logits, indices)
    denseShape = tf.cast(tf.shape(logits), tf.int64)
    
    """THIS IS THE KEY: tensorflow will automatically set output probabilities to zero of undesignated entries in sparse vector"""
    sparseResult = tf.sparse_softmax(tf.SparseTensor(indices, values, denseShape))
    
    probability_dist = tf.scatter_nd(sparseResult.indices, sparseResult.values, sparseResult.dense_shape)
#     probability_dist = probability_dist.set_shape(logits.shape)
    log_probability_dist = tf.scatter_nd(sparseResult.indices, tf.log(sparseResult.values), sparseResult.dense_shape)

    """Want to emulate this:"""
#     probability_dist = tf.nn.softmax(logits)
#     legal_pseudo_probability_dist = probability_dist*values
#     legalprobability_dist = tf.divide(legal_pseudo_probability_dist, tf.reduce_sum(legal_pseudo_probability_dist, axis= 1))
    
    prod = tf.multiply(probability_dist, tf.one_hot(action_placeholder, action_dim ))
    
    entropy = - tf.reduce_sum(probability_dist * log_probability_dist, axis = 1)
    
    
    
    log_prob = tf.log(tf.reduce_sum(prod , axis = 1 ))
#    log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels= action_placeholder, logits= tf.SparseTensor(indices, values, denseShape))
    return log_prob, entropy

def loss_and_update_op(log_prob, entropy, adv_n, entropy_coeff = .1):
    loss = -tf.reduce_mean(log_prob * adv_n) -  entropy_coeff * entropy
    optimizer = tf.train.AdamOptimizer(5e-3)
    update_op = optimizer.minimize(loss)
    return loss, update_op, optimizer
    

In [5]:
#Main code for running policy gradient

tf.reset_default_graph()

#define the board, models *symbolically*
observation_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32)
adv_n_placeholder = tf.placeholder(shape = [None], dtype = tf.float32)
action_placeholder = tf.placeholder(shape = [None], dtype = tf.int32)
mask_placeholder = tf.placeholder(shape=[None, 9], dtype = tf.int32)


model = TicTacToe_model(observation_placeholder, 9, scope = "policy_gradient", reuse=tf.AUTO_REUSE)
#old_model = TicTacToe_model(board_placeholder, 9, scope = "model-2", reuse=tf.AUTO_REUSE)
model_input_s = sample_action(model, mask_placeholder)

#Define Loss functions *symbolically*
log_prob, entropy = get_log_prob(model, action_placeholder, mask_placeholder)
loss, update_op, optimizer = loss_and_update_op(log_prob, entropy, adv_n_placeholder, entropy_coeff = 0)

#start a session
sess =tf.Session()
sess.run(tf.global_variables_initializer())
#Defines player, opponent
player = Players.NN_Player(model, model_input_s, sess, observation_placeholder, mask_placeholder, duplicate=False)
opponent = Players.Random_Player()

#Loads old player,opponent
# temp_file_name = './bot_10_28_v6.ckpt'

#Want to duplicate session
# saver = tf.train.Saver()
# saver.restore(sess, temp_file_name)


# opponent = Players.NN_Player(model, model_input_s, sess, observation_placeholder, mask_placeholder)




#start an environment
env = TicTacToe.TicTacToe()

number_updates_per_expert_update = 5
number_expert_updates = 1000

for k in range(number_expert_updates):
    print("iteration number", k)
    
    batch_adv_n = []
    iteration_winners = Counter({0:0,1:0,2:0})
    
    tic = time.time()
    for i in range(number_updates_per_expert_update):
        paths, batch_winners = batch_rollout(player, opponent, env, max_time_steps=1000)
        iteration_winners += batch_winners
        
        adv_n = sum_of_rewards(paths)
        batch_adv_n = batch_adv_n + adv_n
        

        boards = np.concatenate([path['observation'] for path in paths])
        masks = np.concatenate([path['mask'] for path in paths])
        actions = np.squeeze(np.concatenate([path["action"] for path in paths])).astype(int)
        
        sess.run(update_op, feed_dict = {mask_placeholder: masks, adv_n_placeholder: adv_n, observation_placeholder: boards , action_placeholder: actions})
    
    
    #Unwind win data:
#     print(iteration_winners)
    print("mean adv", np.mean(batch_adv_n))
    print("iteration time", time.time() - tic)
#     print(paths[0])
    
    
    expert_player = Players.Expert_Player()
    _, expert_batch_winners = batch_rollout(player, expert_player, env, max_time_steps=900, exploration_on =True, )
    player_loss_percentage_vs_expert = expert_batch_winners[2]*1.0/(expert_batch_winners[0] + expert_batch_winners[1] + expert_batch_winners[2])
    print("loss percent vs expert", player_loss_percentage_vs_expert)
    opponent = Players.NN_Player(model, model_input_s, sess, observation_placeholder,mask_placeholder)
            
    


iteration number 0


KeyboardInterrupt: 

In [4]:
#Save current net

temp_file_name = './bot_10_28_v7.ckpt'

#Want to duplicate session
saver = tf.train.Saver()
saver.save(sess, temp_file_name)

'./bot_10_28_v6.ckpt'

In [9]:
#Load current net

temp_file_name = './bot_10_28_v3.ckpt'

#Want to duplicate session
saver = tf.train.Saver()
saver.restore(sess, temp_file_name)

INFO:tensorflow:Restoring parameters from ./bot_10_28_v3.ckpt


In [None]:
#Cell Tests Players against each other

from importlib import reload
reload(Players)
player = Players.Random_Player()
opponent = Players.Expert_Player()

env = TicTacToe.TicTacToe()
batch_winners = {0:0,1:0,2:0}
for i in range(10000):
    path = sample_trajectory(player,opponent,env)
    batch_winners[env.current_winner] += 1
print(batch_winners)
    

{0: 867, 1: 0, 2: 9133}


In [10]:
#Implementing Q-learning....
from importlib import reload 
reload(Players)

def collect_tuples(model, observation_placeholder, opponent, env, sess, num_tuples =100, exploration =.1):
    current_state1 = None
    current_action = None
    current_reward = None
    current_state2 = None
    tuples = [] #or maybe an array?
    ob = env.reset
    i=0
    while i< num_tuples:
        i+=1
        #Record the current state
        current_state1 = ob

        #We pick the best action epsilon-greedily
        action = sess.run( model, feed_dict = {observation_placeholder: [ob]})[0] #or something
        if np.random.uniform() < exploration:
            action = env.action_space.sample()
        
        #Record the current action and reward
        ob = env.step(action)
        current_action = action
        current_reward = env.get_reward() #or whatever
        
        #Record the new state
        current_state2 = ob
        tuples.append({'state1': current_state1, 'action':current_action, 'state2':current_state2, 'reward':current_reward})
        
    return tuples
    
def arg_max_sample(model, mask_placeholder):
    indices = tf.where(mask_placeholder)
    values = tf.gather_nd(model, indices)
    denseShape = tf.cast(tf.shape(model), tf.int64)
    x = tf.SparseTensor(indices, values, denseShape)
    x = tf.scatter_nd(x.indices, x.values, x.dense_shape)
    return tf.argmax(x, 1)

def get_loss_and_optimizer_Q(model, targets_placeholder, action_placeholder):
    diff = tf.subtract(model[action_placeholder], targets_placeholder)
    loss = tf.norm(diff)
    optimizer = tf.train.AdamOptimizer(5e-3)
    update_op = optimizer.minimize(loss)
    return loss, update_op, optimizer

def compute_target_values(arg_max_sample_s , next_state, mask, not_end_of_path, reward, decay = .01):
    future_expected_reward = sess.run(arg_max_sample_s, feed_dict= {observation_placeholder: next_state, mask_placeholder: mask})
    return reward + decay * not_end_of_path * future_expected_reward

def sample_paths(paths, batch_size = 10):
    #Make the easy lists
    observation_list = np.concatenate([path['observation'] for path in paths])
    action_list = np.concatenate([path['action'] for path in paths])
    reward_list = np.concatenate([path['reward'] for path in paths])
    mask_list = np.concatenate([path['mask'] for path in paths])

    #Make the done list
    #Returns 0 if at the terminal step
    #Returns 1 otherwise.
    list_of_ones = [1] * len(observation_list)
    partial_sum =0
    for path in paths:
        partial_sum += len(path['observation'])
        list_of_ones[partial_sum-1] = 0
    done_list = list_of_ones
    
    #Get some random indices
    indices = np.random.choice(len(observation_list), batch_size) 
    
    #Select randomly chosen entries
    state1 = np.array([observation_list[i] for i in indices])
    action = np.array([action_list[i] for i in indices])
    state2 = np.array([observation_list[(i+1) % len(observation_list)] for i in indices])
    reward = np.array([reward_list[i] for i in indices])
    mask = np.array([mask_list[(i+1) % len(observation_list)] for i in indices])
    done = np.array([done_list[i] for i in indices])
    return state1, action, state2 , reward, mask, done

    

In [11]:
tf.reset_default_graph()


#Define any constants
gamma = .95

#Define the placeholders
observation_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32)
action_placeholder = tf.placeholder(shape = [None], dtype = tf.int32)

#target place holder is r(s,a) + \gamma \max_a Q(s',a)
target_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32)
mask_placeholder = tf.placeholder(shape = [None,9], dtype = tf.int32)



#Define the model and loss function

model = TicTacToe_model(observation_placeholder, scope = "Q_learn", reuse=tf.AUTO_REUSE)
arg_max_sample_s = arg_max_sample(model, mask_placeholder)
# _, update_op, _  = get_loss_and_optimizer_Q(model, target_placeholder, action_placeholder)

#Start a session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

#Define the players
player = Players.NN_Player(model, arg_max_sample_s, sess, observation_placeholder, mask_placeholder)
opponent = Players.Random_Player()
env = TicTacToe.TicTacToe()
#Collect some tuples
# database = []
# database += collect_tuples(model, observation_placeholder, opponent, env, sess)
# 
# ###Training time
# #I have a tuple, state-action-state from somewhere
# sampled_tuple = random.choice(database)
# 
# not_updated_Q = sess.run(player.q_func(), feed_dict = {observation_placeholder: [sampled_tuple['state1']]})
# 
# target_Q = not_updated_Q
# target_Q[0, sampled_tuple['action']] =  sampled_tuple['reward'] 
# target_Q[0, sampled_tuple['action']] += tf.reduce_max(sess.run(player.q_function(), feed_dict = {observation_placeholder: [sampled_tuple['state2']]}), 1)[0]


#NEED: call batch_rollouts(), feed into chris function
paths = batch_rollout(player, opponent, env)
states, actions, next_states, rewards, not_end_of_path, masks = sample_paths(paths, batch_size = 10)



target_values = compute_target_values(arg_max_sample_s, next_state, masks, not_end_of_path, rewards)

print(target_values)
#sess.run(update_op, feed_dict= {observation_placeholder : states, action_placeholder : actions, observation_placeholder : states })


    
    
    


duplicating session to freeze weights for evaluation...
INFO:tensorflow:Restoring parameters from ./to_duplicate.ckpt


NameError: name 'env' is not defined