In [1]:
import uuid
import time
import pickle
import sys
import gym.spaces
import itertools
import numpy as np
import random
import tensorflow                as tf
import tensorflow.contrib.layers as layers
from collections import namedtuple
import TicTacToe

  from ._conv import register_converters as _register_converters


In [11]:
tf.reset_default_graph()

def TicTacToe_model(tile_in, num_actions, scope, reuse=tf.AUTO_REUSE):
    # A model for a TicTacToe q-function
    tile_in = tf.contrib.layers.flatten(tile_in)
    with tf.variable_scope(scope, reuse=reuse):
        out = tile_in
        out = tf.cast(out, tf.float32)
        with tf.variable_scope("hidden_layers"):
            # original architecture
            out = tf.layers.dense(out, 64 , kernel_initializer = tf.ones_initializer(), bias_initializer = tf.zeros_initializer(), activation = tf.nn.relu)
            out = tf.layers.dense(out, 9 , kernel_initializer = tf.zeros_initializer(), bias_initializer = tf.zeros_initializer(), activation = None)
#        with tf.variable_scope("action_value"):
            out = tf.layers.dense(out, num_actions, kernel_initializer = tf.zeros_initializer(), bias_initializer = tf.zeros_initializer(), activation= None)
        return out

    
def update_old_model_initialize_new_model(old_model, new_model, board_placeholder, sess, update_iter):
    old_model = new_model
    old_model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="model" +str(update_iter-1))
    
    new_model = TicTacToe_model(board_placeholder, 9, scope = "model" + str(update_iter))
    
    new_model_vars =  tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="model" +str(update_iter))
    sess.run(tf.variables_initializer(new_model_vars))
    
    for var_old, var_new in zip(old_model_vars, new_model_vars):
        var_new = sess.run(var_old)
#        print(sess.run(var_old))
#        print(sess.run(var_new))
#    print(sess.run(old_model, feed_dict = {board_placeholder: [[[1,0,0],[0,0,0],[0,0,0]]]})[0,:3])
#    print(sess.run(new_model, feed_dict = {board_placeholder: [[[1,0,0],[0,0,0],[0,0,0]]]})[0,:3])
    return old_model, new_model

def update_and_duplicate_old_model():
    old_model = new_model
    
    variables = tf.trainable_variables()
    for var1 in variables:
        if "pred/" in var1.name:
            trained_var = [var2 for var2 in tf.global_variables() if var2.op.name in str.replace(var1.name, "pred/", "")][0]
            value = sess.run(trained_var)
            sess.run(tf.assign(var1, value))


def sample_action(model):
    out = model
    dist = tf.distributions.Categorical(logits=out)
    return dist.sample()
    
    

def sample_trajectory(old_model, new_model, sess, env):
    new_sample = sample_action(new_model)
    old_sample = sample_action(old_model)
    obs, acs, rewards = [], [], []
    seed = np.random.randint(0,2)
    ob = env.reset()
    if seed % 2 == 0:
        while True:
            obs.append(ob)
            action = sess.run(new_sample, feed_dict={board_placeholder: [ob]})
            acs.append(action)
            ob, rew, done, _ = env.step(action[0])
            rewards.append(rew)
            
            if done:
                break
            
            action = sess.run(old_sample, feed_dict={board_placeholder: [ob]})
            ob, rew, done, _ = env.step(action[0])
    else:
        while True:       
            action = sess.run(old_sample, feed_dict={board_placeholder: [ob]})
            ob, rew, done, _ = env.step(action[0])
   
            obs.append(ob)
            action = sess.run(new_sample, feed_dict={board_placeholder: [ob]})
            acs.append(action)
            ob, rew, done, _ = env.step(action[0])
            rewards.append(rew)
            
            if done:
                break
                
    path = {"observation" : np.array(obs, dtype=np.float32), 
                "reward" : np.array(rewards, dtype=np.float32), 
                "action" : np.array(acs, dtype=np.float32)}
#    print(str(len(path["observation"])))
    return path

def sample_trajectories(old_model, new_model, sess, min_timesteps_per_batch, env):
    paths =[]
    timesteps_this_batch = 0
    while True:
        path = sample_trajectory(old_model, new_model, sess, env)
        paths.append(path)
        timesteps_this_batch += len(path['observation'])
        if timesteps_this_batch > min_timesteps_per_batch:
            break
    return paths
    
def sum_of_rewards(paths, gamma = .95): 
    re_n = [path["reward"] for path in paths]
    q_n = []
    for seq_of_rewards in re_n:
        for t in range(len(seq_of_rewards)):
            weighted_sequence = seq_of_rewards[t:] * np.array([gamma**i for i in range(len(seq_of_rewards[t:]))])
            q_n.append(np.sum(weighted_sequence))
    adv_n = q_n
    return adv_n
        
def standardize_advantage(adv_n):
    adv_n = (adv_n - np.mean(adv_n)) 
    adv_n = adv_n * (1.0/(np.std(adv_n)+.0000001))
    return adv_n

def get_log_prob(model, action_placeholder):
    logits = model
    
    log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels= action_placeholder, logits= logits)
    return log_prob

def loss_and_update_op(log_prob,adv_n):
    loss = -tf.reduce_mean(log_prob * adv_n) 
    optimizer = tf.train.AdamOptimizer(5e-3)
    update_op = optimizer.minimize(loss)
    return loss, update_op, optimizer
    

In [10]:
tf.reset_default_graph()

#define the board, models
board_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32)
adv_n_placeholder = tf.placeholder(shape = [None], dtype = tf.float32)
action_placeholder = tf.placeholder(shape = [None], dtype = tf.int32)
new_model = TicTacToe_model(board_placeholder, 9, scope = "model-1", reuse=tf.AUTO_REUSE)
old_model = TicTacToe_model(board_placeholder, 9, scope = "model-2", reuse=tf.AUTO_REUSE)

#start a session
sess =tf.Session()

#start an environment
env = TicTacToe.TicTacToe()

number_updates_per_expert_update = 10
number_expert_updates = 4

for k in range(number_expert_updates):
    print("iteration number", k)
    #Define Loss functions *symbolically*
    log_prob = get_log_prob(new_model, action_placeholder)
    loss, update_op, optimizer = loss_and_update_op(log_prob, adv_n_placeholder)

    #Initialize all variables first time only, otherwise update only uptimizer vars
    if k ==0:
        sess.run(tf.global_variables_initializer())
    else:
        sess.run(tf.variables_initializer(optimizer.variables()))
    
    for i in range(number_updates_per_expert_update):
        #Produce some trajectories
#        print("sampling games...")
        paths = sample_trajectories(old_model, new_model, sess, 20, env = env)
#        print("games sampled.")
        adv_n = sum_of_rewards(paths) 
        boards = np.concatenate([path['observation'] for path in paths])
        actions = np.squeeze(np.concatenate([path["action"] for path in paths])).astype(int)
#        print("Last game played", paths)
        
        #Update the model
#        print("updating model...")
#        print(sess.run(new_model, feed_dict = {board_placeholder: [[[1,0,0],[0,0,0],[0,0,0]]]})[0,:3])
        test_vars =  tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="model" +str(k-1))
        for i, var in enumerate(test_vars):
            print(var.name, sess.run(var))

        sess.run(update_op, feed_dict = {adv_n_placeholder: adv_n, board_placeholder: boards , action_placeholder: actions})
#        print("model updated.")
    #if new_model does better...
    old_model, new_model = update_old_model_initialize_new_model(old_model, new_model, board_placeholder, sess, update_iter= k)
    

    
for path in paths:
    print(path['observation'])
    print(path['action'])
    print(path['reward'])
    print("____")
    
    


iteration number 0
model-1/hidden_layers/dense/kernel:0 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
model-1/hidden_layers/dense/bias:0 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
model-1/hidden_layers/dense/kernel/Adam:0 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
model-1/hidden_layers/dense/kernel/Adam_1:0 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0

model-1/hidden_layers/dense/kernel:0 [[-5.47140418e-03  1.85526395e-03 -4.46256017e-03  6.47306116e-03
  -9.61095793e-04  1.39755178e-02  9.44132451e-03 -8.58834479e-04
   6.72891224e-03]
 [-9.28554777e-03 -8.01073294e-03 -1.21091139e-02  8.69608019e-03
   1.00319348e-02  1.10113397e-02  8.01410060e-03 -6.01498876e-03
   1.30914217e-02]
 [-4.16159164e-05  1.40150841e-02 -5.99865336e-03  8.49302672e-03
  -1.27071962e-02  2.24988139e-03  5.97303454e-03  7.45351426e-04
   1.47125265e-02]
 [ 1.09986328e-02  1.46391476e-03  1.07301846e-02  8.77176039e-03
   1.76300760e-04 -1.02054356e-02 -9.91443917e-03  1.40524451e-02
  -1.10960063e-02]
 [ 1.20997615e-02  1.36749037e-02 -7.77016999e-03  1.05286939e-02
  -1.18105933e-02  3.84176220e-03 -1.31815281e-02  1.37659609e-02
   4.23909456e-04]
 [ 1.21891527e-02  5.40396664e-04 -7.32604088e-03  8.45101476e-03
  -8.88631213e-04  6.06816588e-03  1.18640838e-02 -4.89063701e-03
   4.46675066e-03]
 [-7.02895736e-03 -6.26048958e-03 -9.88066569e-03  8.3279

model-1/hidden_layers/dense/kernel:0 [[-0.00841047  0.00573785 -0.00430921  0.00835236 -0.00474965  0.02279524
   0.00977537  0.00032698  0.01149101]
 [-0.01134334 -0.01248701 -0.01723151  0.01083728  0.01193638  0.01815724
   0.01229895 -0.01214369  0.01976364]
 [ 0.00433967  0.01442198 -0.00797851  0.01562427 -0.01940079  0.00672356
   0.01083488 -0.0036924   0.02155218]
 [ 0.01285213  0.00722793  0.01422611  0.00809614 -0.0032447  -0.00913097
  -0.01424363  0.02056006 -0.01438104]
 [ 0.01397079  0.01573802 -0.00993151  0.01599177 -0.01698813  0.00857563
  -0.01451343  0.0195802   0.00179492]
 [ 0.01620169  0.00506065 -0.00799165  0.00997248 -0.00522822  0.008238
   0.01680236 -0.00826923  0.00677611]
 [-0.01125367 -0.00897882 -0.01251833  0.01229517  0.00916571  0.02246267
   0.00052921 -0.00055863  0.01618824]
 [ 0.00108828  0.00399806 -0.00583529  0.0189045  -0.0070812   0.0143847
   0.01843395  0.0085836   0.01994198]
 [ 0.01726153  0.00402232  0.00062225  0.0142158  -0.00638309 

model-1/hidden_layers/dense/kernel:0 [[-0.01449434  0.00774273 -0.0003078   0.00454915 -0.00746609  0.03559996
   0.01040993 -0.00323417  0.01824901]
 [-0.01037604 -0.01936069 -0.01832071  0.00575553  0.01601475  0.02952979
   0.01543283 -0.02131359  0.02930251]
 [ 0.00791305  0.01461758 -0.00776161  0.01628368 -0.02579355  0.01513458
   0.01893414 -0.0125506   0.02833985]
 [ 0.01254034  0.00872058  0.02299547  0.00246988 -0.00584071 -0.00054677
  -0.0183074   0.01545518 -0.00938309]
 [ 0.01794756  0.01650585 -0.0089158   0.01131087 -0.02403789  0.01804
  -0.01740073  0.01650054  0.0093409 ]
 [ 0.01777468  0.01010516 -0.00752157  0.01145754 -0.00888824  0.01041455
   0.01925378 -0.01161715  0.00876176]
 [-0.01767742 -0.01283241 -0.01417697  0.01433354  0.00928742  0.03484359
  -0.00138077 -0.00580054  0.02197457]
 [ 0.00140984  0.00631834 -0.00254858  0.01424395 -0.01171361  0.02415305
   0.02481701  0.00121621  0.02567151]
 [ 0.01842831  0.00722031  0.00385428  0.02340186 -0.01205347 

model0/hidden_layers/dense/bias/Adam_1:0 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
model0/hidden_layers/dense/kernel:0 [[-0.00499997 -0.00499997 -0.00499997 -0.00499999 -0.00499967 -0.00499997
   0.00499998  0.00499999  0.00499999]
 [ 0.00499999 -0.005      -0.005      -0.005      -0.00499998 -0.005
   0.005       0.005       0.005     ]
 [-0.00499997 -0.005       0.00499998 -0.00499999 -0.00499999 -0.005
   0.005       0.005       0.005     ]
 [ 0.00499998 -0.00499999  0.00499999 -0.005      -0.00499999 -0.005
   0.00499999  0.005       0.005     ]
 [ 0.00499999 -0.005      -0.00499999 -0.00499999  0.00499996 -0.005
   0.005       0.005       0.005     ]
 [ 0.0049999  -0.00499999 -0.00499999 -0.005      -0.00499999 -0.005
   0.005       0.005       0.005     ]
 [ 0.00499997  0.00499999  0.00499999 -0.005      -0.005      -0.00499999
   0.00499999  0.00499999  0.005     ]
 [-0.00499996 -0.00499996 -0.00499996 -0.00499996 -0.00499999 -0.00499996
   0.00499999  0.00499998  0.00499999]
 [ 0.          

model0/hidden_layers/dense/kernel:0 [[-0.01434235 -0.01555683  0.00083389 -0.01360935 -0.01223941  0.00420918
  -0.00389267  0.01639089  0.01508349]
 [ 0.00573453 -0.01507034 -0.01061971 -0.01187772 -0.00707358 -0.0069389
   0.00623375  0.01716399  0.01345627]
 [-0.00307023 -0.01488556  0.00213019 -0.01031865 -0.00429098 -0.01028116
   0.00891173  0.01348718  0.01044259]
 [ 0.01310042 -0.01639643  0.00981294 -0.01237359 -0.00799217 -0.00947355
   0.00095349  0.0158625   0.01055371]
 [ 0.00549577 -0.01354572 -0.00565221 -0.01595605  0.00050138 -0.00930755
   0.00728507  0.01697297  0.01408425]
 [ 0.00780721 -0.01529919 -0.01459424 -0.01150626 -0.00220571 -0.01481339
   0.0111314   0.01162653  0.00924043]
 [ 0.0110758  -0.00177571  0.01021483 -0.01234195 -0.01016844 -0.01139966
   0.00694406  0.01830677  0.00845085]
 [-0.00697058 -0.01647641 -0.00514959 -0.00238136 -0.00679656 -0.01274702
  -0.00090787  0.01216387  0.01896127]
 [-0.00035959 -0.01133556  0.00384893 -0.00750002 -0.00178795

model0/hidden_layers/dense/kernel:0 [[-0.01993646 -0.02187427  0.00065123 -0.0158313  -0.01370138  0.00974278
  -0.01196158  0.02429735  0.02263701]
 [ 0.01028527 -0.02220925 -0.01776938 -0.01761106 -0.00527575 -0.00487735
   0.00707737  0.02484749  0.01958379]
 [-0.00112194 -0.02101838 -0.00796163 -0.01514768 -0.00241058 -0.00688249
   0.01312322  0.01674295  0.01748755]
 [ 0.02509106 -0.02369624  0.00396074 -0.01964947 -0.008913   -0.00824549
  -0.00123797  0.02227666  0.01881148]
 [ 0.00958232 -0.01833248 -0.01205415 -0.02396311 -0.0064252  -0.0077745
   0.00667352  0.02509427  0.02143667]
 [ 0.01503412 -0.0263181  -0.01800402 -0.01391011  0.00459686 -0.02398896
   0.0149712   0.01071908  0.01576041]
 [ 0.01443161 -0.00722576  0.00654431 -0.01505212 -0.00886007 -0.01663755
   0.01083878  0.0253114   0.00926578]
 [-0.00112588 -0.02071018 -0.01333062 -0.00815445 -0.00460511 -0.00990024
  -0.00319744  0.01836335  0.02925917]
 [ 0.00587815 -0.01513528 -0.0015872  -0.01645172  0.00057334

model1/hidden_layers/dense/kernel:0 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
model1/hidden_layers/dense/bias:0 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
model1/hidden_layers/dense/kernel/Adam:0 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
model1/hidden_layers/dense/kernel/Adam_1:0 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
mode

model1/hidden_layers/dense/kernel:0 [[-1.20895607e-02 -9.63888410e-03  1.24301817e-02  1.18745966e-02
   6.79057371e-03 -1.20917596e-02  1.26659609e-02 -1.41467955e-02
   8.82482063e-03]
 [-1.38535369e-02  4.97494545e-03  1.04589611e-02  3.78236035e-03
  -1.42856026e-02 -1.08638182e-02  1.45119727e-02 -3.53983208e-03
   5.01558511e-03]
 [ 1.32968407e-02 -1.42580187e-02  1.44478865e-02  1.11380927e-02
  -1.30141405e-02  2.66881939e-03  1.12343971e-02 -6.22539874e-03
  -5.47817256e-03]
 [ 9.52426996e-03 -1.00107035e-02 -2.42208759e-03 -7.93107972e-03
   6.86704786e-03 -4.69005853e-03 -6.00522943e-03  3.68391210e-03
   5.24159521e-04]
 [-1.23467278e-02 -1.09315859e-02 -7.20640132e-03 -1.38384514e-02
  -1.33475866e-02  8.00150214e-04  2.14313902e-03  1.20508969e-02
   1.03157423e-02]
 [ 7.32860062e-05  6.92426274e-03  4.81058378e-05 -1.85578084e-03
   2.97102216e-03 -1.37762818e-02  1.34379305e-02  1.40282586e-02
  -1.86146121e-03]
 [ 1.42343864e-02 -1.33060152e-02 -3.62169696e-03 -6.09501

model1/hidden_layers/dense/kernel:0 [[-1.33596519e-02 -1.20933503e-02  1.94656700e-02  1.59020610e-02
   5.13051776e-03 -1.64407548e-02  1.88080519e-02 -2.22595595e-02
   8.50604568e-03]
 [-1.53835360e-02  8.03709403e-03  1.49354776e-02  1.45839574e-03
  -1.84728839e-02 -1.69475935e-02  2.19205152e-02 -7.74915982e-03
   5.86376106e-03]
 [ 1.97737161e-02 -2.25282162e-02  1.28455283e-02  1.86048578e-02
  -1.20434631e-02  1.47579238e-03  1.76341422e-02 -9.18746367e-03
  -1.15983915e-02]
 [ 7.09557021e-03 -1.15665812e-02  5.01317671e-03 -1.11662857e-02
   4.57753893e-04 -2.65761907e-03 -1.18244556e-03 -3.44548956e-03
  -3.88045004e-03]
 [-1.48732131e-02 -1.60309300e-02 -4.63989563e-05 -1.95581801e-02
  -2.04910915e-02 -1.55272195e-04  9.08631086e-03  1.23908855e-02
   1.34696523e-02]
 [ 6.02459535e-03  4.55798255e-03  4.89005912e-03  1.33252470e-04
   3.81005881e-03 -2.00088564e-02  1.91299915e-02  1.79485530e-02
  -8.14659800e-03]
 [ 2.17676386e-02 -1.69257503e-02 -7.13895820e-03  4.36176

model1/hidden_layers/dense/kernel:0 [[-0.01951454 -0.01392616  0.0309597   0.01434223  0.00734374 -0.02014875
   0.02098251 -0.02904138  0.01369341]
 [-0.00818394  0.01310873  0.01882621 -0.00526773 -0.01617994 -0.02739243
   0.02886615 -0.01270212  0.01576423]
 [ 0.02995753 -0.0307359   0.0100772   0.01686563 -0.00661124 -0.00645047
   0.02164228 -0.00895869 -0.0087837 ]
 [ 0.01367262 -0.0093328   0.01493101 -0.01921138  0.00390131 -0.01042888
   0.00536255 -0.00701523  0.00316867]
 [-0.01107077 -0.02375497  0.01163408 -0.02400701 -0.02498886 -0.00727455
   0.01265392  0.01413264  0.02077212]
 [ 0.01436373  0.00590478  0.01200934 -0.00484153  0.00950698 -0.02914386
   0.02049452  0.02514399 -0.01181795]
 [ 0.03153843 -0.01548462 -0.00823692  0.00215077 -0.00665181 -0.01970714
  -0.02163455  0.01791854 -0.0060542 ]
 [-0.00556074 -0.01800031  0.03538985  0.02418796 -0.00645401 -0.01149015
   0.01040213 -0.02878915  0.00636439]
 [-0.01327317 -0.02103688  0.0323669   0.02719942 -0.0047065

model2/hidden_layers/dense/kernel/Adam_1:0 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
model2/hidden_layers/dense/bias/Adam:0 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
model2/hidden_layers/dense/bias/Adam_1:0 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
model2/hidden_layers/dense/kernel:0 [[ 0.00499704 -0.00499998  0.00499999 -0.00499999 -0.00499999  0.00499999
  -0.00499986  0.00499998  0.00499997]
 [ 0.00499997  0.00499997  0.00499997 -0.00499999  0.00499997 -0.00499997
   0.00499997 -0.00499801  0.00499997]
 [ 0.00499998  0.00499944  0.00499998 -0.005      -0.00499998  0.00499998
   0.00499999  0.00499998  0.00499999]
 [-0.00499998 -0.00499999  0.005      -0.00499999 -0.00499999  0.005
  -0.00499998  0.00499998  0.00499993]
 [ 0.00499998 -0.00499998  0.00499998 -0.00499998 -0.00499999  0.

model2/hidden_layers/dense/kernel:0 [[-0.00041907 -0.00235641  0.00856549 -0.00671605 -0.0136887   0.01238677
   0.0012653   0.00680598  0.01415371]
 [ 0.00743426  0.01276803  0.01125422 -0.00960336  0.00801747 -0.00427995
   0.00098274 -0.01240482  0.00743417]
 [ 0.01416789 -0.00282342  0.00099197 -0.00748162 -0.00779037  0.01324126
   0.01087146  0.00377916  0.00787743]
 [ 0.00108416 -0.00988478  0.01309692 -0.01369655 -0.01291776  0.01045309
  -0.00866273  0.01491258  0.0009766 ]
 [ 0.01418971 -0.01452707  0.0043001  -0.00113057 -0.00670501  0.01347726
  -0.00046513 -0.01342095  0.00601682]
 [-0.00953075 -0.00964209  0.01397802 -0.00521798 -0.00357354  0.0149022
  -0.01306819  0.00510676 -0.00041169]
 [-0.00801529 -0.00861633  0.00887621 -0.00098246 -0.01371506  0.01420052
  -0.01263868 -0.0124652   0.00062426]
 [-0.00711652 -0.01233011  0.01310654 -0.00706429 -0.01098242  0.01124911
  -0.01273944  0.00711049 -0.00767757]
 [-0.00782333 -0.0073605   0.01479456 -0.01383591 -0.01004112

model2/hidden_layers/dense/kernel:0 [[-6.54462073e-03  7.12618418e-03  1.90178417e-02 -1.49932904e-02
  -2.43039411e-02  1.48471817e-02  4.59406991e-03  4.47532814e-03
   1.87821332e-02]
 [ 1.40844630e-02  2.08772831e-02  2.00796705e-02 -1.40231885e-02
   1.10696144e-02 -1.33247394e-02 -6.59138709e-03 -2.03542728e-02
   7.36302231e-03]
 [ 2.70746239e-02 -1.42671894e-02 -9.45318304e-03 -9.02738329e-03
  -1.15840882e-02  1.65780652e-02  1.79297868e-02  8.80729780e-03
   1.17383394e-02]
 [ 7.82552175e-03 -6.73473626e-03  2.01528501e-02 -2.63545830e-02
  -1.71372015e-02  1.20367398e-02 -1.44492406e-02  2.71922834e-02
   4.02301084e-05]
 [ 2.31353045e-02 -1.30388048e-02  4.73375432e-03 -2.90826941e-03
  -9.29472130e-03  1.79742873e-02 -3.81277059e-03 -1.64605156e-02
   2.93633016e-03]
 [-9.59223136e-03 -1.39134750e-03  2.68663447e-02 -1.28636863e-02
  -3.06279142e-03  1.60071366e-02 -1.99492406e-02  9.37641039e-03
   2.71840999e-03]
 [-4.19193646e-03 -8.56217463e-04  1.77314188e-02  7.49147

model2/hidden_layers/dense/kernel:0 [[-0.00998244  0.01334693  0.02477573 -0.02124174 -0.02902993  0.01738287
   0.00744761  0.0015406   0.01957531]
 [ 0.01794741  0.02458662  0.02667038 -0.0195113   0.01555348 -0.0200894
  -0.00852854 -0.02566896  0.01028668]
 [ 0.03541353 -0.01224713 -0.01650419 -0.01016164 -0.00924331  0.01217023
   0.02250813  0.00949363  0.0150954 ]
 [ 0.00872383 -0.0019917   0.02077531 -0.03407544 -0.01656753  0.01140156
  -0.01535987  0.03015216 -0.00069133]
 [ 0.02504766 -0.0088637   0.00334402 -0.00570688 -0.011103    0.02222493
  -0.00364544 -0.01763664 -0.00078837]
 [-0.01013089  0.00545554  0.02910868 -0.01743043  0.00018613  0.01213041
  -0.02039333  0.00765796  0.00342781]
 [-0.002942    0.00461711  0.02226458 -0.00386501 -0.02169289  0.02380429
  -0.02507933 -0.02158522 -0.0062136 ]
 [ 0.00035239 -0.00247631  0.0216504  -0.01911452 -0.00375326  0.01654453
  -0.023732   -0.00029681 -0.0069688 ]
 [-0.00041641  0.00914734  0.02809445 -0.03480198 -0.01750209

In [None]:
print(paths[0]['observation'])

In [21]:
[(i,a.name, a.shape) for i, a  in enumerate(tf.trainable_variables())]

[(0,
  'model-1/hidden_layers/dense/kernel:0',
  TensorShape([Dimension(9), Dimension(64)])),
 (1, 'model-1/hidden_layers/dense/bias:0', TensorShape([Dimension(64)])),
 (2,
  'model-1/hidden_layers/dense_1/kernel:0',
  TensorShape([Dimension(64), Dimension(9)])),
 (3, 'model-1/hidden_layers/dense_1/bias:0', TensorShape([Dimension(9)])),
 (4,
  'model-1/action_value/dense/kernel:0',
  TensorShape([Dimension(9), Dimension(9)])),
 (5, 'model-1/action_value/dense/bias:0', TensorShape([Dimension(9)])),
 (6,
  'model-2/hidden_layers/dense/kernel:0',
  TensorShape([Dimension(9), Dimension(64)])),
 (7, 'model-2/hidden_layers/dense/bias:0', TensorShape([Dimension(64)])),
 (8,
  'model-2/hidden_layers/dense_1/kernel:0',
  TensorShape([Dimension(64), Dimension(9)])),
 (9, 'model-2/hidden_layers/dense_1/bias:0', TensorShape([Dimension(9)])),
 (10,
  'model-2/action_value/dense/kernel:0',
  TensorShape([Dimension(9), Dimension(9)])),
 (11, 'model-2/action_value/dense/bias:0', TensorShape([Dimension

In [23]:
sess.run(tf.trainable_variables()[16].name)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [12]:
tf.reset_default_graph()

#define the board, models
board_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32)
adv_n_placeholder = tf.placeholder(shape = [None], dtype = tf.float32)
action_placeholder = tf.placeholder(shape = [None], dtype = tf.int32)
new_model = TicTacToe_model(board_placeholder, 9, scope = "model-1", reuse=tf.AUTO_REUSE)
old_model = TicTacToe_model(board_placeholder, 9, scope = "model-2", reuse=tf.AUTO_REUSE)

#start a session
sess =tf.Session()

#start an environment
env = TicTacToe.TicTacToe()

number_updates_per_expert_update = 10
number_expert_updates = 4

#Define Loss functions *symbolically*
log_prob = get_log_prob(new_model, action_placeholder)
loss, update_op, optimizer = loss_and_update_op(log_prob, adv_n_placeholder)

#Initialize all variables first time only, otherwise update only uptimizer vars
sess.run(tf.global_variables_initializer())

for i in range(20):
        #Produce some trajectories
        paths = sample_trajectories(old_model, new_model, sess, 20, env = env)
        adv_n = sum_of_rewards(paths) 
        boards = np.concatenate([path['observation'] for path in paths])
        actions = np.squeeze(np.concatenate([path["action"] for path in paths])).astype(int)
        
        #Update the model
        loss_current = sess.run([update_op,loss], feed_dict = {adv_n_placeholder: adv_n, board_placeholder: boards , action_placeholder: actions})

        print(loss_current)

[None, 5.6600657]
[None, 0.8210618]
[None, -3.4809644]
[None, -9.115552]
[None, -2.9741018]
[None, -0.87519014]
[None, -3.5369027]
[None, -10.586003]
[None, 4.2861843]
[None, -6.081344]
[None, 1.3459963]
[None, -3.325778]
[None, 6.2557]
[None, -21.44413]
[None, -4.992508]
[None, -9.91697]
[None, -1.078487]
[None, -10.848509]
[None, -7.0113597]
[None, -2.5709517]


In [None]:
width_low_tower  = 24
height_low_tower = 77

n = width_low_tower
k = height_low_tower

withinLevels  = 0
betweenLevels = 0
for j in range(1,n):
    withinLevels += 2*j*(j-1)
    betweenLevels += 4*j*j
    
pyramid = withinLevels + betweenLevels

lower_tower_within = 2*k*n*(n-1)
lower_tower_between = (k-1)*n*n

lower_tower = lower_tower_within + lower_tower_between

    
print(pyramid+lower_tower)
print(withinLevels + lower_tower_within)