In [None]:
import uuid
import time
import pickle
import sys
import gym.spaces
import itertools
import numpy as np
import random
import tensorflow                as tf
import tensorflow.contrib.layers as layers
from collections import namedtuple
import TicTacToe

In [None]:
tf.reset_default_graph()

def TicTacToe_model(placeholder, num_actions, scope, reuse=tf.AUTO_REUSE):
    # A model for a TicTacToe q-function
    placeholder = tf.contrib.layers.flatten(placeholder)
    with tf.variable_scope(scope, reuse=reuse):
        out = placeholder
        out = tf.cast(out, tf.float32)
        out = tf.layers.dense(out, 64 , kernel_initializer = tf.zeros_initializer(), bias_initializer = tf.zeros_initializer(), activation = tf.nn.relu)
        out = tf.layers.dense(out, 9 , kernel_initializer = tf.zeros_initializer(), bias_initializer = tf.zeros_initializer(), activation = None)
    return out

    
def update_old_model_initialize_new_model(old_model, new_model, board_placeholder, sess, update_iter):
    old_model = new_model
    old_model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="model" +str(update_iter-1))
    
    new_model = TicTacToe_model(board_placeholder, 9, scope = "model" + str(update_iter))
    
    new_model_vars =  tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="model" +str(update_iter))
    sess.run(tf.variables_initializer(new_model_vars))
    
    for var_old, var_new in zip(old_model_vars, new_model_vars):
        var_new = sess.run(var_old)
#        print(sess.run(var_old))
#        print(sess.run(var_new))
#    print(sess.run(old_model, feed_dict = {board_placeholder: [[[1,0,0],[0,0,0],[0,0,0]]]})[0,:3])
#    print(sess.run(new_model, feed_dict = {board_placeholder: [[[1,0,0],[0,0,0],[0,0,0]]]})[0,:3])
    return old_model, new_model

def update_and_duplicate_old_model():
    old_model = new_model
    
    variables = tf.trainable_variables()
    for var1 in variables:
        if "pred/" in var1.name:
            trained_var = [var2 for var2 in tf.global_variables() if var2.op.name in str.replace(var1.name, "pred/", "")][0]
            value = sess.run(trained_var)
            sess.run(tf.assign(var1, value))


def sample_action(model):
    out = model
    dist = tf.distributions.Categorical(logits=out)
    return dist.sample()
    
    

def sample_trajectory(old_model, new_model, sess, env):
    new_sample = sample_action(new_model)
    old_sample = sample_action(old_model)
    obs, acs, rewards = [], [], []
    seed = np.random.randint(0,2)
    ob = env.reset()
    if seed % 2 == 0:
        while True:
            obs.append(ob)
            action = sess.run(new_sample, feed_dict={board_placeholder: [ob]})
            acs.append(action)
            ob, rew, done, _ = env.step(action[0])
            rewards.append(rew)
            
            if done:
                break
            
            action = sess.run(old_sample, feed_dict={board_placeholder: [ob]})
            ob, rew, done, _ = env.step(action[0])
    else:
        while True:       
            action = sess.run(old_sample, feed_dict={board_placeholder: [ob]})
            ob, rew, done, _ = env.step(action[0])
   
            obs.append(ob)
            action = sess.run(new_sample, feed_dict={board_placeholder: [ob]})
            acs.append(action)
            ob, rew, done, _ = env.step(action[0])
            rewards.append(rew)
            
            if done:
                break
                
    path = {"observation" : np.array(obs, dtype=np.float32), 
                "reward" : np.array(rewards, dtype=np.float32), 
                "action" : np.array(acs, dtype=np.float32)}
#    print(str(len(path["observation"])))
    return path

def sample_trajectories(old_model, new_model, sess, min_timesteps_per_batch, env):
    paths =[]
    timesteps_this_batch = 0
    while True:
        path = sample_trajectory(old_model, new_model, sess, env)
        paths.append(path)
        timesteps_this_batch += len(path['observation'])
        if timesteps_this_batch > min_timesteps_per_batch:
            break
    return paths
    
def sum_of_rewards(paths, gamma = .95): 
    re_n = [path["reward"] for path in paths]
    q_n = []
    for seq_of_rewards in re_n:
        for t in range(len(seq_of_rewards)):
            weighted_sequence = seq_of_rewards[t:] * np.array([gamma**i for i in range(len(seq_of_rewards[t:]))])
            q_n.append(np.sum(weighted_sequence))
    adv_n = q_n
    return adv_n
        
def standardize_advantage(adv_n):
    adv_n = (adv_n - np.mean(adv_n)) 
    adv_n = adv_n * (1.0/(np.std(adv_n)+.0000001))
    return adv_n

def get_log_prob(model, action_placeholder):
    logits = model
    
    log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels= action_placeholder, logits= logits)
    return log_prob

def loss_and_update_op(log_prob,adv_n):
    loss = -tf.reduce_mean(log_prob * adv_n) 
    optimizer = tf.train.AdamOptimizer(5e-3)
    update_op = optimizer.minimize(loss)
    return loss, update_op, optimizer
    

In [None]:
tf.reset_default_graph()

#define the board, models
board_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32)
adv_n_placeholder = tf.placeholder(shape = [None], dtype = tf.float32)
action_placeholder = tf.placeholder(shape = [None], dtype = tf.int32)
new_model = TicTacToe_model(board_placeholder, 9, scope = "model-1", reuse=tf.AUTO_REUSE)
old_model = TicTacToe_model(board_placeholder, 9, scope = "model-2", reuse=tf.AUTO_REUSE)

#start a session
sess =tf.Session()

#start an environment
env = TicTacToe.TicTacToe()

number_updates_per_expert_update = 10
number_expert_updates = 4

for k in range(number_expert_updates):
    print("iteration number", k)
    #Define Loss functions *symbolically*
    log_prob = get_log_prob(new_model, action_placeholder)
    loss, update_op, optimizer = loss_and_update_op(log_prob, adv_n_placeholder)

    #Initialize all variables first time only, otherwise update only uptimizer vars
    if k ==0:
        sess.run(tf.global_variables_initializer())
    else:
        sess.run(tf.variables_initializer(optimizer.variables()))
    
    for i in range(number_updates_per_expert_update):
        #Produce some trajectories
#        print("sampling games...")
        paths = sample_trajectories(old_model, new_model, sess, 20, env = env)
#        print("games sampled.")
        adv_n = sum_of_rewards(paths) 
        boards = np.concatenate([path['observation'] for path in paths])
        actions = np.squeeze(np.concatenate([path["action"] for path in paths])).astype(int)
#        print("Last game played", paths)
        
        #Update the model
#        print("updating model...")
#        print(sess.run(new_model, feed_dict = {board_placeholder: [[[1,0,0],[0,0,0],[0,0,0]]]})[0,:3])
        test_vars =  tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="model" +str(k-1))
        for var in test_vars:
            print(var.name, np.max(sess.run(var)))
            
#        for i, var in enumerate(test_vars):
#            print(var.name, sess.run(var))

        sess.run(update_op, feed_dict = {adv_n_placeholder: adv_n, board_placeholder: boards , action_placeholder: actions})
#        print("model updated.")
    #if new_model does better...
    old_model, new_model = update_old_model_initialize_new_model(old_model, new_model, board_placeholder, sess, update_iter= k)
    

    
for path in paths:
    print(path['observation'])
    print(path['action'])
    print(path['reward'])
    print("____")
    
    


In [None]:
print(paths[0]['observation'])

In [None]:
[(a.name, sess.run(a)) for i, a  in enumerate(tf.trainable_variables())]



In [None]:
sess.run(tf.trainable_variables()[16].name)

In [None]:
tf.reset_default_graph()

#define the board, models
board_placeholder = tf.placeholder(shape = [None, 3,3], dtype = tf.int32)
adv_n_placeholder = tf.placeholder(shape = [None], dtype = tf.float32)
action_placeholder = tf.placeholder(shape = [None], dtype = tf.int32)
new_model = TicTacToe_model(board_placeholder, 9, scope = "model-1", reuse=tf.AUTO_REUSE)
old_model = TicTacToe_model(board_placeholder, 9, scope = "model-2", reuse=tf.AUTO_REUSE)

#start a session
sess =tf.Session()

#start an environment
env = TicTacToe.TicTacToe()

number_updates_per_expert_update = 10
number_expert_updates = 4

#Define Loss functions *symbolically*
log_prob = get_log_prob(new_model, action_placeholder)
loss, update_op, optimizer = loss_and_update_op(log_prob, adv_n_placeholder)

#Initialize all variables first time only, otherwise update only uptimizer vars
sess.run(tf.global_variables_initializer())

for i in range(20):
        #Produce some trajectories
        paths = sample_trajectories(old_model, new_model, sess, 20, env = env)
        adv_n = sum_of_rewards(paths) 
        boards = np.concatenate([path['observation'] for path in paths])
        actions = np.squeeze(np.concatenate([path["action"] for path in paths])).astype(int)
        
        #Update the model
        loss_current = sess.run([update_op,loss], feed_dict = {adv_n_placeholder: adv_n, board_placeholder: boards , action_placeholder: actions})

        print(loss_current)

In [None]:
width_low_tower  = 24
height_low_tower = 77

n = width_low_tower
k = height_low_tower

withinLevels  = 0
betweenLevels = 0
for j in range(1,n):
    withinLevels += 2*j*(j-1)
    betweenLevels += 4*j*j
    
pyramid = withinLevels + betweenLevels

lower_tower_within = 2*k*n*(n-1)
lower_tower_between = (k-1)*n*n

lower_tower = lower_tower_within + lower_tower_between

    
print(pyramid+lower_tower)
print(withinLevels + lower_tower_within)