In [1]:
%load_ext autoreload
%autoreload 2

import os, shutil, random
import numpy as np
import gym
import tensorflow as tf
from tensorflow.python import debug as tf_debug

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
env = gym.make('CartPole-v1')
print(env.observation_space)
print(env.action_space)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Box(4,)
Discrete(2)


In [6]:
from utils import show_graph

def dense_layers(inputs, units_list, trainable):
    net = inputs
    for n_units in units_list:
        net = tf.layers.dense(net, n_units, trainable=trainable)
        net = tf.nn.relu(net)
    return net

def make_critic(observation: tf.Tensor, action: tf.Tensor,
                observation_units_list=[64], action_units_list=[], combined_units_list=[64],
                trainable=True):
    obs_net = dense_layers(observation, units_list=observation_units_list, trainable=trainable)
    act_net = dense_layers(action, units_list=action_units_list, trainable=trainable)
    
    net = tf.concat([obs_net, act_net], axis=1)
    net = dense_layers(net, units_list=combined_units_list, trainable=trainable)
    
    output = tf.layers.dense(net, units=1, trainable=trainable)
    return tf.identity(output, name='q')

def copy_all_vars(scope_origin: str, scope_target: str):
    origin_vars = tf.global_variables(scope=scope_origin)
    target_vars = tf.global_variables(scope=scope_target)
    var_pair_list = zip(origin_vars, target_vars)
    assign_ops = [tf.assign(t, o, name='assign_' + str(i)) for i, (o, t) in enumerate(var_pair_list)]
    return tf.group(assign_ops)

def update_target_vars(scope_origin: str, scope_target: str, tau: float = 0.001):
    origin_vars = tf.global_variables(scope=scope_origin)
    target_vars = tf.global_variables(scope=scope_target)
    var_pair_list = zip(origin_vars, target_vars)
    assign_ops = [tf.assign(
        t, tf.add(tf.multiply(tau, o, name='new_' + str(i)),
                  tf.multiply((1 - tau), t, name='old_' + str(i)), name='add_' + str(i)),
        name='assign_' + str(i)) for i, (o, t) in enumerate(var_pair_list)]
    return tf.group(assign_ops)

observation_size = env.observation_space.shape[0]
action_size = env.action_space.n

gamma = 0.99
tau = 0.01
learning_rate = 0.01
random_action_prob = 0.1
replay_buffer_size = 10**5

hidden_units = [64]

tf_graph = tf.Graph()
with tf_graph.as_default():
    obs_ph = tf.placeholder(dtype=tf.float32, shape=[None, observation_size], name='observation')
    action_ph = tf.placeholder(dtype=tf.int32, shape=[None], name='action')
    obs_next_ph = tf.placeholder(dtype=tf.float32, shape=[None, observation_size], name='observation_next')
    reward_ph = tf.placeholder(dtype=tf.float32, shape=[None], name='reward')

    with tf.variable_scope('live'):
        net = dense_layers(obs_ph, units_list=hidden_units, trainable=True)
        action_values = tf.layers.dense(net, units=action_size, activation=None, name='action_values')
        print(action_values.shape)
        state_value = tf.reduce_max(input_tensor=action_values, axis=-1, name='state_value')
        q = tf.reduce_sum(action_values * tf.one_hot(action_ph, action_size), axis=1)
        print(tf.one_hot(action_ph, action_size).shape)
        print(q.shape)

    with tf.variable_scope('target'):
        target_net = dense_layers(obs_next_ph, units_list=hidden_units, trainable=False)
        target_action_values = tf.layers.dense(target_net, units=action_size, activation=None, name='action_values')
        target_state_value = tf.reduce_max(input_tensor=target_action_values, axis=-1, name='state_value')
        print(target_state_value.shape)
        
    with tf.variable_scope('update_target'):
        init_target = copy_all_vars(scope_origin='live', scope_target='target')
        update_target = update_target_vars(scope_origin='live', scope_target='target', tau=tau)
            
    with tf.variable_scope('loss'):
        y = reward_ph + gamma * target_state_value
        print(y.shape)
        loss = tf.losses.mean_squared_error(predictions=q, labels=y)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

    show_graph(tf_graph.as_graph_def())

(?, 2)
(?, 2)
(?,)
(?,)
(?,)


In [11]:
n_episode = 100
batch_size = 32

with tf_graph.as_default():
    with tf.Session(graph=tf_graph).as_default():
        sess = tf.get_default_session()
        sess.run(tf.global_variables_initializer())
        sess.run(init_target)
        
        replay_buffer = []
        replay_buffer_cur = 0
        
        for episode_i in range(n_episode):
            obs_0 = env.reset()
            done = False
            step_i = 0
            while not done:
                if random.random() <= random_action_prob:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(action_values.eval(feed_dict={obs_ph: obs_0.reshape(1, -1)})[0])

                obs_1, reward, done, _info = env.step(action)
                step_i += 1
                
                experience = {
                    'obs_0': obs_0,
                    'action': action,
                    'reward': reward,
                    'obs_1': obs_1,
                    'done': done
                }
                if len(replay_buffer) >= batch_size:
                    replay_buffer[replay_buffer_cur] = experience
                    replay_buffer_cur = (replay_buffer_cur + 1) % len(replay_buffer)
                    
                    # train
                    sample = random.sample(replay_buffer, batch_size)
                    
                    t_obs_0 = np.array([x['obs_0'] for x in sample])
                    t_action = np.array([x['action'] for x in sample])
                    t_reward = np.array([x['reward'] for x in sample])
                    t_obs_1 = np.array([x['obs_1'] for x in sample])
                    t_done = np.array([[x['done']] for x in sample])
                    
                    loss_val, _ = sess.run([loss, optimizer], feed_dict={
                        obs_ph: t_obs_0,
                        action_ph: t_action,
                        reward_ph: t_reward,
                        obs_next_ph: t_obs_1
                    })
                    
                    print('episode {}, step {}, loss {}'.format(episode_i, step_i, loss_val))
                else:
                    replay_buffer.append(experience)

episode 3, step 4, loss 1.468094825744629
episode 3, step 5, loss 1.143183708190918
episode 3, step 6, loss 0.8575460314750671
episode 3, step 7, loss 0.6122267246246338
episode 3, step 8, loss 0.42828530073165894
episode 3, step 9, loss 0.2693699300289154
episode 4, step 1, loss 0.15917091071605682
episode 4, step 2, loss 0.09221531450748444
episode 4, step 3, loss 0.05802810937166214
episode 4, step 4, loss 0.04487692937254906
episode 4, step 5, loss 0.06384976208209991
episode 4, step 6, loss 0.09990427643060684
episode 4, step 7, loss 0.13563016057014465
episode 4, step 8, loss 0.15879923105239868
episode 4, step 9, loss 0.16365136206150055
episode 4, step 10, loss 0.14963479340076447
episode 5, step 1, loss 0.10998297482728958
episode 5, step 2, loss 0.07503823935985565
episode 5, step 3, loss 0.04499347507953644
episode 5, step 4, loss 0.02280079945921898
episode 5, step 5, loss 0.023054219782352448
episode 5, step 6, loss 0.030216790735721588
episode 5, step 7, loss 0.0394435301

episode 28, step 1, loss 0.00024963179021142423
episode 28, step 2, loss 0.00022691965568810701
episode 28, step 3, loss 0.00021004470181651413
episode 28, step 4, loss 0.00011124712909804657
episode 28, step 5, loss 0.00012095327110728249
episode 28, step 6, loss 0.0001343142648693174
episode 28, step 7, loss 0.00014328252291306853
episode 28, step 8, loss 0.00016335287364199758
episode 28, step 9, loss 0.0001740228763082996
episode 28, step 10, loss 0.00019244402938056737
episode 28, step 11, loss 0.000220976144191809
episode 29, step 1, loss 0.00021377204393502325
episode 29, step 2, loss 0.0002153970708604902
episode 29, step 3, loss 0.0002233768900623545
episode 29, step 4, loss 0.0002184841432608664
episode 29, step 5, loss 0.00021288519201334566
episode 29, step 6, loss 0.00020218425197526813
episode 29, step 7, loss 0.00018786387227009982
episode 29, step 8, loss 0.00017784690135158598
episode 29, step 9, loss 0.00016816082643344998
episode 29, step 10, loss 0.00016051827697083

episode 53, step 4, loss 3.250251756981015e-05
episode 53, step 5, loss 3.05527173622977e-05
episode 53, step 6, loss 2.9874225219828077e-05
episode 53, step 7, loss 2.9824808734701946e-05
episode 53, step 8, loss 2.9214170353952795e-05
episode 53, step 9, loss 2.7745601983042434e-05
episode 53, step 10, loss 2.7655634767143056e-05
episode 54, step 1, loss 3.575101800379343e-05
episode 54, step 2, loss 4.3085721699753776e-05
episode 54, step 3, loss 4.151812390773557e-05
episode 54, step 4, loss 3.722785913851112e-05
episode 54, step 5, loss 3.058060246985406e-05
episode 54, step 6, loss 2.606742418720387e-05
episode 54, step 7, loss 2.8869024390587583e-05
episode 54, step 8, loss 4.214273940306157e-05
episode 54, step 9, loss 5.687164230039343e-05
episode 55, step 1, loss 5.638487346004695e-05
episode 55, step 2, loss 5.099574264022522e-05
episode 55, step 3, loss 4.788933802046813e-05
episode 55, step 4, loss 5.5520016758237034e-05
episode 55, step 5, loss 5.835077172378078e-05
episo

episode 77, step 19, loss 0.0001031174324452877
episode 77, step 20, loss 0.00010206007573287934
episode 77, step 21, loss 9.428898192709312e-05
episode 77, step 22, loss 9.354752546641976e-05
episode 77, step 23, loss 9.30861133383587e-05
episode 77, step 24, loss 9.11674287635833e-05
episode 77, step 25, loss 8.927732415031642e-05
episode 78, step 1, loss 8.871994214132428e-05
episode 78, step 2, loss 8.893287304090336e-05
episode 78, step 3, loss 8.562956645619124e-05
episode 78, step 4, loss 8.167512714862823e-05
episode 78, step 5, loss 8.318686741404235e-05
episode 78, step 6, loss 8.107088069664314e-05
episode 78, step 7, loss 7.240510603878647e-05
episode 78, step 8, loss 7.289194036275148e-05
episode 78, step 9, loss 6.164560181787238e-05
episode 78, step 10, loss 6.297065556282178e-05
episode 79, step 1, loss 5.341229552868754e-05
episode 79, step 2, loss 5.268822133075446e-05
episode 79, step 3, loss 5.132318619871512e-05
episode 79, step 4, loss 4.9815258535090834e-05
episo

In [5]:
tf.Session().run(tf.one_hot([1,2], 3))

array([[0., 1., 0.],
       [0., 0., 1.]], dtype=float32)