In [1]:
from memory import MemoryBuffer
from env import make_cart_pole, make_lunar_lander_c
from policies import VPGTrainer
import tensorflow as tf
from tensorflow.layers import dense
from utils import reshape_train_var
import numpy as np

In [2]:
class VAPGTrainer():
    def __init__(self, in_op, out_op, v_out_op, act_type='discrete', sess=None):
        """
        Create a wrapper for RL networks for easy training using advantage based
        Vanilla Policy Gradient.
        Args:
            in_op (tf.Placeholder): Observation input to architecture
            out_op (tf.Variable): Action output of architecture
            v_out_op (tf.Variable): Value output of the architecture
            act_type (string): 'discrete' for a discrete actions space or 'continuous'
                               for a continuous actions space
            sess (tf.Session): A session if you would like to use a custom session,
                               if left none it will be automatically created
        """

        if not sess:
            self.renew_sess()
        
        self.in_op = in_op
        self.out_op = out_op
        self.v_out_op = v_out_op
        
        if act_type in ('discrete', 'd'):
            self.train = self._create_discrete_trainer()
            self.act_type = 'discrete'
        elif act_type in ('continuous', 'c'):
            self.train = self._create_continuous_trainer()
            self.act_type = 'continuous'
        else:
            raise TypeError('act_type must be \'discrete\' or \'continuous\'')
        
    def renew_sess(self):
        """
        Starts a new internal Tensorflow session
        """
        self.sess = tf.Session()
        
    def end_sess(self):
        """
        Ends the internal Tensorflow session if it exists
        """
        if self.sess:
            self.sess.close()
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for advantage based vanilla policy training with a discrete action space
        """
        self.act_holders = tf.placeholder(tf.int64, shape=[None])
        self.reward_holders = tf.placeholder(tf.float64, shape=[None])
        
        self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float64)
        self.log_probs = tf.log(self.out_op)
        
        self.advantages = self.reward_holders - self.v_out_op
        
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.log_probs, axis=1)
        self.actor_loss = -tf.reduce_mean(self.resp_acts * self.advantages)
        
        self.optimizer = optimizer
        self.actor_update = self.optimizer.minimize(self.actor_loss)
        
        with tf.control_dependencies([self.actor_update]):
            self.value_loss = tf.reduce_mean(tf.square(self.v_out_op - self.reward_holders))
            self.value_update = self.optimizer.minimize(self.value_loss)
        
        update_func = lambda train_data: self.sess.run([self.actor_update, self.value_update], 
                                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                            self.act_holders: reshape_train_var(train_data[:, 1]),
                                                            self.reward_holders: train_data[:, 2]})
        
        self.sess.run(tf.global_variables_initializer())
        
        return update_func
        
    def _create_continuous_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for advantage based vanilla policy training with a continuous action space
        """
        self.act_holders = tf.placeholder(tf.float64, shape=[None, self.out_op.shape[1].value])
        self.reward_holders = tf.placeholder(tf.float64, shape=[None])
        
        self.advantages = self.reward_holders - self.v_out_op
        
        self.log_probs = tf.log(self.out_op)
        
#         self.act_means = tf.reduce_mean(self.log_probs, axis=1)
#         print()
#         self.actor_loss = -tf.reduce_mean(self.act_means * self.advantages)
        self.actor_loss = -tf.reduce_mean(tf.reduce_sum(self.log_probs * self.advantages, axis=1))
        self.optimizer = optimizer
        
        self.actor_update = self.optimizer.minimize(self.actor_loss)
        
        with tf.control_dependencies([self.actor_update]):
            self.value_loss = tf.reduce_mean(tf.square(self.v_out_op - self.reward_holders))
            self.value_update = self.optimizer.minimize(self.value_loss)
        
        update_func = lambda train_data: self.sess.run([self.actor_update, self.value_update], 
                                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                            self.act_holders: reshape_train_var(train_data[:, 1]), # vstack
                                                            self.reward_holders: train_data[:, 2]})
        
        self.sess.run(tf.global_variables_initializer())
        
        return update_func
    
    def _gen_discrete_act(self, obs):
        act_probs = self.sess.run(self.out_op, feed_dict={self.in_op: [obs]})
        act = np.random.choice(list(range(len(act_probs[0]))), p=act_probs[0])
        
        
        return act
    
    def _gen_continuous_act(self, obs):
        act_vect = self.sess.run(self.out_op, feed_dict={self.in_op: [obs]})[0]
        
        # TODO: Add gaussian noise to action vector
        act_vect = [a + np.random.normal(0., 0.1) for a in act_vect]
        
        return np.array(act_vect)
        
    def gen_act(self, obs):
        if self.act_type == 'discrete':
            return self._gen_discrete_act(obs)
        else:
            return self._gen_continuous_act(obs)
        
    def train(self, obs, rewards, acts):
        raise RuntimeError('The train method was not properly created')

In [3]:
# Lunar Lander continuous

env = make_lunar_lander_c()

obs = tf.placeholder(tf.float64, shape=[None, 8])
dense1 = dense(obs, 32, activation=tf.tanh)
dense2 = dense(dense1, 32, activation=tf.tanh)
act_probs = dense(dense2, 4)
softmax_probs = tf.nn.softmax(act_probs)

v_dense1 = dense(obs, 32, activation=tf.tanh)
v_dense2 = dense(v_dense1, 32, activation=tf.tanh)
value = dense(v_dense2, 1)

network = VAPGTrainer(obs, softmax_probs, value, act_type='c')

  result = entry_point.load(False)


In [4]:
# # Cart pole network

# env = make_cart_pole()

# obs = tf.placeholder(tf.float64, shape=[None, 4])
# dense1 = dense(obs, 32, activation=tf.tanh)
# dense2 = dense(dense1, 32, activation=tf.tanh)
# act_probs = dense(dense2, 2)
# softmax_probs = tf.nn.softmax(act_probs)

# v_dense1 = dense(obs, 32, activation=tf.tanh)
# v_dense2 = dense(v_dense1, 32, activation=tf.tanh)
# value = dense(v_dense2, 1)

# network = VAPGTrainer(obs, softmax_probs, value, act_type='discrete')

In [5]:
n_episodes = 1000000
max_steps = 200
update_freq = 64
print_freq = 1

mb = MemoryBuffer()

In [6]:
all_rewards = []

for episode in range(n_episodes):
    ep_reward = 0
    
    mb.start_rollout()
    obs = env.reset()
    for step in range(max_steps):
        act = network.gen_act(obs)
        
        obs_next, rew, d, _ = env.step(act)
        ep_reward += rew
        
        mb.record(obs, act, rew)
        obs = obs_next
        
        if d:
            break
            
    all_rewards.append(ep_reward)
            
    if episode % update_freq == 0 and episode != 0:
        network.train(mb.to_data())
        
        if episode % (update_freq * print_freq) == 0:
            print(f'Update #{episode // update_freq}, Reward: {np.mean(all_rewards[-update_freq*print_freq:])}')
        

InvalidArgumentError: Incompatible shapes: [8333,4] vs. [8333,8333]
	 [[node mul (defined at <ipython-input-2-e93be3cdb682>:90)  = Mul[T=DT_DOUBLE, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Log, sub)]]

Caused by op 'mul', defined at:
  File "/home/ejmejm/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/ejmejm/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/home/ejmejm/anaconda3/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._run_once()
  File "/home/ejmejm/anaconda3/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
    handle._run()
  File "/home/ejmejm/anaconda3/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 370, in dispatch_queue
    yield self.process_one()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 346, in wrapper
    runner = Runner(result, future, yielded)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 1080, in __init__
    self.run()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3185, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-f2de320a981d>", line 15, in <module>
    network = VAPGTrainer(obs, softmax_probs, value, act_type='c')
  File "<ipython-input-2-e93be3cdb682>", line 27, in __init__
    self.train = self._create_continuous_trainer()
  File "<ipython-input-2-e93be3cdb682>", line 90, in _create_continuous_trainer
    self.actor_loss = -tf.reduce_mean(tf.reduce_sum(self.log_probs * self.advantages, axis=1))
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 866, in binary_op_wrapper
    return func(x, y, name=name)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1131, in _mul_dispatch
    return gen_math_ops.mul(x, y, name=name)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 5042, in mul
    "Mul", x=x, y=y, name=name)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
    op_def=op_def)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): Incompatible shapes: [8333,4] vs. [8333,8333]
	 [[node mul (defined at <ipython-input-2-e93be3cdb682>:90)  = Mul[T=DT_DOUBLE, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Log, sub)]]
