In [1]:
import tensorflow as tf
from tensorflow.layers import dense, flatten
import gym
import numpy as np
from rewards import discount_rewards
from memory import MemoryBuffer

In [2]:
class RLNetwork():
    def __init__(self, in_op, out_op, act_type='discrete', sess=None):
        """
        Create a wrapper for RL networks for easy training.
        Args:
            in_op (tf.Placeholder): Observation input to architecture
            out_op (tf.Variable): Action output of architecture
            act_type (string): 'discrete' for a discrete actions space or 'continuous'
                               for a continuous actions space
            sess (tf.Session): A session if you would like to use a custom session,
                               if left none it will be automatically created
        """

        if not sess:
            self.renew_sess()
        
        self.in_op = in_op
        self.out_op = out_op
        
        if act_type in ('discrete', 'd'):
            self.train = self._create_discrete_trainer()
            self.act_type = 'discrete'
        elif act_type == ('continuous', 'c'):
            self.train = self._create_continuous_trainer()
            self.act_type = 'continuous'
        else:
            raise('act_type must be \'discrete\' or \'continuous\'')
        
    def renew_sess(self):
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def end_sess(self):
        self.sess.close()
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        self.act_holders = tf.placeholder(tf.int64, shape=[None])
        self.reward_holders = tf.placeholder(tf.float64, shape=[None])
        
        self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float64)
        self.log_probs = tf.log(self.out_op)
        
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.log_probs, axis=1)
        self.loss = -tf.reduce_mean(self.resp_acts * self.reward_holders)
        
        self.optimizer = optimizer
        self.update = self.optimizer.minimize(self.loss)
        
        update_func = lambda train_data: self.sess.run(self.update, 
                                                       feed_dict={self.in_op: np.vstack(train_data[:, 0]),
                                                            self.act_holders: train_data[:, 1],
                                                            self.reward_holders: train_data[:, 2]})
        
        return update_func
        
    def _create_continuous_trainer(self):
        raise('Unimplemented')
        
    def _gen_discrete_act(self, obs):
        act_probs = self.sess.run(self.out_op, feed_dict={self.in_op: obs})
        act = np.random.choice(list(range(len(act_probs)+1)), p=act_probs[0])
        
        return act
    
    def _gen_continuous_act(self, obs):
        raise('Unimplemented')
        
    def gen_act(self, obs):
        if self.act_type == 'discrete':
            return self._gen_discrete_act(obs)
        else:
            return self._gen_continuous_act(obs)
        
    def train(self, obs, rewards, acts):
        raise('The train method was not properly created')

In [3]:
def make_cart_pole():
    return gym.make("CartPole-v1")

In [4]:
obs = tf.placeholder(tf.float64, shape=[None, 4])
dense1 = dense(obs, 128, activation=tf.tanh)
dense2 = dense(dense1, 128, activation=tf.tanh)
act_probs = dense(dense2, 2)
softmax_probs = tf.nn.softmax(act_probs)

network = RLNetwork(obs, softmax_probs)

In [5]:
n_episodes = 10000
max_steps = 200
update_freq = 200 # In episodes

mb = MemoryBuffer(update_freq)
env = make_cart_pole()

  result = entry_point.load(False)


In [6]:
import time
total_steps = 0
total_episodes = 0
all_rewards = []

for episode in range(n_episodes):
    obs = env.reset()
    
    episode_reward = 0
    mb.start_rollout()
    for step in range(max_steps):
        act = network.gen_act([obs])

        # env.render()
        # time.sleep(0.02)
        obs_next, rew, d, _ = env.step(act)
        episode_reward += rew
        
        mb.record(obs, act, rew)
        obs = obs_next
        
        total_steps += 1
        if d:
            break
    # print(episode_reward)
    all_rewards.append(episode_reward)
    total_episodes += 1
            
    if total_episodes % 100 == 0:
        print('Recent Reward:', np.mean(all_rewards[-100:]))
        print('Total Episodes:', total_episodes)
        print('Total Steps:', total_steps)
        print('\n-----------')
        
        train_data = mb.to_data()
        network.train(train_data)

Recent Reward: 18.82
Total Episodes: 100
Total Steps: 1882

-----------


FailedPreconditionError: Attempting to use uninitialized value beta2_power
	 [[node beta2_power/read (defined at <ipython-input-2-ca7f3a3c2e1a>:47)  = Identity[T=DT_FLOAT, _class=["loc:@Adam/Assign_1"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](beta2_power)]]

Caused by op 'beta2_power/read', defined at:
  File "/home/ejmejm/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/ejmejm/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/home/ejmejm/anaconda3/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._run_once()
  File "/home/ejmejm/anaconda3/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
    handle._run()
  File "/home/ejmejm/anaconda3/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3185, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-86681618539b>", line 7, in <module>
    network = RLNetwork(obs, softmax_probs)
  File "<ipython-input-2-ca7f3a3c2e1a>", line 21, in __init__
    self.train = self._create_discrete_trainer()
  File "<ipython-input-2-ca7f3a3c2e1a>", line 47, in _create_discrete_trainer
    self.update = self.optimizer.minimize(self.loss)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 410, in minimize
    name=name)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 593, in apply_gradients
    self._create_slots(var_list)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/adam.py", line 131, in _create_slots
    colocate_with=first_var)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 814, in _create_non_slot_variable
    v = variable_scope.variable(initial_value, name=name, trainable=False)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 183, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 146, in _variable_v1_call
    aggregation=aggregation)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 125, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2444, in default_variable_creator
    expected_shape=expected_shape, import_scope=import_scope)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 187, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 1329, in __init__
    constraint=constraint)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 1491, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 81, in identity
    return gen_array_ops.identity(input, name=name)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3454, in identity
    "Identity", input=input, name=name)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
    op_def=op_def)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value beta2_power
	 [[node beta2_power/read (defined at <ipython-input-2-ca7f3a3c2e1a>:47)  = Identity[T=DT_FLOAT, _class=["loc:@Adam/Assign_1"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](beta2_power)]]
