In [1]:
from memory import MemoryBuffer
from policies import PPOTrainer
from utils import gaussian_likelihood, reshape_train_var
import tensorflow as tf
from tensorflow.layers import dense, conv2d, max_pooling2d, flatten
import numpy as np
import time
import gym

In [2]:
class IMTrainer():
    def __init__(self, in_op, out_op, v_out_op, act_type='discrete', sess=None):
        """
        Create a wrapper for RL networks for easy training.
        Args:
            in_op (tf.Placeholder): Observation input to architecture
            out_op (tf.Variable): Action output of architecture
            act_type (string): 'discrete' for a discrete actions space or 'continuous'
                               for a continuous actions space
            sess (tf.Session): A session if you would like to use a custom session,
                               if left none it will be automatically created
        """

        if not sess:
            self.renew_sess()
        
        self.in_op = in_op
        self.out_op = out_op
        self.v_out_op = v_out_op
        
        if act_type in ('discrete', 'd'):
            self.train = self._create_discrete_trainer()
            self.act_type = 'discrete'
        elif act_type in ('continuous', 'c'):
            self.train = self._create_continuous_trainer()
            self.act_type = 'continuous'
        else:
            raise TypeError('act_type must be \'discrete\' or \'continuous\'')
        
    def renew_sess(self):
        """
        Starts a new internal Tensorflow session
        """
        self.sess = tf.Session()
        
    def end_sess(self):
        """
        Ends the internal Tensorflow session if it exists
        """
        if self.sess:
            self.sess.close()
            
    def _create_ICM(self):
        feature_dim = 12
        r_i_scale = 0.5
        
        # Create placeholder
        self.obs_next = tf.placeholder(tf.float32, shape=self.in_op.shape)
        
        # Observation feature encoder
        with tf.variable_scope('feature_encoder'):
            self.f_obs = dense(self.in_op, feature_dim, activation=tf.nn.tanh, name='fe_dense')
            
        with tf.variable_scope('feature_encoder', reuse=True):
            self.f_obs_next = dense(self.obs_next, feature_dim, activation=tf.nn.tanh, name='fe_dense')
            
        # State predictor forward model
        self.state_act_pair = tf.concat([self.act_holders, self.f_obs], axis=1)
        self.sp_dense = dense(self.state_act_pair, 32, activation=tf.nn.tanh)
        self.f_obs_next_hat = dense(self.sp_dense, feature_dim, activation=tf.nn.tanh)
        
        # Inverse model (predicting action)
        self.state_state_pair = tf.concat([self.f_obs, self.f_obs_next], axis=1)
        self.act_hat = dense(self.state_state_pair, self.out_op.shape[1])
        
        # Calculating intrinsic reward
        self.obs_diff = self.f_obs_next_hat - self.f_obs_next
        self.r_i = tf.reduce_mean(r_i_scale * self.obs_diff ** 2)
        
        # Calculating losses
        self.loss_i = 0.5 * tf.reduce_mean((self.act_hat - self.act_holders) ** 2)
        self.loss_f = 0.5 * tf.reduce_mean(self.obs_diff ** 2)
        
        print(self.f_obs, self.f_obs_next, self.sp_dense, self.f_obs_next_hat,
              self.act_hat, self.obs_diff, self.r_i, self.loss_i,
              self.loss_f)
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for vanilla policy training with a discrete action space
        """
#         self.act_holders = tf.placeholder(tf.int32, shape=[None])
#         self.reward_holders = tf.placeholder(tf.float32, shape=[None])
        
#         self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float32)
#         self.log_probs = tf.log(self.out_op)
        
#         self.advantages = self.reward_holders - tf.squeeze(self.v_out_op)
        
#         self.resp_acts = tf.reduce_sum(self.act_masks *  self.log_probs, axis=1)
#         self.loss = -tf.reduce_mean(self.resp_acts * self.advantages)
        
#         self.optimizer = optimizer
#         self.actor_update = self.optimizer.minimize(self.loss)
        
#         with tf.control_dependencies([self.actor_update]):
#             self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.v_out_op)))
#             self.value_update = self.optimizer.minimize(self.value_loss)
        
#         update_func = lambda train_data: self.sess.run([self.actor_update, self.value_update], 
#                                                        feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
#                                                             self.act_holders: reshape_train_var(train_data[:, 1]),
#                                                             self.reward_holders: train_data[:, 2]})
        
#         self.sess.run(tf.global_variables_initializer())
        
#         return update_func
        
    def _create_continuous_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for vanilla policy training with a continuous action space
        """
        
        
        ################################################################
        
        self.act_holders = tf.placeholder(tf.float32, shape=[None, self.out_op.shape[1].value])
        self.reward_holders = tf.placeholder(tf.float32, shape=[None])
        
        self._create_ICM() ##################### build off here
        
        self.std = tf.Variable(0.5 * np.ones(shape=self.out_op.shape[1].value), dtype=tf.float32)
        self.out_act = self.out_op + tf.random_normal(tf.shape(self.out_op), dtype=tf.float32) * self.std
        
        self.log_probs = gaussian_likelihood(self.act_holders, self.out_op, self.std)
        
        self.advantages = self.reward_holders - tf.squeeze(self.v_out_op)
        
        self.actor_loss = -tf.reduce_mean(self.log_probs * self.reward_holders)
        
        
        
        self.optimizer = optimizer
        self.actor_update = self.optimizer.minimize(self.actor_loss)
        
        with tf.control_dependencies([self.actor_update]):
            self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.v_out_op)))
            self.value_update = self.optimizer.minimize(self.value_loss)
        
        update_func = lambda train_data: self.sess.run([self.actor_update, self.value_update], 
                                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                            self.act_holders: reshape_train_var(train_data[:, 1]),
                                                            self.reward_holders: train_data[:, 2]})
        
        self.sess.run(tf.global_variables_initializer())
        
        return update_func
        
    def _gen_discrete_act(self, obs):
        act_probs = self.sess.run(self.out_op, feed_dict={self.in_op: [obs]})
        act = np.random.choice(list(range(len(act_probs[0]))), p=act_probs)
        
        return act
    
    def _gen_continuous_act(self, obs):
        act_vect = self.sess.run(self.out_act, feed_dict={self.in_op: [obs]})[0]
        
        return np.array(act_vect)
        
    def gen_act(self, obs):
        if self.act_type == 'discrete':
            return self._gen_discrete_act(obs)
        else:
            return self._gen_continuous_act(obs)
        
    def train(self, obs, rewards, acts):
        raise RuntimeError('The train method was not properly created')

In [3]:
env = gym.make('BipedalWalker-v2')

obs = tf.placeholder(tf.float32, shape=[None]+list(env.observation_space.shape))
dense1 = dense(obs, 32, activation=tf.tanh)
dense2 = dense(dense1, 32, activation=tf.tanh)
act_probs = dense(dense2, env.action_space.shape[0])

v_dense1 = dense(obs, 32, activation=tf.tanh)
v_dense2 = dense(v_dense1, 32, activation=tf.tanh)
value = dense(v_dense2, 1)

network = IMTrainer(obs, act_probs, value, act_type='c')

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Tensor("feature_encoder/fe_dense/Tanh:0", shape=(?, 12), dtype=float32) Tensor("feature_encoder_1/fe_dense/Tanh:0", shape=(?, 12), dtype=float32) Tensor("dense_6/Tanh:0", shape=(?, 32), dtype=float32) Tensor("dense_7/Tanh:0", shape=(?, 12), dtype=float32) Tensor("dense_8/BiasAdd:0", shape=(?, 4), dtype=float32) Tensor("sub:0", shape=(?, 12), dtype=float32) Tensor("Mean:0", shape=(), dtype=float32) Tensor("mul_1:0", shape=(), dtype=float32) Tensor("mul_2:0", shape=(), dtype=float32)


In [4]:
n_episodes = 1000000
max_steps = 200
update_freq = 64
print_freq = 1

mb = MemoryBuffer()

In [6]:
all_rewards = []

for episode in range(n_episodes):
    ep_reward = 0
    
    mb.start_rollout()
    obs = env.reset()
    for step in range(max_steps):
        obs = obs.squeeze()
        act = network.gen_act(obs)
        
        obs_next, rew, d, _ = env.step(act)
        ep_reward += rew
        
        if True:
            env.render()
            time.sleep(0.02)
        
        mb.record(obs, act, rew)
        obs = obs_next
        
        if d:
            break
            
    all_rewards.append(ep_reward)
            
    if episode % update_freq == 0 and episode != 0:
        network.train(mb.to_data())
        
        if episode % (update_freq * print_freq) == 0:
            print(f'Update #{episode // update_freq}, Reward: {np.mean(all_rewards[-update_freq*print_freq:])}')

KeyboardInterrupt: 