In [1]:
from memory import MemoryBuffer
from env import make_cart_pole, make_lunar_lander_c
from policies import VPGTrainer
import tensorflow as tf
from tensorflow.layers import dense
from utils import reshape_train_var
import numpy as np

In [7]:
class VAPGTrainer():
    def __init__(self, in_op, out_op, v_out_op, act_type='discrete', sess=None):
        """
        Create a wrapper for RL networks for easy training using advantage based
        Vanilla Policy Gradient.
        Args:
            in_op (tf.Placeholder): Observation input to architecture
            out_op (tf.Variable): Action output of architecture
            v_out_op (tf.Variable): Value output of the architecture
            act_type (string): 'discrete' for a discrete actions space or 'continuous'
                               for a continuous actions space
            sess (tf.Session): A session if you would like to use a custom session,
                               if left none it will be automatically created
        """

        if not sess:
            self.renew_sess()
        
        self.in_op = in_op
        
        self.out_op = out_op
            
        self.v_out_op = v_out_op
        
        if act_type in ('discrete', 'd'):
            self.train = self._create_discrete_trainer()
            self.act_type = 'discrete'
        elif act_type in ('continuous', 'c'):
            self.train = self._create_continuous_trainer()
            self.act_type = 'continuous'
        else:
            raise TypeError('act_type must be \'discrete\' or \'continuous\'')
        
    def renew_sess(self):
        """
        Starts a new internal Tensorflow session
        """
        self.sess = tf.Session()
        
    def end_sess(self):
        """
        Ends the internal Tensorflow session if it exists
        """
        if self.sess:
            self.sess.close()
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for advantage based vanilla policy training with a discrete action space
        """
        self.act_holders = tf.placeholder(tf.int64, shape=[None])
        self.reward_holders = tf.placeholder(tf.float64, shape=[None])
        
        self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float64)
        self.log_probs = tf.log(self.out_op)
        
        self.advantages = self.reward_holders - self.v_out_op
        
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.log_probs, axis=1)
        self.actor_loss = -tf.reduce_mean(self.resp_acts * self.advantages)
        
        self.optimizer = optimizer
        self.actor_update = self.optimizer.minimize(self.actor_loss)
        
        with tf.control_dependencies([self.actor_update]):
            self.value_loss = tf.reduce_mean(tf.square(self.v_out_op - self.reward_holders))
            self.value_update = self.optimizer.minimize(self.value_loss)
        
        update_func = lambda train_data: self.sess.run([self.actor_update, self.value_update], 
                                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                            self.act_holders: reshape_train_var(train_data[:, 1]),
                                                            self.reward_holders: train_data[:, 2]})
        
        self.sess.run(tf.global_variables_initializer())
        
        return update_func
        
    def _create_continuous_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for advantage based vanilla policy training with a continuous action space
        """
        self.act_holders = tf.placeholder(tf.float64, shape=[None, self.out_op.shape[1].value])
        self.reward_holders = tf.placeholder(tf.float64, shape=[None])
        
        self.advantages = self.reward_holders - tf.squeeze(self.v_out_op)
        
        self.log_std = tf.Variable(-0.5 * np.ones(self.out_op.shape[1], dtype=np.float64))
        self.sigma = tf.exp(self.log_std)
        
        self.normal_dist = tf.distributions.Normal(self.out_op, self.sigma)
        
        self.action = self.normal_dist._sample_n(1)
        self.action = tf.clip_by_value(self.action, -1, 1)
        
        self.log_probs = tf.reduce_sum(self.normal_dist.log_prob(self.act_holders), axis=1)
        
        self.actor_loss = -tf.reduce_mean(self.log_probs * self.advantages)
        # self.actor_loss -= 1e-1 * self.normal_dist.entropy()
        
        self.optimizer = optimizer
        
        self.actor_update = self.optimizer.minimize(self.actor_loss)
        
        with tf.control_dependencies([self.actor_update]):
            self.value_loss = tf.reduce_mean(tf.square(self.v_out_op - self.reward_holders))
            self.value_update = self.optimizer.minimize(self.value_loss)
        
        def update_func(train_data):
#             print('Loss:', self.sess.run(self.actor_loss, 
#                           feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
#                                      self.act_holders: reshape_train_var(train_data[:, 1]),
#                                      self.reward_holders: train_data[:, 2]}))
            self.sess.run([self.actor_update, self.value_update], 
                          feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                     self.act_holders: reshape_train_var(train_data[:, 1]), # vstack
                                     self.reward_holders: train_data[:, 2]})
        
        self.sess.run(tf.global_variables_initializer())
        
        return update_func
    
    def _gen_discrete_act(self, obs):
        act_probs = self.sess.run(self.out_op, feed_dict={self.in_op: [obs]})
        act = np.random.choice(list(range(len(act_probs[0]))), p=act_probs[0])
        
        
        return act
    
    def _gen_continuous_act(self, obs):
        act_vect = np.squeeze(self.sess.run(self.action, feed_dict={self.in_op: [obs]}))
#         print(act_vect)
        
#         # TODO: Add gaussian noise to action vector
#         act_vect = [a + np.random.normal(0., 0.1) for a in act_vect]
        
        return np.array(act_vect)
        
    def gen_act(self, obs):
        if self.act_type == 'discrete':
            return self._gen_discrete_act(obs)
        else:
            return self._gen_continuous_act(obs)
        
    def train(self, obs, rewards, acts):
        raise RuntimeError('The train method was not properly created')

In [8]:
# Lunar Lander continuous

env = make_lunar_lander_c()

obs = tf.placeholder(tf.float64, shape=[None, 8])
dense1 = dense(obs, 16, activation=tf.tanh)
act_probs = dense(dense1, 4)

v_dense1 = dense(obs, 16, activation=tf.tanh)
value = dense(v_dense1, 1)

network = VAPGTrainer(obs, act_probs, value, act_type='c')

In [9]:
# # Cart pole network

# env = make_cart_pole()

# obs = tf.placeholder(tf.float64, shape=[None, 4])
# dense1 = dense(obs, 32, activation=tf.tanh)
# dense2 = dense(dense1, 32, activation=tf.tanh)
# act_probs = dense(dense2, 2)
# softmax_probs = tf.nn.softmax(act_probs)

# v_dense1 = dense(obs, 32, activation=tf.tanh)
# v_dense2 = dense(v_dense1, 32, activation=tf.tanh)
# value = dense(v_dense2, 1)

# network = VAPGTrainer(obs, softmax_probs, value, act_type='discrete')

In [10]:
n_episodes = 1000000
max_steps = 200
update_freq = 32
print_freq = 3

mb = MemoryBuffer()

In [6]:
all_rewards = []

for episode in range(n_episodes):
    ep_reward = 0
    
    mb.start_rollout()
    obs = env.reset()
    for step in range(max_steps):
        act = network.gen_act(obs)
        
        obs_next, rew, d, _ = env.step(act)
        ep_reward += rew
        
        mb.record(obs, act, rew)
        obs = obs_next
        
        if d:
            break
            
    all_rewards.append(ep_reward)
            
    if episode % update_freq == 0 and episode != 0:
        network.train(mb.to_data())
        
        if episode % (update_freq * print_freq) == 0:
            print(f'Update #{episode // update_freq}, Reward: {np.mean(all_rewards[-update_freq*print_freq:])}')
        

Loss: -807.4621237118484
Update #1, Reward: -482.13777790530344
Loss: -832.4088810949711
Update #2, Reward: -500.418709379878
Loss: -799.1134741909927
Update #3, Reward: -478.40054026605486
Loss: -729.4549424244128
Update #4, Reward: -422.32167007613
Loss: -895.8702003982927
Update #5, Reward: -541.4297672474186
Loss: -961.0200941597064
Update #6, Reward: -565.1601799313671
Loss: -786.9334582986298
Update #7, Reward: -488.0100722048958
Loss: -899.8912704004105
Update #8, Reward: -544.6758150211481
Loss: -879.3487030315688
Update #9, Reward: -540.1681828182042
Loss: -886.5295548314149
Update #10, Reward: -531.4022775747721
Loss: -868.266701004879
Update #11, Reward: -511.76389905558193
Loss: -848.5976701527817
Update #12, Reward: -503.9750143651065
Loss: -882.2177037358612
Update #13, Reward: -547.4975374801754
Loss: -889.1576951129504
Update #14, Reward: -506.91444632528487



KeyboardInterrupt

