In [1]:
import tensorflow as tf
from tensorflow.layers import dense, flatten, conv2d, max_pooling2d
from utils import reshape_train_var
import gym
import numpy as np
from rewards import discount_rewards
from memory import MemoryBuffer
from policies import VPGTrainer
from env import *

In [54]:
class PPOTrainer():
    def __init__(self, in_op, out_op, value_out_op, act_type='discrete', sess=None, clip_val=0.2):
        """
        Create a wrapper for RL networks for easy training.
        Args:
            in_op (tf.Placeholder): Observation input to architecture
            out_op (tf.Variable): Action output of architecture
            act_type (string): 'discrete' for a discrete actions space or 'continuous'
                               for a continuous actions space
            sess (tf.Session): A session if you would like to use a custom session,
                               if left none it will be automatically created
        """

        if not sess:
            self.renew_sess()
        
        self.in_op = in_op
        self.out_op = out_op
        self.value_out_op = value_out_op
        self._prev_weights = None
        self.clip_val = clip_val
        
        if act_type in ('discrete', 'd'):
            self.train = self._create_discrete_trainer()
            self.act_type = 'discrete'
        elif act_type in ('continuous', 'c'):
            self.train = self._create_continuous_trainer()
            self.act_type = 'continuous'
        else:
            raise TypeError('act_type must be \'discrete\' or \'continuous\'')
        
    def renew_sess(self):
        """
        Starts a new internal Tensorflow session
        """
        self.sess = tf.Session()
        
    def end_sess(self):
        """
        Ends the internal Tensorflow session if it exists
        """
        if self.sess:
            self.sess.close()
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for vanilla policy training with a discrete action space
        """
        self.act_holders = tf.placeholder(tf.int64, shape=[None])
        self.reward_holders = tf.placeholder(tf.float64, shape=[None])
        
        self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float64)
        self.log_probs = tf.log(self.out_op)
        
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.log_probs, axis=1)
        self.actor_loss = -tf.reduce_mean(self.resp_acts * self.reward_holders)
        
        self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - self.value_out_op))
        
        self.optimizer = optimizer
        self.actor_update = self.optimizer.minimize(self.actor_loss)
        
        with tf.control_dependencies([self.actor_update]):
            self.value_update = self.optimizer.minimize(self.value_loss)
            
        with tf.control_dependencies([self.out_op]):
            self.tmp_weights = self.sess.trainable_variables()
            
            with tf.control_dependencies([self.tmp_weights]):
                self.relapse_weights = self.assign(tf.trainable_variables(), self._prev_weights)
                
        with tf.control_dependencies([self.out_op]):
            
            with tf.control_dependencies([self.tmp_weights]):
                self.assign(tf.trainable_variables(), self._prev_weights)
        
        
        def update_func(train_data): 
            
            
            self.sess.run([self.actor_update, self.value_update], 
                                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                            self.act_holders: reshape_train_var(train_data[:, 1]),
                                                            self.reward_holders: train_data[:, 2]})
            self.sess.run([self.actor_update, self.value_update], 
                                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                            self.act_holders: reshape_train_var(train_data[:, 1]),
                                                            self.reward_holders: train_data[:, 2]})
        
        self.sess.run(tf.global_variables_initializer())
        self._prev_weights = self.sess.run(tf.trainable_variables())
        
        return update_func
        
    def _create_continuous_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for vanilla policy training with a continuous action space
        """
        self.act_holders = tf.placeholder(tf.float64, shape=[None, self.out_op.shape[1].value])
        self.reward_holders = tf.placeholder(tf.float64, shape=[None])
        
        self.optimizer = optimizer
        
        # First passthrough
        with tf.control_dependencies([self.out_op]):
            self.get_weights = tf.trainable_variables()
            
            with tf.control_dependencies([self.get_weights]):
                self.relapse_weights = self.assign(tf.trainable_variables(), self._prev_weights)
        
        # Second passthrough
        self.advantages = self.value_out_op - self.reward_holders
        
        with tf.control_dependencies([self.advantages]):
            self.restore_weights = self.assign(tf.trainable_variables(), self._curr_weights)
            
#             self.log_probs = tf.log(self.out_op)
#             self.act_means = tf.reduce_mean(self.log_probs, axis=1)

            self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - self.value_out_op))
            
            
            self.old_prob_holders = tf.placeholder(dtype=tf.float64, shape=out_op.shape)
            
            self.policy_ratio = tf.reduce_mean(self.out_op / self.old_prob_holders, axis=1)
            self.clipped_ratio = tf.clip_by_value(self.policy_ratio, 1 - self.clip_val, 1 + self.clip_val)
            
            self.min_loss = tf.minimum(self.policy_ratio * self.advantages, self.clipped_ratio * self.advantages)
            self.actor_loss = tf.reduce_mean(self.min_loss)
            # self.actor_loss = -tf.reduce_mean(self.act_means * self.advantages)
            
            self.actor_update = self.optimizer.minimize(self.actor_loss)
            
            with tf.control_dependencies([self.actor_update]):
                self.value_update = self.optimizer.minimize(self.value_loss)
        
        def update_func(train_data): 
            self.old_probs, self._curr_weights, _ = self.sess.run([self.out_op, self.get_weights, self.relapse_weights], 
                                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0])})
            
            self.sess.run([self.actor_update, self.value_update], 
                                                       feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                                            self.act_holders: reshape_train_var(train_data[:, 1]),
                                                            self.reward_holders: train_data[:, 2]})
        
        self.sess.run(tf.global_variables_initializer())
        self._prev_weights = self.sess.run(tf.trainable_variables())
        
        return update_func
        
    def _gen_discrete_act(self, obs):
        act_probs = self.sess.run(self.out_op, feed_dict={self.in_op: obs})
        act = np.random.choice(list(range(len(act_probs)+1)), p=act_probs[0])
        
        return act
    
    def _gen_continuous_act(self, obs):
        act_vect = self.sess.run(self.out_op, feed_dict={self.in_op: obs})[0]
        
        # TODO: Add gaussian noise to action vector
        act_vect = [a + np.random.normal(0., 0.1) for a in act_vect]
        
        return np.array(act_vect)
        
    def gen_act(self, obs):
        if self.act_type == 'discrete':
            return self._gen_discrete_act(obs)
        else:
            return self._gen_continuous_act(obs)
        
    def train(self, obs, rewards, acts):
        raise RuntimeError('The train method was not properly created')

In [55]:
n_episodes = 10000
max_steps = 500
update_freq = 50 # In episodes

mb = MemoryBuffer(update_freq)
env = make_lunar_lander_c()

In [56]:
# Cart pole network

# obs = tf.placeholder(tf.float64, shape=[None, 4])
# dense1 = dense(obs, 128, activation=tf.tanh)
# dense2 = dense(dense1, 128, activation=tf.tanh)
# act_probs = dense(dense2, 2)
# softmax_probs = tf.nn.softmax(act_probs)

# v_dense1 = dense(obs, 128, activation=tf.tanh)
# v_dense2 = dense(v_dense1, 128, activation=tf.tanh)
# value = dense(v_dense2, 1)

# network = PPOTrainer(obs, softmax_probs, value, act_type='discrete')

In [57]:
# Car race network

# obs = tf.placeholder(tf.float64, shape=np.concatenate([[None], env.observation_space.shape]))
# conv1 = conv2d(obs, 16, [5, 5], padding='valid', activation=tf.tanh)
# maxpool1 = max_pooling2d(conv1, 4, 4)
# conv2 = conv2d(maxpool1, 32, [3, 3], padding='valid', activation=tf.tanh)
# maxpool2 = max_pooling2d(conv2, 2, 2)
# flatten1 = flatten(maxpool2)
# act_vects = dense(flatten1, env.action_space.shape[0])

# network = VanillaPolicyTrainer(obs, act_vects, act_type='continuous')

In [58]:
# Lunar lander discrete network

# obs = tf.placeholder(tf.float64, shape=[None, env.observation_space.shape[0]])
# dense1 = dense(obs, 64, activation=tf.nn.tanh)
# dense2 = dense(dense1, 64, activation=tf.nn.tanh)
# act_probs = dense(dense2, 2)
# softmax_probs = tf.nn.softmax(act_probs)

# network = VanillaPolicyTrainer(obs, softmax_probs, act_type='d')

In [59]:
# Lunar lander continuous network

obs = tf.placeholder(tf.float64, shape=[None, env.observation_space.shape[0]])
dense1 = dense(obs, 10, activation=tf.nn.tanh)
dense2 = dense(dense1, 10, activation=tf.nn.tanh)
act_vects = dense(dense2, 2)

v_dense1 = dense(obs, 10, activation=tf.nn.tanh)
v_dense2 = dense(v_dense1, 10, activation=tf.nn.tanh)
value = dense(v_dense2, 1)

network = PPOTrainer(obs, act_vects, value, act_type='c')

TypeError: Tensor objects are only iterable when eager execution is enabled. To iterate over this tensor use tf.map_fn.

In [41]:
import time
total_steps = 0
total_episodes = 0
all_rewards = []

for episode in range(n_episodes):
    obs = env.reset()
    
    episode_reward = 0
    mb.start_rollout()
    for step in range(max_steps):
        act = network.gen_act([obs])
        act[0] = np.clip(act[0], -1, 1)
        act[1] = np.clip(act[1], -1, 1)
#         act[2] = np.clip(act[2], 0, 1)
#         env.render()
#         time.sleep(0.02)
#         if np.random.random() <= 0.2:
#             act = env.action_space.sample()
        
        obs_next, rew, d, _ = env.step(act)
        episode_reward += rew
        
        mb.record(obs, act, rew)
        obs = obs_next
        
        total_steps += 1
        if d:
            break
            
    all_rewards.append(episode_reward)
    total_episodes += 1
            
    if total_episodes % update_freq * 5 == 0:
        print('Recent Reward:', np.mean(all_rewards[-update_freq * 5:]))
        
        train_data = mb.to_data()
        network.train(train_data)

Recent Reward: -576.7554287286149
Recent Reward: -579.9663330899035
Recent Reward: -430.59564907803673
Recent Reward: -355.21105200430515
Recent Reward: -311.02865202378706
Recent Reward: -222.950159908275
Recent Reward: -133.44655920243636
Recent Reward: -132.83831706899028
Recent Reward: -131.05552193750475
Recent Reward: -128.85525413879603
Recent Reward: -128.20683982397378
Recent Reward: -128.31167982041856
Recent Reward: -129.12467872923864
Recent Reward: -131.13521395416862
Recent Reward: -134.44709984321574


KeyboardInterrupt: 