In [1]:
from memory import MemoryBuffer
from policies import PPOTrainer
from utils import gaussian_likelihood, reshape_train_var
import tensorflow as tf
from tensorflow.layers import dense, conv2d, max_pooling2d, flatten
import numpy as np
import time
import gym
from env import CartPoleEnv

In [5]:
class PPOTrainer():
    def __init__(self, in_op, out_op, value_out_op, act_type='discrete', sess=None, clip_val=0.2, ppo_iters=80,
                 target_kl=0.01, v_coef=1., entropy_coef=0.01):
        """
        Create a wrapper for RL networks for easy training.
        Args:
            in_op (tf.Placeholder): Observation input to architecture
            out_op (tf.Variable): Action output of architecture
            act_type (string): 'discrete' for a discrete actions space or 'continuous'
                               for a continuous actions space
            sess (tf.Session): A session if you would like to use a custom session,
                               if left none it will be automatically created
        """

        if not sess:
            self.renew_sess()
        
        self.in_op = in_op
        self.out_op = out_op
        self.value_out_op = value_out_op
        self._prev_weights = None
        self.clip_val = clip_val
        self.ppo_iters = ppo_iters
        self.target_kl = target_kl
        self.v_coef = v_coef
        self.entropy_coef = entropy_coef
        
        if act_type in ('discrete', 'd'):
            self.train = self._create_discrete_trainer()
            self.act_type = 'discrete'
        elif act_type in ('continuous', 'c'):
            self.train = self._create_continuous_trainer()
            self.act_type = 'continuous'
        else:
            raise TypeError('act_type must be \'discrete\' or \'continuous\'')
        
    def renew_sess(self):
        """
        Starts a new internal Tensorflow session
        """
        self.sess = tf.Session()
        
    def end_sess(self):
        """
        Ends the internal Tensorflow session if it exists
        """
        if self.sess:
            self.sess.close()
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for vanilla policy training with a discrete action space
        """
        # First passthrough
        
        self.act_holders = tf.placeholder(tf.int32, shape=[None])
        self.reward_holders = tf.placeholder(tf.float32, shape=[None])
        
        self.act_masks = tf.one_hot(self.act_holders, self.out_op.shape[1].value, dtype=tf.float32)
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.out_op, axis=1)
        
        self.advantages = self.reward_holders - tf.squeeze(self.value_out_op)
        
        # Second passthrough
        
        self.advatange_holders = tf.placeholder(dtype=tf.float32, shape=self.advantages.shape)
        self.old_prob_holders = tf.placeholder(dtype=tf.float32, shape=self.resp_acts.shape)
 
        self.policy_ratio = self.resp_acts / self.old_prob_holders
        self.clipped_ratio = tf.clip_by_value(self.policy_ratio, 1 - self.clip_val, 1 + self.clip_val)

        self.min_loss = tf.minimum(self.policy_ratio * self.advatange_holders, self.clipped_ratio * self.advatange_holders)
        
        self.optimizer = tf.train.AdamOptimizer()

        # Actor update
        
        self.kl_divergence = tf.reduce_mean(tf.log(self.old_prob_holders) - tf.log(self.resp_acts))
        self.actor_loss = -tf.reduce_mean(self.min_loss)
        self.actor_update = self.optimizer.minimize(self.actor_loss)

        # Value update
        
        self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.value_out_op)))
        self.value_update = self.optimizer.minimize(self.value_loss)
        
        # Combined update
        
        self.combined_loss = self.actor_loss + 1. * self.value_loss
        self.combined_update = self.optimizer.minimize(self.combined_loss)
        
        def update_func(train_data):
            self.old_probs, self.old_advantages = self.sess.run([self.resp_acts, self.advantages], 
                                    feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                               self.act_holders: train_data[:, 1],
                                               self.reward_holders: train_data[:, 2]})
        
            for i in range(self.ppo_iters):
                kl_div, _ = self.sess.run([self.kl_divergence, self.combined_update], 
                               feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                    self.act_holders: reshape_train_var(train_data[:, 1]),
                                    self.reward_holders: train_data[:, 2],
                                    self.old_prob_holders: self.old_probs,
                                    self.advatange_holders: self.old_advantages})
                if kl_div > 1.5 * self.target_kl:
                    break

        self.sess.run(tf.global_variables_initializer())
        
        return update_func
        
    def _create_continuous_trainer(self):
        """
        Creates a function for vanilla policy training with a continuous action space
        """
        # First passthrough
        
        self.act_holders = tf.placeholder(tf.float32, shape=[None, self.out_op.shape[1].value])
        self.reward_holders = tf.placeholder(tf.float32, shape=[None])
        
        self.std = tf.Variable(0.5 * np.ones(shape=self.out_op.shape[1].value), dtype=tf.float32)
        self.out_act = self.out_op + tf.random_normal(tf.shape(self.out_op), dtype=tf.float32) * self.std
        
        self.log_probs = gaussian_likelihood(self.act_holders, self.out_op, self.std)
        
        self.advantages = self.reward_holders - tf.squeeze(self.value_out_op)
        
        # Second passthrough
        
        self.advatange_holders = tf.placeholder(dtype=tf.float32, shape=self.advantages.shape)
        self.old_prob_holders = tf.placeholder(dtype=tf.float32, shape=self.log_probs.shape)
 
        self.policy_ratio = tf.exp(self.log_probs - self.old_prob_holders)
        self.clipped_ratio = tf.clip_by_value(self.policy_ratio, 1 - self.clip_val, 1 + self.clip_val)

        self.min_loss = tf.minimum(self.policy_ratio * self.advatange_holders, self.clipped_ratio * self.advatange_holders)
        
        self.optimizer = tf.train.AdamOptimizer()

        # Actor update
        
        self.kl_divergence = tf.reduce_mean(self.old_prob_holders - self.log_probs)
        self.actor_loss = -tf.reduce_mean(self.min_loss)
        self.actor_update = self.optimizer.minimize(self.actor_loss)

        # Value update
        
        self.value_loss = tf.reduce_mean(tf.square(self.reward_holders - tf.squeeze(self.value_out_op)))
        self.value_update = self.optimizer.minimize(self.value_loss)
        
        # Combined update
        
        self.entropy = -0.5 * tf.reduce_mean(tf.log(2 * np.pi * np.e * self.std))
        self.combined_loss = self.actor_loss + self.v_coef * self.value_loss + self.entropy_coef * self.entropy
        self.combined_update = self.optimizer.minimize(self.combined_loss)
        
        def update_func(train_data):
            self.old_probs, self.old_advantages = self.sess.run([self.log_probs, self.advantages], 
                                    feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                               self.act_holders: reshape_train_var(train_data[:, 1]),
                                               self.reward_holders: train_data[:, 2]})
            
            for i in range(self.ppo_iters):
                kl_div, _ = self.sess.run([self.kl_divergence, self.combined_update], 
                               feed_dict={self.in_op: reshape_train_var(train_data[:, 0]),
                                    self.act_holders: reshape_train_var(train_data[:, 1]),
                                    self.reward_holders: train_data[:, 2],
                                    self.old_prob_holders: self.old_probs,
                                    self.advatange_holders: self.old_advantages})
                if kl_div > 1.5 * self.target_kl:
                    break
            
            return kl_div, self.sess.run(self.entropy)

        self.sess.run(tf.global_variables_initializer())
        
        return update_func
        
    def _gen_discrete_act(self, obs):
        act_probs = self.sess.run(self.out_op, feed_dict={self.in_op: [obs]})
        act = np.random.choice(list(range(len(act_probs[0]))), p=act_probs[0])
        
        return act
    
    def _gen_continuous_act(self, obs):
        act_vect = self.sess.run(self.out_act, feed_dict={self.in_op: [obs]})[0]
        
        return np.array(act_vect)
        
    def gen_act(self, obs):
        if self.act_type == 'discrete':
            return self._gen_discrete_act(obs)
        else:
            return self._gen_continuous_act(obs)
        
    def train(self, obs, rewards, acts):
        raise RuntimeError('The train method was not properly created')

In [6]:
# env = gym.make('CartPole-v1')
# # env = gym.make('BipedalWalker-v2')

# obs = tf.placeholder(tf.float32, shape=[None]+list(env.observation_space.shape))
# dense1 = dense(obs, 32, activation=tf.tanh)
# dense2 = dense(dense1, 32, activation=tf.tanh)
# act_probs = dense(dense2, 2, activation=tf.nn.softmax)

# v_dense1 = dense(obs, 32, activation=tf.tanh)
# v_dense2 = dense(v_dense1, 32, activation=tf.tanh)
# value = dense(v_dense2, 1)

# network = PPOTrainer(obs, act_probs, value, act_type='d')

In [7]:
# env = CartPoleEnv()
# # env = gym.make('BipedalWalker-v2')

# obs = tf.placeholder(tf.float32, shape=[None]+list(env.observation_space.shape))
# dense1 = dense(obs, 32, activation=tf.tanh)
# dense2 = dense(dense1, 32, activation=tf.tanh)
# act_probs = dense(dense2, env.action_space.shape[0])

# v_dense1 = dense(obs, 32, activation=tf.tanh)
# v_dense2 = dense(v_dense1, 32, activation=tf.tanh)
# value = dense(v_dense2, 1)

# network = PPOTrainer(obs, act_probs, value, act_type='c')

In [15]:
env = gym.make('BipedalWalker-v2')

obs = tf.placeholder(tf.float32, shape=[None]+list(env.observation_space.shape))
dense1 = dense(obs, 32, activation=tf.tanh)
dense2 = dense(dense1, 32, activation=tf.tanh)
act_probs = dense(dense2, env.action_space.shape[0], activation=tf.tanh)

v_dense1 = dense(obs, 32, activation=tf.tanh)
v_dense2 = dense(v_dense1, 32, activation=tf.tanh)
value = dense(v_dense2, 1)

network = PPOTrainer(obs, act_probs, value, act_type='c', entropy_coef= 0.1)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [16]:
n_episodes = 1000000
max_steps = 200
update_freq = 16
print_freq = 4

mb = MemoryBuffer()

In [None]:
all_rewards = []

for episode in range(n_episodes):
    ep_reward = 0
    
    mb.start_rollout()
    obs = env.reset()
    for step in range(max_steps):
        obs = obs.squeeze()
        act = network.gen_act(obs)
        
        obs = obs / 5.
        a_o = act
        act = np.clip(act, -1, 1)
        for i in obs:
            if abs(i) > 1:
                print('----------')
                print(i)
        
        obs_next, rew, d, _ = env.step(act)
        ep_reward += rew
        
        if False:
            env.render()
            time.sleep(0.02)
        
        mb.record(obs, act, rew)
        obs = obs_next
        
        if d:
            break
            
    all_rewards.append(ep_reward)
            
    if episode % update_freq == 0 and episode != 0:
        kl_div, ent = network.train(mb.to_data())
        
        if episode % (update_freq * print_freq) == 0:
            print(f'Update #{episode // update_freq}, Reward: {np.mean(all_rewards[-update_freq*print_freq:])}')
            print(a_o)
            print(ent)
            print()

Update #4, Reward: -43.43938043580168
[-0.4255836  -0.5417275   0.30097774  0.90512437]
-1.143805

Update #8, Reward: -19.216157109723902
[ 0.9951526  -0.09123999  0.3803486   0.42581218]
-1.1277367

Update #12, Reward: -28.522169967619295
[-0.870984    0.7447529   0.47475326 -0.19751024]
-1.1302333

Update #16, Reward: -38.8993607310927
[-1.5737464  -0.20089933 -1.4391295  -1.3277129 ]
-1.1358172

Update #20, Reward: -76.92704975397344
[-0.4982914   0.04332612 -0.13924053 -0.11839139]
-1.1523395

Update #24, Reward: -76.88723179886644
[-0.60915834 -1.3590945   0.44240493  0.57904136]
-1.1756155

Update #28, Reward: -76.71226502661817
[ 1.0316777   0.25532207  0.32458642 -1.2453634 ]
-1.2102191

Update #32, Reward: -65.05719565095742
[-1.0413121   0.15279686  0.27340907  0.1524845 ]
-1.209454

Update #36, Reward: -25.08276429144662
[-0.25311917 -0.2934596   1.1656295  -0.2686175 ]
-1.1983747

Update #40, Reward: -41.84326305195559
[ 0.290689   -0.87865627  0.60204154  0.9393899 ]
-1.20

Update #332, Reward: -39.69723026910215
[-0.635688   0.9715442  1.319759  -1.1028706]
-0.98227537

Update #336, Reward: -38.2094051750145
[-0.8268071  1.120253   1.5296865 -0.9812632]
-0.8993496

Update #340, Reward: -32.8101100571488
[-1.3112996   0.34696788  1.469564   -0.8463721 ]
-0.8687394

Update #344, Reward: -36.11554465579997
[-0.80090475  0.85038245  0.5737298  -1.044363  ]
-0.842275

Update #348, Reward: -29.293584867583924
[-1.1665761  0.6547328  1.2842175 -0.917302 ]
-0.8172947

Update #352, Reward: -25.07329644443478
[-0.758932   0.4005286  1.0086255 -1.111101 ]
-0.7864876

Update #356, Reward: -36.31051630171578
[-0.62096405  0.9888075   1.1354481  -1.011633  ]
-0.75763166

Update #360, Reward: -28.148349136669967
[-1.1937474   1.2579644   0.55164766 -1.1891911 ]
-0.64112353

Update #364, Reward: -23.990607219165604
[-0.7601409   0.62794894  1.1065482  -1.0461428 ]
-0.61114883

Update #368, Reward: -21.47941712962483
[-0.9740763  0.8893909  0.9211183 -0.7676903]
-0.63479

Update #656, Reward: -19.781262063211813
[-1.007101    0.9455691   0.9929968  -0.85025483]
0.12674642

Update #660, Reward: -19.574309165507113
[-0.9829064  0.9831375  1.0195187 -1.1171627]
0.14429712

Update #664, Reward: -19.633779329566863
[-0.9754566   1.0227672   0.9125076  -0.95613253]
0.16477662

Update #668, Reward: -19.781675769953765
[-1.0008862   1.0136127   1.0215783  -0.92012054]
0.17876133

Update #672, Reward: -19.737308259461834
[-1.0238682   1.0211111   0.96695036 -1.0611358 ]
0.19279085

Update #676, Reward: -19.699350631717873
[-0.96512765  1.0586448   0.927963   -1.0317165 ]
0.16528895

Update #680, Reward: -19.662520158055443
[-1.0124571   1.0058789   1.0352784  -0.96609783]
0.19005863

Update #684, Reward: -19.72233126094994
[-1.0013945  1.0215665  1.0040346 -1.0608859]
0.21379054

Update #688, Reward: -19.616905251829312
[-1.0075969   0.98703486  1.0174781  -1.0275595 ]
0.22563228

Update #692, Reward: -19.65186852603746
[-1.0133933   1.0167439   0.97778165 -1.01