In [6]:
import gymnasium as gym
import collections
import numpy as np
import statistics
import tensorflow as tf
import tensorflow_probability as tfp
import tqdm
import os

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [7]:
class ContinuousActor(tf.keras.Model):
    """Actor that outputs a policy directly"""
    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int,
        std_init = -2
    ):
        super().__init__()
        self.means = layers.Dense(num_actions)
        # self.stds = tf.Variable(tf.ones(num_actions))*std_init#lambda x: tf.zeros(num_actions)
        # self.stds = layers.Dense(num_actions, activation='relu')
        # tf.ones(num_actions) * 0.25
    def call(self, inputs: tf.Tensor):
        means = self.means(inputs)
        # stds = self.stds(inputs)
        # stds = tf.clip_by_value(stds, 1.0e-3, 1)
        return means
        # return tfp.distributions.MultivariateNormalDiag(loc = means, scale_diag = tf.exp(self.stds))


class ActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.common = layers.Dense(num_hidden_units, activation = 'relu')
        # outputs scale, location params for mvn
        self.actor = ContinuousActor(num_actions, num_hidden_units) #layers.Dense(num_actions)
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor):
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

class sepActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.actor1 = layers.Dense(num_hidden_units, activation = 'relu')
        # outputs scale, location params for mvn
        self.actor2 = ContinuousActor(num_actions, num_hidden_units) 
        self.critic1 = layers.Dense(num_hidden_units, activation = 'relu')
        self.critic2 = layers.Dense(1)

    def call(self, inputs: tf.Tensor):
        a = self.actor1(inputs)
        c = self.critic1(inputs)
        return self.actor2(a), self.critic2(c)


In [8]:
class Actor(tf.keras.Model):

    def __init__(
        self,
        num_actions,
        num_hidden_units
        ):
        super().__init__()
        self.fc1 = layers.Dense(num_hidden_units, activation = 'relu')
        self.fc2 = layers.Dense(num_actions)

    def call(self, x):
        x = self.fc1(x)
        return self.fc2(x)

class Critic(tf.keras.Model):

    def __init__(
        self,
        num_hidden_units
        ):
        super().__init__()
        self.fc1 = layers.Dense(num_hidden_units, activation = 'relu')
        self.fc2 = layers.Dense(1)

    def call(self, x):
        x = self.fc1(x)
        return self.fc2(x)

class FFNetwork(tf.keras.Model):

    def __init__(
        self,
        num_hidden_units: int,
        num_outputs: int
        ):
        super().__init__()
        self.fc1 = layers.Dense(num_hidden_units, activation = 'relu')
        self.fc2 = layers.Dense(num_outputs)

    def call(self, x):
        x = self.fc1(x)
        return self.fc2(x)

In [11]:
class PPO:

    def __init__(
        self,
        env_name: str,
        gamma: float,
        lam: float,
        ent_coef: float,
        vf_coef: float,
        clip: float,
        timesteps_per_batch: int,
        max_timesteps_per_episode: int,
        n_updates_per_iteration: int,
        actor: tf.keras.Model,
        optimizer_actor: tf.keras.optimizers.Optimizer,
        critic: tf.keras.Model,
        optimizer_critic: tf.keras.optimizers.Optimizer,
        summary_writer
        ):

        # env
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.num_actions = self.env.action_space.shape[0]
        
        # learning params
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.clip = clip

        # rollout params
        self.timesteps_per_batch = timesteps_per_batch
        self.max_timesteps_per_episode = max_timesteps_per_episode
        self.n_updates_per_iteration = n_updates_per_iteration

        # model 
        self.actor = actor
        self.optimizer_actor = optimizer_actor
        self.critic = critic
        self.optimizer_critic = optimizer_critic

        # other params
        self.stds = tf.ones(self.num_actions) * 0.5 # TODO: make parameter
        self.summary_writer = summary_writer
        self.seed = 42 # TODO: make parameter

    def learn(self, total_timesteps):
        t_so_far = 0
        while t_so_far < total_timesteps:
            batch_states, batch_actions, batch_log_probs, batch_rewards, batch_lens, batch_dones, batch_values = self.rollout()

            A_k = self.compute_gae(batch_rewards, batch_values, batch_dones)#batch_returns - V
            V, _, _ = self.evaluate(batch_states, batch_actions)
            batch_returns = A_k + V

            # normalise advantages
            A_k = (A_k - tf.reduce_mean(A_k)) / (tf.math.reduce_std(A_k) + 1.0e-10)

            ## set up loss records
            running_actor_loss = 0
            running_critic_loss = 0
            for _ in range(self.n_updates_per_iteration):
                with tf.GradientTape(persistent=True) as tape:
                    V, curr_log_probs, entropy = self.evaluate(batch_states, batch_actions)
                    # this is the start of our computation graph?
                    ratios = tf.exp(curr_log_probs - tf.squeeze(batch_log_probs))

                    # calc surrogate losses
                    surr1 = ratios * A_k
                    surr2 = tf.clip_by_value(ratios, 1 - self.clip, 1 + self.clip) * A_k

                    # get actor / critic losses 
                    entropy_loss = tf.reduce_mean(entropy)
                    actor_loss = tf.reduce_mean(-tf.math.minimum(surr1, surr2)) - self.ent_coef * entropy_loss
                    critic_loss = self.vf_coef * tf.reduce_mean((V - batch_returns)**2)

                # backprop actor loss
                actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
                self.optimizer_actor.apply_gradients(zip(actor_grads, self.actor.trainable_variables))

                # backprop critic loss
                critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
                self.optimizer_critic.apply_gradients(zip(critic_grads, self.critic.trainable_variables))
                
                # for logging
                running_actor_loss += tf.stop_gradient(actor_loss)
                running_critic_loss += tf.stop_gradient(critic_loss)            
            
            t_so_far += np.sum(batch_lens)
            with self.summary_writer.as_default():
                tf.summary.scalar('actor_loss', running_actor_loss / self.n_updates_per_iteration, step = t_so_far)
                tf.summary.scalar('critic_loss', running_critic_loss / self.n_updates_per_iteration, step = t_so_far)
                tf.summary.scalar('returns', tf.reduce_mean(batch_returns), step = t_so_far)
                tf.summary.scalar('rewards', tf.reduce_mean([np.sum(i) for i in batch_rewards]), step = t_so_far)


    def rollout(self):
        """
        Runs a single episode in the environment and accumulates data
        """

        # store results
        batch_states = []
        batch_actions = []
        batch_log_probs = []
        batch_rewards = []
        batch_returns = []
        batch_lens = []
        batch_values = []
        batch_dones = []

        # num timesteps so far
        t = 0

        while t < self.timesteps_per_batch:
            ep_rewards = []
            ep_values = []
            ep_dones = []

            state, _ = self.env.reset()
            done = False

            for ep_t in range(self.max_timesteps_per_episode):

                # increment timesteps
                t += 1

                # collect observations
                batch_states.append(state)
                
                action, log_prob = self.get_action(state)
                value = self.critic(tf.expand_dims(state, 0))
                state, reward, truncated, terminated, _ = self.env.step(action)
                done = truncated or terminated

                # collect reward, action and log prob
                ep_rewards.append(reward)
                ep_values.append(tf.reshape(value, [-1]))
                ep_dones.append(done)

                batch_actions.append(action)
                batch_log_probs.append(log_prob)
                

                if done:
                    break
            
            # collect episode length and rewards
            batch_lens.append(ep_t + 1)
            batch_rewards.append(ep_rewards)
            batch_values.append(ep_values)
            batch_dones.append(ep_dones)

        # convert to tensors
        batch_states = tf.convert_to_tensor(batch_states, dtype = tf.float32)
        batch_actions = tf.convert_to_tensor(batch_actions, dtype = tf.float32)
        batch_log_probs = tf.convert_to_tensor(batch_log_probs, dtype = tf.float32)

        # batch_returns = self.compute_returns(batch_rewards)

        return batch_states, batch_actions, batch_log_probs, batch_rewards, batch_lens, batch_dones, batch_values


    def get_action(self, state):

        means = self.actor(tf.expand_dims(state, 0))
        policy = tfp.distributions.MultivariateNormalDiag(loc = means, scale_diag = self.stds)

        # TODO: squash / scale action (do I need to?)
        action = policy.sample()[0]
        log_prob = policy.log_prob(action)

        return action, log_prob

    def compute_returns(self, rewards):
        batch_returns = []
        for ep_rewards in reversed((rewards)):

            discounted_reward = 0 # reward so far

            for reward in reversed(ep_rewards):
                discounted_reward = reward + self.gamma * discounted_reward
                batch_returns.insert(0, discounted_reward)
            
        # convert to tensor
        batch_returns = tf.convert_to_tensor(batch_returns, dtype = tf.float32)

        return batch_returns


    def compute_gae(self, rewards, values, dones):
        batch_advantages = []

        for ep_rewards, ep_values, ep_dones in zip(rewards, values, dones):
            advantages = []
            last_advantage = 0

            for t in reversed(range(len(ep_rewards))):
                if t + 1 < len(ep_rewards):
                    delta = ep_rewards[t] + self.gamma * ep_values[t+1] * (1 - ep_dones[t+1]) - ep_values[t]
                else:
                    delta = ep_rewards[t] - ep_values[t]

                advantage = delta + self.gamma * self.lam * (1 - ep_dones[t]) * last_advantage
                last_advantage = advantage
                advantages.insert(0, advantage)

            batch_advantages.extend(advantages)
        
        return tf.squeeze(tf.convert_to_tensor(batch_advantages, dtype = tf.float32))
    
    def evaluate(self, batch_states, batch_actions):
        V = tf.squeeze(self.critic(batch_states))
        mean = self.actor(batch_states)
        policy = tfp.distributions.MultivariateNormalDiag(mean, self.stds)

        log_probs = policy.log_prob(batch_actions)

        return V, log_probs, policy.entropy()

    def test(self, num_episodes):
        env = gym.make(self.env_name, render_mode = 'human')
        for i in range(num_episodes):
            state, _ = env.reset(seed = seed)
            done = False
            step = 0
            while not done:
                action, _ = self.get_action(state)
                next_state, reward, terminated, truncated, info = env.step(action)
                done = terminated or truncated
                state = next_state

                step += 1
        env.close()

def get_next_run(log_dir):
    next_run = max([0]+[int(j) for j in [i.split('_')[-1] for i in os.listdir(log_dir)] if j.isdigit()]) + 1
    return log_dir + f'/run_{next_run}'

In [12]:
ENV = 'LunarLanderContinuous-v2'#'Pendulum-v1'#

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
# env = gym.make(ENV)
eps = np.finfo(np.float32).eps.item()

optimizer_actor = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
optimizer_critic = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
num_actions = env.action_space.shape[0]
num_hidden_units = 64

actor = Actor(num_actions, num_hidden_units)
critic = Critic(num_hidden_units)


log_dir = get_next_run('./logs') 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)


agent = PPO(        
    env_name = ENV,
    gamma = 0.999,
    lam = 0.98,
    ent_coef = 0.01,
    vf_coef = 0.4,
    clip = 0.2,
    timesteps_per_batch = 800,
    max_timesteps_per_episode = 200,
    n_updates_per_iteration = 4,
    actor = actor,
    optimizer_actor = optimizer_actor,
    critic = critic,
    optimizer_critic = optimizer_critic,
    summary_writer= summary_writer)


Saving logs to:  ./logs/run_207


In [13]:
agent.learn(240000)

In [176]:
agent.test(5)

In [213]:
class LSTMPPO:

    def __init__(
        self,
        env_name: str,
        gamma: float,
        lam: float,
        ent_coef: float,
        vf_coef: float,
        clip: float,
        window_size: int,
        timesteps_per_batch: int,
        max_timesteps_per_episode: int,
        n_updates_per_iteration: int,
        actor: tf.keras.Model,
        optimizer_actor: tf.keras.optimizers.Optimizer,
        critic: tf.keras.Model,
        optimizer_critic: tf.keras.optimizers.Optimizer,
        summary_writer
        ):

        # env
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.num_actions = self.env.action_space.shape[0]
        
        # learning params
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.clip = clip
        self.window_size = window_size

        # rollout params
        self.timesteps_per_batch = timesteps_per_batch
        self.max_timesteps_per_episode = max_timesteps_per_episode
        self.n_updates_per_iteration = n_updates_per_iteration

        # model 
        self.actor = actor
        self.optimizer_actor = optimizer_actor
        self.critic = critic
        self.optimizer_critic = optimizer_critic

        # other params
        self.stds = tf.ones(self.num_actions) * 0.5
        self.summary_writer = summary_writer

    def learn(self, total_timesteps):
        t_so_far = 0
        while t_so_far < total_timesteps:
            batch_states, batch_actions, batch_log_probs, batch_rewards, batch_lens, batch_dones, batch_values = self.rollout()

            A_k = self.compute_gae(batch_rewards, batch_values, batch_dones)#batch_returns - V
            V, _, _ = self.evaluate(batch_states, batch_actions)
            batch_returns = A_k + V

            # normalise advantages
            A_k = (A_k - tf.reduce_mean(A_k)) / (tf.math.reduce_std(A_k) + 1.0e-10)

            ## set up loss records
            running_actor_loss = 0
            running_critic_loss = 0
            for _ in range(self.n_updates_per_iteration):
                with tf.GradientTape(persistent=True) as tape:
                    V, curr_log_probs, entropy = self.evaluate(batch_states, batch_actions)
                    
                    ratios = tf.exp(curr_log_probs - tf.squeeze(batch_log_probs))

                    # calc surrogate losses
                    surr1 = ratios * A_k
                    surr2 = tf.clip_by_value(ratios, 1 - self.clip, 1 + self.clip) * A_k

                    # get actor / critic losses 
                    entropy_loss = tf.reduce_mean(entropy)
                    actor_loss = tf.reduce_mean(-tf.math.minimum(surr1, surr2)) - self.ent_coef * entropy_loss
                    critic_loss = self.vf_coef * tf.reduce_mean((V - batch_returns)**2)

                # backprop actor loss
                actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
                self.optimizer_actor.apply_gradients(zip(actor_grads, self.actor.trainable_variables))

                # backprop critic loss
                critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
                self.optimizer_critic.apply_gradients(zip(critic_grads, self.critic.trainable_variables))
                
                # for logging
                running_actor_loss += tf.stop_gradient(actor_loss)
                running_critic_loss += tf.stop_gradient(critic_loss)            
            
            t_so_far += np.sum(batch_lens)
            with self.summary_writer.as_default():
                tf.summary.scalar('actor_loss', running_actor_loss / self.n_updates_per_iteration, step = t_so_far)
                tf.summary.scalar('critic_loss', running_critic_loss / self.n_updates_per_iteration, step = t_so_far)
                tf.summary.scalar('returns', tf.reduce_mean(batch_returns), step = t_so_far)
                tf.summary.scalar('rewards', tf.reduce_mean([np.sum(i) for i in batch_rewards]), step = t_so_far)


    def rollout(self):
        """
        Runs a single episode in the environment and accumulates data
        """

        # store results
        batch_states = []
        batch_actions = []
        batch_log_probs = []
        batch_rewards = []
        batch_returns = []
        batch_lens = []
        batch_values = []
        batch_dones = []

        # num timesteps so far
        t = 0

        while t < self.timesteps_per_batch:
            ep_rewards = []
            ep_values = []
            ep_dones = []

            state, _ = self.env.reset()
            done = False

            for ep_t in range(self.max_timesteps_per_episode):

                # increment timesteps
                t += 1

                # collect observations
                batch_states.append(state)
                
                # pad states for use in lstm
                padded = self.pad_input(batch_states, ep_t + 1)
                action, log_prob = self.get_action(padded)
                value = self.critic(tf.expand_dims(padded, 0)) # may need to adjust dims?
                state, reward, truncated, terminated, _ = self.env.step(action)
                done = truncated or terminated

                # collect reward, action and log prob
                ep_rewards.append(reward)
                ep_values.append(tf.reshape(value, [-1]))
                ep_dones.append(done)

                batch_actions.append(action)
                batch_log_probs.append(log_prob)
                

                if done:
                    break
            
            # collect episode length and rewards
            batch_lens.append(ep_t + 1)
            batch_rewards.append(ep_rewards)
            batch_values.append(ep_values)
            batch_dones.append(ep_dones)

        # convert to tensors
        batch_states = tf.convert_to_tensor(batch_states, dtype = tf.float32)
        batch_actions = tf.convert_to_tensor(batch_actions, dtype = tf.float32)
        batch_log_probs = tf.convert_to_tensor(batch_log_probs, dtype = tf.float32)

        # batch_returns = self.compute_returns(batch_rewards)

        return batch_states, batch_actions, batch_log_probs, batch_rewards, batch_lens, batch_dones, batch_values
        
    def pad_input(self, batch_states, num):
        ### TODO: DEBUG - why does this come up with an empty batch_state?
        print(num)
        print(batch_states)
        # get the last 'window_size' states
        to_pad = batch_states[(num - self.window_size):num]
        print(tf.shape(to_pad))
        print(to_pad)

        # get size of padding - 0 if we have enough
        pad_size = np.max([0, window_size-tf.shape(to_pad)[0]], axis = 0)
        
        # get the padded vals
        padded = tf.pad(to_pad, [[0,pad_size],[0,0]])

        return padded

    def get_action(self, state):

        means = self.actor(tf.expand_dims(state, 0))
        policy = tfp.distributions.MultivariateNormalDiag(loc = means, scale_diag = self.stds)

        # TODO: squash / scale action (do I need to?)
        action = policy.sample()[0]
        log_prob = policy.log_prob(action)

        return action, log_prob

    def compute_returns(self, rewards):
        batch_returns = []
        for ep_rewards in reversed((rewards)):

            discounted_reward = 0 # reward so far

            for reward in reversed(ep_rewards):
                discounted_reward = reward + self.gamma * discounted_reward
                batch_returns.insert(0, discounted_reward)
            
        # convert to tensor
        batch_returns = tf.convert_to_tensor(batch_returns, dtype = tf.float32)

        return batch_returns


    def compute_gae(self, rewards, values, dones):
        batch_advantages = []

        for ep_rewards, ep_values, ep_dones in zip(rewards, values, dones):
            advantages = []
            last_advantage = 0

            for t in reversed(range(len(ep_rewards))):
                if t + 1 < len(ep_rewards):
                    delta = ep_rewards[t] + self.gamma * ep_values[t+1] * (1 - ep_dones[t+1]) - ep_values[t]
                else:
                    delta = ep_rewards[t] - ep_values[t]

                advantage = delta + self.gamma * self.lam * (1 - ep_dones[t]) * last_advantage
                last_advantage = advantage
                advantages.insert(0, advantage)

            batch_advantages.extend(advantages)
        
        return tf.squeeze(tf.convert_to_tensor(batch_advantages, dtype = tf.float32))

    def prepare_batch_for_lstm(self, batch_states):
        """wraps states into lstm sequence format"""

        initial_padded_states = tf.convert_to_tensor(
            [tf.pad(states[:(i+1)][::-1], [[0,np.max([0, self.window_size-(i+1)], axis = 0)],[0,0]]) for i in tf.range(0, self.window_size)]
            )
        remaining_states = tf.convert_to_tensor(
            [states[(i-self.window_size):i] for i in tf.range(self.window_size, tf.shape(states)[0])]
            )
        combined_states = tf.concat(
            [initial_padded_states, remaining_states],
            axis = 0)

        return combined_states
    
    def evaluate(self, batch_states, batch_actions):

        lstm_batch_states = self.prepare_batch_for_lstm(batch_states)
        V = tf.squeeze(self.critic(lstm_batch_states))
        mean = self.actor(lstm_batch_states)
        policy = tfp.distributions.MultivariateNormalDiag(mean, self.stds)

        log_probs = policy.log_prob(batch_actions)

        return V, log_probs, policy.entropy()

    def test(self, num_episodes):
        env = gym.make(self.env_name, render_mode = 'human')
        for i in range(num_episodes):
            state, _ = env.reset(seed = seed)
            done = False
            step = 0
            while not done:
                action, _ = self.get_action(state)
                next_state, reward, terminated, truncated, info = env.step(action)
                done = terminated or truncated
                state = next_state

                step += 1
        env.close()

In [204]:
class LSTMNet(tf.keras.Model):

    def __init__(self, num_hidden_units, num_outputs):
        super().__init__()
        self.lstm = layers.LSTM(num_hidden_units)
        self.fc = layers.Dense(num_outputs)

    def call(self, x):
        x = self.lstm(x)
        return self.fc(x)

In [198]:
ENV = 'LunarLanderContinuous-v2'#'Pendulum-v1'#

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
env = gym.make(ENV)
eps = np.finfo(np.float32).eps.item()

optimizer_actor = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
optimizer_critic = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
num_actions = env.action_space.shape[0]
num_hidden_units = 64

actor = LSTMNet(num_hidden_units, num_actions)#Actor(num_actions, num_hidden_units)
critic = LSTMNet(num_hidden_units, 1)

log_dir = get_next_run('./logs') 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)





Saving logs to:  ./logs/run_206


In [214]:
agent = LSTMPPO(        
    env_name = ENV,
    gamma = 0.999,
    lam = 0.98,
    ent_coef = 0.01,
    vf_coef = 0.4,
    clip = 0.2,
    window_size = 4,
    timesteps_per_batch = 40,
    max_timesteps_per_episode = 20,
    n_updates_per_iteration = 2,
    actor = actor,
    optimizer_actor = optimizer_actor,
    critic = critic,
    optimizer_critic = optimizer_critic,
    summary_writer= summary_writer)
    
agent.learn(400)

1
[array([ 0.00163717,  1.4129504 ,  0.165805  ,  0.09023532, -0.0018902 ,
       -0.03755725,  0.        ,  0.        ], dtype=float32)]
tf.Tensor([1 8], shape=(2,), dtype=int32)
[array([ 0.00163717,  1.4129504 ,  0.165805  ,  0.09023532, -0.0018902 ,
       -0.03755725,  0.        ,  0.        ], dtype=float32)]
2
[array([ 0.00163717,  1.4129504 ,  0.165805  ,  0.09023532, -0.0018902 ,
       -0.03755725,  0.        ,  0.        ], dtype=float32), array([ 0.0033493 ,  1.4144068 ,  0.17500368,  0.06472376, -0.00563567,
       -0.07491585,  0.        ,  0.        ], dtype=float32)]
tf.Tensor([2 8], shape=(2,), dtype=int32)
[array([ 0.00163717,  1.4129504 ,  0.165805  ,  0.09023532, -0.0018902 ,
       -0.03755725,  0.        ,  0.        ], dtype=float32), array([ 0.0033493 ,  1.4144068 ,  0.17500368,  0.06472376, -0.00563567,
       -0.07491585,  0.        ,  0.        ], dtype=float32)]
3
[array([ 0.00163717,  1.4129504 ,  0.165805  ,  0.09023532, -0.0018902 ,
       -0.03755725,  0.

InvalidArgumentError: {{function_node __wrapped__Pad_device_/job:localhost/replica:0/task:0/device:CPU:0}} The first dimension of paddings must be the rank of inputs[2,2] [0] [Op:Pad]

In [182]:
batch_states, batch_actions, batch_log_probs, batch_rewards, batch_lens, batch_values, batch_dones = agent.rollout()

In [None]:
    def _prepare_batch_for_lstm(self, batch_states):
        """wraps states into lstm sequence format"""
        ## need the initial k 

        initial_padded_states = tf.convert_to_tensor([tf.pad(states[:(i+1)][::-1], [[0,np.max([0, self.window_size-(i+1)], axis = 0)],[0,0]]) for i in tf.range(0, self.window_size)])
        remaining_states = tf.convert_to_tensor([states[(i-self.window_size):i] for i in tf.range(self.window_size, tf.shape(states)[0])])
        combined_states = tf.concat([initial_padded_states, remaining_states], axis = 0)

        return combined_states

In [196]:
initial_padded_states = tf.convert_to_tensor([tf.pad(batch_states[:(i+1)][::-1], [[0,np.max([0, window_size-(i+1)], axis = 0)],[0,0]]) for i in tf.range(0, window_size)])
remaining_states = tf.convert_to_tensor([batch_states[(i-window_size):i] for i in tf.range(window_size, tf.shape(batch_states)[0])])
combined_states = tf.concat([initial_padded_states, remaining_states], axis = 0)

combined_states

<tf.Tensor: shape=(400, 4, 8), dtype=float32, numpy=
array([[[-0.00742321,  1.4109877 , -0.75190264, ...,  0.17031702,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[-0.01479359,  1.4111564 , -0.7458402 , ...,  0.17382638,
          0.        ,  0.        ],
        [-0.00742321,  1.4109877 , -0.75190264, ...,  0.17031702,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[-0.02225256,  1.4115062 , -0.75249255, ...,  0.13054022,
          0.        ,  0.        ],
        [-0.01479359,  1.

In [None]:
initial_padded_states = tf.convert_to_tensor()

In [190]:
window_size = 4
for i in range(1, 10):
    # print(i, batch_states[:i])
    to_pad = batch_states[(i - window_size):i]

    # get size of padding - 0 if we have enough
    pad_size = np.max([0, window_size-tf.shape(to_pad)[0]], axis = 0)
    print(pad_size)
    
    # get the padded vals
    padded = tf.pad(to_pad, [[0,pad_size],[0,0]])
    # padded = tf.expand_dims(padded, 0)

4
4
4
0
0
0
0
0
0


In [191]:
padded

<tf.Tensor: shape=(4, 8), dtype=float32, numpy=
array([[-0.04458275,  1.4089558 , -0.7465483 , -0.06464802,  0.04220025,
         0.10646494,  0.        ,  0.        ],
       [-0.05202141,  1.4081697 , -0.7492876 , -0.03510208,  0.04756383,
         0.10728167,  0.        ,  0.        ],
       [-0.05937738,  1.4068006 , -0.73888695, -0.06095807,  0.05082559,
         0.06524126,  0.        ,  0.        ],
       [-0.06678429,  1.4048368 , -0.7452685 , -0.08743688,  0.05536009,
         0.09069827,  0.        ,  0.        ]], dtype=float32)>

In [159]:
V, _ = agent.evaluate(batch_states, batch_actions)

A_k = agent.compute_gae(batch_rewards, batch_dones, batch_values)

In [160]:
A_k

<tf.Tensor: shape=(1646, 1), dtype=float32, numpy=
array([[-102.52281],
       [-102.54475],
       [-104.07944],
       ...,
       [-121.56047],
       [-110.15443],
       [ -99.06322]], dtype=float32)>

In [164]:
tf.reduce_mean(tf.pow(tf.squeeze(A_k) + V - V, 2))

<tf.Tensor: shape=(), dtype=float32, numpy=30457.723>

In [157]:
tf.reduce_mean(tf.squeeze(A_k) + V - V)

<tf.Tensor: shape=(), dtype=float32, numpy=-1.1761721e+20>

In [153]:
batch_returns

<tf.Tensor: shape=(187,), dtype=float32, numpy=
array([ -35.858765,  -36.00548 ,  -36.13461 ,  -35.78525 ,  -35.179222,
        -34.822395,  -34.400253,  -34.627316,  -36.89905 ,  -36.64887 ,
        -35.568336,  -35.437412,  -35.054546,  -36.05531 ,  -37.551857,
        -38.17431 ,  -38.348904,  -38.044655,  -37.409275,  -36.980873,
        -36.51348 ,  -36.969887,  -36.618206,  -36.243736,  -35.851246,
        -36.930576,  -36.50202 ,  -36.06474 ,  -36.534584,  -37.81839 ,
        -37.37968 ,  -38.743984,  -40.301453,  -41.512123,  -41.2096  ,
        -40.90464 ,  -40.600693,  -43.584766,  -45.699337,  -47.365635,
        -47.571312,  -47.368546,  -47.170036,  -49.245472,  -50.742073,
        -50.734814,  -52.331966,  -53.164597,  -53.96962 ,  -55.591103,
        -55.516636,  -56.25157 ,  -58.49578 ,  -59.12402 ,  -59.894974,
        -60.31041 ,  -63.66343 ,  -66.03956 ,  -67.37953 ,  -66.11544 ,
        -65.71556 ,  -66.781815,  -65.89396 ,  -66.83498 ,  -66.22196 ,
        -68.4855

In [107]:
env = gym.make(ENV, render_mode='human')
for i in range(30):
    state, _ = env.reset()
env.close()

In [104]:
env.render_mode = 'human'
env.render()
state, _ = env.reset()
env.close()

AttributeError: property 'render_mode' of 'TimeLimit' object has no setter

In [61]:
batch_states, batch_actions, batch_log_probs, batch_returns, batch_lens = agent.rollout()

### TODO: Where does the gradient tape start for tf? here?
V, _ = agent.evaluate(batch_states, batch_actions)

A_k = batch_returns - V

# normalise advantages
A_k = (A_k - tf.reduce_mean(A_k)) / (tf.math.reduce_std(A_k) + 1.0e-10)

for _ in range(1):
    _, curr_log_probs = agent.evaluate(batch_states, batch_actions)
    # this is the start of our computation graph?
    ratios = tf.exp(curr_log_probs - tf.squeeze(batch_log_probs))

    # calc surrogate losses
    surr1 = ratios * A_k
    surr2 = tf.clip_by_value(ratios, 1 - agent.clip, 1 + agent.clip) * A_k

    # actor_loss = tf.math.minimum(surr1, surr2)

In [68]:
tf.reduce_mean(-tf.math.minimum(surr1, surr2))

<tf.Tensor: shape=(), dtype=float32, numpy=-8.606018e-08>

In [55]:
ratios

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([1.        , 1.0000002 , 0.99999976, 1.        , 1.        ,
       1.        , 1.        , 1.0000002 , 1.        , 1.        ,
       1.        , 1.0000005 , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.0000002 , 1.0000002 , 1.        ],
      dtype=float32)>