In [1]:
import gymnasium as gym
import collections
import numpy as np
import statistics
import tensorflow as tf
import tensorflow_probability as tfp
import tqdm
import os

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [2]:
%load_ext tensorboard

In [3]:
class ContinuousActor(tf.keras.Model):
    """Actor that outputs a policy directly"""
    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.means = layers.Dense(num_actions, 
        # activation = 'tanh'
        )
        # self.stds = lambda x: tf.zeros(num_actions)
        self.stds = layers.Dense(num_actions, activation='relu')
        # tf.ones(num_actions) * 0.25
    def call(self, inputs: tf.Tensor):
        means = self.means(inputs)
        stds = self.stds(inputs)
        stds = tf.clip_by_value(stds, 1.0e-3, 1)
        return tfp.distributions.MultivariateNormalDiag(loc = means, scale_diag = tf.exp(stds))


class ActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.common = layers.Dense(num_hidden_units, activation = 'relu')
        # outputs scale, location params for mvn
        self.actor = ContinuousActor(num_actions, num_hidden_units) #layers.Dense(num_actions)
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor):
        x = self.common(inputs)
        return self.actor(x), self.critic(x)




In [4]:
class Agent():

    def __init__(
        self, 
        gamma: float, 
        entropy_coef: float,
        vf_coef: float,
        model: tf.keras.Model, 
        optimizer: tf.keras.optimizers.Optimizer,
        summary_writer,
        debug = False):

        # discount rate
        self.gamma = tf.constant(gamma)
        self.entropy_coef = tf.constant(entropy_coef)
        self.vf_coef = tf.constant(vf_coef)
        self.model = model
        self.optimizer = optimizer

        # creates a dictionary of tensor arrays to write to
        self.memory = self._init_memory()
        self.loss = tf.keras.losses.Huber(reduction = tf.keras.losses.Reduction.NONE)

        self.model.compile(optimizer=self.optimizer)
        self.summary_writer = summary_writer

        # switch for analysis
        self.DEBUG = debug
        if self.DEBUG:
            self.debug_val = 0

    def _init_memory(self):
        return {
                'action' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'next_state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'action_probs': tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'values' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'rewards' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'done' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        }

    
    def _get_action(self, state):

        policy, value = self.model(
                tf.expand_dims(tf.constant(state, tf.float32), 0)
            )
        action = policy.sample()[0]

        return tf.tanh(action), policy, value
        # return tf.clip_by_value(action, -0.99, 0.99), policy, value
    
    def get_action(self, state, step):

        # run the model
        action, policy, _ = self._get_action(state)

        if self.DEBUG:
            with self.summary_writer.as_default():
                tf.summary.scalar('mean1', policy.mean()[0][0], step = self.debug_val)
                tf.summary.scalar('mean2', policy.mean()[0][1], step = self.debug_val)
                tf.summary.scalar('std1', policy.stddev()[0][0], step = self.debug_val)
                tf.summary.scalar('std2', policy.stddev()[0][1], step = self.debug_val)

            self.debug_val += 1

        return action

    def log(self, action, state, next_state, reward, done, step):

        """
        Logs results into memory - not all used necessarily
        """

        self.memory['action'].write(
            step, 
            tf.constant(action, tf.float32)
            ).mark_used()

        self.memory['state'].write(
            step, 
            tf.constant(state, tf.float32)
            ).mark_used()

        self.memory['next_state'].write(
            step, 
            tf.constant(next_state, tf.float32)
            ).mark_used()

        self.memory['rewards'].write(
            step, 
            tf.constant(reward, tf.float32)
            ).mark_used()

        self.memory['done'].write(
            step, 
            tf.constant(done, tf.int32)
            ).mark_used()

    def get_expected_return(
        self,
        rewards: tf.Tensor,
        standardize: bool = True):
        """Compute expected returns"""

        n = tf.shape(rewards)[0] 
        returns = tf.TensorArray(dtype=tf.float32, size=n)

        # start at last reward and then accumulate reward sums into returns array
        rewards = rewards[::-1]
        discounted_sum = tf.constant(0.0)
        discounted_sum_shape = discounted_sum.shape
        for i in tf.range(n):
            reward = rewards[i]
            discounted_sum = reward + self.gamma * discounted_sum # discounted_sum= 0 for last reward (i.e. first element in loop)
            discounted_sum.set_shape(discounted_sum_shape)
            returns = returns.write(i, discounted_sum)
        returns = returns.stack()[::-1] # reverse order back to original

        if standardize:
            returns = ((returns - tf.reduce_mean(returns)) / 
                        (tf.math.reduce_std(returns) + eps))
        
        return returns 

    def update(self, episode: int):

        states = self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())])
        # tf.expand_dims(self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())]), 1)
        rewards = self.memory['rewards'].gather([i for i in tf.range(self.memory['rewards'].size())])
        actions = self.memory['action'].gather([i for i in tf.range(self.memory['action'].size())])
        returns = self.get_expected_return(rewards = rewards, standardize=True)

        with tf.GradientTape() as tape:

            policy, values = self.model(states)

            # calculate the loss values
            loss = self.compute_loss(
                actions, 
                policy, 
                values,
                returns,
                episode)

        # compute the gradients from the loss
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        with self.summary_writer.as_default():
            tf.summary.scalar('loss', loss, step = episode)
            tf.summary.scalar('episode_reward', tf.reduce_sum(rewards), step = episode)

        # wipe memory for next episode
        self.memory = self._init_memory()
        
    def compute_loss(
        self,
        actions: tf.Tensor,
        policy: tf.Tensor,
        values: tf.Tensor,
        returns: tf.Tensor,
        episode: int
    ) -> tf.Tensor:
        """Computes combined actor-critic loss"""
        # print(f'returns: {tf.shape(returns)}; values: {tf.shape(values)}')
        advantage = returns - tf.squeeze(values)

        critic_loss = self.vf_coef*self.loss(tf.squeeze(values), returns)#tf.math.square(advantage)

        entropy_loss = -self.entropy_coef * policy.entropy()

        # squashing correction
        log_probs = tf.expand_dims(policy.log_prob(actions), axis = 1)
        # stopping gradient for advantage dramatically improves stability!
        actor_loss = tf.reduce_sum(-log_probs - tf.squeeze(tf.math.log(1 - tf.math.pow(actions, 2) + 1.0e-10)), axis=1) * tf.stop_gradient(advantage)

        with self.summary_writer.as_default():
            tf.summary.scalar('actor_loss', tf.reduce_mean(actor_loss), step = episode)
            tf.summary.scalar('critic_loss', tf.reduce_mean(critic_loss), step = episode)
            tf.summary.scalar('entropy_loss', tf.reduce_mean(entropy_loss), step = episode)

        return tf.reduce_mean(actor_loss + critic_loss + entropy_loss)

def get_next_run(log_dir):
    next_run = max([0]+[int(j) for j in [i.split('_')[-1] for i in os.listdir(log_dir)] if j.isdigit()]) + 1
    return log_dir + f'/run_{next_run}'

def train(agent, env, num_episodes, seed):
    
    for i in range(num_episodes):
        state, _ = env.reset(seed = seed)
        done = False
        step = 0
        while not done:
            action = agent.get_action(state, step)
            next_state, reward, terminated, truncated, info = env.step(np.array(action))
            done = terminated or truncated
            agent.log(action, state, next_state, reward, done, step)
            state = next_state

            step += 1
        agent.update(i)

def test(agent, env, num_episodes, seed):

    for i in range(num_episodes):
        state, _ = env.reset(seed = seed)
        done = False
        step = 0
        while not done:
            action = agent.get_action(state, step)
            next_state, reward, terminated, truncated, info = env.step(np.array(action))
            done = terminated or truncated
            state = next_state

            step += 1
    env.close()


In [5]:
## create tensorboard logs
LOGS = './logs'
if not os.path.exists(LOGS):
    os.mkdir(LOGS)

In [6]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
env = gym.make('LunarLanderContinuous-v2')
eps = np.finfo(np.float32).eps.item()

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
num_actions = env.action_space.shape[0]
num_hidden_units = 256
model = ActorCritic(num_actions, num_hidden_units)

log_dir = get_next_run(LOGS) 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)
agent = Agent(0.99, 0.0, 0.4, model, optimizer, summary_writer, debug=True)

NUM_EPISODES=600
train(agent, env, NUM_EPISODES, seed)


Saving logs to:  ./logs/run_143


In [123]:
from gymnasium.wrappers import NormalizeObservation
env = NormalizeObservation(env)
state, _ = env.reset()
state

array([-4.84584044e-05,  8.14826722e-03, -4.40632277e-03, -2.24742149e-03,
        5.62189279e-05,  1.10504436e-03,  0.00000000e+00,  0.00000000e+00])

In [7]:
NUM_EPISODES=5
env = gym.make('LunarLanderContinuous-v2', render_mode='human')
test(agent, env, NUM_EPISODES, seed)

## RNN / sequence model
Try to create an RL agent with an RNN network

In [8]:
lstm = tf.keras.Sequential(
    layers.LSTM(128)
)



In [119]:
class ContinuousActor(tf.keras.Model):
    """Actor that outputs a policy directly"""
    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.means = layers.Dense(num_actions)
        self.stds = lambda x: tf.zeros(num_actions)
        # self.stds = layers.Dense(num_actions, activation='relu')
        # tf.ones(num_actions) * 0.25
    def call(self, inputs: tf.Tensor):
        means = self.means(inputs)
        stds = self.stds(inputs)
        # stds = tf.clip_by_value(stds, 1.0e-3, 1)
        return tfp.distributions.MultivariateNormalDiag(loc = means, scale_diag = tf.exp(stds))


class RecurrentActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.common = layers.LSTM(num_hidden_units)
        # outputs scale, location params for mvn
        self.actor = ContinuousActor(num_actions, num_hidden_units) 
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor):
        x = self.common(inputs)
        return self.actor(x), self.critic(x)




In [193]:
class RecurrentAgent():

    def __init__(
        self, 
        gamma: float, 
        entropy_coef: float,
        vf_coef: float,
        window_size: int,
        model: tf.keras.Model, 
        optimizer: tf.keras.optimizers.Optimizer,
        summary_writer,
        debug = False):

        # discount rate
        self.gamma = tf.constant(gamma)
        self.entropy_coef = tf.constant(entropy_coef)
        self.vf_coef = tf.constant(vf_coef)
        self.window_size = tf.constant(window_size)
        self.model = model
        self.optimizer = optimizer

        # creates a dictionary of tensor arrays to write to
        self.memory = self._init_memory()
        self.loss = tf.keras.losses.Huber(reduction = tf.keras.losses.Reduction.NONE)

        self.model.compile(optimizer=self.optimizer)
        self.summary_writer = summary_writer

        # switch for analysis
        self.DEBUG = debug
        if self.DEBUG:
            self.debug_val = 0

    def _init_memory(self):
        return {
                'action' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'next_state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'action_probs': tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'values' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'rewards' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'done' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        }

    def _pad_input(self, new_state):
        
        # get previous states and combine with new_state
        if self.memory['state'].size() > 0:
            last_n = self.memory['state'].size()
            first_n = np.max([0, last_n - self.window_size], 0)
            prev_states = self.memory['state'].gather([i for i in tf.range(first_n, last_n)])
            combined_states = tf.concat([tf.expand_dims(new_state, 0), prev_states], axis =0)
        else:
            combined_states = tf.expand_dims(new_state, 0)

        # get size of padding - 0 if we have enough
        pad_size = np.max([0, self.window_size-tf.shape(combined_states)[0]], axis = 0)
        
        # get the padded vals
        padded = tf.pad(combined_states, [[0,pad_size],[0,0]])
        padded = tf.expand_dims(padded, 0)

        return padded
    
    def _get_action(self, state):

        padded = self._pad_input(state)

        policy, value = self.model(padded)
        action = policy.sample()[0]

        return tf.tanh(action), policy, value
        # return tf.clip_by_value(action, -0.99, 0.99), policy, value
    
    def get_action(self, state, step):

        # run the model
        action, policy, _ = self._get_action(state)

        if self.DEBUG:
            with self.summary_writer.as_default():
                tf.summary.scalar('mean1', policy.mean()[0][0], step = self.debug_val)
                tf.summary.scalar('mean2', policy.mean()[0][1], step = self.debug_val)
                tf.summary.scalar('std1', policy.stddev()[0][0], step = self.debug_val)
                tf.summary.scalar('std2', policy.stddev()[0][1], step = self.debug_val)

            self.debug_val += 1

        return action

    def log(self, action, state, next_state, reward, done, step):

        """
        Logs results into memory - not all used necessarily
        """

        self.memory['action'].write(
            step, 
            tf.constant(action, tf.float32)
            ).mark_used()

        self.memory['state'].write(
            step, 
            tf.constant(state, tf.float32)
            ).mark_used()

        self.memory['next_state'].write(
            step, 
            tf.constant(next_state, tf.float32)
            ).mark_used()

        self.memory['rewards'].write(
            step, 
            tf.constant(reward, tf.float32)
            ).mark_used()

        self.memory['done'].write(
            step, 
            tf.constant(done, tf.int32)
            ).mark_used()

    def get_expected_return(
        self,
        rewards: tf.Tensor,
        standardize: bool = True):
        """Compute expected returns"""

        n = tf.shape(rewards)[0] 
        returns = tf.TensorArray(dtype=tf.float32, size=n)

        # start at last reward and then accumulate reward sums into returns array
        rewards = rewards[::-1]
        discounted_sum = tf.constant(0.0)
        discounted_sum_shape = discounted_sum.shape
        for i in tf.range(n):
            reward = rewards[i]
            discounted_sum = reward + self.gamma * discounted_sum # discounted_sum= 0 for last reward (i.e. first element in loop)
            discounted_sum.set_shape(discounted_sum_shape)
            returns = returns.write(i, discounted_sum)
        returns = returns.stack()[::-1] # reverse order back to original

        if standardize:
            returns = ((returns - tf.reduce_mean(returns)) / 
                        (tf.math.reduce_std(returns) + eps))
        
        return returns

    def _prepare_states_for_lstm(self, states):
        """wraps states into lstm sequence format"""
        ## need the initial k 

        initial_padded_states = tf.convert_to_tensor([tf.pad(states[:(i+1)][::-1], [[0,np.max([0, self.window_size-(i+1)], axis = 0)],[0,0]]) for i in tf.range(0, self.window_size)])
        remaining_states = tf.convert_to_tensor([states[(i-self.window_size):i] for i in tf.range(self.window_size, tf.shape(states)[0])])
        combined_states = tf.concat([initial_padded_states, remaining_states], axis = 0)

        return combined_states
        

    def update(self, episode: int):

        states = self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())])
        states = self._prepare_states_for_lstm(states)
        rewards = self.memory['rewards'].gather([i for i in tf.range(self.memory['rewards'].size())])
        actions = self.memory['action'].gather([i for i in tf.range(self.memory['action'].size())])
        returns = self.get_expected_return(rewards = rewards, standardize=True)

        with tf.GradientTape() as tape:

            policy, values = self.model(states)

            # calculate the loss values
            loss = self.compute_loss(
                actions, 
                policy, 
                values,
                returns,
                episode)

        # compute the gradients from the loss
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        with self.summary_writer.as_default():
            tf.summary.scalar('loss', loss, step = episode)
            tf.summary.scalar('episode_reward', tf.reduce_sum(rewards), step = episode)

        # wipe memory for next episode
        self.memory = self._init_memory()
        
    def compute_loss(
        self,
        actions: tf.Tensor,
        policy: tf.Tensor,
        values: tf.Tensor,
        returns: tf.Tensor,
        episode: int
    ) -> tf.Tensor:
        """Computes combined actor-critic loss"""
        # print(f'returns: {tf.shape(returns)}; values: {tf.shape(values)}')
        advantage = returns - tf.squeeze(values)

        critic_loss = self.vf_coef*self.loss(tf.squeeze(values), returns)#tf.math.square(advantage)

        entropy_loss = -self.entropy_coef * policy.entropy()

        # squashing correction
        log_probs = tf.expand_dims(policy.log_prob(actions), axis = 1)
        # stopping gradient for advantage dramatically improves stability!
        actor_loss = tf.reduce_sum(-log_probs - tf.squeeze(tf.math.log(1 - tf.math.pow(actions, 2) + 1.0e-10)), axis=1) * tf.stop_gradient(advantage)

        with self.summary_writer.as_default():
            tf.summary.scalar('actor_loss', tf.reduce_mean(actor_loss), step = episode)
            tf.summary.scalar('critic_loss', tf.reduce_mean(critic_loss), step = episode)
            tf.summary.scalar('entropy_loss', tf.reduce_mean(entropy_loss), step = episode)

        return tf.reduce_mean(actor_loss + critic_loss + entropy_loss)

In [194]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
env = gym.make('LunarLanderContinuous-v2')
eps = np.finfo(np.float32).eps.item()

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
num_actions = env.action_space.shape[0]
num_hidden_units = 256
model = RecurrentActorCritic(num_actions, num_hidden_units)

log_dir = get_next_run(LOGS) 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)
agent2 = RecurrentAgent(0.99, 0.0, 0.4, 8, model, optimizer, summary_writer, debug=True)

NUM_EPISODES=100
train(agent2, env, NUM_EPISODES, seed)

Saving logs to:  ./logs/run_153
