## RL in tensorflow
Some experiments to learn how to use tensorflow to do RL.

We start with this demo: https://www.tensorflow.org/tutorials/reinforcement_learning/actor_critic

Also tried this guy: https://github.com/philtabor/Actor-Critic-Methods-Paper-To-Code/blob/master/ActorCritic/tf2/agent.py 

This example was useful too for calculating the loss: https://blog.tensorflow.org/2018/07/deep-reinforcement-learning-keras-eager-execution.html

Ultimately it seemed that using tfp.distributions.Categorical might have been slightly less stable than the blog above? But it seems that a key mistake that I made here was not using the _actual_ actions. Instead I was re-sampling from the probability distribution. This made a big difference. the softmax_cross_entropy_with_logits_v2 seems to help a lot.

Appears to be quite unstable some times - can just collapse into using a single action and achieve very bad performance. Not sure why.

Another troublesome occurrence is when the actor loss and the critic loss tend to diverge, with the actor loss growing faster than the critic loss. The critic loss looks to try to catch the actor loss. Results in the total loss to diverge.

Have tried to avoid this by setting entropy regularisation high.

In [1]:
import gymnasium as gym
import collections
import numpy as np
import statistics
import tensorflow as tf
import tensorflow_probability as tfp
import tqdm
import os

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [2]:
%load_ext tensorboard

In [46]:
class ActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        
        # self.critic1 = layers.Dense(num_hidden_units, activation = 'relu')
        # self.actor1 = layers.Dense(num_hidden_units, activation = 'relu')
        self.common = layers.Dense(num_hidden_units, activation = 'relu')
        self.actor = layers.Dense(num_actions,
        #  activation = 'softmax'
        )
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        # c = self.critic1(inputs)
        # a = self.actor1(inputs)
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

In [214]:
class Agent():

    def __init__(
        self, 
        gamma: float, 
        entropy_coef: float,
        model: tf.keras.Model, 
        optimizer: tf.keras.optimizers.Optimizer,
        summary_writer):

        # discount rate
        self.gamma = tf.constant(gamma)
        self.entropy_coef = tf.constant(entropy_coef)
        self.model = model
        self.optimizer = optimizer

        # creates a dictionary of tensor arrays to write to
        self.memory = self._init_memory()
        # self.loss = tf.keras.losses.Huber(reduction = tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)

        self.model.compile(optimizer=self.optimizer)
        self.summary_writer = summary_writer

    def _init_memory(self):
        return {
                'action' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True),
                'state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'next_state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'action_probs': tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'values' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'rewards' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'done' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        }

    
    def _get_action(self, state):

        # state = tf.expand_dims(tf.constant(state, tf.float32), 0)
        action_logits, value = self.model(
                tf.expand_dims(tf.constant(state, tf.float32), 0)
            )
        action_probs = tfp.distributions.Categorical(logits=action_logits)
        action = action_probs.sample()[0]
        # action_probs = tf.nn.softmax(action_logits)
        # action = np.random.choice(2, p=action_probs.numpy()[0])

        return action, action_probs, value
    
    def get_action(self, state, step):

        # run the model
        action, _, _ = self._get_action(state)

        return action

    def log(self, action, state, next_state, reward, done, step):

        """
        Logs results into memory - not all used necessarily
        """

        self.memory['action'].write(
            step, 
            tf.constant(action, tf.int32)
            ).mark_used()

        self.memory['state'].write(
            step, 
            tf.constant(state, tf.float32)
            ).mark_used()

        self.memory['next_state'].write(
            step, 
            tf.constant(next_state, tf.float32)
            ).mark_used()

        self.memory['rewards'].write(
            step, 
            tf.constant(reward, tf.float32)
            ).mark_used()

        self.memory['done'].write(
            step, 
            tf.constant(done, tf.int32)
            ).mark_used()

    def get_expected_return(
        self,
        rewards: tf.Tensor,
        standardize: bool = True):
        """Compute expected returns"""

        n = tf.shape(rewards)[0] 
        returns = tf.TensorArray(dtype=tf.float32, size=n)

        # start at last reward and then accumulate reward sums into returns array
        rewards = rewards[::-1]
        discounted_sum = tf.constant(0.0)
        discounted_sum_shape = discounted_sum.shape
        for i in tf.range(n):
            reward = rewards[i]
            discounted_sum = reward + self.gamma * discounted_sum # discounted_sum= 0 for last reward (i.e. first element in loop)
            discounted_sum.set_shape(discounted_sum_shape)
            returns = returns.write(i, discounted_sum)
        returns = returns.stack()[::-1] # reverse order back to original

        if standardize:
            returns = ((returns - tf.reduce_mean(returns)) / 
                        (tf.math.reduce_std(returns) + eps))
        
        return returns #tf.expand_dims(returns, 1)

    def update(self, episode: int):

        states = tf.expand_dims(self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())]), 1)
        rewards = self.memory['rewards'].gather([i for i in tf.range(self.memory['rewards'].size())])
        actions = self.memory['action'].gather([i for i in tf.range(self.memory['action'].size())])
        # tf.one_hot(
        #     self.memory['action'].gather([i for i in tf.range(self.memory['action'].size())]),
        #     2,
        #     dtype = tf.int32
        # )
        returns = self.get_expected_return(rewards = rewards, standardize=True)

        with tf.GradientTape() as tape:

            action_logits, values = self.model(states)

            # calculate the loss values
            loss = self.compute_loss(
                actions, 
                action_logits, 
                values,
                returns,
                episode)

        # compute the gradients from the loss
        grads = tape.gradient(loss, self.model.trainable_variables)
        # grads = [tf.clip_by_norm(g, 1) for g in grads]
        self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        with self.summary_writer.as_default():
            tf.summary.scalar('loss', loss, step = episode)
            tf.summary.scalar('episode_reward', tf.reduce_sum(rewards), step = episode)

        # wipe memory for next episode
        self.memory = self._init_memory()
        
    def compute_loss(
        self,
        actions: tf.Tensor,
        action_logits: tf.Tensor,
        values: tf.Tensor,
        returns: tf.Tensor,
        episode: int
    ) -> tf.Tensor:
        """Computes combined actor-critic loss"""

        advantage = returns - tf.squeeze(values)

        critic_loss = tf.math.square(advantage)
        # critic_loss = self.loss(values, returns)
        action_probs = tfp.distributions.Categorical(logits=action_logits)

        # action_probs = tf.squeeze(tf.nn.softmax(action_logits))

        entropy_loss = -self.entropy_coef * action_probs.entropy()
        #tf.reduce_sum(action_probs * tf.math.log(action_probs + eps), axis = 1)

        actor_loss = -action_probs.log_prob(actions) * advantage
        #tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels=actions, logits = tf.squeeze(action_logits)) * advantage

        with self.summary_writer.as_default():
            tf.summary.scalar('actor_loss', tf.reduce_mean(actor_loss), step = episode)
            tf.summary.scalar('critic_loss', tf.reduce_mean(critic_loss), step = episode)
            tf.summary.scalar('entropy_loss', tf.reduce_mean(entropy_loss), step = episode)

        return tf.reduce_mean(actor_loss + critic_loss + entropy_loss)

def get_next_run(log_dir):
    next_run = max([0]+[int(j) for j in [i.split('_')[-1] for i in os.listdir(log_dir)] if j.isdigit()]) + 1
    return log_dir + f'/run_{next_run}'

def train(agent, env, num_episodes, seed):
    
    for i in range(num_episodes):
        state, _ = env.reset(seed = seed)
        done = False
        step = 0
        while not done:
            action = agent.get_action(state, step)
            next_state, reward, terminated, truncated, info = env.step(np.array(action))
            done = terminated or truncated
            agent.log(action, state, next_state, reward, done, step)
            state = next_state

            step += 1
        agent.update(i)

def test(agent, env, num_episodes, seed):

    for i in range(num_episodes):
        state, _ = env.reset(seed = seed)
        done = False
        step = 0
        while not done:
            action = agent.get_action(state, step)
            next_state, reward, terminated, truncated, info = env.step(np.array(action))
            done = terminated or truncated
            state = next_state

            step += 1
    env.close()


In [215]:

## create tensorboard logs
LOGS = './logs'
if not os.path.exists(LOGS):
    os.mkdir(LOGS)

In [216]:
env = gym.make('CartPole-v1')

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

eps = np.finfo(np.float32).eps.item()

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
num_actions = env.action_space.n #.shape[0] for continuous
num_hidden_units = 256
model = ActorCritic(num_actions, num_hidden_units)

log_dir = get_next_run(LOGS) 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)
agent = Agent(0.99, 0.5, model, optimizer, summary_writer)

NUM_EPISODES=300
train(agent, env, NUM_EPISODES, seed)


Saving logs to:  ./logs/run_102


In [158]:

env = gym.make('CartPole-v1', render_mode='human')
test(agent, env, 10, 42)

In [192]:
# state, _ = env.reset()
state = np.random.rand(10,1,4)

action, action_probs, value = agent._get_action(state)

In [211]:
dist = tfp.distributions.Categorical(logits= tf.constant([1.,3.]))
dist.prob(tf.constant([0,1]))

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.11920291, 0.880797  ], dtype=float32)>

In [213]:
p1 = tf.nn.softmax(tf.constant([1., 3.]))
print(p1)
print(tf.reduce_sum(-tf.math.log(p1) * p1, axis=0))

tf.Tensor([0.11920291 0.880797  ], shape=(2,), dtype=float32)
tf.Tensor(0.36533388, shape=(), dtype=float32)


In [212]:
dist.entropy()

<tf.Tensor: shape=(), dtype=float32, numpy=0.36533386>

In [194]:
action_probs.logits

<tf.Tensor: shape=(1, 10, 1, 2), dtype=float32, numpy=
array([[[[ -603.4104  ,   603.86115 ]],

        [[-2137.5083  ,  2136.7122  ]],

        [[-1716.2794  ,  1715.7451  ]],

        [[ -940.38794 ,   940.67737 ]],

        [[ -401.54602 ,   402.21472 ]],

        [[-1815.1519  ,  1814.3895  ]],

        [[  -80.182304,    81.40192 ]],

        [[-1866.38    ,  1865.4476  ]],

        [[-1512.7196  ,  1511.8994  ]],

        [[-2159.2698  ,  2158.3687  ]]]], dtype=float32)>

In [195]:
probs1 = tf.nn.softmax(action_probs.logits)
probs2 = action_probs.prob(action)

In [196]:
probs1

<tf.Tensor: shape=(1, 10, 1, 2), dtype=float32, numpy=
array([[[[0., 1.]],

        [[0., 1.]],

        [[0., 1.]],

        [[0., 1.]],

        [[0., 1.]],

        [[0., 1.]],

        [[0., 1.]],

        [[0., 1.]],

        [[0., 1.]],

        [[0., 1.]]]], dtype=float32)>

In [197]:
probs2

<tf.Tensor: shape=(1, 10, 1), dtype=float32, numpy=
array([[[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]]], dtype=float32)>