## RL in tensorflow
Some experiments to learn how to use tensorflow to do RL.

We start with this demo: https://www.tensorflow.org/tutorials/reinforcement_learning/actor_critic

try this guy: https://github.com/philtabor/Actor-Critic-Methods-Paper-To-Code/blob/master/ActorCritic/tf2/agent.py 


In [1]:
import gymnasium as gym
import collections
import numpy as np
import statistics
import tensorflow as tf
import tensorflow_probability as tfp
import tqdm
import os

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [2]:
%load_ext tensorboard

In [46]:
class ActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        
        # self.critic1 = layers.Dense(num_hidden_units, activation = 'relu')
        # self.actor1 = layers.Dense(num_hidden_units, activation = 'relu')
        self.common = layers.Dense(num_hidden_units, activation = 'relu')
        self.actor = layers.Dense(num_actions,
        #  activation = 'softmax'
        )
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        # c = self.critic1(inputs)
        # a = self.actor1(inputs)
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

In [155]:
class Agent():

    def __init__(
        self, 
        gamma: float, 
        entropy_coef: float,
        model: tf.keras.Model, 
        optimizer: tf.keras.optimizers.Optimizer,
        summary_writer):

        # discount rate
        self.gamma = tf.constant(gamma)
        self.entropy_coef = tf.constant(entropy_coef)
        self.model = model
        self.optimizer = optimizer

        # creates a dictionary of tensor arrays to write to
        self.memory = self._init_memory()
        # self.loss = tf.keras.losses.Huber(reduction = tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)

        self.model.compile(optimizer=self.optimizer)
        self.summary_writer = summary_writer

    def _init_memory(self):
        return {
                'action' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True),
                'state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'next_state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'action_probs': tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'values' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'rewards' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'done' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        }

    
    def _get_action(self, state):

        # state = tf.expand_dims(tf.constant(state, tf.float32), 0)
        action_logits, value = self.model(
                tf.expand_dims(tf.constant(state, tf.float32), 0)
            )

        action_probs = tf.nn.softmax(action_logits)
        action = np.random.choice(2, p=action_probs.numpy()[0])

        return action, action_probs, value
    
    def get_action(self, state, step):

        # run the model
        action, _, _ = self._get_action(state)

        return action

    def log(self, action, state, next_state, reward, done, step):

        """
        Logs results into memory - not all used necessarily
        """

        self.memory['action'].write(
            step, 
            tf.constant(action, tf.int32)
            ).mark_used()

        self.memory['state'].write(
            step, 
            tf.constant(state, tf.float32)
            ).mark_used()

        self.memory['next_state'].write(
            step, 
            tf.constant(next_state, tf.float32)
            ).mark_used()

        self.memory['rewards'].write(
            step, 
            tf.constant(reward, tf.float32)
            ).mark_used()

        self.memory['done'].write(
            step, 
            tf.constant(done, tf.int32)
            ).mark_used()

    def get_expected_return(
        self,
        rewards: tf.Tensor,
        standardize: bool = True):
        """Compute expected returns"""

        n = tf.shape(rewards)[0] 
        returns = tf.TensorArray(dtype=tf.float32, size=n)

        # start at last reward and then accumulate reward sums into returns array
        rewards = rewards[::-1]
        discounted_sum = tf.constant(0.0)
        discounted_sum_shape = discounted_sum.shape
        for i in tf.range(n):
            reward = rewards[i]
            discounted_sum = reward + self.gamma * discounted_sum # discounted_sum= 0 for last reward (i.e. first element in loop)
            discounted_sum.set_shape(discounted_sum_shape)
            returns = returns.write(i, discounted_sum)
        returns = returns.stack()[::-1] # reverse order back to original

        if standardize:
            returns = ((returns - tf.reduce_mean(returns)) / 
                        (tf.math.reduce_std(returns) + eps))
        
        return returns #tf.expand_dims(returns, 1)

    def update(self, episode: int):

        states = tf.expand_dims(self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())]), 1)
        rewards = self.memory['rewards'].gather([i for i in tf.range(self.memory['rewards'].size())])
        actions = tf.one_hot(
            self.memory['action'].gather([i for i in tf.range(self.memory['action'].size())]),
            2,
            dtype = tf.int32
        )
        returns = self.get_expected_return(rewards = rewards, standardize=True)

        with tf.GradientTape() as tape:

            # action, action_probs, values = self._get_action(states)
            # could it be calling it in a function hides it from gradients?
            action_logits, values = self.model(states)
            # action_probs = tfp.distributions.Categorical(probs = action_probs)
            # action = action_probs.sample()[0]

            # calculate the loss values
            loss = self.compute_loss(
                actions, #action_probs.prob(action),
                action_logits, #.log_prob(action),
                values,
                returns,
                episode)

        # compute the gradients from the loss
        grads = tape.gradient(loss, self.model.trainable_variables)
        # grads = [tf.clip_by_norm(g, 1) for g in grads]
        self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        with self.summary_writer.as_default():
            tf.summary.scalar('loss', loss, step = episode)
            tf.summary.scalar('episode_reward', tf.reduce_sum(rewards), step = episode)

        # wipe memory for next episode
        self.memory = self._init_memory()
        
    def compute_loss(
        self,
        actions: tf.Tensor,
        action_logits: tf.Tensor,
        values: tf.Tensor,
        returns: tf.Tensor,
        episode: int
    ) -> tf.Tensor:
        """Computes combined actor-critic loss"""

        advantage = returns - tf.squeeze(values)

        critic_loss = tf.math.square(advantage)
        # critic_loss = self.loss(values, returns)

        action_probs = tf.squeeze(tf.nn.softmax(action_logits))

        entropy_loss = -self.entropy_coef * tf.reduce_sum(action_probs * tf.math.log(action_probs + eps), axis = 1)
        #-1*self.entropy_coef * action_probs.entropy()#tf.reduce_sum(action_probs * action_log_probs)

        actor_loss = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels=actions, logits = tf.squeeze(action_logits)) * advantage
        #-1*action_probs.log_prob(actions) * advantage

        with self.summary_writer.as_default():
            tf.summary.scalar('actor_loss', tf.reduce_mean(actor_loss), step = episode)
            tf.summary.scalar('critic_loss', tf.reduce_mean(critic_loss), step = episode)
            tf.summary.scalar('entropy_loss', tf.reduce_mean(entropy_loss), step = episode)

        return tf.reduce_mean(actor_loss + critic_loss + entropy_loss)

def get_next_run(log_dir):
    next_run = max([0]+[int(j) for j in [i.split('_')[-1] for i in os.listdir(log_dir)] if j.isdigit()]) + 1
    return log_dir + f'/run_{next_run}'

def train(agent, env, num_episodes, seed):
    
    for i in range(num_episodes):
        state, _ = env.reset(seed = seed)
        done = False
        step = 0
        while not done:
            action = agent.get_action(state, step)
            next_state, reward, terminated, truncated, info = env.step(np.array(action))
            done = terminated or truncated
            agent.log(action, state, next_state, reward, done, step)
            state = next_state

            step += 1
        agent.update(i)

def test(agent, env, num_episodes, seed):

    for i in range(num_episodes):
        state, _ = env.reset(seed = seed)
        done = False
        step = 0
        while not done:
            action = agent.get_action(state, step)
            next_state, reward, terminated, truncated, info = env.step(np.array(action))
            done = terminated or truncated
            state = next_state

            step += 1
    env.close()


In [156]:

## create tensorboard logs
LOGS = './logs'
if not os.path.exists(LOGS):
    os.mkdir(LOGS)

In [157]:
env = gym.make('CartPole-v1')

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

eps = np.finfo(np.float32).eps.item()

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
num_actions = env.action_space.n #.shape[0] for continuous
num_hidden_units = 256
model = ActorCritic(num_actions, num_hidden_units)

log_dir = get_next_run(LOGS) 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)
agent = Agent(0.99, 0.01, model, optimizer, summary_writer)

NUM_EPISODES=600
train(agent, env, NUM_EPISODES, seed)


Saving logs to:  ./logs/run_97


KeyboardInterrupt: 

In [89]:
tf.nn.softmax_cross_entropy_with_logits(labels = [[1],[1],[0]], logits = tf.constant([[1., 2.], [2., 2.], [3., 5.]]))

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([1.6265233, 1.3862944, 0.       ], dtype=float32)>

In [38]:
entropy = lambda p: p*np.log(p) + (1-p)*np.log(1-p)

for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    print(entropy(i))

-0.3250829733914482
-0.5004024235381879
-0.6108643020548935
-0.6730116670092565
-0.6931471805599453
-0.6730116670092565
-0.6108643020548935
-0.5004024235381879
-0.3250829733914482


In [134]:
env = gym.make('CartPole-v1')
state, _ = env.reset()

action_logits, values = agent.model(np.squeeze(np.random.rand(10, 1, 4)))

In [136]:
action_probs = tf.nn.softmax(action_logits)
print(tf.shape(action_probs))
entropy = tf.reduce_sum(action_probs * tf.math.log(action_probs), axis = 1)
print(tf.shape(entropy))

tf.Tensor([10  2], shape=(2,), dtype=int32)
tf.Tensor([10], shape=(1,), dtype=int32)


In [141]:
tf.shape(tf.squeeze(values))

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([10])>

In [125]:
tf.cast(tf.concat([tf.ones(2), tf.zeros(8)], axis= 0), tf.int32)

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0])>

In [127]:
probs = tf.nn.softmax(action_logits)

actions = tf.one_hot(
    tf.cast(
        tf.concat([tf.ones(2), tf.zeros(8)], axis= 0),
    tf.int32),
    2, dtype = tf.float32)

tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels = actions, logits = action_logits)


<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([0.7570526 , 0.7531439 , 0.6654516 , 0.7044107 , 0.63360876,
       0.6805754 , 0.6382811 , 0.64661634, 0.64507943, 0.6910991 ],
      dtype=float32)>

In [106]:
tf.concat([tf.ones(2), tf.zeros(8)], axis= 0)

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

In [59]:
lgits = tf.constant([[1.,2.],[3., 7.]])
action_probs = tf.nn.softmax(lgits)
action = np.random.choice(2, p=action_probs.numpy()[0])
env.step(np.array(action))

(array([ 0.00432208, -0.14846805,  0.04684547,  0.2773148 ], dtype=float32),
 1.0,
 False,
 False,
 {})

In [60]:
action

0

In [58]:
action_probs.numpy()

array([[0.26894143, 0.7310586 ],
       [0.01798621, 0.98201376]], dtype=float32)

In [26]:
action_probs.sample()[0]

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])>

In [9]:
def woop(p:tfp.distributions):
    return print("p is a dist!")

woop(action_probs)
woop([1,2,3])

p is a dist!
p is a dist!


In [10]:
action_probs.entropy()


<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.6931472], dtype=float32)>

In [12]:
action_probs.sample()

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([1])>

In [15]:
agent.memory['state'].gather([i for i in tf.range(agent.memory['state'].size())])

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[ 0.0273956 , -0.00611216,  0.03585979,  0.0197368 ]],
      dtype=float32)>

In [12]:
values

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.46400774]], dtype=float32)>

In [11]:
action_probs.prob(action)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.9212814], dtype=float32)>

In [16]:
env = gym.make('CartPole-v1')

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

eps = np.finfo(np.float32).eps.item()

optimizer = tf.keras.optimizers.Adam(learning_rate=3.0e-4)
num_actions = env.action_space.n #.shape[0] for continuous
num_hidden_units = 256
model = ActorCritic(num_actions, num_hidden_units)

log_dir = get_next_run(LOGS) 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)
agent = Agent(0.99, model, optimizer, summary_writer)

NUM_EPISODES=3000
train(agent, env, NUM_EPISODES, seed)

Saving logs to:  ./logs/run_63


KeyboardInterrupt: 

In [30]:

env = gym.make('CartPole-v1', render_mode='human')
test(agent, env, 100, 42)