## RL in tensorflow
Some experiments to learn how to use tensorflow to do RL.

We start with this demo: https://www.tensorflow.org/tutorials/reinforcement_learning/actor_critic

try this guy: https://github.com/philtabor/Actor-Critic-Methods-Paper-To-Code/blob/master/ActorCritic/tf2/agent.py 


In [1]:
import gymnasium as gym
import collections
import numpy as np
import statistics
import tensorflow as tf
import tensorflow_probability as tfp
import tqdm

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [25]:
%load_ext tensorboard

In [2]:
class ActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        
        self.common = layers.Dense(num_hidden_units, activation = 'relu')
        self.actor = layers.Dense(num_actions, activation = 'softmax')
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

In [105]:
class Agent():

    def __init__(
        self, 
        gamma: float, 
        model: tf.keras.Model, 
        optimizer: tf.keras.optimizers.Optimizer,
        summary_writer):

        self.gamma = tf.constant(gamma)
        self.model = model
        self.optimizer = optimizer
        self.memory = {
                'state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'next_state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'action_probs': tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'values' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'rewards' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'done' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        }
        self.loss = tf.keras.losses.Huber(reduction = tf.keras.losses.Reduction.SUM)

        self.model.compile(optimizer=self.optimizer)
        self.summary_writer = summary_writer

    
    def _get_action(self, state):

        # state = tf.expand_dims(tf.constant(state, tf.float32), 0)
        action_probs, value = self.model(
                tf.expand_dims(tf.constant(state, tf.float32), 0)
            )
        action = tfp.distributions.Categorical(probs = action_probs).sample()[0]

        return action, action_probs, value

    
    def get_action(self, state, step):

        # run the model
        action, action_probs, value = self._get_action(state)
        # print(action)

        # record outputs - check if need .mark_used()??
        # self.memory['action_probs'].write(step, action_probs[0, action]).mark_used()
        # self.memory['values'].write(step, value).mark_used()

        return action

    def log(self, state, next_state, reward, done, step):

        self.memory['state'].write(
            step, 
            tf.constant(state, tf.float32)
            ).mark_used()

        self.memory['next_state'].write(
            step, 
            tf.constant(next_state, tf.float32)
            ).mark_used()

        self.memory['rewards'].write(
            step, 
            tf.constant(reward, tf.float32)
            ).mark_used()

        self.memory['done'].write(
            step, 
            tf.constant(done, tf.int32)
            ).mark_used()

    def get_expected_return(
        self,
        rewards: tf.Tensor,
        gamma: float,
        standardize: bool = True):
        """Compute expected returns"""

        n = tf.shape(rewards)[0] 
        returns = tf.TensorArray(dtype=tf.float32, size=n)

        # start at last reward and then accumulate reward sums into returns array
        rewards = rewards[::-1]
        discounted_sum = tf.constant(0.0)
        discounted_sum_shape = discounted_sum.shape
        for i in tf.range(n):
            reward = rewards[i]
            discounted_sum = reward + self.gamma * discounted_sum # discounted_sum= 0 for last reward (i.e. first element in loop)
            discounted_sum.set_shape(discounted_sum_shape)
            returns = returns.write(i, discounted_sum)
        returns = returns.stack()[::-1] # take final element

        if standardize:
            returns = ((returns - tf.reduce_mean(returns)) / 
                        (tf.math.reduce_std(returns) + eps))
        
        return tf.expand_dims(returns, 1)

    def update(self, episode: int):

        states = tf.expand_dims(self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())]), 1)
        rewards = self.memory['rewards'].gather([i for i in tf.range(self.memory['rewards'].size())])
        returns = self.get_expected_return(rewards = rewards, gamma = self.gamma)

        with tf.GradientTape() as tape:

            # action_probs, values = self.model(states)
            action, action_probs, values = self._get_action(state)
            # calculate the loss values
            loss = self.compute_loss(action_probs[0, action], values, returns, episode)

        # compute the gradients from the loss
        grads = tape.gradient(loss, self.model.trainable_variables)

        # apply gradients to model params
        self.model.optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # episode_reward = tf.reduce_sum(rewards)
        with self.summary_writer.as_default():
            tf.summary.scalar('loss', loss, step = episode)
            tf.summary.scalar('episode_reward', tf.reduce_sum(rewards), step = episode)
        
    def compute_loss(
        self,
        action_probs: tf.Tensor,
        values: tf.Tensor,
        returns: tf.Tensor,
        episode: int
    ) -> tf.Tensor:
        """Computes combined actor-critic loss"""

        advantage = returns - values

        action_log_probs = tf.math.log(action_probs)
        actor_loss = -tf.reduce_sum(action_log_probs * advantage)

        critic_loss = self.loss(values, returns)

        with self.summary_writer.as_default():
            tf.summary.scalar('actor_loss', actor_loss, step = episode)
            tf.summary.scalar('critic_loss', critic_loss, step = episode)

        return actor_loss + critic_loss

def get_next_run(log_dir):
    next_run = max([0]+[int(j) for j in [i.split('_')[-1] for i in os.listdir(log_dir)] if j.isdigit()]) + 1
    return log_dir + f'/run_{next_run}'



In [106]:
import os
## create tensorboard logs
LOGS = './logs'
if not os.path.exists(LOGS):
    os.mkdir(LOGS)

In [107]:
[int(j) for j in [i.split('_')[-1] for i in os.listdir(LOGS)] if j.isdigit()]

[1, 2, 3, 4]

In [108]:
get_next_run(LOGS)

'./logs/run_5'

In [109]:
env = gym.make('CartPole-v1')

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

eps = np.finfo(np.float32).eps.item()

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
num_actions = env.action_space.n #.shape[0] for continuous
num_hidden_units = 128
model = ActorCritic(num_actions, num_hidden_units)

log_dir = get_next_run(LOGS) #os.path.join(LOGS, f'test/')
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)
agent = Agent(0.99, model, optimizer, summary_writer)

NUM_EPISODES=200

for i in range(NUM_EPISODES):
    state, _ = env.reset(seed = seed)
    step = 0
    done = False

    while not done:
        action = agent.get_action(state, step)
        next_state, reward, terminated, truncated, info = env.step(np.array(action))
        done = terminated or truncated
        agent.log(state, next_state, reward, done, step)

        step +=1

    agent.update(i)



Saving logs to:  ./logs/run_5
