# SAC agents

> Soft Actor Critic based agent

In [None]:
#| default_exp agents.rl.sac

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export

import logging

# set logging level to INFO
logging.basicConfig(level=logging.INFO)

from abc import ABC, abstractmethod
from typing import Union, Optional, List, Tuple
import numpy as np
import os

from ddopnew.envs.base import BaseEnvironment
from ddopnew.agents.rl.mushroom_rl import MushroomBaseAgent
from ddopnew.utils import MDPInfo, Parameter
from ddopnew.obsprocessors import FlattenTimeDimNumpy
from ddopnew.RL_approximators import MLPStateAction, MLPActor
from ddopnew.postprocessors import ClipAction

from ddopnew.dataloaders.base import BaseDataLoader

from mushroom_rl.algorithms.actor_critic.deep_actor_critic import SAC

import torch
import torch.nn.functional as F
from torchsummary import summary

import time

In [None]:
#| export

class SACBaseAgent(MushroomBaseAgent):

    """
    XXX
    """

    def __init__(self, 
                environment_info: MDPInfo,

                learning_rate_actor: float = 3e-4,
                learning_rate_critic: float | None = None, # If none, then it is set to learning_rate_actor
                initial_replay_size: int = 64,
                max_replay_size: int = 50000,
                batch_size: int = 64,
                hidden_layers: List = None, # if None, then default is [64, 64]
                activation: str = "relu", # "relu", "sigmoid", "tanh", "leakyrelu", "elu"
                warmup_transitions: int = 100,
                lr_alpha: float = 3e-4,
                tau: float = 0.005,
                log_std_min: float = -20.0,
                log_std_max: float = 2.0,
                use_log_alpha_loss=False,
                target_entropy: float | None = None,

                drop_prob: float = 0.0,
                batch_norm: bool = False,
                init_method: str = "xavier_uniform", # "xavier_uniform", "xavier_normal", "he_normal", "he_uniform", "normal", "uniform"

                optimizer: str = "Adam", # "Adam" or "SGD" or "RMSprop"  
                loss: str = "MSE", # currently only MSE is supported     
                obsprocessors: list | None = None,      # default: []
                device: str = "cpu", # "cuda" or "cpu"
                agent_name: str | None = "SAC",
                ):

        # The standard SAC agent needs a 2D input, so we need to flatten the time dimension
        flatten_time_dim_processor = FlattenTimeDimNumpy(allow_2d=True, batch_dim_included=False)
        obsprocessors = (obsprocessors or []) + [flatten_time_dim_processor]

        use_cuda = self.set_device(device)

        hidden_layers = hidden_layers or [64, 64]
        self.warmup_training_steps = initial_replay_size

        OptimizerClass=self.get_optimizer_class(optimizer)
        learning_rate_critic = learning_rate_critic or learning_rate_actor
        lossfunction = self.get_loss_function(loss)

        actor_input_shape = self.get_input_shape(environment_info.observation_space)
        actor_output_shape = environment_info.action_space.shape
        critic_input_shape = (actor_input_shape[0] + actor_output_shape[0],) # check how this works when RNN and mixed agents are used

        actor_mu_params = dict(network=MLPActor,
                                    input_shape=actor_input_shape,
                                    output_shape=actor_output_shape,

                                    hidden_layers=hidden_layers,
                                    activation=activation,
                                    drop_prob=drop_prob,
                                    batch_norm=batch_norm,
                                    init_method=init_method,

                                    use_cuda=use_cuda,
                                    dropout=self.dropout
                                    )

        actor_sigma_params = dict(network=MLPActor,
                                    input_shape= actor_input_shape,
                                    output_shape=actor_output_shape,

                                    hidden_layers=hidden_layers,
                                    activation=activation,
                                    drop_prob=drop_prob,
                                    batch_norm=batch_norm,
                                    init_method=init_method,

                                    use_cuda=use_cuda,
                                    dropout=self.dropout 
                                    )
        
        actor_optimizer = {'class': OptimizerClass,
            'params': {'lr': learning_rate_actor}} 

        critic_params = dict(network=MLPStateAction,
                optimizer={'class': OptimizerClass,
                        'params': {'lr': learning_rate_critic}}, 
                loss=lossfunction,
                input_shape=critic_input_shape,
                output_shape=(1,),

                hidden_layers=hidden_layers,
                activation=activation,
                drop_prob=drop_prob,
                batch_norm=batch_norm,
                init_method=init_method,

                use_cuda=use_cuda,
                dropout=self.dropout,)

        self.agent = SAC(
            mdp_info=environment_info,
            actor_mu_params=actor_mu_params,
            actor_sigma_params=actor_sigma_params,
            actor_optimizer=actor_optimizer,
            critic_params=critic_params,
            batch_size=batch_size,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            warmup_transitions=warmup_transitions,
            tau=tau,
            lr_alpha=lr_alpha,
            use_log_alpha_loss=use_log_alpha_loss,
            log_std_min=log_std_min,
            log_std_max=log_std_max,
            target_entropy=target_entropy,
            critic_fit_params=None
        )

        super().__init__(
            environment_info=environment_info,
            obsprocessors=obsprocessors,
            device=device,
            agent_name=agent_name
        )

        logging.info("Actor network (mu network):")
        if logging.getLogger().isEnabledFor(logging.INFO):
            summary(self.actor, input_size=actor_input_shape)
            time.sleep(.2)

        logging.info("Critic network:")
        if logging.getLogger().isEnabledFor(logging.INFO):
            summary(self.critic, input_size=[actor_input_shape, actor_output_shape])

    def get_network_list(self, set_actor_critic_attributes: bool = True):
        """ Get the list of networks in the agent for the save and load functions
        Get the actor for the predict function in eval mode """

        networks = []
        ensemble_critic = self.agent._critic_approximator._impl.model
        for i, model in enumerate(ensemble_critic):
            networks.append(model.network)
        networks.append(self.agent.policy._mu_approximator._impl.model.network)
        networks.append(self.agent.policy._sigma_approximator._impl.model.network)

        actor = self.agent.policy._mu_approximator._impl.model.network
        critic = ensemble_critic[0].network

        if set_actor_critic_attributes:
            return networks, actor, critic
        else:
            return networks

    def predict_(self, observation: np.ndarray) -> np.ndarray: #
        """ Do one forward pass of the model directly and return the prediction.
        Apply tanh as implemented for the SAC actor in mushroom_rl"""

        # make observation torch tensor

        observation = torch.tensor(observation, dtype=torch.float32).to(self.device)
        action = self.actor.forward(observation)
        # print("a before tanh: ", action)
        action = torch.tanh(action)
        # print("a after tanh: ", action)
        action = action * self.agent.policy._delta_a + self.agent.policy._central_a
        # print("a after scaling: ", action)
        action = action.cpu().detach().numpy()

        return action

In [None]:
#| export

class SACAgent(MushroomBaseAgent):

    """
    XXX
    """

    dropout = True # always keep in True for mushroom_RL, dropout is not desired set drop_prob=0.0

    def __init__(self, 
                environment_info: MDPInfo,

                learning_rate_actor: float = 3e-4,
                learning_rate_critic: float | None = None, # If none, then it is set to learning_rate_actor
                initial_replay_size: int = 64,
                max_replay_size: int = 50000,
                batch_size: int = 64,
                hidden_layers: List = None, # if None, then default is [64, 64]
                activation: str = "relu", # "relu", "sigmoid", "tanh", "leakyrelu", "elu"
                warmup_transitions: int = 100,
                lr_alpha: float = 3e-4,
                tau: float = 0.005,
                log_std_min: float = -20.0,
                log_std_max: float = 2.0,
                use_log_alpha_loss=False,
                target_entropy: float | None = None,

                drop_prob: float = 0.0,
                batch_norm: bool = False,
                init_method: str = "xavier_uniform", # "xavier_uniform", "xavier_normal", "he_normal", "he_uniform", "normal", "uniform"

                optimizer: str = "Adam", # "Adam" or "SGD" or "RMSprop"  
                loss: str = "MSE", # currently only MSE is supported     
                obsprocessors: list | None = None,      # default: []
                device: str = "cpu", # "cuda" or "cpu"
                agent_name: str | None = "SAC",
                ):

        # The standard SAC agent needs a 2D input, so we need to flatten the time dimension
        flatten_time_dim_processor = FlattenTimeDimNumpy(allow_2d=True, batch_dim_included=False)
        obsprocessors = (obsprocessors or []) + [flatten_time_dim_processor]

        use_cuda = self.set_device(device)

        hidden_layers = hidden_layers or [64, 64]
        self.warmup_training_steps = initial_replay_size

        OptimizerClass=self.get_optimizer_class(optimizer)
        learning_rate_critic = learning_rate_critic or learning_rate_actor
        lossfunction = self.get_loss_function(loss)

        actor_input_shape = self.get_input_shape(environment_info.observation_space)
        actor_output_shape = environment_info.action_space.shape
        critic_input_shape = (actor_input_shape[0] + actor_output_shape[0],) # check how this works when RNN and mixed agents are used

        actor_mu_params = dict(network=MLPActor,
                                    input_shape=actor_input_shape,
                                    output_shape=actor_output_shape,

                                    hidden_layers=hidden_layers,
                                    activation=activation,
                                    drop_prob=drop_prob,
                                    batch_norm=batch_norm,
                                    init_method=init_method,

                                    use_cuda=use_cuda,
                                    dropout=self.dropout
                                    )

        actor_sigma_params = dict(network=MLPActor,
                                    input_shape= actor_input_shape,
                                    output_shape=actor_output_shape,

                                    hidden_layers=hidden_layers,
                                    activation=activation,
                                    drop_prob=drop_prob,
                                    batch_norm=batch_norm,
                                    init_method=init_method,

                                    use_cuda=use_cuda,
                                    dropout=self.dropout 
                                    )
        
        actor_optimizer = {'class': OptimizerClass,
            'params': {'lr': learning_rate_actor}} 

        critic_params = dict(network=MLPStateAction,
                optimizer={'class': OptimizerClass,
                        'params': {'lr': learning_rate_critic}}, 
                loss=lossfunction,
                input_shape=critic_input_shape,
                output_shape=(1,),

                hidden_layers=hidden_layers,
                activation=activation,
                drop_prob=drop_prob,
                batch_norm=batch_norm,
                init_method=init_method,

                use_cuda=use_cuda,
                dropout=self.dropout,)

        self.agent = SAC(
            mdp_info=environment_info,
            actor_mu_params=actor_mu_params,
            actor_sigma_params=actor_sigma_params,
            actor_optimizer=actor_optimizer,
            critic_params=critic_params,
            batch_size=batch_size,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            warmup_transitions=warmup_transitions,
            tau=tau,
            lr_alpha=lr_alpha,
            use_log_alpha_loss=use_log_alpha_loss,
            log_std_min=log_std_min,
            log_std_max=log_std_max,
            target_entropy=target_entropy,
            critic_fit_params=None
        )

        super().__init__(
            environment_info=environment_info,
            obsprocessors=obsprocessors,
            device=device,
            agent_name=agent_name
        )

        logging.info("Actor network (mu network):")
        if logging.getLogger().isEnabledFor(logging.INFO):
            summary(self.actor, input_size=actor_input_shape)
            time.sleep(.2)

        logging.info("Critic network:")
        if logging.getLogger().isEnabledFor(logging.INFO):
            summary(self.critic, input_size=[actor_input_shape, actor_output_shape])

    def get_network_list(self, set_actor_critic_attributes: bool = True):
        """ Get the list of networks in the agent for the save and load functions
        Get the actor for the predict function in eval mode """

        networks = []
        ensemble_critic = self.agent._critic_approximator._impl.model
        for i, model in enumerate(ensemble_critic):
            networks.append(model.network)
        networks.append(self.agent.policy._mu_approximator._impl.model.network)
        networks.append(self.agent.policy._sigma_approximator._impl.model.network)

        actor = self.agent.policy._mu_approximator._impl.model.network
        critic = ensemble_critic[0].network

        if set_actor_critic_attributes:
            return networks, actor, critic
        else:
            return networks

    def predict_(self, observation: np.ndarray) -> np.ndarray: #
        """ Do one forward pass of the model directly and return the prediction.
        Apply tanh as implemented for the SAC actor in mushroom_rl"""

        # make observation torch tensor

        observation = torch.tensor(observation, dtype=torch.float32).to(self.device)
        action = self.actor.forward(observation)
        # print("a before tanh: ", action)
        action = torch.tanh(action)
        # print("a after tanh: ", action)
        action = action * self.agent.policy._delta_a + self.agent.policy._central_a
        # print("a after scaling: ", action)
        action = action.cpu().detach().numpy()

        return action

In [None]:
from ddopnew.envs.inventory import NewsvendorEnv
from ddopnew.dataloaders.tabular import XYDataLoader
from ddopnew.experiment_functions import run_experiment, test_agent

val_index_start = 8000 #90_000
test_index_start = 9000 #100_000

X = np.random.standard_normal((10000, 2))
Y = np.random.standard_normal((10000, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)
Y = X[:,0].reshape(-1, 1)
# truncate Y at 0:
Y = np.maximum(Y, 0)
# normalize Y max to 1
Y = Y/np.max(Y)

print(np.max(Y))

print(X.shape, Y.shape)

clip_action = ClipAction(0., 1.)

dataloader = XYDataLoader(X, Y, val_index_start, test_index_start, lag_window_params =  {'lag_window': 0, 'include_y': False, 'pre_calc': True})

environment = NewsvendorEnv(
    dataloader = dataloader,
    underage_cost = 0.42857,
    overage_cost = 1.0,
    gamma = 0.999,
    horizon_train = 365,
    q_bound_high = 1.0,
    q_bound_low = -0.1,
    postprocessors = [clip_action],
)



agent = SACAgent(environment.mdp_info,
                obsprocessors = None,      # default: []
                device="cpu", # "cuda" or "cpu"
)

environment.test()
agent.eval()

R, J = test_agent(agent, environment)

print(R, J)

environment.train()
agent.train()
environment.print=False

# run_experiment(agent, environment, n_epochs=50, n_steps=1000, run_id = "test", save_best=True, print_freq=1) # fit agent via run_experiment function

environment.test()
agent.eval()

R, J = test_agent(agent, environment)

print(R, J)

1.0
(10000, 2) (10000, 1)


  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
INFO:root:Actor network (mu network):


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 64]             192
              ReLU-2                   [-1, 64]               0
           Dropout-3                   [-1, 64]               0
            Linear-4                   [-1, 64]           4,160
              ReLU-5                   [-1, 64]               0
           Dropout-6                   [-1, 64]               0
            Linear-7                    [-1, 1]              65
          Identity-8                    [-1, 1]               0
Total params: 4,417
Trainable params: 4,417
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.02
Estimated Total Size (MB): 0.02
----------------------------------------------------------------


INFO:root:Critic network:


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 64]             256
              ReLU-2                   [-1, 64]               0
           Dropout-3                   [-1, 64]               0
            Linear-4                   [-1, 64]           4,160
              ReLU-5                   [-1, 64]               0
           Dropout-6                   [-1, 64]               0
            Linear-7                    [-1, 1]              65
          Identity-8                    [-1, 1]               0
Total params: 4,481
Trainable params: 4,481
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.02
Estimated Total Size (MB): 0.02
----------------------------------------------------------------
-118.82040518889363 -73.82324158885496
-118.82

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()