# Actor-Critic Training Pipeline

In [1]:

#the usual suspects for importing
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np


#general imports
from air_hockey_challenge.framework import AgentBase
from air_hockey_challenge.framework.air_hockey_challenge_wrapper import AirHockeyChallengeWrapper
from air_hockey_challenge.framework.challenge_core import ChallengeCore


#approximator, optimizer, and policy imports
from mushroom_rl.policy.gaussian_policy import GaussianPolicy
from mushroom_rl.approximators.parametric import TorchApproximator
from mushroom_rl.utils.optimizers import AdaptiveOptimizer
from mushroom_rl.policy import OrnsteinUhlenbeckPolicy


#challenge core imports
import time
from mushroom_rl.core import Core
from air_hockey_challenge.framework.challenge_core import ChallengeCore
from air_hockey_challenge.framework.agent_base import AgentBase

#actor critic algorithm
from mushroom_rl.algorithms.actor_critic import DDPG, TD3

In [2]:
!python3 --version

Python 3.10.11


In [3]:
!which jupyter lab

/home/andang/anaconda3/envs/eecs298/bin/jupyter


In [4]:
torch.cuda.is_available()

True

## Deploying these neural networks for ActorCritic

### Notes for networks
- Actor Network:
    - takes in observation
    - outputs action
- Critic Network:
    - takes in action and observation
    - outputs a value (critic value shape 1)

In [5]:
class ActionGenerator(nn.Module):
    def __init__(self, input_dim, output_dim, use_cuda = False, dropout=False, activation = nn.LeakyReLU(0.1) ):
        super().__init__()
        
        num_layers = 20
        layer_width = 10
        
        input_dim = input_dim[0]
        output_dim = output_dim[0]
        
        layers = [nn.Linear(input_dim, layer_width), activation]
        for i in range(num_layers-1):
            layers.append(nn.Linear(layer_width, layer_width))
            layers.append(activation)
        layers.append(nn.Linear(layer_width, output_dim))
        layers.append(activation)
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, obs):
        out = self.model(obs.float())
        
        # return torch.reshape(out,6)
        return out

class CriticNetwork(nn.Module):
    def __init__(self, input_shape, output_shape,**kwargs):
        '''
            Arguments:
            ----------
                input_shape: (m,) wh#ere m is length of input
                output_shape: (m,n,..) where it represents shape of output
        '''
        super().__init__()
        
        input_dim = input_shape[0]
        output_dim = 1
        for i in output_shape: 
            output_dim *= i
        
        self.output_shape = output_shape
        activation = nn.LeakyReLU(0.1)
        
        if 'num_layers' in kwargs.keys():
            num_layers = kwargs['num_layers']
        else:
            num_layers = 20
        
        if 'layer_width' in kwargs.keys():
            layer_width = kwargs['layer_width']
        else:
            layer_width = 10
        
        
        layers = [nn.Linear(input_dim, layer_width), activation]
        for i in range(num_layers-1):
            layers.append(nn.Linear(layer_width, layer_width))
            layers.append(activation)
        layers.append(nn.Linear(layer_width, output_dim))
        layers.append(activation)
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, state, action):
        state_action = torch.cat((state.float(), action.float()), dim=1)
        out = self.model(state_action.float()) #reshape into a tensor of classes representing pos/vel
            
        return out.T

### Customized ChallengeCore module for training

In [6]:
class CustomChallengeCore(Core):
    def __init__(self, *args, action_idx=None, **kwargs):
        if action_idx:
            self.action_idx = action_idx
        else:
            self.action_idx = [0, 1]
        super().__init__(*args, **kwargs)
        
    def _step(self, render):
        """
        Single step.

        Args:
            render (bool):
                whether to render or not.

        Returns:
            A tuple containing the previous state, the action sampled by the
            agent, the reward obtained, the reached state, the absorbing flag
            of the reached state and the last step flag.

        """
        start_time = time.time()
        action = self.agent.draw_action(self._state)
        
        #custom CODE
        # print('action:',action)
        action_reshape = action.reshape(2,3)
        end_time = time.time()
        next_state, reward, absorbing, step_info = self.mdp.step(action_reshape[self.action_idx])
        step_info["computation_time"] = (end_time - start_time)

        self._episode_steps += 1

        if render:
            self.mdp.render()

        last = not (
                self._episode_steps < self.mdp.info.horizon and not absorbing)

        state = self._state
        next_state = self._preprocess(next_state.copy())
        self._state = next_state

        return (state, action, reward, next_state, absorbing, last), step_info

## Setting up Training Pipeline

In [11]:

#define rewawrd function
def custom_reward_function(base_env, state, action, next_state, absorbing):
    reward_value = (state[0] - state[6]) * 2 + (1/abs(state[1] - state[7])) * 2 + absorbing * 3 + \
            (next_state[0] - next_state[6]) * 4 + (1/abs(next_state[0] - next_state[6]))
    return reward_value


mdp = AirHockeyChallengeWrapper(env="3dof-defend", action_type="position-velocity", interpolation_order=3, custom_reward_function=custom_reward_function, debug=True)


#policy 
policy_class = OrnsteinUhlenbeckPolicy
policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2)


#actor network
actor_params = dict(network=ActionGenerator,
                    input_shape=(12,),
                    output_shape=(6,),
                    loss=F.smooth_l1_loss,
                    use_cuda=True)
actor_optimizer = {'class': torch.optim.Adam,
                   'params': {'lr': 0.1}}


#critic network
critic_input_shape = 12+6

critic_params = dict(network=CriticNetwork,
                     input_shape=(critic_input_shape,),
                     optimizer={'class': optim.Adam,'params': {'lr': .1}},
                     output_shape=(1,),
                     loss=F.smooth_l1_loss,
                     use_cuda=True)

algorithm_params = {'mdp_info': mdp.info,
                    'policy_class': policy_class,
                    'policy_params': policy_params,
                    'actor_params': actor_params,
                    'actor_optimizer': actor_optimizer,
                    'critic_params': critic_params,
                    'batch_size': 16,
                    'initial_replay_size': 500,
                    'max_replay_size': 5000,
                    'tau': 0.001 #tau = soft update coefficient
                   }



ddpg = DDPG(**algorithm_params)
old_weights = ddpg.policy.get_weights()

core = CustomChallengeCore(ddpg, mdp)

core.learn(n_episodes=100, n_episodes_per_fit=100, render=True) #render allows us to visualize what's going on

                                                                                                                                                                                                                                                                                                                                                                          

In [10]:
ddpg.policy.get_weights()

array([-0.17055152,  0.07392445, -0.21482342, ...,  0.13484232,
       -0.27873698,  0.05045644], dtype=float32)