# Setting up training with codebase

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

### Example dummy agent.
- this agent holds its current position/vel
- doesn't do anything.
- used to show how to setup an agent.

In [2]:
import numpy as np
from air_hockey_challenge.framework import AgentBase


def build_agent(env_info, **kwargs):
    """
    Function where an Agent that controls the environments should be returned.
    The Agent should inherit from the mushroom_rl Agent base env.

    Args:
        env_info (dict): The environment information
        kwargs (any): Additionally setting from agent_config.yml
    Returns:
         (AgentBase) An instance of the Agent
    """

    return DummyAgent(env_info, **kwargs)

'''
    We will compute the action which is desired pos & vel.
'''

class DummyAgent(AgentBase):
    def __init__(self, env_info, **kwargs):
        super().__init__(env_info, **kwargs)
        self.new_start = True
        self.hold_position = None

    def reset(self):
        self.new_start = True
        self.hold_position = None

    def draw_action(self, obs):
        # breaking down observation into something we can use
        # self.get_joint_pos(obs), self.get_joint_vel(obs), self.get_puck_pos(obs)
        
        hold_position = self.get_joint_pos(observation)
        velocity = np.zeros_like(hold_position)
        action = np.vstack([hold_position, velocity])
        return action

### Setup PyTorch Neural Network to convert observation space to action space
### Requirements
- setup Policy Gradient Method
- setup Policy
    - have a neural network (we are learning that)
    - setup TorchApproximator (connect Torch with MushroomRL)
    - plug Approximator into a Parametric Policy
- Plug Policy into Policy Gradient Method
- Train

In [3]:
#super stupid forward pass neural network
class ActionGenerator(nn.Module):
    def __init__(self, input_dim, output_dim, use_cuda = False, dropout=False, activation = nn.LeakyReLU(0.1) ):
        super().__init__()
        
        num_layers = 20
        layer_width = 10
        
        
        layers = [nn.Linear(input_dim, layer_width), activation]
        for i in range(num_layers-1):
            layers.append(nn.Linear(layer_width, layer_width))
            layers.append(activation)
        layers.append(nn.Linear(layer_width, output_dim))
        layers.append(activation)
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, obs):
        out = self.model(obs.float())
        
        # return torch.reshape(out,6)
        return out

In [4]:
network = ActionGenerator(8,6)
print(network(torch.zeros(1,8)).shape)

torch.Size([1, 6])


### Setup DeepDummy Agent

In [5]:
from mushroom_rl.algorithms.policy_search import REINFORCE
from mushroom_rl.policy.deterministic_policy import DeterministicPolicy
from mushroom_rl.policy.gaussian_policy import GaussianPolicy
from mushroom_rl.approximators.parametric import TorchApproximator
from mushroom_rl.utils.optimizers import AdaptiveOptimizer



In [6]:
#setting this up

import numpy as np
from air_hockey_challenge.framework.air_hockey_challenge_wrapper import AirHockeyChallengeWrapper
from air_hockey_challenge.framework.challenge_core import ChallengeCore, CustomChallengeCore
from mushroom_rl.algorithms.policy_search import REINFORCE


def custom_reward_function(base_env, state, action, next_state, absorbing):
    return 9000 #its over 9000

mdp = AirHockeyChallengeWrapper(env="3dof-hit", action_type="position-velocity", interpolation_order=3, custom_reward_function=custom_reward_function, debug=True)

approximator = TorchApproximator(input_shape=12,
                                 output_shape=6,
                                 network=ActionGenerator,
                                 loss=F.smooth_l1_loss,
                                 # optimizer={'class': optim.Adam, 'params': {'lr': .001}},
                                 use_cuda=False)
policy = GaussianPolicy(approximator,torch.eye(6))
old_weights = policy.get_weights()

algorithm_params = {'mdp_info': mdp.info,
                    'policy': policy,
                    'optimizer':  AdaptiveOptimizer(eps=0.01)
                   }


reinforce = REINFORCE(**algorithm_params)

core = CustomChallengeCore(reinforce, mdp)

core.learn(n_episodes=10, n_episodes_per_fit=10, render=False) #render allows us to visualize what's going on

                                                                                                                                                                                  

In [7]:
policy.get_weights() - old_weights

array([ 0.        ,  0.        ,  0.        , ..., -0.00195193,
       -0.01931422, -0.02257159], dtype=float32)

In [8]:
policy.get_weights()

array([ 0.17344297, -0.28199044,  0.20390825, ..., -0.1298237 ,
        0.00962436,  0.11651352], dtype=float32)