# Actor-Critic spike

Plan:
- Spike Actor-Critic
- Quick test on a Gym env
- Update requirements.txt

## Imports & setup

### Essential tools

In [11]:
# Generic setup
from typing import Tuple

### Examine Gym environments

In [27]:
%%capture
from gym import envs
print(envs.registry.all())

In [28]:
import gym
env = gym.make("CartPole-v1")

# Reproducible gym environments
env.seed(0)

[0]

In [22]:
# Check environment details
# CartPole-v0 is 200, 195.0
# CartPole-v1 is 500, 475.0
env.spec.max_episode_steps, env.spec.reward_threshold, env.action_space, env.observation_space

(200,
 195.0,
 Discrete(2),
 Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32))

### Import PyTorch

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import distributions

# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reproducible results
torch.manual_seed(0)

<torch._C.Generator at 0x11e708090>

## Actor-Critic base class

In [71]:
class ActorCritic(nn.Module):
    """
    A base Actor-Critic class
    """
    def __init__(self, num_inputs: int, num_outputs: int, hidden_layer_config: tuple = (10, 10), activation_func=F.relu) -> None:
        """
        Note:
        - num_inputs = size of the observation space
        - num_outputs = no. of possible actions
        - activation_func = activation function to use between layers. Using F.* format here.
        https://discuss.pytorch.org/t/whats-the-difference-between-nn-relu-vs-f-relu/27599
        """
        super(ActorCritic, self).__init__()

        self.num_inputs: int = num_inputs
        self.num_outputs: int = num_outputs
        # https://deepai.org/machine-learning-glossary-and-terms/hidden-layer-machine-learning
        self.hidden_layer_config: tuple = hidden_layer_config
        self.activation_func = activation_func

        # https://towardsdatascience.com/pytorch-layer-dimensions-what-sizes-should-they-be-and-why-4265a41e01fd
        # Format of nn:
        # https://pytorch.org/docs/stable/generated/torch.nn.Module.html

        self.hidden_layers = nn.ModuleList(
            [nn.Linear(hidden_layer_config[_], hidden_layer_config[_+1]) for _ in range(len(hidden_layer_config)-1)]
        )

        # self.input_layer = nn.Linear(
        #     in_features=self.num_inputs,
        #     out_features=self.hidden_layers[0],
        #     bias=True
        # )

        # # Actor/probabilities
        # # LogSoftmax - pg126 Deep Reinforcement Learning in Action
        # self.policy_output_layer = nn.Sequential(
        #     nn.Linear(
        #         in_features=self.hidden_layers[-1],
        #         out_features=self.num_outputs,
        #         bias=True
        #     ),
        #     nn.LogSoftmax(dim=1)
        # )

        # Critic
        # The output is expected to be a single number because 
        # it's an approximation of state value.
        # https://stackoverflow.com/questions/55405961/why-does-sigmoid-function-outperform-tanh-and-softmax-in-this-case
        # self.value_output_layer = nn.Linear(
        #     in_features=self.hidden_layers[-1],
        #     out_features=1,
        #     bias=True
        # )

        # Softmax output: Actor needs to return probabilities for each available action
        self.actor = nn.Sequential(
            nn.Linear(self.num_inputs, 10),
            nn.ReLU(),
            nn.Linear(10, self.num_outputs),
            nn.Softmax(dim=1)
        )

        self.critic = nn.Sequential(
            nn.Linear(self.num_inputs, 10),
            nn.ReLU(),
            nn.Linear(10, 1)
        )

    def forward(self, state: torch.FloatTensor) -> Tuple[torch.Tensor, torch.Tensor]:
        # Assert this state is a Tensor of floats, or else
        assert isinstance(state, torch.FloatTensor)

        # x = self.activation_func(self.input_layer(state))

        # for h in self.hidden_layers:
        #     state = self.activation_func(h(x))
        
        # actor = self.policy_output_layer(x)
        # critic = self.value_output_layer(x)

        # https://www.kite.com/python/docs/torch.distributions.Categorical
        # Example
        # >>> m = Categorical(torch.tensor([ 0.01, 0.01, 0.97, 0.01 ]))
        # >>> m.sample()  # heavily in favour of tensor(2)
        action_probability_dist = distributions.Categorical(self.actor(state))
        state_values = self.critic(state)
        # state_values = critic
        return action_probability_dist, state_values


In [72]:
# Parameters based on environment:
NUM_OBSERVATIONS: int = env.observation_space.shape[0] # input
NUM_ACTIONS: int = env.action_space.n # output
# NUM_ACTIONS, NUM_OBSERVATIONS

ac = ActorCritic(NUM_OBSERVATIONS, NUM_ACTIONS).to(device)
print(ac)
# print(vars(ac))

ActorCritic(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=10, out_features=10, bias=True)
  )
  (actor): Sequential(
    (0): Linear(in_features=4, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=2, bias=True)
    (3): Softmax(dim=1)
  )
  (critic): Sequential(
    (0): Linear(in_features=4, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=1, bias=True)
  )
)


In [73]:
# One episode
def do_one_episode(env: gym.Env):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        state = torch.unsqueeze(torch.FloatTensor(state), 0).to(device)
        probability_dist, values = ac(state)
        action_to_take = probability_dist.sample()
        next_state, reward, done, _ = env.step(action_to_take.cpu().detach().numpy()[0])
        state = next_state
        total_reward += reward
    return total_reward

# Test for 10 episodes
cartpole = gym.make("CartPole-v1")
for i in range(10):
    reward = do_one_episode(cartpole)
    print(reward)

20.0
13.0
45.0
15.0
10.0
19.0
52.0
19.0
47.0
41.0
