# Actor-Critic spike

Plan:
- Spike Actor-Critic
- Quick test on a Gym env
- Update requirements.txt

## Imports & setup

### Essential tools

In [9]:
# Generic setup
# from typing import Tuple

### Examine Gym environments

In [10]:
%%capture
from gym import envs
print(envs.registry.all())

In [11]:
import gym
env = gym.make("CartPole-v1")

# Reproducible gym environments
env.seed(0)

[0]

In [12]:
# Check environment details
# CartPole-v0 is 200, 195.0
# CartPole-v1 is 500, 475.0
env.spec.max_episode_steps, env.spec.reward_threshold, env.action_space, env.observation_space

(500,
 475.0,
 Discrete(2),
 Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32))

### Import PyTorch

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim

# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reproducible results
torch.manual_seed(0)

<torch._C.Generator at 0x131347210>

## Actor-Critic base class

In [14]:
from actor_critic import ActorCritic

In [15]:
# Parameters based on environment:
NUM_OBSERVATIONS: int = env.observation_space.shape[0] # input
NUM_ACTIONS: int = env.action_space.n # output
# NUM_ACTIONS, NUM_OBSERVATIONS

ac = ActorCritic(NUM_OBSERVATIONS, NUM_ACTIONS, (12, 12, 15, 20)).to(device)
print(ac)
# print(vars(ac))

ActorCritic(
  (actor): Sequential(
    (0): Linear(in_features=4, out_features=12, bias=True)
    (1): ReLU()
    (2): Linear(in_features=12, out_features=2, bias=True)
    (3): Softmax(dim=1)
  )
  (critic): Sequential(
    (0): Linear(in_features=4, out_features=12, bias=True)
    (1): ReLU()
    (2): Linear(in_features=12, out_features=1, bias=True)
  )
)


In [16]:
# One episode
def do_one_episode(env: gym.Env):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        state = torch.unsqueeze(torch.FloatTensor(state), 0).to(device)
        probability_dist, values = ac(state)
        action_to_take = probability_dist.sample()
        next_state, reward, done, _ = env.step(action_to_take.cpu().detach().numpy()[0])
        state = next_state
        total_reward += reward
    return total_reward

# Test for 10 episodes
cartpole = gym.make("CartPole-v1")
for i in range(10):
    reward = do_one_episode(cartpole)
    print(reward)

18.0
20.0
9.0
24.0
22.0
48.0
11.0
24.0
20.0
12.0
