In [22]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from copy import deepcopy
import gym

In [12]:
# Env setup

def test():
    env = gym.make('CartPole-v1')
    env.reset()
    for _ in range(100):
        # env.render()  - Does not work in jupyter?
        obs, reward, done, info = env.step(env.action_space.sample()) # take a random action
        print(reward)
    env.close()
test()

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


  logger.warn(


In [19]:
def eval(env, model=None):
    terminal_steps = []
    max_steps = 1000
    iterations = 100
    for i in range(iterations):
        env.reset()
        for step in range(max_steps):
            if model:
                action = model.get_action()
                obs, reward, done, info = env.step(action.cpu().numpy())
            else:
                obs, reward, done, info = env.step(env.action_space.sample()) # take a random action
            if reward == 0 or step == max_steps - 1:
                terminal_steps.append(step)
                break
    print(f"Mean reward: {np.mean(terminal_steps)}")

In [20]:
def test_eval():
    env = gym.make('CartPole-v1')
    eval(env)
    env.close()
test_eval()

Mean reward: 22.48


In [45]:
# NN Architecture
# - ActorCritic
# - Shared weights vs not
class ActorCritic(torch.nn.Module):
    def __init__(self, input_shape, output_shape, hidden_units, layers):
        super().__init__()
        self.actor = nn.Sequential(
            nn.Linear(input_shape, hidden_units),
            nn.ReLU()
        )
        for i in range(layers):
            self.actor.append(nn.Linear(hidden_units, hidden_units))
            self.actor.append(nn.ReLU())

        self.critic = deepcopy(self.actor)
        self.critic.append(nn.Linear(hidden_units, 1))

        self.actor.append(nn.Linear(hidden_units, output_shape))
        self.actor.append(nn.Tanh())  # Why does actor need Tanh?


    def forward(self, x):
        values = self.critic(x)
        logits = self.actor(x)  # TODO: Do logits always result from Tanh?
        return values, logits

    def get_action(self, x, action=None):
        logits = self.actor(x)

        # TODO: Continuous vs discrete actions
        # As in, are logits probabilities or distributions?

        # Discrete action sampling:
        m = torch.distributions.Categorical(logits=logits)
        if action == None:
            action = m.sample()

        return action, m.log_prob(action)

    def get_value(self, x):
        values = self.critic(x)
        return values

In [53]:
# Test ActorCritic

ac = ActorCritic(8, 2, 128, 2)
print(ac)
ac.get_action(torch.tensor([1,2,3,4,5,6,7,8], dtype=torch.float))

ActorCritic(
  (actor): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=2, bias=True)
    (7): Tanh()
  )
  (critic): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=1, bias=True)
  )
)


(tensor(1), tensor(-0.7129, grad_fn=<SqueezeBackward1>))

In [None]:
# Taken from cleanrl and ML-Collective
config = {
    'description': 'cleanrl',
    'seed': 1,
    'torch_deterministic': True,
    'cuda': True,
    'std_init': 0.05,
    'env_id': 'LunarLanderContinuous-v2',
    'num_workers': 8,  # rank (seed) / envs / N
    'num_epochs': 10, # K number of
    'num_iterations': 30, # number of times we collect a dataset or no. of update loops (300k-2mil total timesteps)
    'max_timesteps': 2048, # T
    'epsilon': 0.2,  # clipping radius
    'lr': 3e-4,
    'gamma' : 0.99,
    'batch_size' : 512,
    'eval_actors': 4,  # not used
    'clip_value_loss': True,
    'gae': True,
    'gae_lambda': 0.95,
    'advantage_norm': True,
    'max_grad_norm': 0.5
}

In [4]:
# PPO
class PPO():

    def __init__(self, config):
        self.config = config
        self.env = gym.make('CartPole-v1')
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.agent = ActorCritic(self.observation_space, self.action_space, hidden_units=128, layers=2)
        self.optimizer = optim.Adam(self.agent.parameters(), lr=config["lr"], eps=1e-5)

    def train(self):

        # Get config values (TODO: fetch from config)
        num_envs = 1
        iterations = 2
        buffer_size = 100
        steps_per_env = 100
        device = torch.device("cuda" if torch.cuda.is_available() and config["device"] == "cuda" else "cpu")

        # Set up buffers
        obs = torch.zeros((steps_per_env, num_envs) + self.observation_space).to(device)
        actions = torch.zeros((steps_per_env, num_envs) + self.action_space.shape).to(device)
        logprobs = torch.zeros((steps_per_env, num_envs)).to(device)
        rewards = torch.zeros((steps_per_env, num_envs)).to(device)
        dones = torch.zeros((steps_per_env, num_envs)).to(device)
        values = torch.zeros((steps_per_env, num_envs)).to(device)

        env = 0  # TODO: Implement multiple envs
        for i in range(iterations):
            done = torch.tensor(1.0).to(device)  # This is just to silence warning about possibly undefined variable
            observation = torch.tensor(env.reset()).to(device)  # Get initial observations
            for step in range(steps_per_env):
                # Collect experience
                obs[step, env] = observation
                with torch.no_grad():
                    action = self.agent.get_action(observation)
                    value = self.agent.get_value(observation)
                observation, reward, done, info = env.step(action.cpu().numpy())
                observation = torch.tensor(observation).to(device)
                actions[step, env] = action
                rewards[step, env] = torch.tensor(reward).to(device)
                dones[step, env] = torch.tensor(done).to(device)
                if done:
                    # TODO: does env self-reset? Seems to do so on clean-rl
                    observation = torch.tensor(env.reset()).to(device)

            # Calculate advantages (future rewards - critic predictions --> critic loss)
            with torch.no_grad():  # TODO: Why no grad here?
                advantages = torch.zeros_like(rewards).to(device)
                final_done = torch.tensor(done).to(device)
                final_value = value
                for t in reversed(range(steps_per_env)):
                    if t == steps_per_env - 1:
                        next_not_done = 1.0 - final_done
                        next_value = final_value
                    else:
                        next_not_done = 1.0 - dones[t+1, env]
                        next_value = values[t+1, env]
                    advantages[t, env] = rewards[t] + self.config["gamma"] * next_not_done * next_value
                advantages = advantages - values  # Critic loss

            # Flatten batches (get rid of env-index) - after calculating advantages, the order no longer matters
            b_obs = obs.reshape((-1,) + self.observation_space)
            b_logprobs = logprobs.reshape(-1)
            b_actions = actions.reshape((-1,) + self.action_space)
            b_advantages = advantages.reshape(-1)
            # b_returns = returns.reshape(-1)
            b_values = values.reshape(-1)

            # Optimize
            b_indices = np.arange(config["batch_size"])
            for epoch in config["num_epoch"]:
                np.random.shuffle(b_indices)

            # Evaluate maybe?
        # Final evaluate maybe?
        env.close()


In [5]:

# Train


