In [1]:
import os
import random
import time
import sys
import re
from dataclasses import dataclass
import numpy as np
import torch
import torch as t
import gym
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
from gym.spaces import Discrete
from einops import rearrange

from utils import make_env, ppo_parse_args
import tests


  if not hasattr(tensorboard, "__version__") or LooseVersion(


In [2]:
import argparse
import os
import random
import time
import sys
from distutils.util import strtobool
from dataclasses import dataclass
from typing import Optional
import numpy as np
import torch
import torch as t
import gym
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
from gym.spaces import Discrete
from typing import Any, List, Optional, Union, Tuple, Iterable
from einops import rearrange
from utils import ppo_parse_args, make_env
import part4_dqn_solution

MAIN = __name__ == "__main__"
RUNNING_FROM_FILE = "ipykernel_launcher" in os.path.basename(sys.argv[0])


In [3]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    """Performs orthogonal initialization on a Pytorch layer

    Args:
        layer (nn.Module): Layer module to be initialized
        std (float, optional): Scaling factor to be used for init. Defaults to np.sqrt(2).
        bias_const (float, optional): Constant value to fill bias. Defaults to 0.0.

    Returns:
        nn.Module: Initialized layer module.
    """    
    t.nn.init.orthogonal_(layer.weight, std)
    t.nn.init.constant_(layer.bias, bias_const)
    return layer


class Agent(nn.Module):
    critic: nn.Sequential
    actor: nn.Sequential

    def __init__(self, envs: gym.vector.SyncVectorEnv):
        super().__init__()
        self.obs_shape = envs.single_observation_space.shape
        self.n_obs = np.array(self.obs_shape).item()
        self.n_actions = envs.single_action_space.n

        self.actor = nn.Sequential(
            layer_init(nn.Linear(self.n_obs, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, self.n_actions), std=0.01),
        )

        self.critic = nn.Sequential(
            layer_init(nn.Linear(self.n_obs, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=0.01),
        )


In [4]:
@t.inference_mode()
def compute_advantages(
    next_value: t.Tensor,
    next_done: t.Tensor,
    rewards: t.Tensor,
    values: t.Tensor,
    dones: t.Tensor,
    device: t.device,
    gamma: float,
    gae_lambda: float,
) -> t.Tensor:
    """Compute advantages using Generalized Advantage Estimation.

    next_value: shape (1, env) - represents V(s_{t+1}) which is needed for the last advantage term
    next_done: shape (env,)
    rewards: shape (t, env)
    values: shape (t, env)
    dones: shape (t, env)

    Return: shape (t, env)
    """
    T = values.shape[0]
    next_values = torch.concat([values[1:], next_value])
    next_dones = torch.concat([dones[1:], next_done.unsqueeze(0)])
    deltas = rewards + gamma * next_values * (1.0 - next_dones) - values

    advantages = deltas.clone().to(device)
    for t in reversed(range(1, T)):
        advantages[t - 1] = (
            deltas[t - 1] + gamma * gae_lambda * (1.0 - dones[t]) * advantages[t]
        )
    return advantages


if MAIN and RUNNING_FROM_FILE:
    tests.test_compute_advantages(compute_advantages)


In [5]:
@dataclass
class Minibatch:
    obs: t.Tensor
    logprobs: t.Tensor
    actions: t.Tensor
    advantages: t.Tensor
    returns: t.Tensor
    values: t.Tensor


def minibatch_indexes(batch_size: int, minibatch_size: int) -> list[np.ndarray]:
    """Return a list of length (batch_size // minibatch_size) where each element is an
    array of indexes into the batch.

    Each index should appear exactly once.
    """
    assert batch_size % minibatch_size == 0
    minibatch_count = batch_size // minibatch_size
    all_idx = np.random.permutation(batch_size)
    return list(rearrange(
        all_idx,
        "(minibatch_count minibatch_size) -> minibatch_count minibatch_size",
        minibatch_count=minibatch_count,
    ))


if MAIN and RUNNING_FROM_FILE:
    tests.test_minibatch_indexes(minibatch_indexes)


def make_minibatches(
    obs: t.Tensor,
    logprobs: t.Tensor,
    actions: t.Tensor,
    advantages: t.Tensor,
    values: t.Tensor,
    obs_shape: tuple,
    action_shape: tuple,
    batch_size: int,
    minibatch_size: int,
) -> list[Minibatch]:
    """Flatten the environment and steps dimension into one batch dimension, then
    shuffle and split into minibatches."""
    returns = advantages + values

    data = (obs, logprobs, actions, advantages, returns, values)
    shapes = (obs_shape, (), action_shape, (), (), ())
    return [
        Minibatch(*[d.reshape((-1,) + s)[ind] for d, s in zip(data, shapes)])
        for ind in minibatch_indexes(batch_size, minibatch_size)
    ]


All tests in `test_minibatch_indexes` passed.


In [6]:
def calc_policy_loss(
    probs: Categorical,
    mb_action: t.Tensor,
    mb_advantages: t.Tensor,
    mb_logprobs: t.Tensor,
    clip_coef: float,
) -> t.Tensor:
    """Return the policy loss, suitable for maximisation with gradient ascent.

    Args:
        probs (Categorical): a distribution containing the actor's unnormalized logits
            of shape (minibatch, num_actions)
        mb_action (t.Tensor): Action probabilities from current theta
        mb_advantages (t.Tensor): Advantages
        mb_logprobs (t.Tensor): Log probabilities from previous theta
        clip_coef (float): amount of clipping, denoted by epsilon in Eq 7.

    Returns:
        t.Tensor: Loss from minimum of unclipped and clipped loss.
    """
    # perform subtraction in logspace - equivalent to divsion
    logspace_r = probs.log_prob(mb_action) - mb_logprobs
    ratio = t.exp(logspace_r)  # exiting logspace

    # normalized advantage
    A = (mb_advantages - mb_advantages.mean()) / mb_advantages.std()

    # finalize L CLIP
    left = ratio * A
    right = t.clamp(ratio, 1 - clip_coef, 1 + clip_coef) * A
    return t.minimum(left, right).mean()


if MAIN and RUNNING_FROM_FILE:
    tests.test_calc_policy_loss(calc_policy_loss)


All tests in `test_calc_policy_loss` passed.


In [7]:
def calc_value_function_loss(
    critic: nn.Sequential, mb_obs: t.Tensor, mb_returns: t.Tensor, v_coef: float
) -> t.Tensor:
    """Computes the value function portion of the loss function

    Args:
        critic (nn.Sequential): Critic network
        mb_obs (t.Tensor): Observations from minibatch
        mb_returns (t.Tensor): Observed returns
        v_coef (float): the coefficient for the value loss, which weights its
            contribution to the overall loss. Denoted by c_1 in the paper.

    Returns:
        t.Tensor: Loss for the critic network
    """
    preds = critic(mb_obs)
    loss = ((preds - mb_returns) ** 2).mean() / 2
    return v_coef * loss


if MAIN and RUNNING_FROM_FILE:
    tests.test_calc_value_function_loss(calc_value_function_loss)


All tests in `test_calc_value_function_loss` passed!


In [8]:
def calc_entropy_loss(probs: Categorical, ent_coef: float):
    """Return the entropy loss term

    Args:
        probs (Categorical): a distribution containing the actor's unnormalized logits
            of shape (minibatch, num_actions)
        ent_coef (float): the coefficient for the entropy loss, which weights its 
            contribution to the overall loss. Denoted by c_2 in the paper.
    """
    return ent_coef * probs.entropy().mean()


if MAIN and RUNNING_FROM_FILE:
    tests.test_calc_entropy_loss(calc_entropy_loss)


In [9]:
class PPOScheduler:
    """Learning rate scheduler for PPO.
    """    
    def __init__(self, optimizer, initial_lr: float, end_lr: float, num_updates: int):
        """_summary_

        Args:
            optimizer (torch.optim.Optimizer): An Optimizer whose "lr" will be scheduled
            initial_lr (float): Initial learning rate
            end_lr (float): Final learning rate
            num_updates (int): Number of steps in which the learning rate is updated
        """        
        self.optimizer = optimizer
        self.initial_lr = initial_lr
        self.end_lr = end_lr
        self.num_updates = num_updates
        self.n_step_calls = 0

    def step(self):
        """Performs linear learning rate decay so that after num_updates calls to step, 
        the learning rate is end_lr."""
        self.n_step_calls += 1
        slope = self.n_step_calls / self.num_updates
        for group in self.optimizer.param_groups:
            group["lr"] = self.initial_lr + slope * (self.end_lr - self.initial_lr)


def make_optimizer(
    agent: Agent, num_updates: int, initial_lr: float, end_lr: float
) -> tuple[optim.Adam, PPOScheduler]:
    """Return an appropriately configured Adam with its attached scheduler.

    Args:
        agent (Agent): Agent network
        num_updates (int): Number of steps in which the learning rate is updated
        initial_lr (float): Initial learning rate
        end_lr (float): Final learning rate

    Returns:
        tuple[optim.Adam, PPOScheduler]: Optimizer and scheduler objects
    """
    optimizer = t.optim.Adam(agent.parameters(), initial_lr, eps=1e-5,maximize=True)
    scheduler = PPOScheduler(optimizer, initial_lr, end_lr, num_updates)
    return optimizer, scheduler


In [None]:
@dataclass
class PPOArgs:
    exp_name: str = os.path.basename("ppo_sol").rstrip(".py")
    seed: int = 1
    torch_deterministic: bool = True
    cuda: bool = True
    track: bool = True
    wandb_project_name: str = "Curt-PPOCart"
    wandb_entity: str = None
    capture_video: bool = True
    env_id: str = "CartPole-v1"
    total_timesteps: int = 500000
    learning_rate: float = 0.00025
    num_envs: int = 4
    num_steps: int = 128
    gamma: float = 0.99
    gae_lambda: float = 0.95
    num_minibatches: int = 4
    update_epochs: int = 4
    clip_coef: float = 0.2
    ent_coef: float = 0.01
    vf_coef: float = 0.5
    max_grad_norm: float = 0.5
    batch_size: int = 512
    minibatch_size: int = 128


def train_ppo(args: PPOArgs):

    # Set up tracking and reporting
    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
    if args.track:
        import wandb

        wandb.init(
            project=args.wandb_project_name,
            entity=args.wandb_entity,
            sync_tensorboard=True,
            config=vars(args),
            name=run_name,
            monitor_gym=True,
            save_code=True,
        )
    writer = SummaryWriter(f"runs/{run_name}")
    writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s"
        % "\n".join([f"|{key}|{value}|" for (key, value) in vars(args).items()]),
    )

    # Set initial settings
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic
    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

    # Initialize environments, action shapes and agent
    envs = gym.vector.SyncVectorEnv(
        [
            make_env(args.env_id, args.seed + i, i, args.capture_video, run_name)
            for i in range(args.num_envs)
        ]
    )
    action_shape = envs.single_action_space.shape
    assert action_shape is not None
    assert isinstance(
        envs.single_action_space, Discrete
    ), "only discrete action space is supported"
    agent = Agent(envs).to(device)

    # Set up training loop components and values to be used for optimization
    num_updates = args.total_timesteps // args.batch_size
    (optimizer, scheduler) = make_optimizer(agent, num_updates, args.learning_rate, 0.0)
    obs = torch.zeros(
        (args.num_steps, args.num_envs) + envs.single_observation_space.shape
    ).to(device)
    actions = torch.zeros((args.num_steps, args.num_envs) + action_shape).to(device)
    logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
    rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
    dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
    values = torch.zeros((args.num_steps, args.num_envs)).to(device)
    global_step = 0
    old_approx_kl = 0.0
    approx_kl = 0.0
    value_loss = t.tensor(0.0)
    policy_loss = t.tensor(0.0)
    entropy_loss = t.tensor(0.0)
    clipfracs = []
    info = []

    # Initialize observations
    start_time = time.time()
    next_obs = torch.Tensor(envs.reset()).to(device)
    next_done = torch.zeros(args.num_envs).to(device)
    
    # Begin training loop
    for _ in range(num_updates):

        # Rollout loop
        for i in range(0, args.num_steps):
            # Rollout calculations
            obs[i] = next_obs
            dones[i] = next_done

            with t.inference_mode():
                next_values = agent.critic(next_obs).flatten()
                logits = agent.actor(next_obs)
            probs = Categorical(logits=logits)
            action = probs.sample()
            logprob = probs.log_prob(action)
            next_obs, reward, done, info = envs.step(action.cpu().numpy())
            
            rewards[i] = t.from_numpy(reward)
            actions[i] = action
            logprobs[i] = logprob
            values[i] = next_values

            next_obs = t.from_numpy(next_obs).to(device)
            next_done = t.from_numpy(done).float().to(device)

            # Reporting
            for item in info:
                if "episode" in item.keys():
                    print(
                        f"global_step={global_step}, episodic_return={item['episode']['r']}"
                    )
                    writer.add_scalar(
                        "charts/episodic_return", item["episode"]["r"], global_step
                    )
                    writer.add_scalar(
                        "charts/episodic_length", item["episode"]["l"], global_step
                    )
                    break

        # Get the next value and advantages
        next_value = rearrange(agent.critic(next_obs), "env 1 -> 1 env")
        advantages = compute_advantages(
            next_value,
            next_done,
            rewards,
            values,
            dones,
            device,
            args.gamma,
            args.gae_lambda,
        )
        clipfracs.clear()

        # Learning loop
        for _ in range(args.update_epochs):
            minibatches = make_minibatches(
                obs,
                logprobs,
                actions,
                advantages,
                values,
                envs.single_observation_space.shape,
                action_shape,
                args.batch_size,
                args.minibatch_size,
            )
            for mb in minibatches:
                # YOUR CODE: compute loss on the minibatch and step the optimizer (not 
                # the scheduler). Do detail #11 (global gradient clipping) here using 
                # nn.utils.clip_grad_norm_.
                logits = agent.actor(mb.obs)
                probs = Categorical(logits=logits)
                policy_loss = calc_policy_loss(probs, mb.actions, mb.advantages, mb.logprobs, args.clip_coef)
                value_loss = calc_value_function_loss(agent.critic, mb.obs, mb.returns, args.vf_coef)
                entropy_loss = calc_entropy_loss(probs, args.ent_coef)
                loss = policy_loss - value_loss + entropy_loss
                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
                optimizer.step()
        
        
        # Step scheduler, calculate metrics
        scheduler.step()
        (y_pred, y_true) = (mb.values.cpu().numpy(), mb.returns.cpu().numpy())
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
        with torch.no_grad():
            newlogprob: t.Tensor = probs.log_prob(mb.actions)
            logratio = newlogprob - mb.logprobs
            ratio = logratio.exp()
            old_approx_kl = (-logratio).mean().item()
            approx_kl = (ratio - 1 - logratio).mean().item()
            clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
        
        
        # Add metrics to reporting
        writer.add_scalar(
            "charts/learning_rate", optimizer.param_groups[0]["lr"], global_step
        )
        writer.add_scalar("losses/value_loss", value_loss.item(), global_step)
        writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step)
        writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
        writer.add_scalar("losses/old_approx_kl", old_approx_kl, global_step)
        writer.add_scalar("losses/approx_kl", approx_kl, global_step)
        writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
        writer.add_scalar("losses/explained_variance", explained_var, global_step)
        writer.add_scalar(
            "charts/SPS", int(global_step / (time.time() - start_time)), global_step
        )
        if global_step % 10 == 0:
            print(
                "steps per second (SPS):", int(global_step / (time.time() - start_time))
            )
        wandb.log({
            "value_loss": value_loss, "policy_loss":policy_loss,
            "entropy":entropy_loss, "old_approx_kl":old_approx_kl, "approx_kl":approx_kl,
            "clipfrac":np.mean(clipfracs), "explained_variance":explained_var})
    envs.close()
    writer.close()


# if MAIN:
#     if "ipykernel_launcher" in os.path.basename(sys.argv[0]):
#         filename = globals().get("__file__", "<filename of this script>")
#         print(
#             f"Try running this file from the command line instead: python {os.path.basename(filename)} --help"
#         )
#         args = PPOArgs()
#     else:
#         args = ppo_parse_args()
#     train_ppo(args)


In [None]:
from gym.envs.classic_control.cartpole import CartPoleEnv
import gym
from gym import logger, spaces
from gym.error import DependencyNotInstalled
import math

class EasyCart(CartPoleEnv):
    def step(self, action):
        (obs, rew, done, info) = super().step(action)
        "YOUR CODE HERE"

gym.envs.registration.register(id="EasyCart-v0", entry_point=EasyCart, max_episode_steps=500)

  logger.warn(f"Overriding environment {id}")


In [14]:
args = PPOArgs()
train_ppo(args)

  from IPython.core.display import HTML, display  # type: ignore


[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


  from IPython.core.display import display


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668982865909735, max=1.0…

Thread SenderThread:
Traceback (most recent call last):
  File "/home/curttigges/miniconda3/envs/arena/lib/python3.9/site-packages/wandb/sdk/internal/internal_util.py", line 50, in run
    self._run()
  File "/home/curttigges/miniconda3/envs/arena/lib/python3.9/site-packages/wandb/sdk/internal/internal_util.py", line 101, in _run
    self._process(record)
  File "/home/curttigges/miniconda3/envs/arena/lib/python3.9/site-packages/wandb/sdk/internal/internal.py", line 308, in _process
    self._sm.send(record)
  File "/home/curttigges/miniconda3/envs/arena/lib/python3.9/site-packages/wandb/sdk/internal/sender.py", line 305, in send
    send_handler(record)
  File "/home/curttigges/miniconda3/envs/arena/lib/python3.9/site-packages/wandb/sdk/internal/sender.py", line 319, in send_request
    send_handler(record)
  File "/home/curttigges/miniconda3/envs/arena/lib/python3.9/site-packages/wandb/sdk/internal/sender.py", line 491, in send_request_defer
    transition_state()
  File "/home/curtt

Problem at: /tmp/ipykernel_1416327/3643760415.py 35 train_ppo


Traceback (most recent call last):
  File "/home/curttigges/miniconda3/envs/arena/lib/python3.9/site-packages/wandb/sdk/wandb_init.py", line 1078, in init
    run = wi.init()
  File "/home/curttigges/miniconda3/envs/arena/lib/python3.9/site-packages/wandb/sdk/wandb_init.py", line 697, in init
    result = handle.wait(
  File "/home/curttigges/miniconda3/envs/arena/lib/python3.9/site-packages/wandb/sdk/lib/mailbox.py", line 259, in wait
    raise MailboxError("transport failed")
wandb.errors.MailboxError: transport failed
[34m[1mwandb[0m: [32m[41mERROR[0m Abnormal program exit


Exception: problem