In [1]:
import os
import random
import time
import sys
import re
from dataclasses import dataclass
import numpy as np
import torch
import torch as t
import gym
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
from gym.spaces import Discrete
from einops import rearrange

from utils import make_env, ppo_parse_args
import tests

from collections import OrderedDict
import wandb

  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _HISTOGRAMPROTO = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _TENSORSHAPEPROTO_DIM = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.EnumValueDescriptor(
  _DATATYPE = _descriptor.EnumDescriptor(
  _descriptor.FieldDescriptor(
  _SERIALIZEDDTYPE = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _RESOURCEHANDLEPROTO_DTYPEANDSHAPE = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _TENSORPROTO = _descriptor.Descriptor(


In [2]:
import argparse
import os
import random
import time
import sys
from distutils.util import strtobool
from dataclasses import dataclass
from typing import Optional
import numpy as np
import torch
import torch as t
import gym
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
from gym.spaces import Discrete
from typing import Any, List, Optional, Union, Tuple, Iterable
from einops import rearrange
from utils import ppo_parse_args, make_env
import solutions

MAIN = __name__ == "__main__"
RUNNING_FROM_FILE = "ipykernel_launcher" in os.path.basename(sys.argv[0])

In [3]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    t.nn.init.orthogonal_(layer.weight, std)
    t.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    critic: nn.Sequential
    actor: nn.Sequential

    num_actions = 34789017348903127489

    def __init__(self, envs: gym.vector.SyncVectorEnv):
        super().__init__()
        obs_shape = envs.single_observation_space.shape[0] # should this be multi
        num_actions = envs.single_action_space.n
        self.actor = nn.Sequential(OrderedDict([
            ("linear1", layer_init(nn.Linear(obs_shape, 64))),
            ("tanh", nn.Tanh()),
            ("linear2", layer_init(nn.Linear(64, 64))),
            ("tanh", nn.Tanh()),
            ("linear2", layer_init(nn.Linear(64, num_actions), std=np.sqrt(0.01))),
        ]))

        self.critic = nn.Sequential(OrderedDict([
            ("linear1", layer_init(nn.Linear(obs_shape, 64))),
            ("tanh", nn.Tanh()),
            ("linear2", layer_init(nn.Linear(64, 64))),
            ("tanh", nn.Tanh()),
            ("linear2", layer_init(nn.Linear(64, 1), std=np.sqrt(1))),
        ]))
        

In [4]:
@t.inference_mode()
def compute_advantages(
    next_value: t.Tensor,
    next_done: t.Tensor,
    rewards: t.Tensor,
    values: t.Tensor,
    dones: t.Tensor,
    device: t.device,
    gamma: float,
    gae_lambda: float,
) -> t.Tensor:
    '''Compute advantages using Generalized Advantage Estimation.

    next_value: shape (1, env) - represents V(s_{t+1}) which is needed for the last advantage term
    next_done: shape (env,)
    rewards: shape (t, env)
    values: shape (t, env)
    dones: shape (t, env)

    Return: shape (t, env)
    '''
    dim_t, n_envs = dones.shape
    valuesplus = values.clone()
    valuesplus[0:-1] = values[1:]
    valuesplus[-1] = next_value
    donesplus = dones.clone()
    donesplus[0:-1] = dones[1:]
    donesplus[-1] = next_done

    T = values.shape[0]

    deltas = rewards + gamma * valuesplus * (1.0 - donesplus) - values
    advantages = deltas.clone().to(device)
    # non-vectorized implementation
    for i in reversed(range(1, T)):
        advantages[i-1] = deltas[i-1] + gamma * gae_lambda * (1.0 - dones[i]) * advantages[i]
    return advantages

if MAIN and RUNNING_FROM_FILE:
    tests.test_compute_advantages(compute_advantages)

In [5]:
@dataclass
class Minibatch:
    obs: t.Tensor
    logprobs: t.Tensor
    actions: t.Tensor
    advantages: t.Tensor
    returns: t.Tensor
    values: t.Tensor

def minibatch_indexes(batch_size: int, minibatch_size: int) -> list[np.ndarray]:
    '''Return a list of length (batch_size // minibatch_size) where each element is an array of indexes into the batch.

    Each index should appear exactly once.
    '''
    assert batch_size % minibatch_size == 0
    # size = batch_size // minibatch_size
    # indexes = np.arange(batch_size)
    # np.random.shuffle(indexes)
    # return [indexes[i * minibatch_size : (i + 1) * minibatch_size] for i in range(size)]
    indices = np.random.permutation(batch_size)
    indices = rearrange(indices, "(mb_num mb_size) -> mb_num mb_size", mb_size=minibatch_size)
    return list(indices)

if MAIN and RUNNING_FROM_FILE:
    tests.test_minibatch_indexes(minibatch_indexes)

def make_minibatches(
    obs: t.Tensor,
    logprobs: t.Tensor,
    actions: t.Tensor,
    advantages: t.Tensor,
    values: t.Tensor,
    obs_shape: tuple,
    action_shape: tuple,
    batch_size: int,
    minibatch_size: int,
) -> list[Minibatch]:
    '''Flatten the environment and steps dimension into one batch dimension, then shuffle and split into minibatches.'''
    idxss = minibatch_indexes(batch_size, minibatch_size)
    returns = advantages + values
    return [Minibatch(
            obs=obs[:, idxs].reshape(-1, *obs_shape),
            logprobs=logprobs[:, idxs].reshape(-1),
            actions=actions[:, idxs].reshape(-1, *action_shape),
            advantages=advantages[:, idxs].reshape(-1),
            returns=returns[:, idxs].reshape(-1),
            values=values[:, idxs].reshape(-1),
        )
    for idxs in idxss]
    # data = (obs, logprobs, actions, advantages, returns, values)
    # shapes = (obs_shape, (), action_shape, (), (), ())
    # return [
    #     Minibatch(*[d.reshape((-1,) + s)[ind] for d, s in zip(data, shapes)])
    #     for ind in minibatch_indexes(batch_size, minibatch_size)
    # ]

All tests in `test_minibatch_indexes` passed.


In [6]:
def calc_policy_loss(
    probs: Categorical, mb_action: t.Tensor, mb_advantages: t.Tensor, mb_logprobs: t.Tensor, clip_coef: float
) -> t.Tensor:
    '''Return the policy loss, suitable for maximisation with gradient ascent.

    probs: a distribution containing the actor's unnormalized logits of shape (minibatch, num_actions)

    clip_coef: amount of clipping, denoted by epsilon in Eq 7.

    normalize: if true, normalize mb_advantages to have mean 0, variance 1
    '''
    rttheta = t.exp(probs.log_prob(mb_action) - mb_logprobs)
    mb_advantages = (mb_advantages - mb_advantages.mean()) / mb_advantages.std()

    return t.min(
        rttheta * mb_advantages,
        rttheta.clip(1 - clip_coef, 1 + clip_coef) * mb_advantages
    ).mean()
    
if MAIN and RUNNING_FROM_FILE:
    tests.test_calc_policy_loss(calc_policy_loss)

All tests in `test_calc_policy_loss` passed.


In [7]:
def calc_value_function_loss(critic: nn.Sequential, mb_obs: t.Tensor, mb_returns: t.Tensor, v_coef: float) -> t.Tensor:
    '''Compute the value function portion of the loss function.

    v_coef: the coefficient for the value loss, which weights its contribution to the overall loss. Denoted by c_1 in the paper.
    '''
    return 0.5 * v_coef * nn.MSELoss()(critic(mb_obs), mb_returns)

if MAIN and RUNNING_FROM_FILE:
    tests.test_calc_value_function_loss(calc_value_function_loss)

All tests in `test_calc_value_function_loss` passed!


In [8]:
def calc_entropy_loss(probs: Categorical, ent_coef: float):
    '''Return the entropy loss term.

    ent_coef: the coefficient for the entropy loss, which weights its contribution to the overall loss. Denoted by c_2 in the paper.
    '''
    return ent_coef * probs.entropy().mean()

if MAIN and RUNNING_FROM_FILE:
    tests.test_calc_entropy_loss(calc_entropy_loss)

In [9]:
class PPOScheduler:
    def __init__(self, optimizer, initial_lr: float, end_lr: float, num_updates: int):
        self.optimizer = optimizer
        self.initial_lr = initial_lr
        self.end_lr = end_lr
        self.num_updates = num_updates
        self.n_step_calls = 0

    def step(self):
        '''Implement linear learning rate decay so that after num_updates calls to step, the learning rate is end_lr.'''
        if self.n_step_calls > self.num_updates:
            return self.end_lr
        else:
            return self.initial_lr + (self.end_lr - self.initial_lr) * self.n_step_calls / self.num_updates

def make_optimizer(agent: Agent, num_updates: int, initial_lr: float, end_lr: float) -> tuple[optim.Adam, PPOScheduler]:
    '''Return an appropriately configured Adam with its attached scheduler.'''
    optimizer = optim.Adam(agent.parameters(), lr=initial_lr, maximize=True)
    return optimizer, PPOScheduler(optimizer, initial_lr, end_lr, num_updates)

In [10]:
__file__ = 'ppo.py'

@dataclass
class PPOArgs:
    exp_name: str = os.path.basename(__file__).rstrip(".py")
    seed: int = 1
    torch_deterministic: bool = True
    cuda: bool = True
    track: bool = True
    wandb_project_name: str = "PPOCart"
    wandb_entity: str = None
    capture_video: bool = True
    env_id: str = "CartPole-v1"
    total_timesteps: int = 500000
    learning_rate: float = 0.00025
    num_envs: int = 4
    num_steps: int = 128
    gamma: float = 0.99
    gae_lambda: float = 0.95
    num_minibatches: int = 4
    update_epochs: int = 4
    clip_coef: float = 0.2
    ent_coef: float = 0.01
    vf_coef: float = 0.5
    max_grad_norm: float = 0.5
    batch_size: int = 512
    minibatch_size: int = 128

def train_ppo(args: PPOArgs):
    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
    if args.track:

        wandb.init(
            project=args.wandb_project_name,
            entity=args.wandb_entity,
            sync_tensorboard=True,
            config=vars(args),
            name=run_name,
            monitor_gym=True,
            save_code=True,
        )
    writer = SummaryWriter(f"runs/{run_name}")
    writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s" % "\n".join([f"|{key}|{value}|" for (key, value) in vars(args).items()]),
    )
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic
    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
    envs = gym.vector.SyncVectorEnv(
        [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
    )
    action_shape = envs.single_action_space.shape
    assert action_shape is not None
    assert isinstance(envs.single_action_space, Discrete), "only discrete action space is supported"
    agent = Agent(envs).to(device)
    num_updates = args.total_timesteps // args.batch_size
    (optimizer, scheduler) = make_optimizer(agent, num_updates, args.learning_rate, 0.0)
    obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
    actions = torch.zeros((args.num_steps, args.num_envs) + action_shape).to(device)
    logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
    rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
    dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
    values = torch.zeros((args.num_steps, args.num_envs)).to(device)
    global_step = 0
    old_approx_kl = 0.0
    approx_kl = 0.0
    value_loss = t.tensor(0.0)
    policy_loss = t.tensor(0.0)
    entropy_loss = t.tensor(0.0)
    clipfracs = []
    info = []
    start_time = time.time()
    next_obs = torch.Tensor(envs.reset()).to(device)
    next_done = torch.zeros(args.num_envs).to(device)
    for _ in range(num_updates):
        for i in range(0, args.num_steps):
            "YOUR CODE: Rollout phase (see detail #1)"
            obs[i] = next_obs
            dones[i] = next_done

            with t.inference_mode():
                next_values = agent.critic(next_obs).flatten()
                logits = agent.actor(next_obs)
            
            probs = Categorical(logits=logits)
            action = probs.sample()
            logprob = probs.log_prob(action)

            next_obs, reward, done, info = envs.step(action.cpu().numpy())
            rewards[i] = t.from_numpy(reward).to(device)
            actions[i] = action
            logprobs[i] = logprob
            values[i] = next_values

            next_obs = t.from_numpy(next_obs).to(device)
            next_done = t.from_numpy(done).to(device)

            for item in info:
                if "episode" in item.keys():
                    print(f"global_step={global_step}, episodic_return={item['episode']['r']}")
                    writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
                    writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
                    break
        next_value = rearrange(agent.critic(next_obs), "env 1 -> 1 env")
        advantages = compute_advantages(
            next_value, next_done, rewards, values, dones, device, args.gamma, args.gae_lambda
        )
        clipfracs.clear()
        for _ in range(args.update_epochs):
            minibatches = make_minibatches(
                obs,
                logprobs,
                actions,
                advantages,
                values,
                envs.single_observation_space.shape,
                action_shape,
                args.batch_size,
                args.minibatch_size,
            )
            for mb in minibatches:
                "YOUR CODE: compute loss on the minibatch and step the optimizer (not the scheduler). Do detail #11 (global gradient clipping) here using nn.utils.clip_grad_norm_."
                logits = agent.actor(mb.obs)
                probs = Categorical(logits=logits)
                policy_loss = calc_policy_loss(probs, mb.actions, mb.advantages, mb.logprobs, args.clip_coef)
                value_loss = calc_value_function_loss(agent.critic, mb.obs, mb.returns, args.vf_coef)
                entropy_loss = calc_entropy_loss(probs, args.ent_coef)
                loss = policy_loss - value_loss + entropy_loss
                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
                optimizer.step()
        scheduler.step()
        (y_pred, y_true) = (mb.values.cpu().numpy(), mb.returns.cpu().numpy())
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
        with torch.no_grad():
            newlogprob: t.Tensor = probs.log_prob(mb.actions)
            logratio = newlogprob - mb.logprobs
            ratio = logratio.exp()
            old_approx_kl = (-logratio).mean().item()
            approx_kl = (ratio - 1 - logratio).mean().item()
            clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
        writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
        writer.add_scalar("losses/value_loss", value_loss.item(), global_step)
        writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step)
        writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
        writer.add_scalar("losses/old_approx_kl", old_approx_kl, global_step)
        writer.add_scalar("losses/approx_kl", approx_kl, global_step)
        writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
        writer.add_scalar("losses/explained_variance", explained_var, global_step)
        writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
        wandb.log({
            "totalloss": value_loss.item() + policy_loss.item() + entropy_loss.item(),
            "lr": optimizer.param_groups[0]["lr"], 
            "value_loss": value_loss.item(), 
            "policy_loss": policy_loss.item(), 
            "entropy": entropy_loss.item(), 
            "old_approx_kl": old_approx_kl, 
            "approx_kl": approx_kl, 
            "clipfrac": np.mean(clipfracs), 
            "explained_variance": explained_var, 
            "SPS": int(global_step / (time.time() - start_time))
            })
        if global_step % 10 == 0:
            print("steps per second (SPS):", int(global_step / (time.time() - start_time)))
    envs.close()
    writer.close()
    wandb.finish()

In [15]:
from gym.envs.classic_control.cartpole import CartPoleEnv
import gym
from gym import logger, spaces
from gym.error import DependencyNotInstalled
import math

class EasyCart(CartPoleEnv):
    def step(self, action):
        (obs, rew, done, info) = super().step(action)
        pos, vel, angle, ang_vel = obs
        c = 1
        new_reward = c * (pos + 2.4) + c * (angle + 0.20943951) + c * (vel + 4) + c * (ang_vel + 3.5)
        
        return (obs, new_reward, done, info)

gym.envs.registration.register(id="EasyCart-v0", entry_point=EasyCart, max_episode_steps=500)

if MAIN:
    if "ipykernel_launcher" in os.path.basename(sys.argv[0]):
        filename = globals().get("__file__", "<filename of this script>")
        print(f"Try running this file from the command line instead: python {os.path.basename(filename)} --help")
        args = PPOArgs(env_id="EasyCart-v0")
    else:
        args = ppo_parse_args()
    train_ppo(args)

  logger.warn(f"Overriding environment {id}")


Try running this file from the command line instead: python ppo.py --help


  from IPython.core.display import HTML, display  # type: ignore


  logger.deprecation(
  logger.deprecation(
  return F.mse_loss(input, target, reduction=self.reduction)
  logger.deprecation(


global_step=0, episodic_return=146.31619262695312
global_step=0, episodic_return=144.62844848632812
global_step=0, episodic_return=236.1435089111328
global_step=0, episodic_return=155.6774444580078
global_step=0, episodic_return=176.78118896484375
global_step=0, episodic_return=96.0130615234375
global_step=0, episodic_return=229.56918334960938
global_step=0, episodic_return=219.7485809326172
global_step=0, episodic_return=280.26953125
global_step=0, episodic_return=105.78245544433594
global_step=0, episodic_return=487.1505126953125
global_step=0, episodic_return=290.8472900390625
global_step=0, episodic_return=565.37451171875
global_step=0, episodic_return=287.2403564453125
global_step=0, episodic_return=177.53341674804688
global_step=0, episodic_return=116.99507141113281
global_step=0, episodic_return=157.27439880371094
global_step=0, episodic_return=228.69142150878906
global_step=0, episodic_return=127.18999481201172
global_step=0, episodic_return=328.47821044921875
steps per second 

  return F.mse_loss(input, target, reduction=self.reduction)
  logger.deprecation(


global_step=0, episodic_return=472.943603515625
global_step=0, episodic_return=308.0131530761719
global_step=0, episodic_return=137.77005004882812
global_step=0, episodic_return=146.37335205078125
global_step=0, episodic_return=154.67567443847656
global_step=0, episodic_return=146.1490936279297
global_step=0, episodic_return=268.321044921875
global_step=0, episodic_return=201.74942016601562
global_step=0, episodic_return=167.06451416015625
global_step=0, episodic_return=116.29856872558594
global_step=0, episodic_return=156.74771118164062
steps per second (SPS): 0
global_step=0, episodic_return=478.10064697265625
global_step=0, episodic_return=249.18885803222656
global_step=0, episodic_return=428.4019470214844
global_step=0, episodic_return=354.6907958984375
global_step=0, episodic_return=195.9595184326172
global_step=0, episodic_return=176.30679321289062
global_step=0, episodic_return=413.56182861328125
global_step=0, episodic_return=209.7801513671875
global_step=0, episodic_return=397

  return F.mse_loss(input, target, reduction=self.reduction)


global_step=0, episodic_return=136.84251403808594
global_step=0, episodic_return=187.31646728515625
steps per second (SPS): 0
global_step=0, episodic_return=464.4496154785156
global_step=0, episodic_return=394.742431640625
global_step=0, episodic_return=441.273193359375
global_step=0, episodic_return=672.4449462890625
global_step=0, episodic_return=312.2405090332031
global_step=0, episodic_return=396.10430908203125
global_step=0, episodic_return=457.9420471191406
global_step=0, episodic_return=177.88482666015625
global_step=0, episodic_return=244.57919311523438
global_step=0, episodic_return=241.17510986328125
global_step=0, episodic_return=352.39385986328125
global_step=0, episodic_return=323.78973388671875
global_step=0, episodic_return=116.91065979003906
global_step=0, episodic_return=167.56248474121094
global_step=0, episodic_return=268.0293273925781
global_step=0, episodic_return=857.9696044921875
steps per second (SPS): 0
global_step=0, episodic_return=285.7243957519531
global_st

  logger.deprecation(
  return F.mse_loss(input, target, reduction=self.reduction)


global_step=0, episodic_return=378.68255615234375
global_step=0, episodic_return=446.95001220703125
global_step=0, episodic_return=411.523193359375
global_step=0, episodic_return=588.63232421875
global_step=0, episodic_return=708.7506713867188
steps per second (SPS): 0
global_step=0, episodic_return=339.54168701171875
global_step=0, episodic_return=336.8694763183594
global_step=0, episodic_return=962.6343994140625
global_step=0, episodic_return=185.35301208496094
global_step=0, episodic_return=674.0247802734375
global_step=0, episodic_return=314.81427001953125
global_step=0, episodic_return=210.60060119628906
global_step=0, episodic_return=662.3192138671875
global_step=0, episodic_return=415.79150390625
global_step=0, episodic_return=392.7398986816406
global_step=0, episodic_return=297.9765625
global_step=0, episodic_return=247.88609313964844
steps per second (SPS): 0
global_step=0, episodic_return=96.62042999267578
global_step=0, episodic_return=569.010986328125
global_step=0, episodi

  logger.deprecation(
  return F.mse_loss(input, target, reduction=self.reduction)


global_step=0, episodic_return=279.7634582519531
steps per second (SPS): 0
global_step=0, episodic_return=1547.5714111328125
global_step=0, episodic_return=2015.06591796875
global_step=0, episodic_return=322.50982666015625
global_step=0, episodic_return=2100.587646484375
steps per second (SPS): 0
global_step=0, episodic_return=590.7235107421875
global_step=0, episodic_return=2314.148681640625
global_step=0, episodic_return=1086.5272216796875
global_step=0, episodic_return=2019.83740234375
global_step=0, episodic_return=1059.9483642578125
steps per second (SPS): 0
global_step=0, episodic_return=709.5706787109375
global_step=0, episodic_return=1730.841064453125
global_step=0, episodic_return=1182.268310546875
steps per second (SPS): 0
global_step=0, episodic_return=1030.614990234375
global_step=0, episodic_return=1352.6773681640625
global_step=0, episodic_return=298.1153869628906
global_step=0, episodic_return=616.2450561523438
global_step=0, episodic_return=2674.908447265625
steps per s

  logger.deprecation(
  return F.mse_loss(input, target, reduction=self.reduction)


global_step=0, episodic_return=1857.632080078125
global_step=0, episodic_return=2102.709228515625
steps per second (SPS): 0
global_step=0, episodic_return=2372.949462890625
global_step=0, episodic_return=1324.762939453125
global_step=0, episodic_return=3778.369873046875
global_step=0, episodic_return=2087.55322265625
steps per second (SPS): 0
global_step=0, episodic_return=1609.59033203125
steps per second (SPS): 0
global_step=0, episodic_return=2266.474853515625
global_step=0, episodic_return=2131.064453125
global_step=0, episodic_return=2785.2919921875
global_step=0, episodic_return=1489.8214111328125
steps per second (SPS): 0
steps per second (SPS): 0
global_step=0, episodic_return=2667.66455078125
global_step=0, episodic_return=2933.4541015625
global_step=0, episodic_return=2238.919677734375
steps per second (SPS): 0
global_step=0, episodic_return=2765.916259765625
global_step=0, episodic_return=1698.296142578125
global_step=0, episodic_return=438.44677734375
global_step=0, episodi

  logger.deprecation(
  return F.mse_loss(input, target, reduction=self.reduction)


global_step=0, episodic_return=1396.913330078125
global_step=0, episodic_return=1950.2967529296875
global_step=0, episodic_return=1560.7646484375
steps per second (SPS): 0
global_step=0, episodic_return=1993.4893798828125
global_step=0, episodic_return=1577.7047119140625
global_step=0, episodic_return=1951.7042236328125
global_step=0, episodic_return=1944.8883056640625
steps per second (SPS): 0
global_step=0, episodic_return=1649.141845703125
global_step=0, episodic_return=1663.8997802734375
global_step=0, episodic_return=1666.9305419921875
global_step=0, episodic_return=1793.119140625
steps per second (SPS): 0
global_step=0, episodic_return=1544.3951416015625
global_step=0, episodic_return=1511.7825927734375
global_step=0, episodic_return=1679.620361328125
global_step=0, episodic_return=1482.6910400390625
steps per second (SPS): 0
global_step=0, episodic_return=1700.217041015625
global_step=0, episodic_return=1617.311279296875
steps per second (SPS): 0
global_step=0, episodic_return=1

  logger.deprecation(
  return F.mse_loss(input, target, reduction=self.reduction)


steps per second (SPS): 0
global_step=0, episodic_return=1650.619140625
global_step=0, episodic_return=698.0452270507812
global_step=0, episodic_return=1511.3895263671875
global_step=0, episodic_return=1443.4486083984375
steps per second (SPS): 0
global_step=0, episodic_return=1261.760009765625
global_step=0, episodic_return=1558.221923828125
global_step=0, episodic_return=1380.08251953125
global_step=0, episodic_return=1431.6412353515625
steps per second (SPS): 0
global_step=0, episodic_return=1537.492431640625
global_step=0, episodic_return=1707.28662109375
global_step=0, episodic_return=1642.6610107421875
global_step=0, episodic_return=1741.7760009765625
steps per second (SPS): 0
global_step=0, episodic_return=1596.966796875
global_step=0, episodic_return=2137.66357421875
global_step=0, episodic_return=1723.467041015625
steps per second (SPS): 0
global_step=0, episodic_return=1688.114501953125
global_step=0, episodic_return=1685.10693359375
global_step=0, episodic_return=1582.891601

  logger.deprecation(
  return F.mse_loss(input, target, reduction=self.reduction)


steps per second (SPS): 0
global_step=0, episodic_return=2358.358154296875
global_step=0, episodic_return=2308.66943359375
global_step=0, episodic_return=2139.838134765625
global_step=0, episodic_return=2003.735107421875
steps per second (SPS): 0
global_step=0, episodic_return=1829.77197265625
global_step=0, episodic_return=2094.26171875
global_step=0, episodic_return=1735.6611328125
global_step=0, episodic_return=2107.653076171875
steps per second (SPS): 0
global_step=0, episodic_return=1751.96533203125
global_step=0, episodic_return=1729.1920166015625
global_step=0, episodic_return=1943.6728515625
steps per second (SPS): 0
global_step=0, episodic_return=1930.537353515625
global_step=0, episodic_return=1886.2796630859375
global_step=0, episodic_return=1785.4769287109375
steps per second (SPS): 0
global_step=0, episodic_return=2021.111572265625
global_step=0, episodic_return=2555.97314453125
steps per second (SPS): 0
global_step=0, episodic_return=1908.05615234375
global_step=0, episod

  from IPython.core.display import HTML, display  # type: ignore


0,1
SPS,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
approx_kl,▁▁▁▂▁▁▂▁▁▁▂▁▃▂▂▃▂▁▁▁▁▁▁█▁▃▁▃▁▁▁▄▃▁▃▁▅▁▁▂
charts/SPS,▁
charts/episodic_length,▁
charts/episodic_return,▁
charts/learning_rate,▁
clipfrac,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
entropy,█▆▅▄▄▄▄▃▄▃▄▃▃▂▃▃▃▂▃▂▃▂▃▂▁▂▂▁▂▁▂▂▂▃▃▂▂▂▃▂
explained_variance,▃▃▃▃▃▅▄█▂▁▃▂▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
global_step,▁

0,1
SPS,0.0
approx_kl,0.00077
charts/SPS,0.0
charts/episodic_length,41.0
charts/episodic_return,440.87018
charts/learning_rate,0.00025
clipfrac,0.0
entropy,0.00539
explained_variance,0.00228
global_step,0.0
