In [1]:
import os
import sys

device = "cuda:0"

# Setup: and Selenium chrome driver to PATH and set MINIWOB_BASE_URL env variable to the directory with HTML task files

In [2]:
from miniwob.environment import MiniWoBEnvironment
from miniwob.screenshot import pil_to_numpy_array

task_name = [
    'login-user',
    'email-inbox-forward',
    'email-inbox-delete',
    'email-inbox-important'
]
num_task = len(task_name) if type(task_name) is list else 1
base_url = os.environ.get('MINIWOB_BASE_URL')
print('BASE URL:', base_url)

BASE URL: file:///mnt/akostin/home/akostin/prjs/miniwob-plusplus/html/


In [3]:
NH=128
X_MAX, Y_MAX = 160, 210
DOM_TEXT_MAX_TOKENS = 8
DOM_CLASSES_MAX_TOKENS = 8
GOAL_MAX_TOKENS = 32
GOAL_MAX_WORDS = 20
GOAL_WORD_MAX_TOKENS = 8
DOM_MAX_EDGES = 7
DOM_MAX_VERTICES = 17
EXPLORE_REWARD = 0.05
STEP_PRICE = 0.1
HEADS_KEYS = ["dom", "token", "master"]
H_bert = 64

# Gym miniwob wrapper

In [4]:
import gym
import torch
from miniwob.action import MiniWoBElementClick, MiniWoBType
from transformers import BertTokenizer
import numpy as np


class MiniWobClickElementWrapper(gym.Wrapper):
    
    def __init__(self, env):
        super().__init__(env)
        self.env = env
        bert_model = 'bert-base-cased'
        
        self.tokenizer = BertTokenizer.from_pretrained(bert_model)
        self.observation_space = gym.spaces.Dict({"goal_state": gym.spaces.Box(0, 
                                                                               self.tokenizer.vocab_size,
                                                                               (GOAL_MAX_TOKENS, ),
                                                                               dtype=np.int64),
                                                  #"img_state": gym.spaces.Box(0,
                                                  #                            255,
                                                  #                            (150, 150, 3),
                                                  #                            dtype=np.uint8),
                                                  "dom_simple_feats": gym.spaces.Box(-1,
                                                                                     1,
                                                                                     (DOM_MAX_VERTICES, 3),
                                                                                     dtype=np.float32),
                                                  "dom_text_tokens": gym.spaces.Box(0,
                                                                                    self.tokenizer.vocab_size,
                                                                                    (DOM_MAX_VERTICES, DOM_TEXT_MAX_TOKENS),
                                                                                    dtype=np.int64),
                                                  "dom_classes_tokens": gym.spaces.Box(0,
                                                                                       self.tokenizer.vocab_size,
                                                                                       (DOM_MAX_VERTICES, DOM_CLASSES_MAX_TOKENS),
                                                                                       dtype=np.int64),
                                                  #"dom_tags": gym.spaces.Box(0,
                                                  #                           len(tags)-1,
                                                  #                           (DOM_MAX_VERTICES,),
                                                  #                           dtype=np.int64),
                                                  #"n_goal_words": gym.spaces.Box(0,
                                                  #                               GOAL_MAX_WORDS,
                                                  #                               (1,),
                                                  #                               dtype=np.int64),
                                                  "action_mask": gym.spaces.Box(0,
                                                                                1,
                                                                                (DOM_MAX_VERTICES + GOAL_MAX_WORDS,),
                                                                                dtype=np.int64),
                                                  "goal_words": gym.spaces.Box(0,
                                                                               self.tokenizer.vocab_size,
                                                                               (GOAL_MAX_WORDS, GOAL_WORD_MAX_TOKENS),
                                                                               dtype=np.int64)})
        
        self.n_steps = [0 for _ in range(self.num_instances)]
        self.past_states = [list() for _ in range(self.num_instances)]
        self.curr_states = [torch.zeros((DOM_MAX_VERTICES, 2)) for _ in range(self.num_instances)]
        self.inst_verts = [None for _ in range(self.num_instances)]
        self.text_tokens = [None for _ in range(self.num_instances)]
        self.dom_keys = [key for key in self.observation_space if key.startswith("dom")] + ["action_mask"]
        self.action_space = gym.spaces.Discrete(DOM_MAX_VERTICES + GOAL_MAX_WORDS)
    
    def dom_filter(self, ob):
        f_elems = list()
        elems = ob.dom_elements
        for e in elems:
            if ((not e.ref is None) and e.is_leaf):
                f_elems.append(e)
        return f_elems[:DOM_MAX_VERTICES]
    
    def clean_classes(self, text):
        return text.replace(' ', '; ').replace('-', ' ').replace('_', ' ')
    
    def _ob_to_dom(self, ob, i):
        max_length = DOM_TEXT_MAX_TOKENS
        if ob is None:
            self.inst_verts[i] = None
            self.text_tokens[i] = None
            return {
                "dom_simple_feats": -torch.ones((1, DOM_MAX_VERTICES, 3), dtype=torch.float32),
                "dom_text_tokens": torch.zeros((1, DOM_MAX_VERTICES, DOM_TEXT_MAX_TOKENS), dtype=torch.int64),
                "dom_classes_tokens": torch.zeros((1, DOM_MAX_VERTICES, DOM_CLASSES_MAX_TOKENS), dtype=torch.int64),
                #"dom_tags": torch.zeros((1, DOM_MAX_VERTICES), dtype=torch.int64),
                #"n_goal_words": torch.zeros((1,1), dtype=torch.int64),
                "action_mask": torch.ones((1, DOM_MAX_VERTICES + GOAL_MAX_WORDS), dtype=torch.int64)
            }
        else:
            self.inst_verts[i] = self.dom_filter(ob)
            self.text_tokens[i] = ob.tokens
            tokens = []
            class_tokens = []
            feats = []
            tags_ = []
            available_dom_click = []
            for elem in self.dom_filter(ob):
                elem_text = elem.text if elem.text else "none"
                elem_tokens = self.tokenizer(elem_text,
                                             padding='max_length',
                                             max_length=DOM_TEXT_MAX_TOKENS,
                                             truncation=True,
                                             return_tensors="pt",
                                             add_special_tokens=False)["input_ids"]
                tokens.append(elem_tokens)
                
                elem_text = self.clean_classes(elem.classes if elem.classes else "none")
                elem_tokens = self.tokenizer(elem_text,
                                             padding='max_length',
                                             max_length=DOM_CLASSES_MAX_TOKENS,
                                             truncation=True,
                                             return_tensors="pt",
                                             add_special_tokens=False)["input_ids"]
                class_tokens.append(elem_tokens)
                
                elem_feats = torch.tensor([elem.focused, bool(elem.value), elem.tampered],
                                          dtype=torch.float32)
                feats.append(elem_feats)
                #tags_.append(torch.tensor([tag2id[elem.tag]], dtype=torch.int64))
                available_dom_click.append(0 if elem.is_leaf else 1)
            
            delta_v = DOM_MAX_VERTICES - len(tokens)
            available_dom_click += [1 for _ in range(delta_v)]
            
            if self.typeble_elem_focused(i) and not self.elem_have_value(i):
                available_type_tokens = [0 for _ in range(len(self.text_tokens[i]))] + \
                                        [1 for _ in range(GOAL_MAX_WORDS - len(self.text_tokens[i]))]
            else:
                #print("typing not available")
                available_type_tokens = [1 for _ in range(GOAL_MAX_WORDS)]
            available_actions = available_dom_click + available_type_tokens
            
            tokens += [torch.zeros((1, DOM_TEXT_MAX_TOKENS), dtype=torch.int64) for _ in range(delta_v)]
            class_tokens += [torch.zeros((1, DOM_CLASSES_MAX_TOKENS), dtype=torch.int64) for _ in range(delta_v)]
            feats += [-torch.ones((3,), dtype=torch.float32) for _ in range(delta_v)]
            tags_ += [torch.zeros((1,), dtype=torch.int64) for _ in range(delta_v)]
            
            return{
                "dom_text_tokens": torch.cat(tokens, dim=0).unsqueeze(0),
                "dom_classes_tokens": torch.cat(class_tokens, dim=0).unsqueeze(0),
                "dom_simple_feats": torch.stack(feats, dim=0).unsqueeze(0),
                #"dom_tags": torch.cat(tags_, dim=0).unsqueeze(0),
                #"n_goal_words": torch.tensor([len(self.text_tokens[i])]).unsqueeze(0),
                "action_mask": torch.tensor(available_actions).unsqueeze(0)
            }
    
    def typeble_elem_focused(self, i):
        focused_elem = self.n_element_focused(i)
        if not focused_elem is None:
            focused_vert = self.inst_verts[i][focused_elem]
            return ") value=" in str(focused_vert) and "input" in str(focused_vert)
        else:
            return False
    
    def elem_have_value(self, i):
        focused_elem = self.n_element_focused(i)
        if not focused_elem is None:
            value_state = self.curr_states[i][focused_elem, 1]
            return bool(value_state)
        else:
            return False
    
    def n_element_focused(self, i):
        focus_state = self.curr_states[i][:, 0]
        focus = torch.where(focus_state==1)[0]
        if len(focus)>0:
            return int(focus[0])
        else:
            return None
    
    def _ob_to_token(self, ob, i):
        max_length = GOAL_MAX_TOKENS
        if ob is None:
            return torch.zeros(1, max_length)
        else:
            return self.tokenizer(ob.tokens,  
                                  padding='max_length', 
                                  max_length = max_length, truncation=True,          
                                  return_tensors="pt",
                                  is_split_into_words=True)['input_ids']
    
    def _ob_to_goal_words(self, ob):
        max_length = GOAL_WORD_MAX_TOKENS
        if ob is None:
            return torch.zeros(1, GOAL_MAX_WORDS, GOAL_WORD_MAX_TOKENS)
        else:
            words = list()
            for t in ob.tokens:
                word_tokens = self.tokenizer(t,  
                                             padding='max_length', 
                                             max_length = max_length, truncation=True,          
                                             return_tensors="pt",
                                             is_split_into_words=False,
                                             add_special_tokens=False)['input_ids']
                words.append(word_tokens)
            delta = GOAL_MAX_WORDS - len(words)
            words += [torch.zeros(1, max_length) for _ in range(delta)]
            words = torch.cat(words, dim=0).unsqueeze(0)
            return words
            
    def _ob_to_image(self, ob):
        if ob is None:
            return torch.zeros(1, 150, 150, 3)
        else:
            return torch.tensor(pil_to_numpy_array(ob.screenshot.resize([150, 150]))).unsqueeze(0)
        
    
    def _to_miniwob_actions(self, actions):
        
        #actions = actions.squeeze(dim=1)
        miniwob_actions = []
        
        for i in range(self.num_instances):
            if self.instances[i].get_metadata()['done']:
                miniwob_actions.append(None)
            else:
                if self.inst_verts[i] is None:
                    miniwob_actions.append(None)
                else:
                    v_n = actions[i]#.item()
                    if v_n is None:
                        miniwob_actions.append(v_n)
                    elif v_n < DOM_MAX_VERTICES:
                        element = self.inst_verts[i][v_n]
                        miniwob_actions.append(MiniWoBElementClick(element))
                        self.n_steps[i] += 1
                    else:
                        text = self.text_tokens[i][v_n - DOM_MAX_VERTICES]
                        miniwob_actions.append(MiniWoBType(text))
                        self.n_steps[i] += 1
        return miniwob_actions
    
    def get_explore_reward(self, obs):
        explore_rewards = [0 for _ in range(self.num_instances)]
        for i, states in enumerate(self.past_states):
            t = obs["dom_simple_feats"][i][:, :-1]
            unique = True
            for pt in states:
                if torch.all(t==pt):
                    unique = False
                    break
            if unique:
                explore_rewards[i] += EXPLORE_REWARD
            else:
                explore_rewards[i] -= EXPLORE_REWARD
        return explore_rewards

    def observation(self, obs):
        goals = torch.cat([self._ob_to_token(obs[i], i) for i in range(len(obs))], dim=0)
        goal_words = torch.cat([self._ob_to_goal_words(ob) for ob in obs], dim=0)
        #imgs = torch.cat([self._ob_to_image(ob) for ob in obs], dim=0)
        doms = [self._ob_to_dom(obs[i], i) for i in range(len(obs))]
        doms_states = {key: torch.cat([dom[key] for dom in doms], dim=0) for key in self.dom_keys}
        return {#'img_state': imgs, 
                'goal_state': goals,
                "goal_words": goal_words,
                **doms_states}
    
    def _ob_to_simple_feats(self, ob):
        if ob is None:
            return -torch.ones((1, DOM_MAX_VERTICES, 3), dtype=torch.float32)
        else:
            feats = []
            for elem in self.dom_filter(ob):
                elem_feats = torch.tensor([elem.focused, bool(elem.value), elem.tampered],
                                          dtype=torch.float32)
                feats.append(elem_feats)            
            delta_v = DOM_MAX_VERTICES - len(feats)
            feats += [-torch.ones((3,), dtype=torch.float32) for _ in range(delta_v)]
            return torch.stack(feats, dim=0).unsqueeze(0)
    
    def get_rewards(self, rewards):
        augmented_rewards = list()
        for i, reward in enumerate(rewards):
            if reward > 0:
                if self.n_steps[i] < 5:
                    r = 1 - self.n_steps[i] * 0.1
                else:
                    r = 0.5 * (np.e ** (-self.n_steps[i] + 5))
                augmented_rewards.append(r)
            else:
                augmented_rewards.append(reward)
        return augmented_rewards
        
    def update_curr_states(self, obs):
        obs = [self._ob_to_simple_feats(ob) for ob in obs]
        obs = {"dom_simple_feats": torch.cat(obs, dim=0)}
        for i, _ in enumerate(self.past_states):
            s = obs["dom_simple_feats"][i][:, :-1]
            self.curr_states[i] = s
    
    def update_past_states(self, obs):
        for i, _ in enumerate(self.past_states):
            s = obs["dom_simple_feats"][i][:, :-1]
            self.past_states[i].append(s)
    
    def reset(self):
        obs = self.observation(self.env.reset(record_screenshots=True))
        self.update_past_states(obs)
        return obs
    
    def step(self, actions):
        miniwob_actions = self._to_miniwob_actions(actions)
        obs, rewards, dones, infos = self.env.step(miniwob_actions)
        self.update_curr_states(obs)

        obs = self.observation(obs)
        
        ###
        rewards = self.get_rewards(rewards)
        ###
        
        ####
        explore_rewards = self.get_explore_reward(obs)
        rewards = [a + b for a, b in zip(rewards, explore_rewards)]
        for i, r in enumerate(explore_rewards):
            infos['n'][i]['exploration_reward'] = r
        ####
        
        for i, instance in enumerate(self.env.instances):
            if instance.get_metadata()['done']:
                instance.begin_task()
                self.past_states[i] = list()
                self.n_steps[i] = 0
        
        self.update_past_states(obs)
        
        return obs, torch.tensor([rewards]).T, torch.tensor([dones]).T, infos['n']


class EpisodeInfoWrapper(gym.Wrapper):
    
    def __init__(self, env, n):
        super(EpisodeInfoWrapper, self).__init__(env)
        self.episode_reward = np.zeros(n)
        self.episode_steps = np.zeros(n)
        self.n = n

    def reset(self):
        self.episode_reward = np.zeros(self.n)
        self.episode_steps = np.zeros(self.n)
        return self.env.reset()

    def step(self, actions):
        states, rewards, dones, infos = self.env.step(actions)
        
        for i, done in enumerate(dones):
            self.episode_steps[i] += 1
            if done:
                self.episode_reward[i] = rewards[i]
                infos[i]['episode'] = {'r': self.episode_reward[i], 'steps': self.episode_steps[i]}
                self.episode_steps[i] = 0
                
        return states, rewards, dones, infos

In [5]:
from transformers import BertTokenizer, BertModel, BertConfig
import torch
from torch import nn
from torch.nn import Linear
from torch.nn.functional import relu


class BertEncoder(nn.Module):
    
    def __init__(self):
        
        super().__init__()
        bert_model = 'bert-base-cased'
        
        self.bert_encoder = BertModel.from_pretrained(bert_model).embeddings.word_embeddings
        self.embed_dim = 768
        self.output_size = H_bert
        self.h_dim = 128
        self.n_h_layers = 2
        
        self.out_layers = nn.ModuleList(
            [nn.Sequential(nn.LayerNorm(self.embed_dim), nn.Linear(self.embed_dim, self.h_dim), nn.ReLU())] +
            [nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.h_dim), nn.ReLU()) for _ in range(self.n_h_layers)] +
            [nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.output_size), nn.ReLU())]
        )
           
    def forward(self, input_ids):
        with torch.no_grad():
            text_tokens_embed = self.bert_encoder(input_ids.long()).mean(dim=1)
        
        prev, out = text_tokens_embed, text_tokens_embed
        for l in self.out_layers:
            out = l(out)
            if out.shape[-1] == prev.shape[-1]:
                out = out + prev
            prev = out
        
        return out

In [6]:
import torch
from torch import nn
from torch.nn import ModuleList, Parameter, TransformerEncoderLayer, TransformerEncoder
from torch.nn.functional import relu, softmax


UNDIRECTED = True
SELF_LOOP = True
bert_model = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_model)
pad_token_id = tokenizer.pad_token_id


class FFNet(nn.Module):
    def __init__(self, inp_dim, h_dim=64, out_dim=64, h_layers=4, last_layer_relu=True):
        super(FFNet, self).__init__()
        self.inp_dim = inp_dim
        self.h_dim = h_dim
        self.out_dim = out_dim
        
        layers = []
        layers.append(nn.Sequential(nn.LayerNorm(self.inp_dim), nn.Linear(self.inp_dim, self.h_dim), nn.ReLU()))
        for i in range(h_layers):
            layers.append(nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.h_dim), nn.ReLU()))
        
        if last_layer_relu:
            layers.append(nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.out_dim), nn.ReLU()))
        else:
            layers.append(nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.out_dim)))
        
        self.layers = nn.ModuleList(layers)
    
    def forward(self, input):
        out = input
        prev = input
        for l in self.layers:
            out = l(out)
            if out.shape[-1] == prev.shape[-1]:
                out = out + prev
            prev = out
        return out


class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.text_encoder = BertEncoder()
        self.tf_size = 64
        self.output_size = 128
        
        self.net = FFNet(((H_bert * 2 + 3) * DOM_MAX_VERTICES + GOAL_MAX_WORDS * H_bert), 512, self.output_size, 8)
        
        self.flag = True
        self.buffer = None
    
    def forward(self, input):
        if self.flag:
            bs = input["goal_state"].shape[0]
            
            # goal words encoding
            goal_words = torch.flatten(input["goal_words"], end_dim=-2)
            words = self.text_encoder(goal_words).view(bs, -1)
            
            # dom elements encoding
            dom_text = torch.flatten(input["dom_text_tokens"], end_dim=-2)
            dom_text_global = self.text_encoder(dom_text)
            
            dom_classes = torch.flatten(input["dom_classes_tokens"], end_dim=-2)
            dom_classes_global = self.text_encoder(dom_classes)
            
            simple_feats = torch.flatten(input["dom_simple_feats"], end_dim=-2)
            
            doms = torch.cat([dom_text_global, dom_classes_global, simple_feats], dim=-1)
            doms = doms.view(bs, -1)
            
            # forward
            inp = torch.cat([doms, words], dim=-1)#.view(bs, -1)
            out = self.net(inp)
            
            self.buffer = out
        
        else:
            out = self.buffer
        
        return out

In [7]:
from rllr.models.ppo import FixedCategorical, ActorCriticNetwork, CriticNetwork
from torch.nn import functional as F


class DiscreteActorNetwork(nn.Module):
    """
    Actor is a policy network. Given state it evaluates
    probability of action given state or sample an action
    """

    def __init__(self, action_size, state_encoder, hidden_size):
        super().__init__()
        self.state_encoder = state_encoder
        input_size = state_encoder.output_size
        self.h_dim = hidden_size
        self.inp_dim = input_size
        self.output_size = action_size
        self.dom_head_out = DOM_MAX_VERTICES
        self.token_head_out = GOAL_MAX_WORDS
        self.master_head_out = 2
        self.n_h_layers = 4
        
        # dom layers
        self.dom_layers = ModuleList([nn.Sequential(nn.LayerNorm(self.inp_dim), nn.Linear(self.inp_dim, self.h_dim), nn.ReLU())] + \
                                     [nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.h_dim), nn.ReLU()) for _ in range(self.n_h_layers)] + \
                                     [nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.dom_head_out))])
        
        # token layers
        self.token_layers = ModuleList([nn.Sequential(nn.LayerNorm(self.inp_dim), nn.Linear(self.inp_dim, self.h_dim), nn.ReLU())] + \
                                       [nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.h_dim), nn.ReLU()) for _ in range(self.n_h_layers)] + \
                                       [nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.token_head_out))])
        
        # layers for master head
        self.master_layers = ModuleList([nn.Sequential(nn.LayerNorm(self.inp_dim), nn.Linear(self.inp_dim, self.h_dim), nn.ReLU())] + \
                                        [nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.h_dim), nn.ReLU()) for _ in range(self.n_h_layers)] + \
                                        [nn.Sequential(nn.LayerNorm(self.h_dim), nn.Linear(self.h_dim, self.master_head_out))])
        
    def forward(self, states):
        states_encoding = self.state_encoder(states)
        action_mask = states["action_mask"]  # BxA
        
        dom_mask, token_mask = action_mask[:, :DOM_MAX_VERTICES], action_mask[:, -GOAL_MAX_WORDS:]
        master_mask = torch.cat([torch.all(dom_mask, dim=1, keepdim=True),
                                 torch.all(token_mask, dim=1, keepdim=True)],
                                 dim=1).int()
        for i in range(token_mask.shape[0]):
            if dom_mask[i].all():
                dom_mask[i] = torch.zeros_like(dom_mask[i], dtype=torch.int)
            if token_mask[i].all():
                token_mask[i] = torch.zeros_like(token_mask[i], dtype=torch.int)
            if master_mask[i].all():
                master_mask[i] = torch.zeros_like(master_mask[i], dtype=torch.int)
        
        heads_mask = {k: v for k, v in zip(HEADS_KEYS, [dom_mask, token_mask, master_mask])}
        
        heads_out = list()
        for layers in [self.dom_layers, self.token_layers, self.master_layers]:
            out = states_encoding
            prev = states_encoding
            for l in layers:
                out = l(out)
                if out.shape[-1] == prev.shape[-1]:
                    out = out + prev
                prev = out
            heads_out.append(out)
        
        heads_out = {k: v for k, v in zip(HEADS_KEYS, heads_out)}
        
        for k in heads_mask:
            heads_out[k] = torch.where(heads_mask[k]==1, -float("inf"), heads_out[k].double())
        
        return {k: FixedCategorical(logits=F.log_softmax(v, dim=1)) for k, v in heads_out.items()}
    


class ACN(ActorCriticNetwork):
    def __init__(self, action_space, actor_state_encoder, critic_state_encoder, actor_hidden_size, critic_hidden_size):
        super(ActorCriticNetwork, self).__init__()
        if type(action_space) == gym.spaces.Discrete:
            self.actor = DiscreteActorNetwork(action_space.n, actor_state_encoder, actor_hidden_size)
        else:
            raise f'{action_space} not supported'
        self.critic = CriticNetwork(critic_state_encoder, critic_hidden_size)

        def init_params(m):
            classname = m.__class__.__name__
            if classname.find("Linear") != -1:
                m.weight.data.normal_(0, 1)
                m.weight.data *= 1 / torch.sqrt(m.weight.data.pow(2).sum(1, keepdim=True))
                if m.bias is not None:
                    m.bias.data.fill_(0)

        self.apply(init_params)
    
    def act(self, states, deterministic=False):
        self.actor.state_encoder.flag = True
        dist = self.actor.forward(states)
        if deterministic:
            action = {k: dist[k].mode() for k in dist}
        else:
            action = {k: dist[k].sample() for k in dist}
        log_probs = {k: dist[k].log_probs(action[k]) for k in dist}
        self.critic.state_encoder.flag = False
        return self.critic.forward(states), action, log_probs
    
    def evaluate_actions(self, states, actions):
        self.actor.state_encoder.flag = True
        dist = self.actor.forward(states)
        self.critic.state_encoder.flag = False
        values = self.critic.forward(states)
        
        log_probs = {k: dist[k].log_probs(actions[k]) for k in dist}
        dist_entropy = {k: dist[k].entropy().mean() for k in dist}
        return values, log_probs, dist_entropy

# Train agent

In [8]:
import numpy as np
from rllr.env.vec_wrappers import make_vec_envs
from stable_baselines3.common.vec_env import DummyVecEnv


def get_envs(n, **kwargs):
    envs = list()
    for task in task_name:
        env = MiniWoBEnvironment(task, seeds=range(n), num_instances=n, base_url=base_url, **kwargs)
        env = MiniWobClickElementWrapper(env)
        env = EpisodeInfoWrapper(env, n)
        envs.append(env)
    return envs

In [9]:
n_instances = 4
envs = get_envs(n_instances, headless=True)

In [10]:
from rllr.models.encoders import GoalStateEncoder, SimpleCNN

encoder = Encoder()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
from rllr.algo.ppo import PPO

class MultiheadPPO(PPO):
    def __init__(self, actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef,
                 entropy_coef, lr=None, eps=None, max_grad_norm=None,
                 dom_entropy=1, token_entropy=1, master_entropy=1):
        super(MultiheadPPO, self).__init__(actor_critic, clip_param, ppo_epoch,
                                           num_mini_batch, value_loss_coef,
                                           entropy_coef, lr, eps, max_grad_norm)
        self.dom_entropy = dom_entropy
        self.token_entropy = token_entropy
        self.master_entropy = master_entropy
    
    def update(self, rollouts):
        advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

        value_loss_epoch = 0
        action_loss_epoch = 0
        dist_entropy_epoch = 0

        for e in range(self.ppo_epoch):
            data_generator = rollouts.feed_forward_generator(advantages, self.num_mini_batch)

            for sample in data_generator:
                obs_batch, actions_batch, value_preds_batch, return_batch, masks_batch, \
                old_action_log_probs_batch, adv_targ = sample

                # Reshape to do in a single forward pass for all steps
                values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions(obs_batch, actions_batch)

                ratio = {k: torch.exp(action_log_probs[k] - old_action_log_probs_batch[k]) for k in action_log_probs}
                surr1 = {k: ratio[k] * adv_targ for k in ratio}
                surr2 = {k: torch.clamp(ratio[k], 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ for k in ratio}
                action_loss = {k: -torch.min(surr1[k], surr2[k]) for k in surr1}

                # clipped_value_loss:
                value_pred_clipped = value_preds_batch + \
                     (values - value_preds_batch).clamp(-self.clip_param, self.clip_param)
                value_losses = (values - return_batch).pow(2)
                value_losses_clipped = (value_pred_clipped - return_batch).pow(2)
                value_loss = 0.5 * torch.max(value_losses, value_losses_clipped).mean()

                self.optimizer.zero_grad()
                
                action_loss = action_loss["master"] + torch.where(actions_batch["master"]==0, action_loss["dom"], action_loss["token"])
                action_loss = action_loss.mean()
                entropy_loss = dist_entropy["master"] * self.master_entropy + \
                               dist_entropy["dom"] * self.dom_entropy + \
                               dist_entropy["token"] * self.token_entropy
                loss = value_loss * self.value_loss_coef + action_loss - entropy_loss * self.entropy_coef
                
                loss.backward()
                nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm)
                self.optimizer.step()

                value_loss_epoch += value_loss.item()
                action_loss_epoch += action_loss.item()
                dist_entropy_epoch += entropy_loss.item()

        num_updates = self.ppo_epoch * self.num_mini_batch

        value_loss_epoch /= num_updates
        action_loss_epoch /= num_updates
        dist_entropy_epoch /= num_updates

        return value_loss_epoch, action_loss_epoch, dist_entropy_epoch


In [12]:
from rllr.buffer.rollout import RolloutStorage
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler


class MultiheadRolloutStorage(RolloutStorage):
    def __init__(self, num_steps, num_processes, obs_space, action_space):
        super(MultiheadRolloutStorage, self).__init__(num_steps, num_processes, obs_space, action_space)
        if action_space.__class__.__name__ == 'Discrete':
            self.actions = {k: torch.zeros(num_steps, num_processes, 1).long() for k in HEADS_KEYS}
        else:
            self.actions = {k: torch.zeros(num_steps, num_processes, action_space.shape[0]) for k in HEADS_KEYS}
        self.action_log_probs = {k: torch.zeros(num_steps, num_processes, 1) for k in HEADS_KEYS}
    
    def to(self, device):
        if self.obs.__class__.__name__ == 'dict':
            for key in self.obs:
                self.obs[key] = self.obs[key].to(device)
        else:
            self.obs = self.obs.to(device)

        self.rewards = self.rewards.to(device)
        self.value_preds = self.value_preds.to(device)
        self.returns = self.returns.to(device)
        self.action_log_probs = {k: self.action_log_probs[k].to(device) for k in self.action_log_probs}
        self.actions = {k: self.actions[k].to(device) for k in self.actions}
        self.masks = self.masks.to(device)

    def insert(self, obs, actions, action_log_probs, value_preds, rewards, masks):
        self.copy_obs(obs, self.step + 1)
        for k in actions:
            self.actions[k][self.step].copy_(actions[k])
            self.action_log_probs[k][self.step].copy_(action_log_probs[k])
        self.value_preds[self.step].copy_(value_preds)
        self.rewards[self.step].copy_(rewards)
        self.masks[self.step + 1].copy_(masks)
        
        self.step = (self.step + 1) % self.num_steps
    
    def feed_forward_generator(self, advantages, num_mini_batch=None, mini_batch_size=None):
        num_steps, num_processes = self.rewards.size()[0:2]
        batch_size = num_processes * num_steps

        if mini_batch_size is None:
            assert batch_size >= num_mini_batch, (
                "PPO requires the number of processes ({}) "
                "* number of steps ({}) = {} "
                "to be greater than or equal to the number of PPO mini batches ({})."
                "".format(num_processes, num_steps, num_processes * num_steps,
                          num_mini_batch))
            mini_batch_size = batch_size // num_mini_batch
        sampler = BatchSampler(
            SubsetRandomSampler(range(batch_size)),
            mini_batch_size,
            drop_last=True)
        for indices in sampler:
            if self.obs.__class__.__name__ == 'dict':
                obs_batch = {key: self.obs[key][:-1].view(-1, *self.obs[key].size()[2:])[indices] for key in self.obs}
            else:
                obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices]
            
            actions_batch = {k: self.actions[k].view(-1, self.actions[k].size(-1))[indices] for k in self.actions}
            #actions_batch = self.actions.view(-1,
            #                                  self.actions.size(-1))[indices]
            
            value_preds_batch = self.value_preds[:-1].view(-1, 1)[indices]
            return_batch = self.returns[:-1].view(-1, 1)[indices]
            masks_batch = self.masks[:-1].view(-1, 1)[indices]
            
            old_action_log_probs_batch = {k: self.action_log_probs[k].view(-1, 1)[indices] for k in self.action_log_probs}
            #old_action_log_probs_batch = self.action_log_probs.view(-1,
            #                                                        1)[indices]
            if advantages is None:
                adv_targ = None
            else:
                adv_targ = advantages.view(-1, 1)[indices]

            yield obs_batch, actions_batch, \
                value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, adv_targ

In [13]:
hidden_size = NH
policy = ACN(envs[0].action_space, encoder, encoder, hidden_size, hidden_size)

agent_conf = {
        "clip_param": 0.2,
        "ppo_epoch": 4,
        "num_mini_batch": 4,
        "value_loss_coef": 0.5,
        "entropy_coef": 0.01,
        "lr": 0.0002,
        "eps": 1e-5,
        "max_grad_norm": 0.5,
}


agent = MultiheadPPO(policy, **agent_conf)
agent.to(device)

<__main__.MultiheadPPO at 0x7f3a2f285eb0>

In [14]:
from tqdm import trange
import time
from collections import deque
import torch
import numpy as np


def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
    """Decreases the learning rate linearly"""
    lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

def get_action_mtx():
    return [[None for _ in range(n_instances)] for _ in range(num_task)]

def get_obs_i_dict(obs, i):
    ob = {k: obs[k][i] for k in obs}
    return ob

def collect_actions(actions, agent_envs):
    action_mtx = get_action_mtx()
    for i, env_num in enumerate(agent_envs):
        action_mtx[env_num][i] = actions[i].item()
    return action_mtx
    
def collect_obs(obs, rewards, dones, infos, agent_envs):
    new_agent_envs = list()
    multi_obs, multi_reward, multi_done, multi_infos = list(), list(), list(), list()
    for i, agent_env in enumerate(agent_envs):
        multi_reward.append(rewards[agent_env][i])
        multi_infos.append(infos[agent_env][i])
        
        multi_done.append(dones[agent_env][i])
        if multi_done[-1]:
            new_agent_envs.append(np.random.randint(num_task))
        else:
            new_agent_envs.append(agent_env)
        multi_obs.append(get_obs_i_dict(obs[new_agent_envs[-1]], i))
    
    multi_obs = {k: torch.stack([ob[k] for ob in multi_obs], dim=0) for k in multi_obs[0]}    
    return multi_obs, multi_reward, multi_done, multi_infos, new_agent_envs

def perform_actions(envs, action_mtx, agent_envs):
    global jjjj
    obs, rewards, dones, infos = list(), list(), list(), list()
    for i, env_actions in enumerate(action_mtx):
        ob, reward, done, info = envs[i].step(env_actions)
        obs.append(ob); rewards.append(reward), dones.append(done), infos.append(info)
    
    obs, rewards, dones, infos, agent_envs = collect_obs(obs, rewards, dones, infos, agent_envs)
    return obs, torch.cat(rewards).view(-1, 1), torch.cat(dones), infos, agent_envs

def reset(envs, agent_envs):
    all_obs, obs = list(), list()
    for i in range(num_task):
        all_obs.append(envs[i].reset())
    for i, agent_env in enumerate(agent_envs):
        obs.append(get_obs_i_dict(all_obs[agent_env], i))
    obs = {k: torch.stack([ob[k] for ob in obs], dim=0) for k in obs[0]}
    return obs

def train_ppo(envs, agent, conf):
    """
    Runs a series of episode and collect statistics
    """
    env = envs[0]
    
    best_mean, best_median = -float('inf'), -float('inf')
    rollouts = MultiheadRolloutStorage(
        conf['training.n_steps'], conf['training.n_processes'], env.observation_space, env.action_space
    )
    
    agent_envs = np.random.randint(0, num_task, n_instances)
    obs = reset(envs, agent_envs)
    
    rollouts.set_first_obs(obs)
    rollouts.to(conf['agent.device'])

    start = time.time()
    num_updates = int(conf['training.n_env_steps'] // conf['training.n_steps'] // conf['training.n_processes'])

    episode_rewards = deque(maxlen=20)
    exploration_rewards = deque(maxlen=conf["training.n_steps"]*n_instances)
    episode_steps = deque(maxlen=20)

    for j in trange(num_updates):
        update_linear_schedule(agent.optimizer, j, num_updates, conf['agent.lr'])

        for step in range(conf['training.n_steps']):
            # Sample actions
            obs = {k: v.to(conf['agent.device']) for k, v in obs.items()}
            value, action, action_log_prob = agent.act(obs)
            
            master_action = action["master"]
            dom_action = action["dom"]
            token_action = action["token"]
            true_action = torch.where(master_action==0, dom_action, token_action + DOM_MAX_VERTICES)
            
            action_mtx = collect_actions(true_action, agent_envs)
            obs, reward, done, infos, agent_envs = perform_actions(envs, action_mtx, agent_envs)
            
            for info in infos:
                exploration_rewards.append(info["exploration_reward"])
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_steps.append(info['episode']['steps'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            rollouts.insert(obs, action, action_log_prob, value, reward, masks)

        next_value = agent.get_value(rollouts.get_last_obs())
        rollouts.compute_returns(next_value, conf['agent.gamma'], conf['agent.gae_lambda'])
        
        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        if j % conf['training.verbose'] == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * conf['training.n_processes'] * conf['training.n_steps']
            end = time.time()
            print(f'Updates {j}, '
                  f'num timesteps {total_num_steps}, '
                  f'FPS {int(total_num_steps / (end - start))} \n'
                  f'Last {len(episode_rewards)} training episodes: '
                  f'mean/median reward {np.mean(episode_rewards):.2f}/{np.median(episode_rewards):.2f}, '
                  f'min/max reward {np.min(episode_rewards):.2f}/{np.max(episode_rewards):.2f}\n'
                  f'dist_entropy {dist_entropy:.2f}, '
                  f'value_loss {value_loss:.2f}, '
                  f'action_loss {action_loss:.2f}, '
                  f'explor_rew {np.mean(exploration_rewards):.6f} '
                  f'mean_episode_steps {np.mean(episode_steps):.2f}')
            with open(conf['outputs.path']+".txt", 'a') as f:
                f.write(f"%d;%.4f;%.4f;%.6f;%.4f;%.4f\n" % \
                        (total_num_steps, np.mean(episode_rewards), \
                        np.median(episode_rewards), np.mean(exploration_rewards), \
                        dist_entropy, np.mean(episode_steps)))
            
        if j % 50 == 0:
            torch.save(agent, conf['outputs.path']+"_"+str(j))
        
        if np.mean(episode_rewards) >= best_mean:
            torch.save(agent, conf['outputs.path']+"_best_mean")
            best_mean = np.mean(episode_rewards)
        
        if np.median(episode_rewards) >= best_median:
            torch.save(agent, conf['outputs.path']+"_best_median")
            best_median = np.median(episode_rewards)

In [15]:
train_conf = {
    "agent.lr": 0.0002,
    "agent.device": device,
    "agent.gamma": 0.99,
    "agent.gae_lambda": 0.95,
    "training.n_env_steps": 8000000,
    "training.n_steps": 100,
    "training.n_processes": n_instances,
    "training.verbose": 1,
    "outputs.path": "./multitask/multitask_flat_ln"
}

In [None]:
train_ppo(envs, agent, train_conf)

  0%|                                                                                                                                                              | 0/20000 [00:00<?, ?it/s]

Updates 0, num timesteps 400, FPS 7 
Last 20 training episodes: mean/median reward -1.04/-1.05, min/max reward -1.05/-0.95
dist_entropy 4.47, value_loss 0.68, action_loss 0.12, explor_rew -0.007750 mean_episode_steps 31.25


  0%|                                                                                                                                                  | 1/20000 [00:56<312:28:31, 56.25s/it]

Updates 1, num timesteps 800, FPS 7 
Last 20 training episodes: mean/median reward -0.92/-1.05, min/max reward -1.05/0.55
dist_entropy 4.53, value_loss 0.21, action_loss -0.01, explor_rew -0.015750 mean_episode_steps 31.15


  0%|                                                                                                                                                  | 2/20000 [01:51<309:57:35, 55.80s/it]

Updates 2, num timesteps 1200, FPS 7 
Last 20 training episodes: mean/median reward -0.95/-1.05, min/max reward -1.05/0.85
dist_entropy 4.51, value_loss 0.14, action_loss -0.01, explor_rew -0.026250 mean_episode_steps 43.25


  0%|                                                                                                                                                  | 3/20000 [02:46<307:58:23, 55.44s/it]

Updates 3, num timesteps 1600, FPS 7 
Last 20 training episodes: mean/median reward -0.76/-1.05, min/max reward -1.05/0.85
dist_entropy 4.63, value_loss 0.09, action_loss -0.01, explor_rew -0.028250 mean_episode_steps 51.30


  0%|                                                                                                                                                  | 4/20000 [03:41<307:11:22, 55.31s/it]

Updates 4, num timesteps 2000, FPS 7 
Last 20 training episodes: mean/median reward -0.96/-1.05, min/max reward -1.05/0.65
dist_entropy 4.35, value_loss 0.04, action_loss -0.02, explor_rew -0.026000 mean_episode_steps 52.70


  0%|                                                                                                                                                  | 5/20000 [04:36<306:45:07, 55.23s/it]

Updates 5, num timesteps 2400, FPS 7 
Last 20 training episodes: mean/median reward -0.87/-1.05, min/max reward -1.05/0.85
dist_entropy 4.27, value_loss 0.03, action_loss -0.02, explor_rew -0.014500 mean_episode_steps 26.15


  0%|                                                                                                                                                  | 6/20000 [05:31<306:14:17, 55.14s/it]

Updates 6, num timesteps 2800, FPS 7 
Last 20 training episodes: mean/median reward -0.86/-1.05, min/max reward -1.05/0.85
dist_entropy 4.31, value_loss 0.03, action_loss -0.02, explor_rew -0.015000 mean_episode_steps 39.60


  0%|                                                                                                                                                  | 7/20000 [06:26<305:36:54, 55.03s/it]

Updates 7, num timesteps 3200, FPS 7 
Last 20 training episodes: mean/median reward -0.95/-1.05, min/max reward -1.05/0.85
dist_entropy 4.34, value_loss 0.02, action_loss -0.02, explor_rew -0.007000 mean_episode_steps 31.30


  0%|                                                                                                                                                  | 8/20000 [07:21<305:49:10, 55.07s/it]

Updates 8, num timesteps 3600, FPS 7 
Last 20 training episodes: mean/median reward -1.05/-1.05, min/max reward -1.05/-1.05
dist_entropy 4.18, value_loss 0.02, action_loss -0.02, explor_rew 0.000250 mean_episode_steps 18.05


  0%|                                                                                                                                                  | 9/20000 [08:17<306:54:27, 55.27s/it]

Updates 9, num timesteps 4000, FPS 7 
Last 20 training episodes: mean/median reward -1.05/-1.05, min/max reward -1.05/-1.05
dist_entropy 4.16, value_loss 0.00, action_loss -0.04, explor_rew 0.000250 mean_episode_steps 26.75


  0%|                                                                                                                                                 | 10/20000 [09:13<307:10:51, 55.32s/it]

Updates 10, num timesteps 4400, FPS 7 
Last 20 training episodes: mean/median reward -1.05/-1.05, min/max reward -1.05/-1.05
dist_entropy 4.12, value_loss 0.01, action_loss -0.02, explor_rew 0.004250 mean_episode_steps 15.25


  0%|                                                                                                                                                 | 11/20000 [10:07<306:09:25, 55.14s/it]

Updates 11, num timesteps 4800, FPS 7 
Last 20 training episodes: mean/median reward -1.05/-1.05, min/max reward -1.05/-1.05
dist_entropy 4.07, value_loss 0.02, action_loss -0.03, explor_rew 0.012000 mean_episode_steps 19.10


  0%|                                                                                                                                                 | 12/20000 [11:02<305:57:40, 55.11s/it]

Updates 12, num timesteps 5200, FPS 7 
Last 20 training episodes: mean/median reward -0.96/-1.05, min/max reward -1.05/0.75
dist_entropy 4.04, value_loss 0.03, action_loss -0.03, explor_rew 0.012500 mean_episode_steps 19.15


  0%|                                                                                                                                                 | 13/20000 [11:58<306:19:56, 55.18s/it]

Updates 13, num timesteps 5600, FPS 7 
Last 20 training episodes: mean/median reward -0.86/-1.05, min/max reward -1.05/0.85
dist_entropy 4.00, value_loss 0.02, action_loss -0.02, explor_rew 0.011500 mean_episode_steps 23.75


  0%|                                                                                                                                                 | 14/20000 [12:54<307:47:23, 55.44s/it]

Updates 14, num timesteps 6000, FPS 7 
Last 20 training episodes: mean/median reward -1.05/-1.05, min/max reward -1.05/-1.05
dist_entropy 3.90, value_loss 0.02, action_loss -0.05, explor_rew 0.008250 mean_episode_steps 28.25


  0%|                                                                                                                                                 | 15/20000 [13:50<309:23:32, 55.73s/it]

Updates 15, num timesteps 6400, FPS 7 
Last 20 training episodes: mean/median reward -1.05/-1.05, min/max reward -1.05/-1.05
dist_entropy 3.90, value_loss 0.03, action_loss -0.02, explor_rew 0.021500 mean_episode_steps 32.30


  0%|                                                                                                                                                 | 16/20000 [14:46<310:02:03, 55.85s/it]

Updates 16, num timesteps 6800, FPS 7 
Last 20 training episodes: mean/median reward -1.05/-1.05, min/max reward -1.05/-1.05
dist_entropy 3.90, value_loss 0.01, action_loss -0.04, explor_rew 0.012250 mean_episode_steps 34.40


  0%|                                                                                                                                                 | 17/20000 [15:43<312:01:31, 56.21s/it]

Updates 17, num timesteps 7200, FPS 7 
Last 20 training episodes: mean/median reward -1.00/-1.05, min/max reward -1.05/0.02
dist_entropy 3.73, value_loss 0.02, action_loss -0.03, explor_rew 0.013000 mean_episode_steps 36.95


  0%|▏                                                                                                                                                | 18/20000 [16:39<311:59:46, 56.21s/it]

Updates 18, num timesteps 7600, FPS 7 
Last 20 training episodes: mean/median reward -1.05/-1.05, min/max reward -1.05/-1.05
dist_entropy 3.52, value_loss 0.03, action_loss -0.02, explor_rew 0.022250 mean_episode_steps 33.90


  0%|▏                                                                                                                                                | 19/20000 [17:36<312:34:14, 56.32s/it]

Updates 19, num timesteps 8000, FPS 7 
Last 20 training episodes: mean/median reward -1.00/-1.05, min/max reward -1.05/-0.05
dist_entropy 3.47, value_loss 0.06, action_loss -0.03, explor_rew 0.022000 mean_episode_steps 30.30


  0%|▏                                                                                                                                                | 20/20000 [18:33<313:00:06, 56.40s/it]

Updates 20, num timesteps 8400, FPS 7 
Last 20 training episodes: mean/median reward -0.99/-1.05, min/max reward -1.05/0.13
dist_entropy 3.41, value_loss 0.05, action_loss -0.03, explor_rew 0.016750 mean_episode_steps 26.75


  0%|▏                                                                                                                                                | 21/20000 [19:29<313:06:44, 56.42s/it]

Updates 21, num timesteps 8800, FPS 7 
Last 20 training episodes: mean/median reward -0.95/-1.05, min/max reward -1.05/0.02
dist_entropy 3.45, value_loss 0.03, action_loss -0.03, explor_rew 0.019000 mean_episode_steps 24.65


  0%|▏                                                                                                                                                | 22/20000 [20:25<312:55:35, 56.39s/it]

Updates 22, num timesteps 9200, FPS 7 
Last 20 training episodes: mean/median reward -1.00/-1.05, min/max reward -1.05/-0.03
dist_entropy 3.42, value_loss 0.04, action_loss -0.02, explor_rew 0.018750 mean_episode_steps 28.20


  0%|▏                                                                                                                                                | 23/20000 [21:22<312:50:16, 56.38s/it]

Updates 23, num timesteps 9600, FPS 7 
Last 20 training episodes: mean/median reward -0.92/-1.05, min/max reward -1.05/0.45
dist_entropy 3.26, value_loss 0.07, action_loss -0.02, explor_rew 0.022000 mean_episode_steps 26.90


  0%|▏                                                                                                                                                | 24/20000 [22:18<312:44:07, 56.36s/it]

Updates 24, num timesteps 10000, FPS 7 
Last 20 training episodes: mean/median reward -0.82/-1.05, min/max reward -1.05/0.45
dist_entropy 2.98, value_loss 0.06, action_loss -0.03, explor_rew 0.022500 mean_episode_steps 29.70


  0%|▏                                                                                                                                                | 25/20000 [23:14<312:44:43, 56.36s/it]

Updates 25, num timesteps 10400, FPS 7 
Last 20 training episodes: mean/median reward -0.90/-1.05, min/max reward -1.05/0.45
dist_entropy 2.97, value_loss 0.08, action_loss -0.02, explor_rew 0.016250 mean_episode_steps 26.20


  0%|▏                                                                                                                                                | 26/20000 [24:11<312:55:05, 56.40s/it]

Updates 26, num timesteps 10800, FPS 7 
Last 20 training episodes: mean/median reward -0.58/-1.05, min/max reward -1.05/0.85
dist_entropy 2.82, value_loss 0.10, action_loss -0.03, explor_rew 0.022000 mean_episode_steps 24.50


  0%|▏                                                                                                                                                | 27/20000 [25:08<313:19:36, 56.48s/it]

Updates 27, num timesteps 11200, FPS 7 
Last 20 training episodes: mean/median reward -0.60/-1.05, min/max reward -1.05/0.45
dist_entropy 2.82, value_loss 0.09, action_loss -0.02, explor_rew 0.023500 mean_episode_steps 30.10


  0%|▏                                                                                                                                                | 28/20000 [26:04<312:48:26, 56.38s/it]

Updates 28, num timesteps 11600, FPS 7 
Last 20 training episodes: mean/median reward -0.70/-1.05, min/max reward -1.05/0.45
dist_entropy 2.89, value_loss 0.07, action_loss -0.03, explor_rew 0.018000 mean_episode_steps 32.00


  0%|▏                                                                                                                                                | 29/20000 [27:00<313:23:52, 56.49s/it]

Updates 29, num timesteps 12000, FPS 7 
Last 20 training episodes: mean/median reward -0.67/-1.05, min/max reward -1.05/0.45
dist_entropy 2.74, value_loss 0.09, action_loss -0.02, explor_rew 0.020250 mean_episode_steps 30.85


  0%|▏                                                                                                                                                | 30/20000 [27:57<313:42:04, 56.55s/it]

Updates 30, num timesteps 12400, FPS 7 
Last 20 training episodes: mean/median reward -0.71/-1.05, min/max reward -1.05/0.45
dist_entropy 2.64, value_loss 0.06, action_loss -0.02, explor_rew 0.018750 mean_episode_steps 29.15


  0%|▏                                                                                                                                                | 31/20000 [28:54<313:55:06, 56.59s/it]

Updates 31, num timesteps 12800, FPS 7 
Last 20 training episodes: mean/median reward -0.43/-1.05, min/max reward -1.05/0.85
dist_entropy 2.77, value_loss 0.05, action_loss -0.02, explor_rew 0.016000 mean_episode_steps 34.80


  0%|▏                                                                                                                                                | 32/20000 [29:51<314:20:37, 56.67s/it]

Updates 32, num timesteps 13200, FPS 7 
Last 20 training episodes: mean/median reward -0.52/-1.05, min/max reward -1.05/0.45
dist_entropy 3.05, value_loss 0.05, action_loss -0.02, explor_rew 0.014000 mean_episode_steps 31.25


  0%|▏                                                                                                                                                | 33/20000 [30:47<314:17:35, 56.67s/it]

Updates 33, num timesteps 13600, FPS 7 
Last 20 training episodes: mean/median reward -0.32/-0.04, min/max reward -1.05/0.45
dist_entropy 3.10, value_loss 0.06, action_loss -0.03, explor_rew 0.013750 mean_episode_steps 28.45


  0%|▎                                                                                                                                                | 35/20000 [32:41<314:42:48, 56.75s/it]

Updates 34, num timesteps 14000, FPS 7 
Last 20 training episodes: mean/median reward -0.62/-1.05, min/max reward -1.05/0.45
dist_entropy 3.08, value_loss 0.06, action_loss -0.03, explor_rew 0.016000 mean_episode_steps 32.40


  0%|▎                                                                                                                                                | 36/20000 [33:37<313:47:00, 56.58s/it]

Updates 35, num timesteps 14400, FPS 7 
Last 20 training episodes: mean/median reward -0.87/-1.05, min/max reward -1.05/0.45
dist_entropy 3.15, value_loss 0.08, action_loss -0.02, explor_rew 0.009750 mean_episode_steps 36.05


  0%|▎                                                                                                                                                | 37/20000 [34:33<312:53:35, 56.43s/it]

Updates 36, num timesteps 14800, FPS 7 
Last 20 training episodes: mean/median reward -0.90/-1.05, min/max reward -1.05/-0.04
dist_entropy 3.39, value_loss 0.08, action_loss -0.04, explor_rew 0.007000 mean_episode_steps 51.15


  0%|▎                                                                                                                                                | 38/20000 [35:30<312:23:16, 56.34s/it]

Updates 37, num timesteps 15200, FPS 7 
Last 20 training episodes: mean/median reward -1.00/-1.05, min/max reward -1.05/-0.04
dist_entropy 3.73, value_loss 0.11, action_loss -0.01, explor_rew -0.026250 mean_episode_steps 58.20


  0%|▎                                                                                                                                                | 39/20000 [36:26<312:28:52, 56.36s/it]

Updates 38, num timesteps 15600, FPS 7 
Last 20 training episodes: mean/median reward -0.74/-1.05, min/max reward -1.05/0.02
dist_entropy 3.43, value_loss 0.05, action_loss -0.01, explor_rew 0.003500 mean_episode_steps 54.00


  0%|▎                                                                                                                                                | 40/20000 [37:22<312:34:49, 56.38s/it]

Updates 39, num timesteps 16000, FPS 7 
Last 20 training episodes: mean/median reward -0.79/-1.05, min/max reward -1.05/0.02
dist_entropy 3.18, value_loss 0.04, action_loss -0.02, explor_rew 0.011000 mean_episode_steps 35.70


  0%|▎                                                                                                                                                | 41/20000 [38:19<313:09:25, 56.48s/it]

Updates 40, num timesteps 16400, FPS 7 
Last 20 training episodes: mean/median reward -0.85/-1.05, min/max reward -1.05/-0.04
dist_entropy 3.54, value_loss 0.03, action_loss -0.01, explor_rew 0.002250 mean_episode_steps 44.60


  0%|▎                                                                                                                                                | 42/20000 [39:16<313:13:23, 56.50s/it]

Updates 41, num timesteps 16800, FPS 7 
Last 20 training episodes: mean/median reward -0.79/-1.05, min/max reward -1.05/0.02
dist_entropy 3.17, value_loss 0.03, action_loss -0.02, explor_rew 0.012500 mean_episode_steps 44.05


  0%|▎                                                                                                                                                | 43/20000 [40:12<312:29:50, 56.37s/it]

Updates 42, num timesteps 17200, FPS 7 
Last 20 training episodes: mean/median reward -0.80/-1.05, min/max reward -1.05/0.02
dist_entropy 3.10, value_loss 0.03, action_loss -0.02, explor_rew 0.015000 mean_episode_steps 39.40


  0%|▎                                                                                                                                                | 44/20000 [41:08<312:43:42, 56.42s/it]

Updates 43, num timesteps 17600, FPS 7 
Last 20 training episodes: mean/median reward -0.90/-1.05, min/max reward -1.05/0.02
dist_entropy 3.14, value_loss 0.02, action_loss -0.02, explor_rew 0.014500 mean_episode_steps 34.05


  0%|▎                                                                                                                                                | 45/20000 [42:05<313:10:25, 56.50s/it]

Updates 44, num timesteps 18000, FPS 7 
Last 20 training episodes: mean/median reward -0.75/-1.05, min/max reward -1.05/0.02
dist_entropy 3.12, value_loss 0.02, action_loss -0.02, explor_rew 0.019500 mean_episode_steps 35.85


  0%|▎                                                                                                                                                | 46/20000 [43:01<313:02:04, 56.48s/it]

Updates 45, num timesteps 18400, FPS 7 
Last 20 training episodes: mean/median reward -1.00/-1.05, min/max reward -1.05/-0.05
dist_entropy 3.22, value_loss 0.03, action_loss -0.02, explor_rew 0.021250 mean_episode_steps 32.25


  0%|▎                                                                                                                                                | 47/20000 [43:58<312:58:18, 56.47s/it]

Updates 46, num timesteps 18800, FPS 7 
Last 20 training episodes: mean/median reward -0.80/-1.05, min/max reward -1.05/-0.03
dist_entropy 3.12, value_loss 0.02, action_loss -0.02, explor_rew 0.022500 mean_episode_steps 29.65


  0%|▎                                                                                                                                                | 48/20000 [44:54<312:16:04, 56.34s/it]

Updates 47, num timesteps 19200, FPS 7 
Last 20 training episodes: mean/median reward -0.75/-1.05, min/max reward -1.05/-0.03
dist_entropy 3.17, value_loss 0.01, action_loss -0.02, explor_rew 0.028250 mean_episode_steps 31.55


  0%|▎                                                                                                                                                | 49/20000 [45:50<312:21:06, 56.36s/it]

Updates 48, num timesteps 19600, FPS 7 
Last 20 training episodes: mean/median reward -0.70/-1.05, min/max reward -1.05/-0.03
dist_entropy 3.14, value_loss 0.03, action_loss -0.02, explor_rew 0.028000 mean_episode_steps 27.85


  0%|▎                                                                                                                                                | 50/20000 [46:47<312:34:51, 56.41s/it]

Updates 49, num timesteps 20000, FPS 7 
Last 20 training episodes: mean/median reward -0.90/-1.05, min/max reward -1.05/-0.03
dist_entropy 3.18, value_loss 0.03, action_loss -0.01, explor_rew 0.016500 mean_episode_steps 35.70
Updates 50, num timesteps 20400, FPS 7 
Last 20 training episodes: mean/median reward -0.95/-1.05, min/max reward -1.05/-0.04
dist_entropy 3.06, value_loss 0.02, action_loss -0.02, explor_rew 0.023750 mean_episode_steps 44.00


  0%|▍                                                                                                                                                | 52/20000 [48:39<311:52:43, 56.28s/it]

Updates 51, num timesteps 20800, FPS 7 
Last 20 training episodes: mean/median reward -0.85/-1.05, min/max reward -1.05/-0.04
dist_entropy 3.06, value_loss 0.02, action_loss -0.04, explor_rew 0.022000 mean_episode_steps 41.60


  0%|▍                                                                                                                                                | 53/20000 [49:35<311:44:42, 56.26s/it]

Updates 52, num timesteps 21200, FPS 7 
Last 20 training episodes: mean/median reward -0.82/-1.05, min/max reward -1.05/0.55
dist_entropy 3.20, value_loss 0.05, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 31.90


  0%|▍                                                                                                                                                | 54/20000 [50:32<311:51:57, 56.29s/it]

Updates 53, num timesteps 21600, FPS 7 
Last 20 training episodes: mean/median reward -0.85/-1.05, min/max reward -1.05/-0.05
dist_entropy 3.15, value_loss 0.02, action_loss -0.03, explor_rew 0.023750 mean_episode_steps 35.20


  0%|▍                                                                                                                                                | 55/20000 [51:28<311:40:39, 56.26s/it]

Updates 54, num timesteps 22000, FPS 7 
Last 20 training episodes: mean/median reward -0.90/-1.05, min/max reward -1.05/-0.03
dist_entropy 3.14, value_loss 0.02, action_loss -0.03, explor_rew 0.025500 mean_episode_steps 33.55


  0%|▍                                                                                                                                                | 56/20000 [52:24<311:34:39, 56.24s/it]

Updates 55, num timesteps 22400, FPS 7 
Last 20 training episodes: mean/median reward -0.74/-1.05, min/max reward -1.05/0.02
dist_entropy 2.99, value_loss 0.03, action_loss -0.02, explor_rew 0.026500 mean_episode_steps 36.90


  0%|▍                                                                                                                                                | 57/20000 [53:20<311:20:23, 56.20s/it]

Updates 56, num timesteps 22800, FPS 7 
Last 20 training episodes: mean/median reward -0.92/-1.05, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.06, action_loss -0.04, explor_rew 0.027750 mean_episode_steps 35.75


  0%|▍                                                                                                                                                | 58/20000 [54:16<311:18:54, 56.20s/it]

Updates 57, num timesteps 23200, FPS 7 
Last 20 training episodes: mean/median reward -0.95/-1.05, min/max reward -1.05/-0.03
dist_entropy 2.92, value_loss 0.04, action_loss -0.02, explor_rew 0.028000 mean_episode_steps 34.30


  0%|▍                                                                                                                                                | 59/20000 [55:13<311:56:40, 56.32s/it]

Updates 58, num timesteps 23600, FPS 7 
Last 20 training episodes: mean/median reward -0.75/-1.05, min/max reward -1.05/-0.03
dist_entropy 2.83, value_loss 0.03, action_loss -0.01, explor_rew 0.029000 mean_episode_steps 32.95


  0%|▍                                                                                                                                                | 60/20000 [56:09<311:36:02, 56.26s/it]

Updates 59, num timesteps 24000, FPS 7 
Last 20 training episodes: mean/median reward -0.90/-1.05, min/max reward -1.05/-0.03
dist_entropy 2.92, value_loss 0.04, action_loss -0.00, explor_rew 0.023500 mean_episode_steps 34.95


  0%|▍                                                                                                                                                | 61/20000 [57:06<311:59:30, 56.33s/it]

Updates 60, num timesteps 24400, FPS 7 
Last 20 training episodes: mean/median reward -0.76/-1.05, min/max reward -1.05/0.55
dist_entropy 3.04, value_loss 0.03, action_loss -0.02, explor_rew 0.024750 mean_episode_steps 29.75


  0%|▍                                                                                                                                                | 62/20000 [58:02<311:46:38, 56.29s/it]

Updates 61, num timesteps 24800, FPS 7 
Last 20 training episodes: mean/median reward -0.87/-1.05, min/max reward -1.05/0.55
dist_entropy 3.07, value_loss 0.04, action_loss 0.05, explor_rew 0.023000 mean_episode_steps 34.65


  0%|▍                                                                                                                                                | 63/20000 [58:58<311:46:57, 56.30s/it]

Updates 62, num timesteps 25200, FPS 7 
Last 20 training episodes: mean/median reward -0.69/-1.05, min/max reward -1.05/0.45
dist_entropy 3.22, value_loss 0.05, action_loss -0.04, explor_rew 0.012500 mean_episode_steps 42.75


  0%|▍                                                                                                                                                | 64/20000 [59:55<312:00:13, 56.34s/it]

Updates 63, num timesteps 25600, FPS 7 
Last 20 training episodes: mean/median reward -0.76/-1.05, min/max reward -1.05/0.55
dist_entropy 3.25, value_loss 0.05, action_loss -0.03, explor_rew 0.018500 mean_episode_steps 35.30


  0%|▍                                                                                                                                              | 65/20000 [1:00:51<312:31:54, 56.44s/it]

Updates 64, num timesteps 26000, FPS 7 
Last 20 training episodes: mean/median reward -0.57/-1.05, min/max reward -1.05/0.55
dist_entropy 3.15, value_loss 0.07, action_loss -0.03, explor_rew 0.022000 mean_episode_steps 36.80


  0%|▍                                                                                                                                              | 66/20000 [1:01:48<313:00:22, 56.53s/it]

Updates 65, num timesteps 26400, FPS 7 
Last 20 training episodes: mean/median reward -0.58/-1.05, min/max reward -1.05/0.55
dist_entropy 3.00, value_loss 0.06, action_loss -0.03, explor_rew 0.019250 mean_episode_steps 25.70
Updates 66, num timesteps 26800, FPS 7 
Last 20 training episodes: mean/median reward -0.18/-0.03, min/max reward -1.05/0.55
dist_entropy 2.98, value_loss 0.05, action_loss -0.03, explor_rew 0.023000 mean_episode_steps 29.70


  0%|▍                                                                                                                                              | 68/20000 [1:03:42<313:45:23, 56.67s/it]

Updates 67, num timesteps 27200, FPS 7 
Last 20 training episodes: mean/median reward -0.39/-0.04, min/max reward -1.05/0.55
dist_entropy 3.24, value_loss 0.11, action_loss -0.02, explor_rew 0.016250 mean_episode_steps 31.95


  0%|▍                                                                                                                                              | 69/20000 [1:04:38<312:33:44, 56.46s/it]

Updates 68, num timesteps 27600, FPS 7 
Last 20 training episodes: mean/median reward -0.33/-0.05, min/max reward -1.05/0.55
dist_entropy 3.32, value_loss 0.11, action_loss -0.01, explor_rew 0.005250 mean_episode_steps 41.05
Updates 69, num timesteps 28000, FPS 7 
Last 20 training episodes: mean/median reward -0.31/-0.02, min/max reward -1.05/0.55
dist_entropy 3.38, value_loss 0.11, action_loss -0.01, explor_rew -0.004250 mean_episode_steps 24.15


  0%|▌                                                                                                                                              | 70/20000 [1:05:34<311:52:36, 56.34s/it]

Updates 70, num timesteps 28400, FPS 7 
Last 20 training episodes: mean/median reward -0.04/0.04, min/max reward -1.05/0.55
dist_entropy 3.14, value_loss 0.07, action_loss -0.02, explor_rew 0.004750 mean_episode_steps 36.70


  0%|▌                                                                                                                                              | 72/20000 [1:07:27<312:49:11, 56.51s/it]

Updates 71, num timesteps 28800, FPS 7 
Last 20 training episodes: mean/median reward -0.13/-0.05, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.10, action_loss -0.03, explor_rew 0.011500 mean_episode_steps 45.20
Updates 72, num timesteps 29200, FPS 7 
Last 20 training episodes: mean/median reward 0.17/0.45, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.09, action_loss 0.19, explor_rew 0.017500 mean_episode_steps 30.05


  0%|▌                                                                                                                                              | 73/20000 [1:08:25<314:22:42, 56.80s/it]

Updates 73, num timesteps 29600, FPS 7 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.05, action_loss -0.02, explor_rew 0.020250 mean_episode_steps 29.00


  0%|▌                                                                                                                                              | 74/20000 [1:09:22<314:56:25, 56.90s/it]

Updates 74, num timesteps 30000, FPS 7 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 2.59, value_loss 0.05, action_loss -0.02, explor_rew 0.023500 mean_episode_steps 25.15


  0%|▌                                                                                                                                              | 75/20000 [1:10:19<316:05:16, 57.11s/it]

Updates 75, num timesteps 30400, FPS 7 
Last 20 training episodes: mean/median reward 0.14/0.45, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.07, action_loss -0.02, explor_rew 0.010250 mean_episode_steps 32.20


  0%|▌                                                                                                                                              | 76/20000 [1:11:29<336:53:28, 60.87s/it]

Updates 76, num timesteps 30800, FPS 7 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.04, action_loss -0.04, explor_rew 0.026500 mean_episode_steps 25.10


  0%|▌                                                                                                                                              | 78/20000 [1:14:07<387:18:56, 69.99s/it]

Updates 77, num timesteps 31200, FPS 7 
Last 20 training episodes: mean/median reward -0.23/-0.05, min/max reward -1.05/0.75
dist_entropy 3.05, value_loss 0.20, action_loss -0.03, explor_rew 0.014750 mean_episode_steps 28.45
Updates 78, num timesteps 31600, FPS 6 
Last 20 training episodes: mean/median reward 0.02/0.55, min/max reward -1.05/0.75
dist_entropy 3.12, value_loss 0.05, action_loss -0.01, explor_rew 0.005250 mean_episode_steps 33.70


  0%|▌                                                                                                                                              | 80/20000 [1:16:44<411:43:37, 74.41s/it]

Updates 79, num timesteps 32000, FPS 6 
Last 20 training episodes: mean/median reward -0.05/0.23, min/max reward -1.05/0.55
dist_entropy 3.04, value_loss 0.03, action_loss 0.00, explor_rew 0.011500 mean_episode_steps 19.60


  0%|▌                                                                                                                                              | 81/20000 [1:18:02<417:54:12, 75.53s/it]

Updates 80, num timesteps 32400, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.50, min/max reward -1.05/0.55
dist_entropy 3.16, value_loss 0.03, action_loss 0.00, explor_rew 0.014750 mean_episode_steps 22.60
Updates 81, num timesteps 32800, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.03, action_loss -0.01, explor_rew 0.019500 mean_episode_steps 23.80


  0%|▌                                                                                                                                              | 83/20000 [1:20:38<424:14:10, 76.68s/it]

Updates 82, num timesteps 33200, FPS 6 
Last 20 training episodes: mean/median reward -0.17/0.02, min/max reward -1.05/0.55
dist_entropy 2.98, value_loss 0.03, action_loss -0.01, explor_rew 0.019250 mean_episode_steps 23.80


  0%|▌                                                                                                                                              | 84/20000 [1:21:53<420:57:10, 76.09s/it]

Updates 83, num timesteps 33600, FPS 6 
Last 20 training episodes: mean/median reward 0.06/0.28, min/max reward -1.05/0.55
dist_entropy 3.03, value_loss 0.03, action_loss -0.02, explor_rew 0.011750 mean_episode_steps 23.50


  0%|▌                                                                                                                                              | 85/20000 [1:23:08<419:52:20, 75.90s/it]

Updates 84, num timesteps 34000, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.23, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.02, action_loss -0.02, explor_rew 0.021750 mean_episode_steps 22.10


  0%|▌                                                                                                                                              | 86/20000 [1:24:25<421:47:52, 76.25s/it]

Updates 85, num timesteps 34400, FPS 6 
Last 20 training episodes: mean/median reward -0.20/0.02, min/max reward -1.05/0.55
dist_entropy 2.91, value_loss 0.02, action_loss -0.02, explor_rew 0.025750 mean_episode_steps 19.85


  0%|▌                                                                                                                                              | 87/20000 [1:25:42<422:39:25, 76.41s/it]

Updates 86, num timesteps 34800, FPS 6 
Last 20 training episodes: mean/median reward -0.09/0.02, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.05, action_loss -0.03, explor_rew 0.009750 mean_episode_steps 22.80


  0%|▋                                                                                                                                              | 88/20000 [1:26:58<422:41:58, 76.42s/it]

Updates 87, num timesteps 35200, FPS 6 
Last 20 training episodes: mean/median reward -0.05/0.23, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.02, action_loss -0.03, explor_rew 0.023250 mean_episode_steps 22.25
Updates 88, num timesteps 35600, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.55, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.02, action_loss -0.02, explor_rew 0.028250 mean_episode_steps 19.20


  0%|▋                                                                                                                                              | 90/20000 [1:29:32<423:36:03, 76.59s/it]

Updates 89, num timesteps 36000, FPS 6 
Last 20 training episodes: mean/median reward -0.07/0.02, min/max reward -1.05/0.55
dist_entropy 3.04, value_loss 0.03, action_loss -0.02, explor_rew 0.019000 mean_episode_steps 28.80


  0%|▋                                                                                                                                              | 91/20000 [1:30:48<422:45:51, 76.45s/it]

Updates 90, num timesteps 36400, FPS 6 
Last 20 training episodes: mean/median reward -0.03/-0.00, min/max reward -1.05/0.55
dist_entropy 2.98, value_loss 0.03, action_loss -0.01, explor_rew 0.020750 mean_episode_steps 34.00


  0%|▋                                                                                                                                              | 92/20000 [1:32:01<416:26:54, 75.31s/it]

Updates 91, num timesteps 36800, FPS 6 
Last 20 training episodes: mean/median reward -0.18/-0.03, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.02, action_loss -0.00, explor_rew 0.026000 mean_episode_steps 29.60


  0%|▋                                                                                                                                              | 93/20000 [1:33:18<419:14:52, 75.82s/it]

Updates 92, num timesteps 37200, FPS 6 
Last 20 training episodes: mean/median reward -0.09/0.02, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.04, action_loss -0.01, explor_rew 0.028500 mean_episode_steps 20.35
Updates 93, num timesteps 37600, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.55, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.03, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 20.50


  0%|▋                                                                                                                                              | 95/20000 [1:35:48<416:42:03, 75.36s/it]

Updates 94, num timesteps 38000, FPS 6 
Last 20 training episodes: mean/median reward 0.01/0.02, min/max reward -1.05/0.55
dist_entropy 2.68, value_loss 0.03, action_loss -0.02, explor_rew 0.028000 mean_episode_steps 29.60


  0%|▋                                                                                                                                              | 96/20000 [1:37:04<419:08:16, 75.81s/it]

Updates 95, num timesteps 38400, FPS 6 
Last 20 training episodes: mean/median reward -0.17/0.02, min/max reward -1.05/0.55
dist_entropy 2.60, value_loss 0.02, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 27.00
Updates 96, num timesteps 38800, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.55, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.08, action_loss -0.01, explor_rew 0.025250 mean_episode_steps 20.25


  0%|▋                                                                                                                                              | 98/20000 [1:39:38<421:05:32, 76.17s/it]

Updates 97, num timesteps 39200, FPS 6 
Last 20 training episodes: mean/median reward 0.08/0.13, min/max reward -1.05/0.55
dist_entropy 2.49, value_loss 0.04, action_loss 0.04, explor_rew 0.030000 mean_episode_steps 20.25


  0%|▋                                                                                                                                              | 99/20000 [1:40:54<421:08:07, 76.18s/it]

Updates 98, num timesteps 39600, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.50, min/max reward -1.05/0.55
dist_entropy 2.60, value_loss 0.07, action_loss -0.01, explor_rew 0.027250 mean_episode_steps 20.45


  0%|▋                                                                                                                                             | 100/20000 [1:42:07<415:39:37, 75.19s/it]

Updates 99, num timesteps 40000, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.13, min/max reward -1.05/0.55
dist_entropy 2.49, value_loss 0.07, action_loss -0.01, explor_rew 0.027750 mean_episode_steps 20.20
Updates 100, num timesteps 40400, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.13, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.10, action_loss -0.02, explor_rew 0.018750 mean_episode_steps 24.15


  1%|▋                                                                                                                                             | 101/20000 [1:43:23<417:12:42, 75.48s/it]

Updates 101, num timesteps 40800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.55, min/max reward -1.05/0.55
dist_entropy 2.47, value_loss 0.03, action_loss -0.01, explor_rew 0.027250 mean_episode_steps 24.80


  1%|▋                                                                                                                                             | 102/20000 [1:44:39<418:05:32, 75.64s/it]

Updates 102, num timesteps 41200, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.55, min/max reward -1.05/0.55
dist_entropy 2.51, value_loss 0.03, action_loss -0.02, explor_rew 0.028750 mean_episode_steps 18.15


  1%|▋                                                                                                                                             | 104/20000 [1:47:10<416:49:31, 75.42s/it]

Updates 103, num timesteps 41600, FPS 6 
Last 20 training episodes: mean/median reward 0.00/0.08, min/max reward -1.05/0.55
dist_entropy 2.38, value_loss 0.04, action_loss -0.00, explor_rew 0.029000 mean_episode_steps 26.80
Updates 104, num timesteps 42000, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.55, min/max reward -1.05/0.55
dist_entropy 2.55, value_loss 0.02, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 19.70


  1%|▊                                                                                                                                             | 106/20000 [1:49:41<418:30:11, 75.73s/it]

Updates 105, num timesteps 42400, FPS 6 
Last 20 training episodes: mean/median reward -0.18/-0.03, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.23, action_loss 0.04, explor_rew 0.007000 mean_episode_steps 29.05


  1%|▊                                                                                                                                             | 107/20000 [1:50:57<418:37:27, 75.76s/it]

Updates 106, num timesteps 42800, FPS 6 
Last 20 training episodes: mean/median reward -0.25/-0.30, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.10, action_loss -0.01, explor_rew 0.012750 mean_episode_steps 24.40
Updates 107, num timesteps 43200, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.55, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.07, action_loss -0.03, explor_rew 0.006000 mean_episode_steps 25.55


  1%|▊                                                                                                                                             | 108/20000 [1:52:09<411:45:47, 74.52s/it]

Updates 108, num timesteps 43600, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.55, min/max reward -1.05/0.55
dist_entropy 2.55, value_loss 0.05, action_loss -0.00, explor_rew 0.018000 mean_episode_steps 19.30


  1%|▊                                                                                                                                             | 110/20000 [1:54:42<417:16:26, 75.52s/it]

Updates 109, num timesteps 44000, FPS 6 
Last 20 training episodes: mean/median reward 0.13/0.08, min/max reward -1.05/0.55
dist_entropy 2.55, value_loss 0.08, action_loss -0.02, explor_rew 0.016000 mean_episode_steps 21.80
Updates 110, num timesteps 44400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -1.05/0.55
dist_entropy 2.37, value_loss 0.06, action_loss -0.03, explor_rew 0.024250 mean_episode_steps 21.15


  1%|▊                                                                                                                                             | 112/20000 [1:57:14<418:24:26, 75.74s/it]

Updates 111, num timesteps 44800, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.50, min/max reward -1.05/0.55
dist_entropy 3.11, value_loss 0.17, action_loss -0.04, explor_rew -0.001000 mean_episode_steps 32.25


  1%|▊                                                                                                                                             | 113/20000 [1:58:28<414:58:29, 75.12s/it]

Updates 112, num timesteps 45200, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.05, action_loss -0.02, explor_rew 0.022750 mean_episode_steps 28.10


  1%|▊                                                                                                                                             | 114/20000 [1:59:42<413:51:57, 74.92s/it]

Updates 113, num timesteps 45600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.05, action_loss -0.03, explor_rew 0.016250 mean_episode_steps 20.25


  1%|▊                                                                                                                                             | 115/20000 [2:00:58<414:57:33, 75.12s/it]

Updates 114, num timesteps 46000, FPS 6 
Last 20 training episodes: mean/median reward -0.28/-0.04, min/max reward -1.05/0.75
dist_entropy 2.78, value_loss 0.12, action_loss -0.02, explor_rew 0.027500 mean_episode_steps 17.10


  1%|▊                                                                                                                                             | 116/20000 [2:02:10<410:16:07, 74.28s/it]

Updates 115, num timesteps 46400, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -1.05/0.55
dist_entropy 2.60, value_loss 0.05, action_loss -0.02, explor_rew 0.028000 mean_episode_steps 21.80
Updates 116, num timesteps 46800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.05/0.55
dist_entropy 2.51, value_loss 0.05, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 24.45


  1%|▊                                                                                                                                             | 118/20000 [2:04:42<414:44:55, 75.10s/it]

Updates 117, num timesteps 47200, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.53, value_loss 0.01, action_loss -0.01, explor_rew 0.029250 mean_episode_steps 23.00
Updates 118, num timesteps 47600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.50, min/max reward -0.03/0.55
dist_entropy 2.50, value_loss 0.02, action_loss 0.02, explor_rew 0.030250 mean_episode_steps 14.05


  1%|▊                                                                                                                                             | 119/20000 [2:05:59<417:39:19, 75.63s/it]

Updates 119, num timesteps 48000, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.55, min/max reward -1.05/0.55
dist_entropy 2.40, value_loss 0.06, action_loss -0.01, explor_rew 0.027000 mean_episode_steps 19.20


  1%|▊                                                                                                                                             | 120/20000 [2:07:16<420:42:54, 76.19s/it]

Updates 120, num timesteps 48400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.02, action_loss -0.01, explor_rew 0.028000 mean_episode_steps 20.65


  1%|▊                                                                                                                                             | 121/20000 [2:08:33<421:50:46, 76.39s/it]

Updates 121, num timesteps 48800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.45, min/max reward -0.04/0.55
dist_entropy 2.49, value_loss 0.00, action_loss -0.00, explor_rew 0.030250 mean_episode_steps 19.55


  1%|▊                                                                                                                                             | 123/20000 [2:11:03<416:46:34, 75.48s/it]

Updates 122, num timesteps 49200, FPS 6 
Last 20 training episodes: mean/median reward 0.08/0.45, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.12, action_loss -0.00, explor_rew 0.021500 mean_episode_steps 21.60


  1%|▉                                                                                                                                             | 124/20000 [2:12:14<410:05:41, 74.28s/it]

Updates 123, num timesteps 49600, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.45, min/max reward -1.05/0.55
dist_entropy 2.64, value_loss 0.06, action_loss -0.04, explor_rew 0.023250 mean_episode_steps 27.15


  1%|▉                                                                                                                                             | 125/20000 [2:13:31<413:25:25, 74.88s/it]

Updates 124, num timesteps 50000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.05/0.55
dist_entropy 2.52, value_loss 0.02, action_loss -0.02, explor_rew 0.029250 mean_episode_steps 22.65


  1%|▉                                                                                                                                             | 126/20000 [2:14:46<414:59:55, 75.17s/it]

Updates 125, num timesteps 50400, FPS 6 
Last 20 training episodes: mean/median reward -0.14/0.02, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.16, action_loss -0.02, explor_rew 0.028000 mean_episode_steps 17.30


  1%|▉                                                                                                                                             | 127/20000 [2:16:03<416:29:33, 75.45s/it]

Updates 126, num timesteps 50800, FPS 6 
Last 20 training episodes: mean/median reward -0.06/0.02, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.08, action_loss -0.02, explor_rew 0.028250 mean_episode_steps 23.50


  1%|▉                                                                                                                                             | 128/20000 [2:17:18<417:10:29, 75.58s/it]

Updates 127, num timesteps 51200, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.50, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.03, action_loss -0.01, explor_rew 0.027000 mean_episode_steps 16.55


  1%|▉                                                                                                                                             | 129/20000 [2:18:34<417:27:33, 75.63s/it]

Updates 128, num timesteps 51600, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.50, min/max reward -1.05/0.55
dist_entropy 2.97, value_loss 0.05, action_loss 0.01, explor_rew 0.020500 mean_episode_steps 26.85
Updates 129, num timesteps 52000, FPS 6 
Last 20 training episodes: mean/median reward 0.19/0.55, min/max reward -1.05/0.55
dist_entropy 3.05, value_loss 0.03, action_loss -0.02, explor_rew 0.025000 mean_episode_steps 20.75


  1%|▉                                                                                                                                             | 131/20000 [2:21:05<417:07:43, 75.58s/it]

Updates 130, num timesteps 52400, FPS 6 
Last 20 training episodes: mean/median reward -0.34/-0.04, min/max reward -1.05/0.55
dist_entropy 2.99, value_loss 0.04, action_loss -0.02, explor_rew 0.027750 mean_episode_steps 19.75
Updates 131, num timesteps 52800, FPS 6 
Last 20 training episodes: mean/median reward 0.08/0.55, min/max reward -1.05/0.55
dist_entropy 2.97, value_loss 0.03, action_loss -0.01, explor_rew 0.023250 mean_episode_steps 16.45


  1%|▉                                                                                                                                             | 133/20000 [2:23:21<393:07:30, 71.24s/it]

Updates 132, num timesteps 53200, FPS 6 
Last 20 training episodes: mean/median reward -0.15/-0.03, min/max reward -1.05/0.55
dist_entropy 2.90, value_loss 0.04, action_loss -0.02, explor_rew 0.028250 mean_episode_steps 25.15


  1%|▉                                                                                                                                             | 134/20000 [2:24:38<401:46:38, 72.81s/it]

Updates 133, num timesteps 53600, FPS 6 
Last 20 training episodes: mean/median reward -0.13/0.02, min/max reward -1.05/0.55
dist_entropy 2.65, value_loss 0.06, action_loss -0.03, explor_rew 0.031750 mean_episode_steps 21.35
Updates 134, num timesteps 54000, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.55, min/max reward -1.05/0.55
dist_entropy 2.77, value_loss 0.11, action_loss -0.04, explor_rew 0.021750 mean_episode_steps 21.25


  1%|▉                                                                                                                                             | 136/20000 [2:27:09<410:03:05, 74.31s/it]

Updates 135, num timesteps 54400, FPS 6 
Last 20 training episodes: mean/median reward -0.03/0.08, min/max reward -1.05/0.75
dist_entropy 2.60, value_loss 0.21, action_loss -0.02, explor_rew 0.029500 mean_episode_steps 20.40


  1%|▉                                                                                                                                             | 137/20000 [2:28:24<411:18:06, 74.54s/it]

Updates 136, num timesteps 54800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.50, min/max reward -0.05/0.55
dist_entropy 2.72, value_loss 0.03, action_loss -0.01, explor_rew 0.029750 mean_episode_steps 23.30


  1%|▉                                                                                                                                             | 138/20000 [2:29:40<413:03:23, 74.87s/it]

Updates 137, num timesteps 55200, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -1.05/0.55
dist_entropy 2.56, value_loss 0.04, action_loss -0.01, explor_rew 0.029250 mean_episode_steps 15.55


  1%|▉                                                                                                                                             | 139/20000 [2:30:54<411:01:38, 74.50s/it]

Updates 138, num timesteps 55600, FPS 6 
Last 20 training episodes: mean/median reward 0.20/0.45, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.04, action_loss 0.01, explor_rew 0.018750 mean_episode_steps 31.65


  1%|▉                                                                                                                                             | 140/20000 [2:32:08<411:39:46, 74.62s/it]

Updates 139, num timesteps 56000, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -1.05/0.55
dist_entropy 2.56, value_loss 0.02, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 22.15


  1%|█                                                                                                                                             | 141/20000 [2:33:19<405:27:24, 73.50s/it]

Updates 140, num timesteps 56400, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.55, value_loss 0.02, action_loss -0.01, explor_rew 0.026000 mean_episode_steps 22.35


  1%|█                                                                                                                                             | 142/20000 [2:34:34<407:35:36, 73.89s/it]

Updates 141, num timesteps 56800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.50, min/max reward -0.03/0.55
dist_entropy 2.53, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 19.75
Updates 142, num timesteps 57200, FPS 6 
Last 20 training episodes: mean/median reward 0.49/0.55, min/max reward -0.03/0.55
dist_entropy 2.60, value_loss 0.00, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 17.75


  1%|█                                                                                                                                             | 143/20000 [2:35:49<409:44:00, 74.28s/it]

Updates 143, num timesteps 57600, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.55, min/max reward -1.05/0.55
dist_entropy 3.13, value_loss 0.10, action_loss -0.01, explor_rew 0.011500 mean_episode_steps 28.10


  1%|█                                                                                                                                             | 144/20000 [2:37:05<412:39:59, 74.82s/it]

Updates 144, num timesteps 58000, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.55, min/max reward -1.05/0.55
dist_entropy 3.13, value_loss 0.04, action_loss -0.03, explor_rew 0.009250 mean_episode_steps 19.80


  1%|█                                                                                                                                             | 146/20000 [2:39:34<411:44:56, 74.66s/it]

Updates 145, num timesteps 58400, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.45, min/max reward -1.05/0.55
dist_entropy 2.66, value_loss 0.04, action_loss -0.03, explor_rew 0.015000 mean_episode_steps 22.60


  1%|█                                                                                                                                             | 147/20000 [2:40:51<414:14:20, 75.12s/it]

Updates 146, num timesteps 58800, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.50, min/max reward -1.05/0.55
dist_entropy 2.47, value_loss 0.02, action_loss -0.01, explor_rew 0.022500 mean_episode_steps 18.10
Updates 147, num timesteps 59200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -1.05/0.55
dist_entropy 2.47, value_loss 0.01, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 20.00


  1%|█                                                                                                                                             | 149/20000 [2:43:17<406:29:41, 73.72s/it]

Updates 148, num timesteps 59600, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.29, min/max reward -0.05/0.55
dist_entropy 2.65, value_loss 0.03, action_loss -0.03, explor_rew 0.017500 mean_episode_steps 27.65
Updates 149, num timesteps 60000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.02, explor_rew 0.026500 mean_episode_steps 24.65


  1%|█                                                                                                                                             | 150/20000 [2:44:26<398:48:24, 72.33s/it]

Updates 150, num timesteps 60400, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.50, min/max reward -1.05/0.55
dist_entropy 2.45, value_loss 0.02, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 18.45


  1%|█                                                                                                                                             | 152/20000 [2:46:30<368:16:50, 66.80s/it]

Updates 151, num timesteps 60800, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.02, min/max reward -0.05/0.55
dist_entropy 2.07, value_loss 0.04, action_loss -0.02, explor_rew 0.021750 mean_episode_steps 24.75


  1%|█                                                                                                                                             | 153/20000 [2:47:27<352:35:19, 63.96s/it]

Updates 152, num timesteps 61200, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.02, min/max reward -1.05/0.55
dist_entropy 2.42, value_loss 0.08, action_loss -0.00, explor_rew 0.030250 mean_episode_steps 18.65
Updates 153, num timesteps 61600, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.55, min/max reward -0.03/0.55
dist_entropy 2.82, value_loss 0.07, action_loss -0.01, explor_rew 0.025000 mean_episode_steps 23.30


  1%|█                                                                                                                                             | 154/20000 [2:48:25<342:03:30, 62.05s/it]

Updates 154, num timesteps 62000, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.55, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.06, action_loss 0.08, explor_rew 0.024500 mean_episode_steps 21.85


  1%|█                                                                                                                                             | 155/20000 [2:49:23<334:34:04, 60.69s/it]

Updates 155, num timesteps 62400, FPS 6 
Last 20 training episodes: mean/median reward 0.18/0.55, min/max reward -1.05/0.55
dist_entropy 2.55, value_loss 0.04, action_loss 0.02, explor_rew 0.022250 mean_episode_steps 24.15


  1%|█                                                                                                                                             | 157/20000 [2:51:17<324:40:34, 58.90s/it]

Updates 156, num timesteps 62800, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.28, min/max reward -1.05/0.55
dist_entropy 2.38, value_loss 0.05, action_loss -0.03, explor_rew 0.015000 mean_episode_steps 23.65
Updates 157, num timesteps 63200, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.55, min/max reward -1.05/0.55
dist_entropy 2.35, value_loss 0.07, action_loss -0.02, explor_rew 0.024750 mean_episode_steps 25.10


  1%|█                                                                                                                                             | 158/20000 [2:52:14<322:14:50, 58.47s/it]

Updates 158, num timesteps 63600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.55, min/max reward -1.05/0.55
dist_entropy 2.24, value_loss 0.04, action_loss -0.02, explor_rew 0.027250 mean_episode_steps 18.70


  1%|█▏                                                                                                                                            | 160/20000 [2:54:09<319:09:51, 57.91s/it]

Updates 159, num timesteps 64000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.03/0.55
dist_entropy 2.38, value_loss 0.02, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 25.30


  1%|█▏                                                                                                                                            | 161/20000 [2:55:06<316:30:47, 57.43s/it]

Updates 160, num timesteps 64400, FPS 6 
Last 20 training episodes: mean/median reward 0.13/0.23, min/max reward -1.05/0.55
dist_entropy 2.68, value_loss 0.09, action_loss 0.01, explor_rew 0.006500 mean_episode_steps 33.85


  1%|█▏                                                                                                                                            | 162/20000 [2:56:03<316:24:08, 57.42s/it]

Updates 161, num timesteps 64800, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -1.05/0.55
dist_entropy 2.32, value_loss 0.05, action_loss -0.02, explor_rew 0.026250 mean_episode_steps 22.65


  1%|█▏                                                                                                                                            | 163/20000 [2:57:00<315:59:07, 57.34s/it]

Updates 162, num timesteps 65200, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.45, min/max reward -1.05/0.55
dist_entropy 2.58, value_loss 0.09, action_loss -0.02, explor_rew 0.026000 mean_episode_steps 26.70


  1%|█▏                                                                                                                                            | 164/20000 [2:57:57<315:39:31, 57.29s/it]

Updates 163, num timesteps 65600, FPS 6 
Last 20 training episodes: mean/median reward -0.07/0.45, min/max reward -1.05/0.55
dist_entropy 2.46, value_loss 0.08, action_loss -0.03, explor_rew 0.023750 mean_episode_steps 23.00


  1%|█▏                                                                                                                                            | 165/20000 [2:58:53<313:08:22, 56.83s/it]

Updates 164, num timesteps 66000, FPS 6 
Last 20 training episodes: mean/median reward 0.12/0.45, min/max reward -1.05/0.55
dist_entropy 3.05, value_loss 0.06, action_loss -0.04, explor_rew 0.000500 mean_episode_steps 25.15
Updates 165, num timesteps 66400, FPS 6 
Last 20 training episodes: mean/median reward 0.49/0.55, min/max reward -0.05/0.55
dist_entropy 2.90, value_loss 0.08, action_loss -0.03, explor_rew 0.000500 mean_episode_steps 21.95


  1%|█▏                                                                                                                                            | 167/20000 [3:00:47<313:42:02, 56.94s/it]

Updates 166, num timesteps 66800, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -1.05/0.55
dist_entropy 2.66, value_loss 0.07, action_loss -0.00, explor_rew 0.016250 mean_episode_steps 26.80


  1%|█▏                                                                                                                                            | 168/20000 [3:01:44<313:22:42, 56.89s/it]

Updates 167, num timesteps 67200, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.04, action_loss -0.00, explor_rew 0.023250 mean_episode_steps 20.55


  1%|█▏                                                                                                                                            | 169/20000 [3:02:40<313:06:01, 56.84s/it]

Updates 168, num timesteps 67600, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.29, min/max reward -0.05/0.55
dist_entropy 2.62, value_loss 0.03, action_loss -0.02, explor_rew 0.018750 mean_episode_steps 26.50


  1%|█▏                                                                                                                                            | 170/20000 [3:03:37<312:57:53, 56.82s/it]

Updates 169, num timesteps 68000, FPS 6 
Last 20 training episodes: mean/median reward 0.19/0.45, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.03, action_loss -0.03, explor_rew 0.024250 mean_episode_steps 21.75


  1%|█▏                                                                                                                                            | 171/20000 [3:04:34<313:06:11, 56.84s/it]

Updates 170, num timesteps 68400, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.45, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.04, action_loss -0.02, explor_rew 0.021250 mean_episode_steps 27.15


  1%|█▏                                                                                                                                            | 172/20000 [3:05:31<313:41:14, 56.95s/it]

Updates 171, num timesteps 68800, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.45, min/max reward -0.05/0.55
dist_entropy 2.55, value_loss 0.01, action_loss -0.02, explor_rew 0.024000 mean_episode_steps 17.25


  1%|█▏                                                                                                                                            | 173/20000 [3:06:29<314:32:13, 57.11s/it]

Updates 172, num timesteps 69200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -1.05/0.55
dist_entropy 2.53, value_loss 0.02, action_loss -0.01, explor_rew 0.026250 mean_episode_steps 22.65


  1%|█▏                                                                                                                                            | 174/20000 [3:07:26<314:12:07, 57.05s/it]

Updates 173, num timesteps 69600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 2.60, value_loss 0.01, action_loss -0.02, explor_rew 0.027250 mean_episode_steps 21.25


  1%|█▏                                                                                                                                            | 175/20000 [3:08:23<314:53:19, 57.18s/it]

Updates 174, num timesteps 70000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.05/0.55
dist_entropy 2.45, value_loss 0.01, action_loss -0.01, explor_rew 0.027500 mean_episode_steps 23.25
Updates 175, num timesteps 70400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.02, explor_rew 0.030500 mean_episode_steps 26.35


  1%|█▎                                                                                                                                            | 177/20000 [3:10:18<314:40:58, 57.15s/it]

Updates 176, num timesteps 70800, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.02, min/max reward -0.05/0.55
dist_entropy 2.63, value_loss 0.07, action_loss 0.00, explor_rew -0.002250 mean_episode_steps 45.60


  1%|█▎                                                                                                                                            | 178/20000 [3:11:14<314:04:07, 57.04s/it]

Updates 177, num timesteps 71200, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.50, min/max reward -0.05/0.55
dist_entropy 2.63, value_loss 0.03, action_loss -0.02, explor_rew 0.007250 mean_episode_steps 26.45


  1%|█▎                                                                                                                                            | 179/20000 [3:12:12<314:11:43, 57.07s/it]

Updates 178, num timesteps 71600, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.45, min/max reward -1.05/0.55
dist_entropy 2.50, value_loss 0.05, action_loss -0.01, explor_rew 0.019000 mean_episode_steps 40.85
Updates 179, num timesteps 72000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.05/0.55
dist_entropy 2.51, value_loss 0.02, action_loss -0.04, explor_rew 0.026500 mean_episode_steps 21.25


  1%|█▎                                                                                                                                            | 181/20000 [3:14:05<312:15:26, 56.72s/it]

Updates 180, num timesteps 72400, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.50, min/max reward -0.05/0.55
dist_entropy 2.86, value_loss 0.05, action_loss -0.04, explor_rew 0.000250 mean_episode_steps 35.80


  1%|█▎                                                                                                                                            | 182/20000 [3:15:02<312:44:03, 56.81s/it]

Updates 181, num timesteps 72800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.05/0.55
dist_entropy 2.43, value_loss 0.01, action_loss -0.02, explor_rew 0.029750 mean_episode_steps 23.95
Updates 182, num timesteps 73200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -1.05/0.55
dist_entropy 2.31, value_loss 0.05, action_loss -0.01, explor_rew 0.028750 mean_episode_steps 21.50


  1%|█▎                                                                                                                                            | 184/20000 [3:16:57<315:25:03, 57.30s/it]

Updates 183, num timesteps 73600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.50, min/max reward -0.05/0.55
dist_entropy 2.38, value_loss 0.01, action_loss -0.01, explor_rew 0.028500 mean_episode_steps 26.85
Updates 184, num timesteps 74000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.05/0.55
dist_entropy 2.35, value_loss 0.00, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 16.65


  1%|█▎                                                                                                                                            | 185/20000 [3:17:55<315:52:47, 57.39s/it]

Updates 185, num timesteps 74400, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.03/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 21.45


  1%|█▎                                                                                                                                            | 187/20000 [3:19:50<315:46:19, 57.38s/it]

Updates 186, num timesteps 74800, FPS 6 
Last 20 training episodes: mean/median reward 0.12/0.45, min/max reward -1.05/0.55
dist_entropy 2.67, value_loss 0.08, action_loss -0.01, explor_rew 0.029250 mean_episode_steps 24.70
Updates 187, num timesteps 75200, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.03/0.55
dist_entropy 2.49, value_loss 0.01, action_loss -0.00, explor_rew 0.031000 mean_episode_steps 24.10


  1%|█▎                                                                                                                                            | 189/20000 [3:21:45<315:55:35, 57.41s/it]

Updates 188, num timesteps 75600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.05/0.55
dist_entropy 2.39, value_loss 0.00, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 28.25


  1%|█▎                                                                                                                                            | 190/20000 [3:22:42<315:24:40, 57.32s/it]

Updates 189, num timesteps 76000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.50, min/max reward -0.03/0.55
dist_entropy 2.45, value_loss 0.00, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 21.85
Updates 190, num timesteps 76400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.55, min/max reward -0.03/0.55
dist_entropy 2.47, value_loss 0.00, action_loss -0.00, explor_rew 0.030750 mean_episode_steps 24.05


  1%|█▎                                                                                                                                            | 192/20000 [3:24:36<314:51:50, 57.22s/it]

Updates 191, num timesteps 76800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.03/0.55
dist_entropy 2.34, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 22.60
Updates 192, num timesteps 77200, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.03/0.55
dist_entropy 2.43, value_loss 0.02, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 17.40


  1%|█▍                                                                                                                                            | 194/20000 [3:26:31<315:26:57, 57.34s/it]

Updates 193, num timesteps 77600, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.03/0.55
dist_entropy 2.49, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 24.20


  1%|█▍                                                                                                                                            | 195/20000 [3:27:28<315:11:22, 57.29s/it]

Updates 194, num timesteps 78000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.03/0.55
dist_entropy 2.47, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 18.85


  1%|█▍                                                                                                                                            | 196/20000 [3:28:24<312:26:37, 56.80s/it]

Updates 195, num timesteps 78400, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 2.48, value_loss 0.04, action_loss -0.02, explor_rew -0.003750 mean_episode_steps 34.85
Updates 196, num timesteps 78800, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.05/0.55
dist_entropy 2.80, value_loss 0.05, action_loss -0.02, explor_rew -0.003750 mean_episode_steps 39.50


  1%|█▍                                                                                                                                            | 198/20000 [3:30:17<311:57:02, 56.71s/it]

Updates 197, num timesteps 79200, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.05/0.55
dist_entropy 2.63, value_loss 0.01, action_loss -0.03, explor_rew 0.021250 mean_episode_steps 22.70
Updates 198, num timesteps 79600, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.05/0.55
dist_entropy 2.33, value_loss 0.01, action_loss -0.02, explor_rew 0.023500 mean_episode_steps 28.05


  1%|█▍                                                                                                                                            | 200/20000 [3:32:12<313:33:22, 57.01s/it]

Updates 199, num timesteps 80000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.05/0.55
dist_entropy 2.27, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 26.30
Updates 200, num timesteps 80400, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.45, min/max reward -0.04/0.55
dist_entropy 2.36, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 19.20


  1%|█▍                                                                                                                                            | 202/20000 [3:34:07<314:36:33, 57.21s/it]

Updates 201, num timesteps 80800, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -1.05/0.55
dist_entropy 2.52, value_loss 0.02, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 25.65


  1%|█▍                                                                                                                                            | 203/20000 [3:35:04<314:53:59, 57.26s/it]

Updates 202, num timesteps 81200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.50, min/max reward -0.04/0.55
dist_entropy 2.53, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 18.65


  1%|█▍                                                                                                                                            | 204/20000 [3:36:01<314:26:10, 57.18s/it]

Updates 203, num timesteps 81600, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.50, min/max reward 0.02/0.55
dist_entropy 2.57, value_loss 0.00, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 20.15
Updates 204, num timesteps 82000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.62, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 17.30


  1%|█▍                                                                                                                                            | 206/20000 [3:37:55<314:04:14, 57.12s/it]

Updates 205, num timesteps 82400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.57, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 17.85


  1%|█▍                                                                                                                                            | 207/20000 [3:38:53<314:27:20, 57.19s/it]

Updates 206, num timesteps 82800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.03/0.55
dist_entropy 2.47, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 22.15


  1%|█▍                                                                                                                                            | 208/20000 [3:39:50<314:33:52, 57.22s/it]

Updates 207, num timesteps 83200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 22.10


  1%|█▍                                                                                                                                            | 209/20000 [3:40:45<311:40:58, 56.70s/it]

Updates 208, num timesteps 83600, FPS 6 
Last 20 training episodes: mean/median reward 0.20/-0.00, min/max reward -0.05/0.55
dist_entropy 3.33, value_loss 0.07, action_loss -0.01, explor_rew -0.006250 mean_episode_steps 18.60
Updates 209, num timesteps 84000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.02, explor_rew 0.025000 mean_episode_steps 19.65


  1%|█▍                                                                                                                                            | 210/20000 [3:41:42<312:10:42, 56.79s/it]

Updates 210, num timesteps 84400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.03/0.55
dist_entropy 2.56, value_loss 0.00, action_loss -0.02, explor_rew 0.030500 mean_episode_steps 26.55


  1%|█▌                                                                                                                                            | 212/20000 [3:43:37<313:35:34, 57.05s/it]

Updates 211, num timesteps 84800, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.29, min/max reward -0.05/0.55
dist_entropy 2.55, value_loss 0.02, action_loss 0.18, explor_rew 0.020750 mean_episode_steps 25.00


  1%|█▌                                                                                                                                            | 213/20000 [3:44:34<313:09:18, 56.97s/it]

Updates 212, num timesteps 85200, FPS 6 
Last 20 training episodes: mean/median reward 0.06/0.28, min/max reward -1.05/0.55
dist_entropy 1.98, value_loss 0.12, action_loss 0.02, explor_rew -0.011000 mean_episode_steps 34.45


  1%|█▌                                                                                                                                            | 214/20000 [3:45:30<312:17:13, 56.82s/it]

Updates 213, num timesteps 85600, FPS 6 
Last 20 training episodes: mean/median reward -0.04/0.34, min/max reward -1.05/0.55
dist_entropy 2.58, value_loss 0.11, action_loss 0.00, explor_rew -0.020750 mean_episode_steps 53.50


  1%|█▌                                                                                                                                            | 215/20000 [3:46:27<312:20:13, 56.83s/it]

Updates 214, num timesteps 86000, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.50, min/max reward -0.03/0.55
dist_entropy 2.93, value_loss 0.08, action_loss -0.01, explor_rew -0.001500 mean_episode_steps 38.95


  1%|█▌                                                                                                                                            | 216/20000 [3:47:24<312:24:21, 56.85s/it]

Updates 215, num timesteps 86400, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.02, min/max reward -1.05/0.55
dist_entropy 2.99, value_loss 0.04, action_loss -0.00, explor_rew 0.007250 mean_episode_steps 29.60


  1%|█▌                                                                                                                                            | 217/20000 [3:48:21<312:17:33, 56.83s/it]

Updates 216, num timesteps 86800, FPS 6 
Last 20 training episodes: mean/median reward -0.05/0.02, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.02, action_loss 0.00, explor_rew 0.017750 mean_episode_steps 32.10


  1%|█▌                                                                                                                                            | 218/20000 [3:49:18<312:27:00, 56.86s/it]

Updates 217, num timesteps 87200, FPS 6 
Last 20 training episodes: mean/median reward 0.01/0.28, min/max reward -1.05/0.55
dist_entropy 3.02, value_loss 0.02, action_loss -0.01, explor_rew 0.020500 mean_episode_steps 25.90
Updates 218, num timesteps 87600, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.55, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.04, action_loss -0.02, explor_rew 0.025500 mean_episode_steps 21.55


  1%|█▌                                                                                                                                            | 220/20000 [3:51:12<312:45:25, 56.92s/it]

Updates 219, num timesteps 88000, FPS 6 
Last 20 training episodes: mean/median reward -0.11/-0.03, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.04, action_loss -0.01, explor_rew 0.024250 mean_episode_steps 25.85
Updates 220, num timesteps 88400, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.55, min/max reward -1.05/0.55
dist_entropy 2.42, value_loss 0.07, action_loss -0.02, explor_rew 0.023250 mean_episode_steps 25.05


  1%|█▌                                                                                                                                            | 221/20000 [3:52:09<313:03:27, 56.98s/it]

Updates 221, num timesteps 88800, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.04/0.55
dist_entropy 2.60, value_loss 0.05, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 29.05


  1%|█▌                                                                                                                                            | 223/20000 [3:54:03<313:14:58, 57.02s/it]

Updates 222, num timesteps 89200, FPS 6 
Last 20 training episodes: mean/median reward 0.18/0.08, min/max reward -1.05/0.55
dist_entropy 2.54, value_loss 0.02, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 26.45


  1%|█▌                                                                                                                                            | 224/20000 [3:55:00<313:18:39, 57.03s/it]

Updates 223, num timesteps 89600, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.50, min/max reward -0.03/0.55
dist_entropy 2.56, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 18.70


  1%|█▌                                                                                                                                            | 225/20000 [3:55:57<313:20:46, 57.04s/it]

Updates 224, num timesteps 90000, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.45, min/max reward -0.05/0.55
dist_entropy 2.65, value_loss 0.01, action_loss -0.00, explor_rew 0.030500 mean_episode_steps 19.80
Updates 225, num timesteps 90400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 24.85


  1%|█▌                                                                                                                                            | 227/20000 [3:57:52<313:54:29, 57.15s/it]

Updates 226, num timesteps 90800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.05/0.55
dist_entropy 2.53, value_loss 0.01, action_loss -0.01, explor_rew 0.024000 mean_episode_steps 24.85


  1%|█▌                                                                                                                                            | 228/20000 [3:58:49<313:35:33, 57.10s/it]

Updates 227, num timesteps 91200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.47, value_loss 0.00, action_loss -0.02, explor_rew 0.029500 mean_episode_steps 19.20


  1%|█▋                                                                                                                                            | 229/20000 [3:59:46<314:19:58, 57.24s/it]

Updates 228, num timesteps 91600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.50, min/max reward -0.03/0.55
dist_entropy 2.46, value_loss 0.02, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 26.80


  1%|█▋                                                                                                                                            | 230/20000 [4:00:44<314:44:15, 57.31s/it]

Updates 229, num timesteps 92000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.03/0.55
dist_entropy 2.54, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 24.25


  1%|█▋                                                                                                                                            | 231/20000 [4:01:41<314:44:16, 57.31s/it]

Updates 230, num timesteps 92400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.54, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 29.65
Updates 231, num timesteps 92800, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.01, explor_rew 0.028750 mean_episode_steps 25.45


  1%|█▋                                                                                                                                            | 233/20000 [4:03:34<312:15:25, 56.87s/it]

Updates 232, num timesteps 93200, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -0.05/0.75
dist_entropy 2.93, value_loss 0.12, action_loss -0.01, explor_rew -0.017000 mean_episode_steps 37.55


  1%|█▋                                                                                                                                            | 234/20000 [4:04:29<308:58:49, 56.27s/it]

Updates 233, num timesteps 93600, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -0.05/0.55
dist_entropy 2.81, value_loss 0.15, action_loss -0.00, explor_rew -0.032750 mean_episode_steps 58.00


  1%|█▋                                                                                                                                            | 235/20000 [4:05:23<305:52:54, 55.71s/it]

Updates 234, num timesteps 94000, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -0.05/0.55
dist_entropy 2.99, value_loss 0.18, action_loss -0.00, explor_rew -0.050000 mean_episode_steps 58.00


  1%|█▋                                                                                                                                            | 236/20000 [4:06:18<303:38:39, 55.31s/it]

Updates 235, num timesteps 94400, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -0.05/0.55
dist_entropy 3.13, value_loss 0.15, action_loss -0.01, explor_rew -0.045000 mean_episode_steps 111.30


  1%|█▋                                                                                                                                            | 237/20000 [4:07:14<304:49:47, 55.53s/it]

Updates 236, num timesteps 94800, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -0.05/0.55
dist_entropy 3.00, value_loss 0.10, action_loss -0.03, explor_rew -0.002000 mean_episode_steps 69.20


  1%|█▋                                                                                                                                            | 238/20000 [4:08:11<307:06:13, 55.94s/it]

Updates 237, num timesteps 95200, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.45, min/max reward -0.05/0.55
dist_entropy 2.78, value_loss 0.06, action_loss -0.01, explor_rew 0.018000 mean_episode_steps 26.75


  1%|█▋                                                                                                                                            | 239/20000 [4:09:08<309:04:52, 56.31s/it]

Updates 238, num timesteps 95600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.03/0.55
dist_entropy 2.63, value_loss 0.04, action_loss -0.02, explor_rew 0.025250 mean_episode_steps 25.75


  1%|█▋                                                                                                                                            | 240/20000 [4:10:05<309:47:14, 56.44s/it]

Updates 239, num timesteps 96000, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.03/0.55
dist_entropy 2.54, value_loss 0.01, action_loss -0.01, explor_rew 0.027250 mean_episode_steps 22.35


  1%|█▋                                                                                                                                            | 241/20000 [4:11:02<310:32:44, 56.58s/it]

Updates 240, num timesteps 96400, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.23, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.01, action_loss -0.02, explor_rew 0.029500 mean_episode_steps 25.30
Updates 241, num timesteps 96800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.03/0.55
dist_entropy 2.61, value_loss 0.01, action_loss -0.02, explor_rew 0.030250 mean_episode_steps 20.30


  1%|█▋                                                                                                                                            | 243/20000 [4:12:57<312:58:33, 57.03s/it]

Updates 242, num timesteps 97200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.55, value_loss 0.02, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 25.75


  1%|█▋                                                                                                                                            | 244/20000 [4:13:53<312:41:34, 56.98s/it]

Updates 243, num timesteps 97600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -0.03/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.02, explor_rew 0.027500 mean_episode_steps 22.45


  1%|█▋                                                                                                                                            | 245/20000 [4:14:51<312:59:00, 57.04s/it]

Updates 244, num timesteps 98000, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.05/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.02, explor_rew 0.030750 mean_episode_steps 25.65


  1%|█▋                                                                                                                                            | 246/20000 [4:15:47<311:56:55, 56.85s/it]

Updates 245, num timesteps 98400, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.45, min/max reward -1.05/0.55
dist_entropy 2.60, value_loss 0.01, action_loss -0.03, explor_rew 0.025750 mean_episode_steps 21.60


  1%|█▊                                                                                                                                            | 247/20000 [4:16:44<312:22:28, 56.93s/it]

Updates 246, num timesteps 98800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.05/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.02, explor_rew 0.030250 mean_episode_steps 23.90


  1%|█▊                                                                                                                                            | 248/20000 [4:17:41<311:38:23, 56.80s/it]

Updates 247, num timesteps 99200, FPS 6 
Last 20 training episodes: mean/median reward -0.49/-1.05, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.27, action_loss 0.50, explor_rew -0.008000 mean_episode_steps 39.45
Updates 248, num timesteps 99600, FPS 6 
Last 20 training episodes: mean/median reward 0.04/0.55, min/max reward -1.05/0.55
dist_entropy 2.99, value_loss 0.14, action_loss -0.04, explor_rew 0.026750 mean_episode_steps 18.65


  1%|█▊                                                                                                                                            | 249/20000 [4:18:38<312:26:15, 56.95s/it]

Updates 249, num timesteps 100000, FPS 6 
Last 20 training episodes: mean/median reward 0.20/0.55, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.12, action_loss -0.01, explor_rew -0.042250 mean_episode_steps 16.05


  1%|█▊                                                                                                                                            | 250/20000 [4:19:35<311:56:52, 56.86s/it]

Updates 250, num timesteps 100400, FPS 6 
Last 20 training episodes: mean/median reward -0.07/0.55, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.18, action_loss 0.04, explor_rew -0.034250 mean_episode_steps 101.05


  1%|█▊                                                                                                                                            | 251/20000 [4:20:31<311:57:13, 56.87s/it]

Updates 251, num timesteps 100800, FPS 6 
Last 20 training episodes: mean/median reward -0.07/0.55, min/max reward -1.05/0.55
dist_entropy 3.08, value_loss 0.08, action_loss 0.01, explor_rew -0.049000 mean_episode_steps 101.05


  1%|█▊                                                                                                                                            | 253/20000 [4:22:24<309:51:20, 56.49s/it]

Updates 252, num timesteps 101200, FPS 6 
Last 20 training episodes: mean/median reward -0.24/-0.03, min/max reward -1.05/0.55
dist_entropy 3.18, value_loss 0.05, action_loss 0.00, explor_rew -0.042000 mean_episode_steps 158.85


  1%|█▊                                                                                                                                            | 254/20000 [4:23:20<308:56:43, 56.33s/it]

Updates 253, num timesteps 101600, FPS 6 
Last 20 training episodes: mean/median reward -0.24/-0.03, min/max reward -1.05/0.55
dist_entropy 3.15, value_loss 0.05, action_loss -0.00, explor_rew -0.049000 mean_episode_steps 158.85




Updates 254, num timesteps 102000, FPS 6 
Last 20 training episodes: mean/median reward 0.04/0.55, min/max reward -1.05/0.55
dist_entropy 3.29, value_loss 0.07, action_loss -0.01, explor_rew -0.042250 mean_episode_steps 178.90


  1%|█▊                                                                                                                                            | 256/20000 [4:25:12<308:49:06, 56.31s/it]

Updates 255, num timesteps 102400, FPS 6 
Last 20 training episodes: mean/median reward -0.16/0.50, min/max reward -1.05/0.55
dist_entropy 3.40, value_loss 0.11, action_loss -0.01, explor_rew -0.044500 mean_episode_steps 225.50
Updates 256, num timesteps 102800, FPS 6 
Last 20 training episodes: mean/median reward -0.06/0.55, min/max reward -1.05/0.55
dist_entropy 3.34, value_loss 0.10, action_loss -0.00, explor_rew -0.034250 mean_episode_steps 160.85


  1%|█▊                                                                                                                                            | 258/20000 [4:27:05<309:10:09, 56.38s/it]

Updates 257, num timesteps 103200, FPS 6 
Last 20 training episodes: mean/median reward -0.18/-0.04, min/max reward -1.05/0.55
dist_entropy 3.33, value_loss 0.09, action_loss -0.00, explor_rew -0.040750 mean_episode_steps 178.00


  1%|█▊                                                                                                                                            | 259/20000 [4:28:02<309:04:22, 56.36s/it]

Updates 258, num timesteps 103600, FPS 6 
Last 20 training episodes: mean/median reward -0.21/-0.04, min/max reward -1.05/0.55
dist_entropy 3.26, value_loss 0.06, action_loss 0.00, explor_rew -0.041750 mean_episode_steps 181.60
Updates 259, num timesteps 104000, FPS 6 
Last 20 training episodes: mean/median reward 0.07/0.55, min/max reward -1.05/0.55
dist_entropy 3.20, value_loss 0.15, action_loss -0.00, explor_rew -0.028250 mean_episode_steps 233.55


  1%|█▊                                                                                                                                            | 260/20000 [4:28:59<310:02:00, 56.54s/it]

Updates 260, num timesteps 104400, FPS 6 
Last 20 training episodes: mean/median reward -0.01/0.55, min/max reward -1.05/0.55
dist_entropy 3.38, value_loss 0.11, action_loss -0.01, explor_rew -0.043750 mean_episode_steps 205.35


  1%|█▊                                                                                                                                            | 262/20000 [4:30:51<309:40:04, 56.48s/it]

Updates 261, num timesteps 104800, FPS 6 
Last 20 training episodes: mean/median reward -0.17/0.50, min/max reward -1.05/0.55
dist_entropy 3.48, value_loss 0.26, action_loss -0.02, explor_rew -0.009250 mean_episode_steps 68.50


  1%|█▊                                                                                                                                            | 263/20000 [4:31:46<306:46:15, 55.95s/it]

Updates 262, num timesteps 105200, FPS 6 
Last 20 training episodes: mean/median reward -0.96/-1.05, min/max reward -1.05/0.65
dist_entropy 3.93, value_loss 0.19, action_loss -0.01, explor_rew -0.029250 mean_episode_steps 58.55


  1%|█▊                                                                                                                                            | 264/20000 [4:32:42<305:48:00, 55.78s/it]

Updates 263, num timesteps 105600, FPS 6 
Last 20 training episodes: mean/median reward -0.87/-1.05, min/max reward -1.05/0.75
dist_entropy 4.12, value_loss 0.12, action_loss -0.01, explor_rew -0.011250 mean_episode_steps 21.25


  1%|█▉                                                                                                                                            | 265/20000 [4:33:37<305:08:21, 55.66s/it]

Updates 264, num timesteps 106000, FPS 6 
Last 20 training episodes: mean/median reward -1.05/-1.05, min/max reward -1.05/-1.05
dist_entropy 3.89, value_loss 0.04, action_loss -0.03, explor_rew -0.006500 mean_episode_steps 41.55


  1%|█▉                                                                                                                                            | 266/20000 [4:34:33<305:15:53, 55.69s/it]

Updates 265, num timesteps 106400, FPS 6 
Last 20 training episodes: mean/median reward -0.81/-1.05, min/max reward -1.05/0.85
dist_entropy 3.72, value_loss 0.06, action_loss -0.03, explor_rew -0.005000 mean_episode_steps 26.40


  1%|█▉                                                                                                                                            | 267/20000 [4:35:28<304:51:49, 55.62s/it]

Updates 266, num timesteps 106800, FPS 6 
Last 20 training episodes: mean/median reward -0.63/-1.05, min/max reward -1.05/0.85
dist_entropy 3.71, value_loss 0.10, action_loss -0.02, explor_rew 0.005500 mean_episode_steps 24.30


  1%|█▉                                                                                                                                            | 268/20000 [4:36:24<305:04:57, 55.66s/it]

Updates 267, num timesteps 107200, FPS 6 
Last 20 training episodes: mean/median reward -0.43/-1.05, min/max reward -1.05/0.75
dist_entropy 3.66, value_loss 0.17, action_loss -0.03, explor_rew 0.007500 mean_episode_steps 20.25


  1%|█▉                                                                                                                                            | 269/20000 [4:37:20<305:08:16, 55.67s/it]

Updates 268, num timesteps 107600, FPS 6 
Last 20 training episodes: mean/median reward -0.39/-0.55, min/max reward -1.05/0.85
dist_entropy 3.45, value_loss 0.11, action_loss -0.03, explor_rew 0.002750 mean_episode_steps 19.25


  1%|█▉                                                                                                                                            | 270/20000 [4:38:16<305:55:03, 55.82s/it]

Updates 269, num timesteps 108000, FPS 6 
Last 20 training episodes: mean/median reward -0.40/-1.05, min/max reward -1.05/0.75
dist_entropy 3.47, value_loss 0.09, action_loss -0.03, explor_rew 0.009250 mean_episode_steps 24.20


  1%|█▉                                                                                                                                            | 271/20000 [4:39:11<305:44:57, 55.79s/it]

Updates 270, num timesteps 108400, FPS 6 
Last 20 training episodes: mean/median reward -0.39/-0.05, min/max reward -1.05/0.85
dist_entropy 3.10, value_loss 0.09, action_loss -0.02, explor_rew 0.004750 mean_episode_steps 34.00


  1%|█▉                                                                                                                                            | 272/20000 [4:40:08<306:30:19, 55.93s/it]

Updates 271, num timesteps 108800, FPS 6 
Last 20 training episodes: mean/median reward -0.34/-0.55, min/max reward -1.05/0.55
dist_entropy 3.08, value_loss 0.11, action_loss -0.03, explor_rew 0.010250 mean_episode_steps 24.10


  1%|█▉                                                                                                                                            | 273/20000 [4:41:04<307:14:01, 56.07s/it]

Updates 272, num timesteps 109200, FPS 6 
Last 20 training episodes: mean/median reward -0.21/-0.02, min/max reward -1.05/0.85
dist_entropy 2.86, value_loss 0.08, action_loss -0.03, explor_rew 0.015500 mean_episode_steps 23.90


  1%|█▉                                                                                                                                            | 274/20000 [4:42:01<308:09:15, 56.24s/it]

Updates 273, num timesteps 109600, FPS 6 
Last 20 training episodes: mean/median reward 0.07/0.28, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.05, action_loss -0.02, explor_rew 0.020750 mean_episode_steps 19.95


  1%|█▉                                                                                                                                            | 275/20000 [4:42:57<308:44:28, 56.35s/it]

Updates 274, num timesteps 110000, FPS 6 
Last 20 training episodes: mean/median reward -0.28/-0.04, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.06, action_loss -0.01, explor_rew 0.021750 mean_episode_steps 27.00


  1%|█▉                                                                                                                                            | 276/20000 [4:43:54<309:27:09, 56.48s/it]

Updates 275, num timesteps 110400, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.50, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.04, action_loss -0.01, explor_rew 0.022500 mean_episode_steps 25.70


  1%|█▉                                                                                                                                            | 277/20000 [4:44:51<310:25:19, 56.66s/it]

Updates 276, num timesteps 110800, FPS 6 
Last 20 training episodes: mean/median reward 0.10/0.13, min/max reward -1.05/0.55
dist_entropy 2.53, value_loss 0.06, action_loss -0.02, explor_rew 0.028000 mean_episode_steps 20.10


  1%|█▉                                                                                                                                            | 278/20000 [4:45:49<311:22:08, 56.84s/it]

Updates 277, num timesteps 111200, FPS 6 
Last 20 training episodes: mean/median reward 0.18/0.34, min/max reward -1.05/0.55
dist_entropy 2.54, value_loss 0.08, action_loss -0.01, explor_rew 0.022750 mean_episode_steps 25.15
Updates 278, num timesteps 111600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.03/0.55
dist_entropy 2.47, value_loss 0.04, action_loss -0.02, explor_rew 0.027750 mean_episode_steps 24.25


  1%|█▉                                                                                                                                            | 280/20000 [4:47:43<312:00:28, 56.96s/it]

Updates 279, num timesteps 112000, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.13, min/max reward -0.05/0.55
dist_entropy 2.57, value_loss 0.04, action_loss -0.05, explor_rew 0.016000 mean_episode_steps 20.65


  1%|█▉                                                                                                                                            | 281/20000 [4:48:40<312:06:04, 56.98s/it]

Updates 280, num timesteps 112400, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.45, min/max reward -1.05/0.55
dist_entropy 2.56, value_loss 0.04, action_loss -0.02, explor_rew 0.028750 mean_episode_steps 26.20
Updates 281, num timesteps 112800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.03, action_loss -0.02, explor_rew 0.025250 mean_episode_steps 23.45


  1%|██                                                                                                                                            | 283/20000 [4:50:34<312:49:17, 57.12s/it]

Updates 282, num timesteps 113200, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -1.05/0.55
dist_entropy 2.65, value_loss 0.04, action_loss -0.02, explor_rew 0.029750 mean_episode_steps 23.70
Updates 283, num timesteps 113600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.02, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 22.70


  1%|██                                                                                                                                            | 285/20000 [4:52:28<311:57:11, 56.96s/it]

Updates 284, num timesteps 114000, FPS 6 
Last 20 training episodes: mean/median reward 0.01/0.28, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.11, action_loss -0.02, explor_rew 0.031500 mean_episode_steps 26.75


  1%|██                                                                                                                                            | 286/20000 [4:53:25<312:02:37, 56.98s/it]

Updates 285, num timesteps 114400, FPS 6 
Last 20 training episodes: mean/median reward 0.01/0.02, min/max reward -1.05/0.55
dist_entropy 2.66, value_loss 0.07, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 25.95


  1%|██                                                                                                                                            | 287/20000 [4:54:22<312:21:27, 57.04s/it]

Updates 286, num timesteps 114800, FPS 6 
Last 20 training episodes: mean/median reward -0.39/-1.05, min/max reward -1.05/0.55
dist_entropy 2.50, value_loss 0.06, action_loss 0.00, explor_rew 0.013000 mean_episode_steps 27.25


  1%|██                                                                                                                                            | 288/20000 [4:55:19<312:13:33, 57.02s/it]

Updates 287, num timesteps 115200, FPS 6 
Last 20 training episodes: mean/median reward -0.14/-0.00, min/max reward -1.05/0.55
dist_entropy 2.60, value_loss 0.03, action_loss -0.00, explor_rew 0.013750 mean_episode_steps 29.65
Updates 288, num timesteps 115600, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.55, min/max reward -1.05/0.55
dist_entropy 2.77, value_loss 0.01, action_loss -0.01, explor_rew 0.023250 mean_episode_steps 22.60


  1%|██                                                                                                                                            | 289/20000 [4:56:16<312:15:25, 57.03s/it]

Updates 289, num timesteps 116000, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.55, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.02, action_loss -0.01, explor_rew 0.025250 mean_episode_steps 26.25


  1%|██                                                                                                                                            | 291/20000 [4:58:11<313:09:54, 57.20s/it]

Updates 290, num timesteps 116400, FPS 6 
Last 20 training episodes: mean/median reward -0.12/0.02, min/max reward -1.05/0.55
dist_entropy 2.77, value_loss 0.01, action_loss -0.02, explor_rew 0.023500 mean_episode_steps 24.10


  1%|██                                                                                                                                            | 292/20000 [4:59:08<312:15:06, 57.04s/it]

Updates 291, num timesteps 116800, FPS 6 
Last 20 training episodes: mean/median reward -0.12/-0.00, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.02, action_loss -0.01, explor_rew 0.025750 mean_episode_steps 19.70
Updates 292, num timesteps 117200, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.02, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 27.30


  1%|██                                                                                                                                            | 294/20000 [5:01:02<312:38:11, 57.11s/it]

Updates 293, num timesteps 117600, FPS 6 
Last 20 training episodes: mean/median reward 0.01/0.28, min/max reward -1.05/0.55
dist_entropy 2.65, value_loss 0.02, action_loss -0.02, explor_rew 0.028250 mean_episode_steps 20.75


  1%|██                                                                                                                                            | 295/20000 [5:01:59<312:42:09, 57.13s/it]

Updates 294, num timesteps 118000, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.26, min/max reward -1.05/0.55
dist_entropy 2.56, value_loss 0.02, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 23.65


  1%|██                                                                                                                                            | 296/20000 [5:02:56<312:42:16, 57.13s/it]

Updates 295, num timesteps 118400, FPS 6 
Last 20 training episodes: mean/median reward 0.12/0.28, min/max reward -1.05/0.55
dist_entropy 2.66, value_loss 0.01, action_loss 0.00, explor_rew 0.032500 mean_episode_steps 26.85
Updates 296, num timesteps 118800, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.55, min/max reward -1.05/0.55
dist_entropy 2.73, value_loss 0.03, action_loss -0.04, explor_rew 0.006000 mean_episode_steps 25.90


  1%|██                                                                                                                                            | 297/20000 [5:03:54<313:12:52, 57.23s/it]

Updates 297, num timesteps 119200, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.55, min/max reward -1.05/0.55
dist_entropy 2.56, value_loss 0.04, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 25.85


  1%|██                                                                                                                                            | 298/20000 [5:04:51<313:42:42, 57.32s/it]

Updates 298, num timesteps 119600, FPS 6 
Last 20 training episodes: mean/median reward 0.12/0.55, min/max reward -1.05/0.55
dist_entropy 2.55, value_loss 0.02, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 14.65


  2%|██▏                                                                                                                                           | 300/20000 [5:06:45<312:58:46, 57.19s/it]

Updates 299, num timesteps 120000, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.13, min/max reward -0.03/0.55
dist_entropy 2.30, value_loss 0.09, action_loss 0.00, explor_rew 0.033000 mean_episode_steps 20.50
Updates 300, num timesteps 120400, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.13, min/max reward -0.05/0.55
dist_entropy 2.42, value_loss 0.05, action_loss -0.01, explor_rew 0.010250 mean_episode_steps 29.70


  2%|██▏                                                                                                                                           | 302/20000 [5:08:39<311:31:46, 56.94s/it]

Updates 301, num timesteps 120800, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.08, min/max reward -0.05/0.55
dist_entropy 2.36, value_loss 0.01, action_loss -0.00, explor_rew 0.028250 mean_episode_steps 18.70


  2%|██▏                                                                                                                                           | 303/20000 [5:09:36<311:36:08, 56.95s/it]

Updates 302, num timesteps 121200, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.13, min/max reward -0.05/0.55
dist_entropy 2.37, value_loss 0.01, action_loss -0.00, explor_rew 0.029250 mean_episode_steps 27.75


  2%|██▏                                                                                                                                           | 304/20000 [5:10:33<311:51:35, 57.00s/it]

Updates 303, num timesteps 121600, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.13, min/max reward -0.05/0.55
dist_entropy 2.51, value_loss 0.01, action_loss -0.02, explor_rew 0.030750 mean_episode_steps 24.95


  2%|██▏                                                                                                                                           | 305/20000 [5:11:31<313:12:03, 57.25s/it]

Updates 304, num timesteps 122000, FPS 6 
Last 20 training episodes: mean/median reward -0.13/-0.03, min/max reward -1.05/0.55
dist_entropy 2.56, value_loss 0.11, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 23.90


  2%|██▏                                                                                                                                           | 306/20000 [5:12:28<313:35:27, 57.32s/it]

Updates 305, num timesteps 122400, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.08, min/max reward -0.05/0.55
dist_entropy 2.51, value_loss 0.04, action_loss -0.00, explor_rew 0.028750 mean_episode_steps 22.60


  2%|██▏                                                                                                                                           | 307/20000 [5:13:25<312:56:23, 57.21s/it]

Updates 306, num timesteps 122800, FPS 6 
Last 20 training episodes: mean/median reward -0.03/-0.03, min/max reward -1.05/0.55
dist_entropy 2.36, value_loss 0.05, action_loss -0.01, explor_rew 0.023250 mean_episode_steps 33.65


  2%|██▏                                                                                                                                           | 308/20000 [5:14:22<312:03:20, 57.05s/it]

Updates 307, num timesteps 123200, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.13, min/max reward -0.05/0.55
dist_entropy 2.42, value_loss 0.01, action_loss -0.01, explor_rew 0.023250 mean_episode_steps 30.80
Updates 308, num timesteps 123600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.05/0.55
dist_entropy 2.35, value_loss 0.02, action_loss -0.01, explor_rew 0.025500 mean_episode_steps 42.25


  2%|██▏                                                                                                                                           | 310/20000 [5:16:17<312:47:11, 57.19s/it]

Updates 309, num timesteps 124000, FPS 6 
Last 20 training episodes: mean/median reward 0.10/0.13, min/max reward -1.05/0.55
dist_entropy 2.35, value_loss 0.07, action_loss -0.01, explor_rew 0.024250 mean_episode_steps 25.20


  2%|██▏                                                                                                                                           | 311/20000 [5:17:13<311:50:22, 57.02s/it]

Updates 310, num timesteps 124400, FPS 6 
Last 20 training episodes: mean/median reward 0.20/0.29, min/max reward -1.05/0.55
dist_entropy 2.32, value_loss 0.03, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 28.70


  2%|██▏                                                                                                                                           | 312/20000 [5:18:10<310:54:45, 56.85s/it]

Updates 311, num timesteps 124800, FPS 6 
Last 20 training episodes: mean/median reward 0.13/0.29, min/max reward -1.05/0.55
dist_entropy 2.35, value_loss 0.08, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 21.35
Updates 312, num timesteps 125200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.05/0.55
dist_entropy 2.37, value_loss 0.01, action_loss -0.02, explor_rew 0.030500 mean_episode_steps 26.15


  2%|██▏                                                                                                                                           | 313/20000 [5:19:07<311:22:15, 56.94s/it]

Updates 313, num timesteps 125600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -1.05/0.55
dist_entropy 2.38, value_loss 0.07, action_loss -0.02, explor_rew 0.030250 mean_episode_steps 15.25


  2%|██▏                                                                                                                                           | 315/20000 [5:21:01<311:49:19, 57.03s/it]

Updates 314, num timesteps 126000, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.05/0.55
dist_entropy 2.54, value_loss 0.01, action_loss -0.01, explor_rew 0.026500 mean_episode_steps 23.00
Updates 315, num timesteps 126400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.04/0.55
dist_entropy 2.52, value_loss 0.00, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 23.50


  2%|██▎                                                                                                                                           | 317/20000 [5:22:55<310:42:14, 56.83s/it]

Updates 316, num timesteps 126800, FPS 6 
Last 20 training episodes: mean/median reward -0.08/-0.02, min/max reward -1.05/0.55
dist_entropy 3.00, value_loss 0.16, action_loss 0.00, explor_rew -0.006750 mean_episode_steps 42.65


  2%|██▎                                                                                                                                           | 318/20000 [5:23:52<310:48:32, 56.85s/it]

Updates 317, num timesteps 127200, FPS 6 
Last 20 training episodes: mean/median reward -0.07/-0.00, min/max reward -1.05/0.55
dist_entropy 2.60, value_loss 0.09, action_loss -0.02, explor_rew 0.018250 mean_episode_steps 34.30


  2%|██▎                                                                                                                                           | 319/20000 [5:24:48<310:22:10, 56.77s/it]

Updates 318, num timesteps 127600, FPS 6 
Last 20 training episodes: mean/median reward -0.06/0.23, min/max reward -1.05/0.55
dist_entropy 2.49, value_loss 0.06, action_loss -0.02, explor_rew 0.027250 mean_episode_steps 21.65


  2%|██▎                                                                                                                                           | 320/20000 [5:25:45<310:40:16, 56.83s/it]

Updates 319, num timesteps 128000, FPS 6 
Last 20 training episodes: mean/median reward 0.10/0.23, min/max reward -1.05/0.55
dist_entropy 2.58, value_loss 0.08, action_loss -0.01, explor_rew 0.027250 mean_episode_steps 20.65


  2%|██▎                                                                                                                                           | 321/20000 [5:26:41<309:02:37, 56.54s/it]

Updates 320, num timesteps 128400, FPS 6 
Last 20 training episodes: mean/median reward -0.55/-1.05, min/max reward -1.05/0.75
dist_entropy 2.74, value_loss 0.26, action_loss 0.01, explor_rew 0.002250 mean_episode_steps 23.30


  2%|██▎                                                                                                                                           | 322/20000 [5:27:37<308:15:05, 56.39s/it]

Updates 321, num timesteps 128800, FPS 6 
Last 20 training episodes: mean/median reward -0.05/0.45, min/max reward -1.05/0.75
dist_entropy 2.64, value_loss 0.15, action_loss 0.01, explor_rew 0.008250 mean_episode_steps 15.50


  2%|██▎                                                                                                                                           | 323/20000 [5:28:34<309:33:57, 56.64s/it]

Updates 322, num timesteps 129200, FPS 6 
Last 20 training episodes: mean/median reward 0.06/0.45, min/max reward -1.05/0.75
dist_entropy 2.38, value_loss 0.06, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 18.65


  2%|██▎                                                                                                                                           | 324/20000 [5:29:31<310:30:31, 56.81s/it]

Updates 323, num timesteps 129600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -1.05/0.55
dist_entropy 2.47, value_loss 0.04, action_loss 0.00, explor_rew 0.026000 mean_episode_steps 22.15
Updates 324, num timesteps 130000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.55, min/max reward -1.05/0.55
dist_entropy 2.52, value_loss 0.02, action_loss 0.00, explor_rew 0.030500 mean_episode_steps 17.65


  2%|██▎                                                                                                                                           | 326/20000 [5:31:26<312:01:18, 57.09s/it]

Updates 325, num timesteps 130400, FPS 6 
Last 20 training episodes: mean/median reward 0.12/0.23, min/max reward -1.05/0.55
dist_entropy 2.52, value_loss 0.04, action_loss -0.01, explor_rew 0.027500 mean_episode_steps 27.55
Updates 326, num timesteps 130800, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward 0.02/0.55
dist_entropy 2.35, value_loss 0.02, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 23.45


  2%|██▎                                                                                                                                           | 328/20000 [5:33:21<312:13:23, 57.14s/it]

Updates 327, num timesteps 131200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.03/0.55
dist_entropy 2.34, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 19.15
Updates 328, num timesteps 131600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward 0.02/0.55
dist_entropy 2.39, value_loss 0.01, action_loss -0.00, explor_rew 0.030750 mean_episode_steps 14.75


  2%|██▎                                                                                                                                           | 329/20000 [5:34:18<313:03:30, 57.29s/it]

Updates 329, num timesteps 132000, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.55, min/max reward -1.05/0.55
dist_entropy 2.46, value_loss 0.12, action_loss -0.02, explor_rew 0.030500 mean_episode_steps 20.25


  2%|██▎                                                                                                                                           | 330/20000 [5:35:16<314:19:27, 57.53s/it]

Updates 330, num timesteps 132400, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.33, value_loss 0.01, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 16.65


  2%|██▎                                                                                                                                           | 332/20000 [5:37:11<314:06:22, 57.49s/it]

Updates 331, num timesteps 132800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.05/0.55
dist_entropy 2.32, value_loss 0.02, action_loss 0.04, explor_rew 0.017250 mean_episode_steps 21.90


  2%|██▎                                                                                                                                           | 333/20000 [5:38:08<312:56:57, 57.28s/it]

Updates 332, num timesteps 133200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.50, min/max reward -0.03/0.55
dist_entropy 2.64, value_loss 0.02, action_loss 0.01, explor_rew 0.018000 mean_episode_steps 36.30


  2%|██▎                                                                                                                                           | 334/20000 [5:39:05<312:22:34, 57.18s/it]

Updates 333, num timesteps 133600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.03/0.55
dist_entropy 2.46, value_loss 0.02, action_loss -0.01, explor_rew 0.027500 mean_episode_steps 17.70
Updates 334, num timesteps 134000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -1.05/0.55
dist_entropy 2.36, value_loss 0.02, action_loss -0.00, explor_rew 0.029000 mean_episode_steps 18.00


  2%|██▍                                                                                                                                           | 335/20000 [5:40:03<313:23:50, 57.37s/it]

Updates 335, num timesteps 134400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.03/0.55
dist_entropy 2.40, value_loss 0.02, action_loss -0.02, explor_rew 0.028250 mean_episode_steps 24.60


  2%|██▍                                                                                                                                           | 337/20000 [5:41:57<312:51:27, 57.28s/it]

Updates 336, num timesteps 134800, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.45, min/max reward -1.05/0.55
dist_entropy 2.49, value_loss 0.02, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 23.45


  2%|██▍                                                                                                                                           | 338/20000 [5:42:54<312:33:09, 57.23s/it]

Updates 337, num timesteps 135200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -1.05/0.55
dist_entropy 2.47, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 22.45


  2%|██▍                                                                                                                                           | 339/20000 [5:43:52<312:20:18, 57.19s/it]

Updates 338, num timesteps 135600, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.50, min/max reward -1.05/0.55
dist_entropy 2.56, value_loss 0.13, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 24.35


  2%|██▍                                                                                                                                           | 340/20000 [5:44:48<311:53:58, 57.11s/it]

Updates 339, num timesteps 136000, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -1.05/0.55
dist_entropy 2.45, value_loss 0.07, action_loss -0.02, explor_rew 0.026000 mean_episode_steps 23.30


  2%|██▍                                                                                                                                           | 341/20000 [5:45:45<311:16:53, 57.00s/it]

Updates 340, num timesteps 136400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.03/0.55
dist_entropy 2.38, value_loss 0.01, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 26.70


  2%|██▍                                                                                                                                           | 342/20000 [5:46:42<311:25:12, 57.03s/it]

Updates 341, num timesteps 136800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.43, value_loss 0.01, action_loss -0.01, explor_rew 0.026250 mean_episode_steps 24.20


  2%|██▍                                                                                                                                           | 343/20000 [5:47:39<310:52:30, 56.93s/it]

Updates 342, num timesteps 137200, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -1.05/0.55
dist_entropy 2.47, value_loss 0.01, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 16.25


  2%|██▍                                                                                                                                           | 344/20000 [5:48:36<311:00:43, 56.96s/it]

Updates 343, num timesteps 137600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -1.05/0.55
dist_entropy 2.27, value_loss 0.01, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 22.05


  2%|██▍                                                                                                                                           | 345/20000 [5:49:33<311:12:52, 57.00s/it]

Updates 344, num timesteps 138000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.05/0.55
dist_entropy 2.46, value_loss 0.00, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 23.15


  2%|██▍                                                                                                                                           | 346/20000 [5:50:30<310:56:24, 56.95s/it]

Updates 345, num timesteps 138400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -1.05/0.55
dist_entropy 2.45, value_loss 0.01, action_loss -0.01, explor_rew 0.028250 mean_episode_steps 19.25


  2%|██▍                                                                                                                                           | 347/20000 [5:51:27<310:52:19, 56.94s/it]

Updates 346, num timesteps 138800, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.03/0.55
dist_entropy 2.36, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 28.40
Updates 347, num timesteps 139200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.50, value_loss 0.00, action_loss -0.02, explor_rew 0.030250 mean_episode_steps 21.50


  2%|██▍                                                                                                                                           | 349/20000 [5:53:21<310:37:30, 56.91s/it]

Updates 348, num timesteps 139600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.60, value_loss 0.05, action_loss -0.06, explor_rew 0.016000 mean_episode_steps 29.75


  2%|██▍                                                                                                                                           | 350/20000 [5:54:18<311:28:40, 57.06s/it]

Updates 349, num timesteps 140000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.38, value_loss 0.02, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 28.35
Updates 350, num timesteps 140400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.05/0.55
dist_entropy 2.23, value_loss 0.02, action_loss 0.01, explor_rew 0.024000 mean_episode_steps 18.95


  2%|██▍                                                                                                                                           | 351/20000 [5:55:16<312:16:28, 57.21s/it]

Updates 351, num timesteps 140800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.05/0.55
dist_entropy 2.37, value_loss 0.01, action_loss -0.01, explor_rew 0.026250 mean_episode_steps 18.60


  2%|██▍                                                                                                                                           | 352/20000 [5:56:13<312:50:59, 57.32s/it]

Updates 352, num timesteps 141200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.31, value_loss 0.01, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 25.60


  2%|██▌                                                                                                                                           | 353/20000 [5:57:11<313:36:16, 57.46s/it]

Updates 353, num timesteps 141600, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.03/0.55
dist_entropy 2.49, value_loss 0.02, action_loss -0.00, explor_rew 0.030250 mean_episode_steps 25.50


  2%|██▌                                                                                                                                           | 355/20000 [5:59:05<312:17:44, 57.23s/it]

Updates 354, num timesteps 142000, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -0.04/0.55
dist_entropy 2.40, value_loss 0.01, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 22.95


  2%|██▌                                                                                                                                           | 356/20000 [6:00:02<311:20:51, 57.06s/it]

Updates 355, num timesteps 142400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.03/0.55
dist_entropy 2.47, value_loss 0.02, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 20.30
Updates 356, num timesteps 142800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.52, value_loss 0.00, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 22.60


  2%|██▌                                                                                                                                           | 358/20000 [6:01:57<312:00:53, 57.19s/it]

Updates 357, num timesteps 143200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.34, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 27.25


  2%|██▌                                                                                                                                           | 359/20000 [6:02:53<311:22:13, 57.07s/it]

Updates 358, num timesteps 143600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.58, value_loss 0.00, action_loss -0.02, explor_rew 0.031750 mean_episode_steps 20.85
Updates 359, num timesteps 144000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.52, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 28.10


  2%|██▌                                                                                                                                           | 360/20000 [6:03:50<311:20:04, 57.07s/it]

Updates 360, num timesteps 144400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.53, value_loss 0.01, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 21.20


  2%|██▌                                                                                                                                           | 362/20000 [6:05:44<311:12:58, 57.05s/it]

Updates 361, num timesteps 144800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.03/0.55
dist_entropy 2.35, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 24.20
Updates 362, num timesteps 145200, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.04/0.55
dist_entropy 2.46, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 30.30


  2%|██▌                                                                                                                                           | 363/20000 [6:06:42<311:26:09, 57.09s/it]

Updates 363, num timesteps 145600, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.03/0.55
dist_entropy 2.40, value_loss 0.00, action_loss -0.02, explor_rew 0.032000 mean_episode_steps 21.10


  2%|██▌                                                                                                                                           | 364/20000 [6:07:39<312:33:34, 57.30s/it]

Updates 364, num timesteps 146000, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.32, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 26.70


  2%|██▌                                                                                                                                           | 366/20000 [6:09:34<311:34:49, 57.13s/it]

Updates 365, num timesteps 146400, FPS 6 
Last 20 training episodes: mean/median reward 0.21/-0.00, min/max reward -0.04/0.55
dist_entropy 2.45, value_loss 0.00, action_loss -0.02, explor_rew 0.033250 mean_episode_steps 18.35


  2%|██▌                                                                                                                                           | 367/20000 [6:10:30<310:32:41, 56.94s/it]

Updates 366, num timesteps 146800, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.29, min/max reward -1.05/0.55
dist_entropy 2.42, value_loss 0.06, action_loss -0.03, explor_rew 0.026500 mean_episode_steps 24.15


  2%|██▌                                                                                                                                           | 368/20000 [6:11:27<310:44:10, 56.98s/it]

Updates 367, num timesteps 147200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -1.05/0.55
dist_entropy 2.49, value_loss 0.01, action_loss -0.00, explor_rew 0.027750 mean_episode_steps 29.60
Updates 368, num timesteps 147600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.37, value_loss 0.00, action_loss -0.02, explor_rew 0.030250 mean_episode_steps 26.35


  2%|██▋                                                                                                                                           | 370/20000 [6:13:21<310:54:23, 57.02s/it]

Updates 369, num timesteps 148000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.05/0.55
dist_entropy 2.41, value_loss 0.00, action_loss -0.02, explor_rew 0.029250 mean_episode_steps 26.20


  2%|██▋                                                                                                                                           | 371/20000 [6:14:18<311:05:28, 57.05s/it]

Updates 370, num timesteps 148400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.03/0.55
dist_entropy 2.37, value_loss 0.00, action_loss -0.02, explor_rew 0.031500 mean_episode_steps 20.00


  2%|██▋                                                                                                                                           | 372/20000 [6:15:16<311:10:59, 57.07s/it]

Updates 371, num timesteps 148800, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.45, min/max reward -0.03/0.55
dist_entropy 2.45, value_loss 0.02, action_loss -0.01, explor_rew 0.028750 mean_episode_steps 18.55


  2%|██▋                                                                                                                                           | 373/20000 [6:16:13<311:00:05, 57.04s/it]

Updates 372, num timesteps 149200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.40, value_loss 0.01, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 21.80


  2%|██▋                                                                                                                                           | 374/20000 [6:17:09<310:14:10, 56.91s/it]

Updates 373, num timesteps 149600, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -1.05/0.55
dist_entropy 2.45, value_loss 0.01, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 20.05


  2%|██▋                                                                                                                                           | 375/20000 [6:18:06<310:13:40, 56.91s/it]

Updates 374, num timesteps 150000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.50, min/max reward -0.03/0.55
dist_entropy 2.29, value_loss 0.00, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 14.50


  2%|██▋                                                                                                                                           | 376/20000 [6:19:03<310:38:29, 56.99s/it]

Updates 375, num timesteps 150400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.03/0.55
dist_entropy 2.38, value_loss 0.02, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 23.65


  2%|██▋                                                                                                                                           | 377/20000 [6:20:00<310:44:02, 57.01s/it]

Updates 376, num timesteps 150800, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.03/0.55
dist_entropy 2.42, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 26.05


  2%|██▋                                                                                                                                           | 378/20000 [6:20:57<310:32:55, 56.98s/it]

Updates 377, num timesteps 151200, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.05/0.55
dist_entropy 2.48, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 13.05
Updates 378, num timesteps 151600, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.03/0.55
dist_entropy 2.51, value_loss 0.00, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 19.10


  2%|██▋                                                                                                                                           | 379/20000 [6:21:55<311:28:09, 57.15s/it]

Updates 379, num timesteps 152000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.38, value_loss 0.00, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 17.40


  2%|██▋                                                                                                                                           | 381/20000 [6:23:49<311:42:18, 57.20s/it]

Updates 380, num timesteps 152400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.05/0.55
dist_entropy 2.48, value_loss 0.00, action_loss -0.02, explor_rew 0.031750 mean_episode_steps 21.15
Updates 381, num timesteps 152800, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.41, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 24.75


  2%|██▋                                                                                                                                           | 383/20000 [6:25:43<311:08:01, 57.10s/it]

Updates 382, num timesteps 153200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.04/0.55
dist_entropy 2.45, value_loss 0.00, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 23.55


  2%|██▋                                                                                                                                           | 384/20000 [6:26:40<310:31:40, 56.99s/it]

Updates 383, num timesteps 153600, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.05/0.55
dist_entropy 2.57, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 23.85


  2%|██▋                                                                                                                                           | 385/20000 [6:27:37<310:02:41, 56.90s/it]

Updates 384, num timesteps 154000, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.58, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 20.65


  2%|██▋                                                                                                                                           | 386/20000 [6:28:34<310:37:36, 57.01s/it]

Updates 385, num timesteps 154400, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.47, value_loss 0.00, action_loss -0.02, explor_rew 0.032000 mean_episode_steps 23.70


  2%|██▋                                                                                                                                           | 387/20000 [6:29:31<310:35:35, 57.01s/it]

Updates 386, num timesteps 154800, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.50, min/max reward -1.05/0.55
dist_entropy 2.55, value_loss 0.02, action_loss 0.00, explor_rew 0.027750 mean_episode_steps 23.30


  2%|██▊                                                                                                                                           | 388/20000 [6:30:28<310:30:14, 57.00s/it]

Updates 387, num timesteps 155200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.50, min/max reward -0.05/0.55
dist_entropy 2.51, value_loss 0.00, action_loss -0.01, explor_rew 0.029750 mean_episode_steps 21.15
Updates 388, num timesteps 155600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.05/0.55
dist_entropy 2.61, value_loss 0.01, action_loss -0.01, explor_rew 0.028000 mean_episode_steps 32.75


  2%|██▊                                                                                                                                           | 390/20000 [6:32:22<310:39:44, 57.03s/it]

Updates 389, num timesteps 156000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.05/0.55
dist_entropy 2.46, value_loss 0.00, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 28.10


  2%|██▊                                                                                                                                           | 391/20000 [6:33:19<310:09:51, 56.94s/it]

Updates 390, num timesteps 156400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.50, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 20.40


  2%|██▊                                                                                                                                           | 392/20000 [6:34:16<309:58:08, 56.91s/it]

Updates 391, num timesteps 156800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 2.61, value_loss 0.01, action_loss -0.02, explor_rew 0.025500 mean_episode_steps 35.05


  2%|██▊                                                                                                                                           | 393/20000 [6:35:13<309:48:08, 56.88s/it]

Updates 392, num timesteps 157200, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.51, value_loss 0.00, action_loss -0.02, explor_rew 0.031000 mean_episode_steps 20.05


  2%|██▊                                                                                                                                           | 394/20000 [6:36:10<310:07:56, 56.95s/it]

Updates 393, num timesteps 157600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.40, value_loss 0.01, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 22.95


  2%|██▊                                                                                                                                           | 395/20000 [6:37:07<310:11:00, 56.96s/it]

Updates 394, num timesteps 158000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.48, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 24.70


  2%|██▊                                                                                                                                           | 396/20000 [6:38:03<309:31:45, 56.84s/it]

Updates 395, num timesteps 158400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.59, value_loss 0.00, action_loss -0.02, explor_rew 0.033250 mean_episode_steps 23.75


  2%|██▊                                                                                                                                           | 397/20000 [6:39:00<309:46:01, 56.89s/it]

Updates 396, num timesteps 158800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.45, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 28.75
Updates 397, num timesteps 159200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.50, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 18.10


  2%|██▊                                                                                                                                           | 398/20000 [6:39:58<310:34:43, 57.04s/it]

Updates 398, num timesteps 159600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.56, value_loss 0.00, action_loss -0.02, explor_rew 0.032250 mean_episode_steps 26.10


  2%|██▊                                                                                                                                           | 399/20000 [6:40:55<310:53:53, 57.10s/it]

Updates 399, num timesteps 160000, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.55, min/max reward -1.05/0.55
dist_entropy 2.63, value_loss 0.02, action_loss -0.01, explor_rew 0.033750 mean_episode_steps 20.05


  2%|██▊                                                                                                                                           | 400/20000 [6:41:52<310:17:33, 56.99s/it]

Updates 400, num timesteps 160400, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.50, min/max reward -0.04/0.55
dist_entropy 2.57, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 14.00


  2%|██▊                                                                                                                                           | 402/20000 [6:43:46<310:19:16, 57.00s/it]

Updates 401, num timesteps 160800, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -1.05/0.55
dist_entropy 2.51, value_loss 0.02, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 27.95


  2%|██▊                                                                                                                                           | 403/20000 [6:44:43<309:59:18, 56.95s/it]

Updates 402, num timesteps 161200, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -1.05/0.55
dist_entropy 2.49, value_loss 0.01, action_loss -0.02, explor_rew 0.033000 mean_episode_steps 22.65
Updates 403, num timesteps 161600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.55, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 21.10


  2%|██▉                                                                                                                                           | 405/20000 [6:46:37<310:08:45, 56.98s/it]

Updates 404, num timesteps 162000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.51, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 20.70
Updates 405, num timesteps 162400, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.55, min/max reward -1.05/0.55
dist_entropy 2.63, value_loss 0.01, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 21.40


  2%|██▉                                                                                                                                           | 406/20000 [6:47:34<310:14:29, 57.00s/it]

Updates 406, num timesteps 162800, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -1.05/0.55
dist_entropy 2.57, value_loss 0.05, action_loss 0.02, explor_rew 0.030000 mean_episode_steps 22.10


  2%|██▉                                                                                                                                           | 408/20000 [6:49:28<310:18:47, 57.02s/it]

Updates 407, num timesteps 163200, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.56, value_loss 0.02, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 20.25


  2%|██▉                                                                                                                                           | 409/20000 [6:50:25<309:44:57, 56.92s/it]

Updates 408, num timesteps 163600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.60, value_loss 0.00, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 16.50


  2%|██▉                                                                                                                                           | 410/20000 [6:51:21<309:17:02, 56.84s/it]

Updates 409, num timesteps 164000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.05/0.55
dist_entropy 2.58, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 17.50
Updates 410, num timesteps 164400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.01, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 25.65


  2%|██▉                                                                                                                                           | 411/20000 [6:52:19<310:20:07, 57.03s/it]

Updates 411, num timesteps 164800, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.03/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 22.55


  2%|██▉                                                                                                                                           | 412/20000 [6:53:16<310:20:45, 57.04s/it]

Updates 412, num timesteps 165200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.028750 mean_episode_steps 23.60


  2%|██▉                                                                                                                                           | 414/20000 [6:55:10<310:46:05, 57.12s/it]

Updates 413, num timesteps 165600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -0.04/0.55
dist_entropy 2.61, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 26.15


  2%|██▉                                                                                                                                           | 415/20000 [6:56:07<310:13:11, 57.02s/it]

Updates 414, num timesteps 166000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.59, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 20.75
Updates 415, num timesteps 166400, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.05/0.55
dist_entropy 2.61, value_loss 0.01, action_loss -0.01, explor_rew 0.025250 mean_episode_steps 25.15


  2%|██▉                                                                                                                                           | 416/20000 [6:57:04<310:25:36, 57.06s/it]

Updates 416, num timesteps 166800, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.60, value_loss 0.01, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 19.90


  2%|██▉                                                                                                                                           | 418/20000 [6:58:59<310:47:55, 57.14s/it]

Updates 417, num timesteps 167200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.59, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 24.75


  2%|██▉                                                                                                                                           | 419/20000 [6:59:55<310:16:14, 57.04s/it]

Updates 418, num timesteps 167600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 25.35


  2%|██▉                                                                                                                                           | 420/20000 [7:00:52<310:01:27, 57.00s/it]

Updates 419, num timesteps 168000, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 21.20
Updates 420, num timesteps 168400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.034250 mean_episode_steps 33.25


  2%|██▉                                                                                                                                           | 421/20000 [7:01:49<309:42:47, 56.95s/it]

Updates 421, num timesteps 168800, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.05/0.55
dist_entropy 3.08, value_loss 0.08, action_loss -0.02, explor_rew -0.015000 mean_episode_steps 37.35


  2%|███                                                                                                                                           | 423/20000 [7:03:41<307:03:49, 56.47s/it]

Updates 422, num timesteps 169200, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.04, action_loss -0.01, explor_rew 0.009500 mean_episode_steps 36.70
Updates 423, num timesteps 169600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.02, explor_rew 0.027750 mean_episode_steps 18.75


  2%|███                                                                                                                                           | 425/20000 [7:05:35<307:49:49, 56.61s/it]

Updates 424, num timesteps 170000, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 22.50
Updates 425, num timesteps 170400, FPS 6 
Last 20 training episodes: mean/median reward 0.09/0.55, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.10, action_loss 0.01, explor_rew 0.030500 mean_episode_steps 16.20


  2%|███                                                                                                                                           | 426/20000 [7:06:32<309:04:53, 56.85s/it]

Updates 426, num timesteps 170800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.55, min/max reward -1.05/0.55
dist_entropy 2.66, value_loss 0.02, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 18.60


  2%|███                                                                                                                                           | 428/20000 [7:08:26<309:38:30, 56.95s/it]

Updates 427, num timesteps 171200, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.02, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 22.45


  2%|███                                                                                                                                           | 429/20000 [7:09:22<308:22:13, 56.72s/it]

Updates 428, num timesteps 171600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.06, action_loss -0.01, explor_rew 0.011500 mean_episode_steps 24.30


  2%|███                                                                                                                                           | 430/20000 [7:10:19<307:55:37, 56.64s/it]

Updates 429, num timesteps 172000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.81, value_loss 0.02, action_loss 0.01, explor_rew 0.021000 mean_episode_steps 32.55


  2%|███                                                                                                                                           | 431/20000 [7:11:15<307:43:48, 56.61s/it]

Updates 430, num timesteps 172400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.02, explor_rew 0.034000 mean_episode_steps 16.90


  2%|███                                                                                                                                           | 432/20000 [7:12:12<308:10:36, 56.70s/it]

Updates 431, num timesteps 172800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss 0.01, explor_rew 0.031750 mean_episode_steps 27.10
Updates 432, num timesteps 173200, FPS 6 
Last 20 training episodes: mean/median reward 0.50/0.55, min/max reward -0.03/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 22.40


  2%|███                                                                                                                                           | 433/20000 [7:13:10<309:46:25, 56.99s/it]

Updates 433, num timesteps 173600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 21.30


  2%|███                                                                                                                                           | 434/20000 [7:14:07<310:19:36, 57.10s/it]

Updates 434, num timesteps 174000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.88, value_loss 0.05, action_loss -0.05, explor_rew -0.002250 mean_episode_steps 32.05


  2%|███                                                                                                                                           | 436/20000 [7:16:00<308:47:39, 56.82s/it]

Updates 435, num timesteps 174400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.02, explor_rew 0.032250 mean_episode_steps 20.45


  2%|███                                                                                                                                           | 437/20000 [7:16:57<308:37:36, 56.79s/it]

Updates 436, num timesteps 174800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.01, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 23.85


  2%|███                                                                                                                                           | 438/20000 [7:17:54<309:05:50, 56.88s/it]

Updates 437, num timesteps 175200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.60, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 25.85
Updates 438, num timesteps 175600, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.60, value_loss 0.00, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 25.85


  2%|███                                                                                                                                           | 440/20000 [7:19:48<309:40:34, 57.00s/it]

Updates 439, num timesteps 176000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.61, value_loss 0.00, action_loss -0.02, explor_rew 0.032500 mean_episode_steps 23.40


  2%|███▏                                                                                                                                          | 441/20000 [7:20:45<309:14:31, 56.92s/it]

Updates 440, num timesteps 176400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.05/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 24.25
Updates 441, num timesteps 176800, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 15.65


  2%|███▏                                                                                                                                          | 442/20000 [7:21:43<309:59:04, 57.06s/it]

Updates 442, num timesteps 177200, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.05/0.55
dist_entropy 2.61, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 22.40


  2%|███▏                                                                                                                                          | 444/20000 [7:23:37<310:29:16, 57.16s/it]

Updates 443, num timesteps 177600, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -1.05/0.55
dist_entropy 2.65, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 18.00


  2%|███▏                                                                                                                                          | 445/20000 [7:24:34<309:50:14, 57.04s/it]

Updates 444, num timesteps 178000, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.58, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 29.80
Updates 445, num timesteps 178400, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.60, value_loss 0.00, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 18.45


  2%|███▏                                                                                                                                          | 447/20000 [7:26:28<310:01:27, 57.08s/it]

Updates 446, num timesteps 178800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -1.05/0.55
dist_entropy 2.65, value_loss 0.02, action_loss 0.00, explor_rew 0.029250 mean_episode_steps 20.60


  2%|███▏                                                                                                                                          | 448/20000 [7:27:25<309:55:43, 57.07s/it]

Updates 447, num timesteps 179200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 21.50


  2%|███▏                                                                                                                                          | 449/20000 [7:28:22<309:52:57, 57.06s/it]

Updates 448, num timesteps 179600, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 21.55


  2%|███▏                                                                                                                                          | 450/20000 [7:29:19<309:28:45, 56.99s/it]

Updates 449, num timesteps 180000, FPS 6 
Last 20 training episodes: mean/median reward 0.19/0.21, min/max reward -1.05/0.55
dist_entropy 2.63, value_loss 0.01, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 17.15
Updates 450, num timesteps 180400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 24.75


  2%|███▏                                                                                                                                          | 451/20000 [7:30:16<309:49:22, 57.05s/it]

Updates 451, num timesteps 180800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.03/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 17.30


  2%|███▏                                                                                                                                          | 452/20000 [7:31:13<309:55:36, 57.08s/it]

Updates 452, num timesteps 181200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 22.00


  2%|███▏                                                                                                                                          | 453/20000 [7:32:10<309:36:17, 57.02s/it]

Updates 453, num timesteps 181600, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 20.70


  2%|███▏                                                                                                                                          | 455/20000 [7:34:04<308:24:13, 56.80s/it]

Updates 454, num timesteps 182000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.05/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.03, explor_rew 0.023750 mean_episode_steps 21.00
Updates 455, num timesteps 182400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 29.75


  2%|███▏                                                                                                                                          | 456/20000 [7:35:01<308:52:16, 56.89s/it]

Updates 456, num timesteps 182800, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.55, min/max reward -1.05/0.55
dist_entropy 2.60, value_loss 0.02, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 20.75


  2%|███▎                                                                                                                                          | 458/20000 [7:36:55<308:51:31, 56.90s/it]

Updates 457, num timesteps 183200, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 26.10


  2%|███▎                                                                                                                                          | 459/20000 [7:37:51<308:00:03, 56.74s/it]

Updates 458, num timesteps 183600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -1.05/0.55
dist_entropy 2.67, value_loss 0.01, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 27.50
Updates 459, num timesteps 184000, FPS 6 
Last 20 training episodes: mean/median reward 0.49/0.55, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 23.35


  2%|███▎                                                                                                                                          | 461/20000 [7:39:46<309:20:34, 57.00s/it]

Updates 460, num timesteps 184400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 23.75


  2%|███▎                                                                                                                                          | 462/20000 [7:40:42<308:52:30, 56.91s/it]

Updates 461, num timesteps 184800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 18.00
Updates 462, num timesteps 185200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 21.00


  2%|███▎                                                                                                                                          | 463/20000 [7:41:40<310:10:23, 57.15s/it]

Updates 463, num timesteps 185600, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 16.70


  2%|███▎                                                                                                                                          | 465/20000 [7:43:34<310:17:50, 57.18s/it]

Updates 464, num timesteps 186000, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.02, explor_rew 0.032500 mean_episode_steps 20.50


  2%|███▎                                                                                                                                          | 466/20000 [7:44:31<309:11:09, 56.98s/it]

Updates 465, num timesteps 186400, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 21.55


  2%|███▎                                                                                                                                          | 467/20000 [7:45:28<308:35:06, 56.87s/it]

Updates 466, num timesteps 186800, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.05/0.55
dist_entropy 2.79, value_loss 0.02, action_loss -0.04, explor_rew 0.024000 mean_episode_steps 37.65


  2%|███▎                                                                                                                                          | 468/20000 [7:46:25<309:05:33, 56.97s/it]

Updates 467, num timesteps 187200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 18.75
Updates 468, num timesteps 187600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 23.05


  2%|███▎                                                                                                                                          | 470/20000 [7:48:19<309:15:21, 57.01s/it]

Updates 469, num timesteps 188000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.05/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 18.15


  2%|███▎                                                                                                                                          | 471/20000 [7:49:16<309:10:19, 56.99s/it]

Updates 470, num timesteps 188400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 25.85


  2%|███▎                                                                                                                                          | 472/20000 [7:50:13<309:16:07, 57.01s/it]

Updates 471, num timesteps 188800, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.45, min/max reward -1.05/0.55
dist_entropy 2.68, value_loss 0.03, action_loss -0.00, explor_rew 0.025250 mean_episode_steps 28.70


  2%|███▎                                                                                                                                          | 473/20000 [7:51:10<308:33:48, 56.89s/it]

Updates 472, num timesteps 189200, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -0.05/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.03, explor_rew 0.023250 mean_episode_steps 31.30
Updates 473, num timesteps 189600, FPS 6 
Last 20 training episodes: mean/median reward 0.19/0.55, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.07, action_loss -0.01, explor_rew 0.022500 mean_episode_steps 29.40


  2%|███▎                                                                                                                                          | 474/20000 [7:52:07<308:57:15, 56.96s/it]

Updates 474, num timesteps 190000, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 3.06, value_loss 0.03, action_loss 0.00, explor_rew 0.016500 mean_episode_steps 28.70


  2%|███▍                                                                                                                                          | 476/20000 [7:53:58<305:47:44, 56.39s/it]

Updates 475, num timesteps 190400, FPS 6 
Last 20 training episodes: mean/median reward -0.13/-0.05, min/max reward -1.05/0.55
dist_entropy 3.10, value_loss 0.04, action_loss 0.02, explor_rew 0.009750 mean_episode_steps 26.65


  2%|███▍                                                                                                                                          | 477/20000 [7:54:55<305:44:06, 56.38s/it]

Updates 476, num timesteps 190800, FPS 6 
Last 20 training episodes: mean/median reward -0.10/0.25, min/max reward -1.05/0.55
dist_entropy 3.06, value_loss 0.01, action_loss -0.01, explor_rew 0.013250 mean_episode_steps 15.40
Updates 477, num timesteps 191200, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.55, min/max reward -1.05/0.55
dist_entropy 3.05, value_loss 0.03, action_loss 0.01, explor_rew 0.018000 mean_episode_steps 24.90


  2%|███▍                                                                                                                                          | 478/20000 [7:55:51<305:42:55, 56.38s/it]

Updates 478, num timesteps 191600, FPS 6 
Last 20 training episodes: mean/median reward 0.13/0.55, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.02, action_loss -0.01, explor_rew 0.025250 mean_episode_steps 20.60


  2%|███▍                                                                                                                                          | 480/20000 [7:57:44<306:04:39, 56.45s/it]

Updates 479, num timesteps 192000, FPS 6 
Last 20 training episodes: mean/median reward 0.10/0.26, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.02, action_loss -0.02, explor_rew 0.029250 mean_episode_steps 17.60
Updates 480, num timesteps 192400, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.03, action_loss -0.01, explor_rew 0.026000 mean_episode_steps 15.40


  2%|███▍                                                                                                                                          | 481/20000 [7:58:41<306:25:41, 56.52s/it]

Updates 481, num timesteps 192800, FPS 6 
Last 20 training episodes: mean/median reward 0.20/0.55, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.02, action_loss -0.01, explor_rew 0.026000 mean_episode_steps 15.40


  2%|███▍                                                                                                                                          | 482/20000 [7:59:38<307:01:19, 56.63s/it]

Updates 482, num timesteps 193200, FPS 6 
Last 20 training episodes: mean/median reward 0.09/0.55, min/max reward -1.05/0.55
dist_entropy 2.77, value_loss 0.02, action_loss -0.00, explor_rew 0.030500 mean_episode_steps 16.90


  2%|███▍                                                                                                                                          | 483/20000 [8:00:34<306:31:21, 56.54s/it]

Updates 483, num timesteps 193600, FPS 6 
Last 20 training episodes: mean/median reward 0.09/0.55, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.02, action_loss -0.01, explor_rew 0.027000 mean_episode_steps 22.10


  2%|███▍                                                                                                                                          | 484/20000 [8:01:31<306:06:44, 56.47s/it]

Updates 484, num timesteps 194000, FPS 6 
Last 20 training episodes: mean/median reward 0.06/0.55, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.02, explor_rew 0.029000 mean_episode_steps 16.00


  2%|███▍                                                                                                                                          | 486/20000 [8:03:23<305:52:07, 56.43s/it]

Updates 485, num timesteps 194400, FPS 6 
Last 20 training episodes: mean/median reward -0.05/0.25, min/max reward -1.05/0.55
dist_entropy 2.77, value_loss 0.02, action_loss -0.00, explor_rew 0.029750 mean_episode_steps 17.85


  2%|███▍                                                                                                                                          | 487/20000 [8:04:20<305:40:01, 56.39s/it]

Updates 486, num timesteps 194800, FPS 6 
Last 20 training episodes: mean/median reward -0.19/-0.04, min/max reward -1.05/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.01, explor_rew 0.028000 mean_episode_steps 22.65


  2%|███▍                                                                                                                                          | 488/20000 [8:05:16<305:37:50, 56.39s/it]

Updates 487, num timesteps 195200, FPS 6 
Last 20 training episodes: mean/median reward -0.18/-0.04, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.029500 mean_episode_steps 21.00


  2%|███▍                                                                                                                                          | 489/20000 [8:06:12<305:10:44, 56.31s/it]

Updates 488, num timesteps 195600, FPS 6 
Last 20 training episodes: mean/median reward 0.05/0.25, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.01, explor_rew 0.029750 mean_episode_steps 22.30


  2%|███▍                                                                                                                                          | 490/20000 [8:07:09<305:25:56, 56.36s/it]

Updates 489, num timesteps 196000, FPS 6 
Last 20 training episodes: mean/median reward 0.00/0.26, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.02, action_loss -0.01, explor_rew 0.029250 mean_episode_steps 19.85


  2%|███▍                                                                                                                                          | 491/20000 [8:08:05<304:54:15, 56.26s/it]

Updates 490, num timesteps 196400, FPS 6 
Last 20 training episodes: mean/median reward -0.18/-0.04, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.08, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 16.65
Updates 491, num timesteps 196800, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.00, explor_rew 0.030250 mean_episode_steps 20.95


  2%|███▌                                                                                                                                          | 493/20000 [8:09:58<306:07:52, 56.50s/it]

Updates 492, num timesteps 197200, FPS 6 
Last 20 training episodes: mean/median reward -0.13/-0.03, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.03, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 16.90


  2%|███▌                                                                                                                                          | 494/20000 [8:10:54<305:34:40, 56.40s/it]

Updates 493, num timesteps 197600, FPS 6 
Last 20 training episodes: mean/median reward -0.30/-0.04, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.03, action_loss -0.00, explor_rew 0.030750 mean_episode_steps 16.50
Updates 494, num timesteps 198000, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.55, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.01, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 17.55


  2%|███▌                                                                                                                                          | 496/20000 [8:12:48<306:38:41, 56.60s/it]

Updates 495, num timesteps 198400, FPS 6 
Last 20 training episodes: mean/median reward -0.15/0.25, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.01, action_loss -0.01, explor_rew 0.028000 mean_episode_steps 17.60


  2%|███▌                                                                                                                                          | 497/20000 [8:13:45<307:01:37, 56.67s/it]

Updates 496, num timesteps 198800, FPS 6 
Last 20 training episodes: mean/median reward -0.29/-0.04, min/max reward -1.05/0.55
dist_entropy 2.91, value_loss 0.02, action_loss -0.01, explor_rew 0.028750 mean_episode_steps 19.70
Updates 497, num timesteps 199200, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.55, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 20.80


  2%|███▌                                                                                                                                          | 498/20000 [8:14:42<307:25:42, 56.75s/it]

Updates 498, num timesteps 199600, FPS 6 
Last 20 training episodes: mean/median reward 0.06/0.55, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.01, action_loss -0.01, explor_rew 0.028750 mean_episode_steps 18.75


  2%|███▌                                                                                                                                          | 499/20000 [8:15:39<308:09:22, 56.89s/it]

Updates 499, num timesteps 200000, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.55, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.02, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 27.35


  2%|███▌                                                                                                                                          | 500/20000 [8:16:36<308:07:19, 56.88s/it]

Updates 500, num timesteps 200400, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.55, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.01, action_loss -0.02, explor_rew 0.031500 mean_episode_steps 27.20


  3%|███▌                                                                                                                                          | 502/20000 [8:18:29<307:41:17, 56.81s/it]

Updates 501, num timesteps 200800, FPS 6 
Last 20 training episodes: mean/median reward -0.09/-0.04, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.02, action_loss -0.02, explor_rew 0.033750 mean_episode_steps 29.50


  3%|███▌                                                                                                                                          | 503/20000 [8:19:26<306:39:31, 56.62s/it]

Updates 502, num timesteps 201200, FPS 6 
Last 20 training episodes: mean/median reward -0.01/-0.04, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.02, action_loss -0.01, explor_rew 0.035000 mean_episode_steps 26.75


  3%|███▌                                                                                                                                          | 504/20000 [8:20:22<305:57:24, 56.50s/it]

Updates 503, num timesteps 201600, FPS 6 
Last 20 training episodes: mean/median reward -0.08/-0.04, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.01, action_loss -0.01, explor_rew 0.034750 mean_episode_steps 25.45
Updates 504, num timesteps 202000, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.55, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.01, action_loss -0.02, explor_rew 0.033750 mean_episode_steps 28.75


  3%|███▌                                                                                                                                          | 506/20000 [8:22:16<307:13:46, 56.74s/it]

Updates 505, num timesteps 202400, FPS 6 
Last 20 training episodes: mean/median reward -0.05/0.25, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.01, action_loss -0.02, explor_rew 0.034250 mean_episode_steps 25.30


  3%|███▌                                                                                                                                          | 507/20000 [8:23:13<307:29:03, 56.79s/it]

Updates 506, num timesteps 202800, FPS 6 
Last 20 training episodes: mean/median reward -0.16/-0.04, min/max reward -1.05/0.55
dist_entropy 2.73, value_loss 0.04, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 22.65
Updates 507, num timesteps 203200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -1.05/0.55
dist_entropy 2.63, value_loss 0.07, action_loss -0.02, explor_rew 0.033000 mean_episode_steps 20.15


  3%|███▌                                                                                                                                          | 509/20000 [8:25:07<307:54:54, 56.87s/it]

Updates 508, num timesteps 203600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.04/0.55
dist_entropy 2.50, value_loss 0.04, action_loss -0.02, explor_rew 0.032000 mean_episode_steps 19.55


  3%|███▌                                                                                                                                          | 510/20000 [8:26:04<309:05:36, 57.09s/it]

Updates 509, num timesteps 204000, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -1.05/0.55
dist_entropy 2.65, value_loss 0.03, action_loss 0.00, explor_rew 0.030000 mean_episode_steps 27.50
Updates 510, num timesteps 204400, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.55, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.09, action_loss 0.00, explor_rew 0.022750 mean_episode_steps 16.30


  3%|███▋                                                                                                                                          | 512/20000 [8:27:58<308:40:12, 57.02s/it]

Updates 511, num timesteps 204800, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -1.05/0.75
dist_entropy 2.58, value_loss 0.03, action_loss 0.00, explor_rew 0.027250 mean_episode_steps 23.90


  3%|███▋                                                                                                                                          | 513/20000 [8:28:55<309:02:51, 57.09s/it]

Updates 512, num timesteps 205200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -1.05/0.55
dist_entropy 2.44, value_loss 0.08, action_loss -0.01, explor_rew 0.027000 mean_episode_steps 25.35
Updates 513, num timesteps 205600, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.03/0.55
dist_entropy 2.79, value_loss 0.03, action_loss -0.01, explor_rew 0.019750 mean_episode_steps 16.45


  3%|███▋                                                                                                                                          | 514/20000 [8:29:53<309:11:26, 57.12s/it]

Updates 514, num timesteps 206000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.05/0.55
dist_entropy 2.96, value_loss 0.04, action_loss -0.01, explor_rew 0.009000 mean_episode_steps 20.65


  3%|███▋                                                                                                                                          | 515/20000 [8:30:49<308:18:50, 56.96s/it]

Updates 515, num timesteps 206400, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.94, value_loss 0.03, action_loss 0.00, explor_rew 0.008000 mean_episode_steps 20.10


  3%|███▋                                                                                                                                          | 517/20000 [8:32:43<308:13:51, 56.95s/it]

Updates 516, num timesteps 206800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.98, value_loss 0.04, action_loss 0.00, explor_rew 0.011500 mean_episode_steps 27.20
Updates 517, num timesteps 207200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.93, value_loss 0.03, action_loss -0.00, explor_rew 0.007250 mean_episode_steps 18.25


  3%|███▋                                                                                                                                          | 518/20000 [8:33:40<307:38:48, 56.85s/it]

Updates 518, num timesteps 207600, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.03/0.55
dist_entropy 2.96, value_loss 0.04, action_loss 0.00, explor_rew 0.008250 mean_episode_steps 20.40


  3%|███▋                                                                                                                                          | 520/20000 [8:35:33<307:09:09, 56.76s/it]

Updates 519, num timesteps 208000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.03/0.55
dist_entropy 2.90, value_loss 0.02, action_loss -0.01, explor_rew 0.011000 mean_episode_steps 17.75
Updates 520, num timesteps 208400, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.62, value_loss 0.03, action_loss -0.02, explor_rew 0.027000 mean_episode_steps 20.75


  3%|███▋                                                                                                                                          | 522/20000 [8:37:28<308:09:57, 56.96s/it]

Updates 521, num timesteps 208800, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.61, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 16.30


  3%|███▋                                                                                                                                          | 523/20000 [8:38:25<308:11:58, 56.97s/it]

Updates 522, num timesteps 209200, FPS 6 
Last 20 training episodes: mean/median reward 0.04/0.21, min/max reward -1.05/0.55
dist_entropy 2.66, value_loss 0.12, action_loss -0.00, explor_rew 0.031000 mean_episode_steps 20.55
Updates 523, num timesteps 209600, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.55, min/max reward -1.05/0.55
dist_entropy 2.60, value_loss 0.03, action_loss -0.01, explor_rew 0.027250 mean_episode_steps 22.55


  3%|███▋                                                                                                                                          | 524/20000 [8:39:22<309:09:37, 57.15s/it]

Updates 524, num timesteps 210000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.52, value_loss 0.01, action_loss -0.02, explor_rew 0.031000 mean_episode_steps 20.25


  3%|███▋                                                                                                                                          | 526/20000 [8:41:16<308:55:42, 57.11s/it]

Updates 525, num timesteps 210400, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.50, min/max reward -1.05/0.55
dist_entropy 2.66, value_loss 0.08, action_loss -0.03, explor_rew 0.027000 mean_episode_steps 25.05


  3%|███▋                                                                                                                                          | 527/20000 [8:42:13<308:32:46, 57.04s/it]

Updates 526, num timesteps 210800, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.01, action_loss 0.00, explor_rew 0.031500 mean_episode_steps 27.60


  3%|███▋                                                                                                                                          | 528/20000 [8:43:11<308:46:05, 57.09s/it]

Updates 527, num timesteps 211200, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.54, value_loss 0.00, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 30.45


  3%|███▊                                                                                                                                          | 529/20000 [8:44:08<308:51:38, 57.11s/it]

Updates 528, num timesteps 211600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -1.05/0.55
dist_entropy 2.59, value_loss 0.02, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 21.30
Updates 529, num timesteps 212000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 20.75


  3%|███▊                                                                                                                                          | 531/20000 [8:46:01<306:49:00, 56.73s/it]

Updates 530, num timesteps 212400, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.45, min/max reward -1.05/0.55
dist_entropy 3.19, value_loss 0.16, action_loss -0.03, explor_rew -0.016250 mean_episode_steps 34.30


  3%|███▊                                                                                                                                          | 532/20000 [8:46:57<306:24:15, 56.66s/it]

Updates 531, num timesteps 212800, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -0.04/0.55
dist_entropy 2.96, value_loss 0.06, action_loss -0.05, explor_rew 0.008750 mean_episode_steps 23.25


  3%|███▊                                                                                                                                          | 533/20000 [8:47:54<305:57:00, 56.58s/it]

Updates 532, num timesteps 213200, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.02, explor_rew 0.027750 mean_episode_steps 27.10
Updates 533, num timesteps 213600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -1.05/0.55
dist_entropy 2.77, value_loss 0.02, action_loss -0.02, explor_rew 0.026250 mean_episode_steps 26.60


  3%|███▊                                                                                                                                          | 535/20000 [8:49:48<307:10:19, 56.81s/it]

Updates 534, num timesteps 214000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.05/0.55
dist_entropy 2.81, value_loss 0.01, action_loss -0.01, explor_rew 0.029250 mean_episode_steps 28.25


  3%|███▊                                                                                                                                          | 536/20000 [8:50:45<307:40:50, 56.91s/it]

Updates 535, num timesteps 214400, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.05/0.55
dist_entropy 2.82, value_loss 0.00, action_loss -0.02, explor_rew 0.030250 mean_episode_steps 28.30


  3%|███▊                                                                                                                                          | 537/20000 [8:51:42<307:06:33, 56.80s/it]

Updates 536, num timesteps 214800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 24.20
Updates 537, num timesteps 215200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.02, explor_rew 0.033250 mean_episode_steps 21.30


  3%|███▊                                                                                                                                          | 539/20000 [8:53:36<308:28:16, 57.06s/it]

Updates 538, num timesteps 215600, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 26.70
Updates 539, num timesteps 216000, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.05/0.55
dist_entropy 2.78, value_loss 0.00, action_loss -0.00, explor_rew 0.029250 mean_episode_steps 22.00


  3%|███▊                                                                                                                                          | 541/20000 [8:55:30<307:59:35, 56.98s/it]

Updates 540, num timesteps 216400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.05/0.55
dist_entropy 2.87, value_loss 0.01, action_loss -0.05, explor_rew 0.016500 mean_episode_steps 35.65


  3%|███▊                                                                                                                                          | 542/20000 [8:56:27<307:45:28, 56.94s/it]

Updates 541, num timesteps 216800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.05/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 24.35


  3%|███▊                                                                                                                                          | 543/20000 [8:57:24<307:05:10, 56.82s/it]

Updates 542, num timesteps 217200, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.03, action_loss -0.00, explor_rew 0.025000 mean_episode_steps 28.45
Updates 543, num timesteps 217600, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 30.10


  3%|███▊                                                                                                                                          | 545/20000 [8:59:18<307:50:05, 56.96s/it]

Updates 544, num timesteps 218000, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.02, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 19.30


  3%|███▉                                                                                                                                          | 546/20000 [9:00:14<306:40:00, 56.75s/it]

Updates 545, num timesteps 218400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.87, value_loss 0.04, action_loss -0.00, explor_rew 0.016250 mean_episode_steps 19.80
Updates 546, num timesteps 218800, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.83, value_loss 0.01, action_loss 0.00, explor_rew 0.027000 mean_episode_steps 25.30


  3%|███▉                                                                                                                                          | 548/20000 [9:02:08<307:44:22, 56.95s/it]

Updates 547, num timesteps 219200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.84, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 21.50


  3%|███▉                                                                                                                                          | 549/20000 [9:03:05<307:10:21, 56.85s/it]

Updates 548, num timesteps 219600, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -1.05/0.55
dist_entropy 2.94, value_loss 0.03, action_loss -0.01, explor_rew 0.020250 mean_episode_steps 28.90
Updates 549, num timesteps 220000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.05/0.55
dist_entropy 2.91, value_loss 0.01, action_loss -0.00, explor_rew 0.026750 mean_episode_steps 22.70


  3%|███▉                                                                                                                                          | 550/20000 [9:04:02<307:35:00, 56.93s/it]

Updates 550, num timesteps 220400, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.05/0.55
dist_entropy 2.77, value_loss 0.01, action_loss -0.02, explor_rew 0.029500 mean_episode_steps 20.90


  3%|███▉                                                                                                                                          | 551/20000 [9:05:00<308:48:41, 57.16s/it]

Updates 551, num timesteps 220800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.05/0.55
dist_entropy 2.89, value_loss 0.01, action_loss -0.01, explor_rew 0.029000 mean_episode_steps 23.30


  3%|███▉                                                                                                                                          | 552/20000 [9:05:57<308:22:05, 57.08s/it]

Updates 552, num timesteps 221200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.05/0.55
dist_entropy 2.85, value_loss 0.01, action_loss -0.00, explor_rew 0.029000 mean_episode_steps 24.45


  3%|███▉                                                                                                                                          | 554/20000 [9:07:51<308:03:19, 57.03s/it]

Updates 553, num timesteps 221600, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.45, min/max reward -0.03/0.55
dist_entropy 2.86, value_loss 0.02, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 23.90


  3%|███▉                                                                                                                                          | 555/20000 [9:08:48<308:00:08, 57.02s/it]

Updates 554, num timesteps 222000, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.50, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 19.85
Updates 555, num timesteps 222400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.00, action_loss -0.01, explor_rew 0.029000 mean_episode_steps 17.00


  3%|███▉                                                                                                                                          | 557/20000 [9:10:43<309:05:02, 57.23s/it]

Updates 556, num timesteps 222800, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.05/0.55
dist_entropy 2.80, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 22.55


  3%|███▉                                                                                                                                          | 558/20000 [9:11:40<308:59:15, 57.21s/it]

Updates 557, num timesteps 223200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 31.45


  3%|███▉                                                                                                                                          | 559/20000 [9:12:38<309:46:46, 57.36s/it]

Updates 558, num timesteps 223600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.03/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.02, explor_rew 0.030750 mean_episode_steps 31.35
Updates 559, num timesteps 224000, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward 0.02/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 17.90


  3%|███▉                                                                                                                                          | 561/20000 [9:14:32<309:43:43, 57.36s/it]

Updates 560, num timesteps 224400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.00, explor_rew 0.030250 mean_episode_steps 17.65


  3%|███▉                                                                                                                                          | 562/20000 [9:15:29<308:26:42, 57.13s/it]

Updates 561, num timesteps 224800, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.45, min/max reward -1.05/0.55
dist_entropy 2.77, value_loss 0.02, action_loss -0.00, explor_rew 0.031000 mean_episode_steps 22.90


  3%|███▉                                                                                                                                          | 563/20000 [9:16:26<308:31:04, 57.14s/it]

Updates 562, num timesteps 225200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -0.03/0.55
dist_entropy 2.81, value_loss 0.00, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 18.75


  3%|████                                                                                                                                          | 564/20000 [9:17:24<309:43:55, 57.37s/it]

Updates 563, num timesteps 225600, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.34, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.11, action_loss 0.02, explor_rew 0.031000 mean_episode_steps 20.95


  3%|████                                                                                                                                          | 565/20000 [9:18:21<308:35:52, 57.16s/it]

Updates 564, num timesteps 226000, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.23, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.07, action_loss -0.02, explor_rew 0.030250 mean_episode_steps 19.55
Updates 565, num timesteps 226400, FPS 6 
Last 20 training episodes: mean/median reward 0.04/0.55, min/max reward -1.05/0.55
dist_entropy 2.43, value_loss 0.06, action_loss -0.02, explor_rew 0.020000 mean_episode_steps 29.70


  3%|████                                                                                                                                          | 566/20000 [9:19:18<308:13:48, 57.10s/it]

Updates 566, num timesteps 226800, FPS 6 
Last 20 training episodes: mean/median reward -0.01/0.55, min/max reward -1.05/0.55
dist_entropy 2.55, value_loss 0.03, action_loss -0.01, explor_rew 0.024250 mean_episode_steps 23.80


  3%|████                                                                                                                                          | 568/20000 [9:21:11<307:30:50, 56.97s/it]

Updates 567, num timesteps 227200, FPS 6 
Last 20 training episodes: mean/median reward 0.12/0.23, min/max reward -1.05/0.55
dist_entropy 2.49, value_loss 0.04, action_loss -0.01, explor_rew 0.023000 mean_episode_steps 21.65
Updates 568, num timesteps 227600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.03/0.55
dist_entropy 2.49, value_loss 0.04, action_loss -0.02, explor_rew 0.024000 mean_episode_steps 24.15


  3%|████                                                                                                                                          | 569/20000 [9:22:09<308:03:23, 57.07s/it]

Updates 569, num timesteps 228000, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.30, value_loss 0.07, action_loss 0.01, explor_rew 0.027250 mean_episode_steps 23.85


  3%|████                                                                                                                                          | 571/20000 [9:24:03<308:23:21, 57.14s/it]

Updates 570, num timesteps 228400, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.34, min/max reward -1.05/0.55
dist_entropy 2.46, value_loss 0.06, action_loss -0.02, explor_rew 0.010750 mean_episode_steps 26.20


  3%|████                                                                                                                                          | 572/20000 [9:25:00<307:33:14, 56.99s/it]

Updates 571, num timesteps 228800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.34, min/max reward 0.02/0.55
dist_entropy 2.23, value_loss 0.02, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 18.45


  3%|████                                                                                                                                          | 573/20000 [9:25:57<307:36:29, 57.00s/it]

Updates 572, num timesteps 229200, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.13, min/max reward 0.02/0.55
dist_entropy 2.39, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 22.10
Updates 573, num timesteps 229600, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.55, min/max reward -1.05/0.55
dist_entropy 2.58, value_loss 0.04, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 18.65


  3%|████                                                                                                                                          | 574/20000 [9:26:54<308:02:36, 57.09s/it]

Updates 574, num timesteps 230000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.04, action_loss -0.02, explor_rew 0.025250 mean_episode_steps 24.80


  3%|████                                                                                                                                          | 575/20000 [9:27:52<308:35:32, 57.19s/it]

Updates 575, num timesteps 230400, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward 0.02/0.55
dist_entropy 2.62, value_loss 0.01, action_loss -0.01, explor_rew 0.029250 mean_episode_steps 16.40


  3%|████                                                                                                                                          | 577/20000 [9:29:46<308:50:48, 57.24s/it]

Updates 576, num timesteps 230800, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.45, min/max reward -0.03/0.55
dist_entropy 2.61, value_loss 0.00, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 19.05
Updates 577, num timesteps 231200, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward 0.02/0.55
dist_entropy 2.55, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 21.65


  3%|████                                                                                                                                          | 579/20000 [9:31:41<308:55:10, 57.26s/it]

Updates 578, num timesteps 231600, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.50, min/max reward 0.02/0.55
dist_entropy 2.49, value_loss 0.00, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 21.65
Updates 579, num timesteps 232000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward 0.02/0.55
dist_entropy 2.57, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 23.95


  3%|████▏                                                                                                                                         | 581/20000 [9:33:35<308:16:46, 57.15s/it]

Updates 580, num timesteps 232400, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.50, min/max reward 0.02/0.55
dist_entropy 2.53, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 13.80
Updates 581, num timesteps 232800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -1.05/0.55
dist_entropy 2.52, value_loss 0.01, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 19.55


  3%|████▏                                                                                                                                         | 583/20000 [9:35:29<308:33:58, 57.21s/it]

Updates 582, num timesteps 233200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.45, min/max reward 0.02/0.55
dist_entropy 2.54, value_loss 0.00, action_loss -0.02, explor_rew 0.030500 mean_episode_steps 19.70
Updates 583, num timesteps 233600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.05/0.55
dist_entropy 2.62, value_loss 0.02, action_loss 0.01, explor_rew 0.026250 mean_episode_steps 29.95


  3%|████▏                                                                                                                                         | 585/20000 [9:37:24<308:28:10, 57.20s/it]

Updates 584, num timesteps 234000, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.03/0.55
dist_entropy 2.49, value_loss 0.02, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 21.20


  3%|████▏                                                                                                                                         | 586/20000 [9:38:21<308:46:34, 57.26s/it]

Updates 585, num timesteps 234400, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -1.05/0.55
dist_entropy 2.49, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 16.75
Updates 586, num timesteps 234800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward 0.02/0.55
dist_entropy 2.58, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 14.90


  3%|████▏                                                                                                                                         | 588/20000 [9:40:16<308:16:41, 57.17s/it]

Updates 587, num timesteps 235200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.50, min/max reward 0.02/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.02, explor_rew 0.031000 mean_episode_steps 18.05
Updates 588, num timesteps 235600, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward 0.02/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 24.60


  3%|████▏                                                                                                                                         | 590/20000 [9:42:10<308:30:46, 57.22s/it]

Updates 589, num timesteps 236000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.50, min/max reward 0.02/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 21.15


  3%|████▏                                                                                                                                         | 591/20000 [9:43:07<308:08:43, 57.16s/it]

Updates 590, num timesteps 236400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward 0.02/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.03, explor_rew 0.026750 mean_episode_steps 17.30
Updates 591, num timesteps 236800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.03/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.03, explor_rew 0.030750 mean_episode_steps 23.65


  3%|████▏                                                                                                                                         | 593/20000 [9:45:02<308:12:27, 57.17s/it]

Updates 592, num timesteps 237200, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -1.05/0.55
dist_entropy 2.64, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 17.65


  3%|████▏                                                                                                                                         | 594/20000 [9:45:59<308:13:37, 57.18s/it]

Updates 593, num timesteps 237600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.50, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 24.30


  3%|████▏                                                                                                                                         | 595/20000 [9:46:56<308:05:26, 57.16s/it]

Updates 594, num timesteps 238000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward 0.02/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 16.25
Updates 595, num timesteps 238400, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward 0.02/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 25.10


  3%|████▏                                                                                                                                         | 597/20000 [9:48:50<307:48:31, 57.11s/it]

Updates 596, num timesteps 238800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.03/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 21.80


  3%|████▏                                                                                                                                         | 598/20000 [9:49:48<307:48:18, 57.11s/it]

Updates 597, num timesteps 239200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.50, min/max reward -0.03/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 21.80


  3%|████▎                                                                                                                                         | 599/20000 [9:50:45<307:34:50, 57.07s/it]

Updates 598, num timesteps 239600, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward 0.02/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 28.30


  3%|████▎                                                                                                                                         | 600/20000 [9:51:42<308:04:22, 57.17s/it]

Updates 599, num timesteps 240000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.03/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 23.15
Updates 600, num timesteps 240400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.03/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 18.65


  3%|████▎                                                                                                                                         | 602/20000 [9:53:36<308:05:42, 57.18s/it]

Updates 601, num timesteps 240800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.05/0.55
dist_entropy 2.61, value_loss 0.00, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 28.80


  3%|████▎                                                                                                                                         | 603/20000 [9:54:33<307:16:57, 57.03s/it]

Updates 602, num timesteps 241200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.03/0.55
dist_entropy 2.64, value_loss 0.00, action_loss 88.14, explor_rew 0.032750 mean_episode_steps 28.55


  3%|████▎                                                                                                                                         | 604/20000 [9:55:30<307:08:59, 57.01s/it]

Updates 603, num timesteps 241600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.03/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 18.85


  3%|████▎                                                                                                                                         | 605/20000 [9:56:27<306:58:09, 56.98s/it]

Updates 604, num timesteps 242000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.05/0.55
dist_entropy 2.63, value_loss 0.04, action_loss -0.01, explor_rew 0.028000 mean_episode_steps 25.95


  3%|████▎                                                                                                                                         | 606/20000 [9:57:23<306:12:41, 56.84s/it]

Updates 605, num timesteps 242400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 20.45


  3%|████▎                                                                                                                                         | 607/20000 [9:58:20<306:08:59, 56.83s/it]

Updates 606, num timesteps 242800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.02, explor_rew 0.032250 mean_episode_steps 18.75
Updates 607, num timesteps 243200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.05/0.55
dist_entropy 2.90, value_loss 0.01, action_loss -0.05, explor_rew 0.016750 mean_episode_steps 25.10


  3%|████▎                                                                                                                                        | 609/20000 [10:00:13<304:41:10, 56.57s/it]

Updates 608, num timesteps 243600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.05/0.55
dist_entropy 2.89, value_loss 0.01, action_loss -0.03, explor_rew 0.017750 mean_episode_steps 24.95


  3%|████▎                                                                                                                                        | 610/20000 [10:01:10<304:55:42, 56.61s/it]

Updates 609, num timesteps 244000, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.05/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.02, explor_rew 0.029250 mean_episode_steps 19.30
Updates 610, num timesteps 244400, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.05/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.02, explor_rew 0.025500 mean_episode_steps 24.60


  3%|████▎                                                                                                                                        | 612/20000 [10:03:03<305:09:50, 56.66s/it]

Updates 611, num timesteps 244800, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.00, explor_rew 0.030750 mean_episode_steps 22.80


  3%|████▎                                                                                                                                        | 613/20000 [10:04:00<306:08:46, 56.85s/it]

Updates 612, num timesteps 245200, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.05/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 21.60
Updates 613, num timesteps 245600, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 22.15


  3%|████▎                                                                                                                                        | 614/20000 [10:04:58<307:17:47, 57.07s/it]

Updates 614, num timesteps 246000, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 26.00


  3%|████▎                                                                                                                                        | 616/20000 [10:06:52<307:20:02, 57.08s/it]

Updates 615, num timesteps 246400, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.00, explor_rew 0.030750 mean_episode_steps 23.30


  3%|████▎                                                                                                                                        | 617/20000 [10:07:49<307:15:36, 57.07s/it]

Updates 616, num timesteps 246800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.03/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 29.55


  3%|████▎                                                                                                                                        | 618/20000 [10:08:46<306:48:33, 56.99s/it]

Updates 617, num timesteps 247200, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.50, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.02, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 23.45


  3%|████▎                                                                                                                                        | 619/20000 [10:09:43<306:23:24, 56.91s/it]

Updates 618, num timesteps 247600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 23.75
Updates 619, num timesteps 248000, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 21.70


  3%|████▎                                                                                                                                        | 620/20000 [10:10:41<307:50:23, 57.18s/it]

Updates 620, num timesteps 248400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 14.55


  3%|████▍                                                                                                                                        | 622/20000 [10:12:35<307:32:45, 57.14s/it]

Updates 621, num timesteps 248800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.81, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 22.85
Updates 622, num timesteps 249200, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.15, action_loss -0.02, explor_rew 0.031500 mean_episode_steps 19.25


  3%|████▍                                                                                                                                        | 623/20000 [10:13:32<307:20:27, 57.10s/it]

Updates 623, num timesteps 249600, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.55, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.06, action_loss -0.02, explor_rew 0.029500 mean_episode_steps 26.50


  3%|████▍                                                                                                                                        | 625/20000 [10:15:26<306:44:00, 56.99s/it]

Updates 624, num timesteps 250000, FPS 6 
Last 20 training episodes: mean/median reward -0.05/0.25, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.04, action_loss -0.03, explor_rew 0.029000 mean_episode_steps 22.35
Updates 625, num timesteps 250400, FPS 6 
Last 20 training episodes: mean/median reward 0.08/0.55, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.03, action_loss -0.01, explor_rew 0.027000 mean_episode_steps 20.25


  3%|████▍                                                                                                                                        | 627/20000 [10:17:19<306:09:36, 56.89s/it]

Updates 626, num timesteps 250800, FPS 6 
Last 20 training episodes: mean/median reward 0.00/0.26, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.02, action_loss -0.01, explor_rew 0.027750 mean_episode_steps 21.65


  3%|████▍                                                                                                                                        | 628/20000 [10:18:16<305:31:59, 56.78s/it]

Updates 627, num timesteps 251200, FPS 6 
Last 20 training episodes: mean/median reward -0.10/0.25, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.02, action_loss -0.02, explor_rew 0.031000 mean_episode_steps 18.95


  3%|████▍                                                                                                                                        | 629/20000 [10:19:12<305:11:00, 56.72s/it]

Updates 628, num timesteps 251600, FPS 6 
Last 20 training episodes: mean/median reward 0.08/-0.03, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.01, action_loss -0.01, explor_rew 0.029750 mean_episode_steps 24.50


  3%|████▍                                                                                                                                        | 630/20000 [10:20:09<304:20:38, 56.56s/it]

Updates 629, num timesteps 252000, FPS 6 
Last 20 training episodes: mean/median reward -0.16/-0.03, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.02, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 21.80


  3%|████▍                                                                                                                                        | 631/20000 [10:21:05<304:10:01, 56.53s/it]

Updates 630, num timesteps 252400, FPS 6 
Last 20 training episodes: mean/median reward 0.02/-0.04, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 25.15


  3%|████▍                                                                                                                                        | 632/20000 [10:22:01<304:01:19, 56.51s/it]

Updates 631, num timesteps 252800, FPS 6 
Last 20 training episodes: mean/median reward 0.00/0.26, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.02, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 23.50


  3%|████▍                                                                                                                                        | 633/20000 [10:22:58<304:11:21, 56.54s/it]

Updates 632, num timesteps 253200, FPS 6 
Last 20 training episodes: mean/median reward -0.08/-0.03, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.01, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 22.65


  3%|████▍                                                                                                                                        | 634/20000 [10:23:54<303:48:38, 56.48s/it]

Updates 633, num timesteps 253600, FPS 6 
Last 20 training episodes: mean/median reward 0.00/0.25, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 18.55
Updates 634, num timesteps 254000, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.02, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 21.10


  3%|████▍                                                                                                                                        | 636/20000 [10:25:48<304:39:21, 56.64s/it]

Updates 635, num timesteps 254400, FPS 6 
Last 20 training episodes: mean/median reward -0.37/-0.55, min/max reward -1.05/0.55
dist_entropy 2.90, value_loss 0.02, action_loss -0.02, explor_rew 0.028000 mean_episode_steps 12.40


  3%|████▍                                                                                                                                        | 637/20000 [10:26:45<304:32:03, 56.62s/it]

Updates 636, num timesteps 254800, FPS 6 
Last 20 training episodes: mean/median reward -0.23/-0.04, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.02, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 26.35
Updates 637, num timesteps 255200, FPS 6 
Last 20 training episodes: mean/median reward 0.08/0.55, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 32.85


  3%|████▌                                                                                                                                        | 639/20000 [10:28:37<303:26:21, 56.42s/it]

Updates 638, num timesteps 255600, FPS 6 
Last 20 training episodes: mean/median reward -0.01/-0.04, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.01, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 19.15


  3%|████▌                                                                                                                                        | 640/20000 [10:29:34<303:33:28, 56.45s/it]

Updates 639, num timesteps 256000, FPS 6 
Last 20 training episodes: mean/median reward 0.00/0.26, min/max reward -1.05/0.55
dist_entropy 2.90, value_loss 0.02, action_loss 0.00, explor_rew 0.029750 mean_episode_steps 23.90
Updates 640, num timesteps 256400, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.55, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.01, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 17.80


  3%|████▌                                                                                                                                        | 641/20000 [10:30:31<304:13:23, 56.57s/it]

Updates 641, num timesteps 256800, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.55, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.02, action_loss -0.01, explor_rew 0.028750 mean_episode_steps 18.75


  3%|████▌                                                                                                                                        | 643/20000 [10:32:24<304:51:52, 56.70s/it]

Updates 642, num timesteps 257200, FPS 6 
Last 20 training episodes: mean/median reward -0.11/-0.05, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.01, action_loss -0.01, explor_rew 0.026000 mean_episode_steps 23.60


  3%|████▌                                                                                                                                        | 644/20000 [10:33:21<304:21:12, 56.61s/it]

Updates 643, num timesteps 257600, FPS 6 
Last 20 training episodes: mean/median reward 0.09/-0.04, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.01, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 23.85
Updates 644, num timesteps 258000, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.55, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.02, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 23.20


  3%|████▌                                                                                                                                        | 645/20000 [10:34:17<304:11:21, 56.58s/it]

Updates 645, num timesteps 258400, FPS 6 
Last 20 training episodes: mean/median reward 0.13/0.55, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.02, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 30.75


  3%|████▌                                                                                                                                        | 646/20000 [10:35:14<304:04:35, 56.56s/it]

Updates 646, num timesteps 258800, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.02, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 20.85


  3%|████▌                                                                                                                                        | 648/20000 [10:37:07<304:35:18, 56.66s/it]

Updates 647, num timesteps 259200, FPS 6 
Last 20 training episodes: mean/median reward 0.05/0.25, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.01, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 22.85


  3%|████▌                                                                                                                                        | 649/20000 [10:38:04<304:05:25, 56.57s/it]

Updates 648, num timesteps 259600, FPS 6 
Last 20 training episodes: mean/median reward 0.02/-0.04, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.02, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 24.30


  3%|████▌                                                                                                                                        | 650/20000 [10:39:00<303:38:44, 56.49s/it]

Updates 649, num timesteps 260000, FPS 6 
Last 20 training episodes: mean/median reward -0.11/-0.04, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 24.15
Updates 650, num timesteps 260400, FPS 6 
Last 20 training episodes: mean/median reward -0.29/-0.04, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 19.70


  3%|████▌                                                                                                                                        | 652/20000 [10:40:54<304:44:06, 56.70s/it]

Updates 651, num timesteps 260800, FPS 6 
Last 20 training episodes: mean/median reward 0.01/-0.04, min/max reward -1.05/0.55
dist_entropy 2.77, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 20.15


  3%|████▌                                                                                                                                        | 653/20000 [10:41:51<305:20:46, 56.82s/it]

Updates 652, num timesteps 261200, FPS 6 
Last 20 training episodes: mean/median reward -0.13/-0.04, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.00, explor_rew 0.030750 mean_episode_steps 25.60


  3%|████▌                                                                                                                                        | 654/20000 [10:42:47<304:34:05, 56.68s/it]

Updates 653, num timesteps 261600, FPS 6 
Last 20 training episodes: mean/median reward -0.16/-0.04, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.02, action_loss -0.01, explor_rew 0.027750 mean_episode_steps 18.55
Updates 654, num timesteps 262000, FPS 6 
Last 20 training episodes: mean/median reward 0.09/0.55, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.02, action_loss -0.01, explor_rew 0.029000 mean_episode_steps 19.65


  3%|████▌                                                                                                                                        | 656/20000 [10:44:41<304:36:26, 56.69s/it]

Updates 655, num timesteps 262400, FPS 6 
Last 20 training episodes: mean/median reward 0.02/-0.03, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.02, action_loss -0.01, explor_rew 0.027500 mean_episode_steps 19.40
Updates 656, num timesteps 262800, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.55, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 23.05


  3%|████▋                                                                                                                                        | 658/20000 [10:46:35<305:09:34, 56.80s/it]

Updates 657, num timesteps 263200, FPS 6 
Last 20 training episodes: mean/median reward -0.24/-0.04, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.01, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 17.00


  3%|████▋                                                                                                                                        | 659/20000 [10:47:31<304:33:10, 56.69s/it]

Updates 658, num timesteps 263600, FPS 6 
Last 20 training episodes: mean/median reward -0.26/-0.04, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.02, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 18.95
Updates 659, num timesteps 264000, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.02, action_loss -0.02, explor_rew 0.031000 mean_episode_steps 15.45


  3%|████▋                                                                                                                                        | 661/20000 [10:49:24<303:50:49, 56.56s/it]

Updates 660, num timesteps 264400, FPS 6 
Last 20 training episodes: mean/median reward -0.10/-0.03, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.02, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 20.50


  3%|████▋                                                                                                                                        | 662/20000 [10:50:21<304:47:56, 56.74s/it]

Updates 661, num timesteps 264800, FPS 6 
Last 20 training episodes: mean/median reward 0.03/-0.03, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.01, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 20.10


  3%|████▋                                                                                                                                        | 663/20000 [10:51:17<304:09:46, 56.63s/it]

Updates 662, num timesteps 265200, FPS 6 
Last 20 training episodes: mean/median reward 0.03/-0.03, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.01, action_loss -0.02, explor_rew 0.030750 mean_episode_steps 25.50


  3%|████▋                                                                                                                                        | 664/20000 [10:52:14<304:07:18, 56.62s/it]

Updates 663, num timesteps 265600, FPS 6 
Last 20 training episodes: mean/median reward -0.16/-0.04, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.01, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 21.70


  3%|████▋                                                                                                                                        | 665/20000 [10:53:10<303:42:45, 56.55s/it]

Updates 664, num timesteps 266000, FPS 6 
Last 20 training episodes: mean/median reward -0.04/-0.04, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.02, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 15.70
Updates 665, num timesteps 266400, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.00, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 19.70


  3%|████▋                                                                                                                                        | 667/20000 [10:55:04<304:25:33, 56.69s/it]

Updates 666, num timesteps 266800, FPS 6 
Last 20 training episodes: mean/median reward -0.16/-0.04, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.02, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 19.95


  3%|████▋                                                                                                                                        | 668/20000 [10:56:01<303:56:12, 56.60s/it]

Updates 667, num timesteps 267200, FPS 6 
Last 20 training episodes: mean/median reward -0.13/-0.03, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 25.20
Updates 668, num timesteps 267600, FPS 6 
Last 20 training episodes: mean/median reward -0.17/0.55, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.02, action_loss -0.00, explor_rew 0.029750 mean_episode_steps 14.35


  3%|████▋                                                                                                                                        | 669/20000 [10:56:58<304:35:14, 56.72s/it]

Updates 669, num timesteps 268000, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.55, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.01, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 16.85


  3%|████▋                                                                                                                                        | 671/20000 [10:58:51<304:21:00, 56.68s/it]

Updates 670, num timesteps 268400, FPS 6 
Last 20 training episodes: mean/median reward -0.14/-0.04, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 18.60


  3%|████▋                                                                                                                                        | 672/20000 [10:59:47<303:19:53, 56.50s/it]

Updates 671, num timesteps 268800, FPS 6 
Last 20 training episodes: mean/median reward -0.00/0.20, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.02, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 28.30


  3%|████▋                                                                                                                                        | 673/20000 [11:00:43<302:37:26, 56.37s/it]

Updates 672, num timesteps 269200, FPS 6 
Last 20 training episodes: mean/median reward 0.07/-0.04, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 22.55
Updates 673, num timesteps 269600, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.55, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.02, action_loss -0.00, explor_rew 0.029750 mean_episode_steps 20.45


  3%|████▊                                                                                                                                        | 675/20000 [11:02:37<303:41:56, 56.58s/it]

Updates 674, num timesteps 270000, FPS 6 
Last 20 training episodes: mean/median reward -0.12/-0.04, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.01, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 20.55
Updates 675, num timesteps 270400, FPS 6 
Last 20 training episodes: mean/median reward 0.18/0.55, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.00, action_loss -0.01, explor_rew 0.034000 mean_episode_steps 24.45


  3%|████▊                                                                                                                                        | 676/20000 [11:03:33<303:21:20, 56.51s/it]

Updates 676, num timesteps 270800, FPS 6 
Last 20 training episodes: mean/median reward 0.19/0.55, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.04, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 19.55


  3%|████▊                                                                                                                                        | 678/20000 [11:05:26<303:39:13, 56.58s/it]

Updates 677, num timesteps 271200, FPS 6 
Last 20 training episodes: mean/median reward 0.02/-0.04, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 25.90


  3%|████▊                                                                                                                                        | 679/20000 [11:06:23<303:38:39, 56.58s/it]

Updates 678, num timesteps 271600, FPS 6 
Last 20 training episodes: mean/median reward -0.01/-0.04, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.01, action_loss 0.00, explor_rew 0.030750 mean_episode_steps 22.75
Updates 679, num timesteps 272000, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.55, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 21.30


  3%|████▊                                                                                                                                        | 680/20000 [11:07:20<303:47:01, 56.61s/it]

Updates 680, num timesteps 272400, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.02, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 22.75


  3%|████▊                                                                                                                                        | 682/20000 [11:09:13<303:33:41, 56.57s/it]

Updates 681, num timesteps 272800, FPS 6 
Last 20 training episodes: mean/median reward -0.16/-0.04, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 24.30


  3%|████▊                                                                                                                                        | 683/20000 [11:10:10<304:08:27, 56.68s/it]

Updates 682, num timesteps 273200, FPS 6 
Last 20 training episodes: mean/median reward -0.10/0.25, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.02, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 19.40


  3%|████▊                                                                                                                                        | 684/20000 [11:11:06<303:54:39, 56.64s/it]

Updates 683, num timesteps 273600, FPS 6 
Last 20 training episodes: mean/median reward -0.06/-0.04, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.01, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 19.60
Updates 684, num timesteps 274000, FPS 6 
Last 20 training episodes: mean/median reward 0.01/0.55, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 21.15


  3%|████▊                                                                                                                                        | 685/20000 [11:12:04<304:53:21, 56.83s/it]

Updates 685, num timesteps 274400, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.55, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.01, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 24.30


  3%|████▊                                                                                                                                        | 686/20000 [11:13:00<304:15:36, 56.71s/it]

Updates 686, num timesteps 274800, FPS 6 
Last 20 training episodes: mean/median reward 0.13/0.55, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 18.65


  3%|████▊                                                                                                                                        | 688/20000 [11:14:54<304:59:44, 56.86s/it]

Updates 687, num timesteps 275200, FPS 6 
Last 20 training episodes: mean/median reward -0.21/-0.04, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 22.35
Updates 688, num timesteps 275600, FPS 6 
Last 20 training episodes: mean/median reward 0.19/0.55, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.01, action_loss -0.00, explor_rew 0.031000 mean_episode_steps 22.50


  3%|████▊                                                                                                                                        | 689/20000 [11:15:51<304:40:53, 56.80s/it]

Updates 689, num timesteps 276000, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.55, min/max reward -1.05/0.55
dist_entropy 2.90, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 25.00


  3%|████▊                                                                                                                                        | 691/20000 [11:17:44<304:07:19, 56.70s/it]

Updates 690, num timesteps 276400, FPS 6 
Last 20 training episodes: mean/median reward -0.15/0.25, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.01, action_loss -0.02, explor_rew 0.029000 mean_episode_steps 15.10


  3%|████▉                                                                                                                                        | 692/20000 [11:18:41<303:53:18, 56.66s/it]

Updates 691, num timesteps 276800, FPS 6 
Last 20 training episodes: mean/median reward -0.10/0.25, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.03, action_loss 0.01, explor_rew 0.023750 mean_episode_steps 25.95
Updates 692, num timesteps 277200, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.55, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 13.00


  3%|████▉                                                                                                                                        | 693/20000 [11:19:38<304:21:37, 56.75s/it]

Updates 693, num timesteps 277600, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.01, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 15.90


  3%|████▉                                                                                                                                        | 695/20000 [11:21:31<303:42:35, 56.64s/it]

Updates 694, num timesteps 278000, FPS 6 
Last 20 training episodes: mean/median reward -0.03/-0.04, min/max reward -1.05/0.55
dist_entropy 2.97, value_loss 0.03, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 21.60
Updates 695, num timesteps 278400, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.55, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.02, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 17.05


  3%|████▉                                                                                                                                        | 696/20000 [11:22:28<304:07:44, 56.72s/it]

Updates 696, num timesteps 278800, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.99, value_loss 0.02, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 20.35


  3%|████▉                                                                                                                                        | 697/20000 [11:23:25<304:32:54, 56.80s/it]

Updates 697, num timesteps 279200, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.01, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 26.00


  3%|████▉                                                                                                                                        | 699/20000 [11:25:18<304:05:53, 56.72s/it]

Updates 698, num timesteps 279600, FPS 6 
Last 20 training episodes: mean/median reward 0.05/0.25, min/max reward -1.05/0.55
dist_entropy 2.94, value_loss 0.02, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 25.35
Updates 699, num timesteps 280000, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.55, min/max reward -1.05/0.55
dist_entropy 2.98, value_loss 0.04, action_loss -0.01, explor_rew 0.027750 mean_episode_steps 21.35


  4%|████▉                                                                                                                                        | 700/20000 [11:26:15<304:01:59, 56.71s/it]

Updates 700, num timesteps 280400, FPS 6 
Last 20 training episodes: mean/median reward 0.05/0.25, min/max reward -1.05/0.55
dist_entropy 2.96, value_loss 0.02, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 18.65


  4%|████▉                                                                                                                                        | 701/20000 [11:27:12<304:21:38, 56.77s/it]

Updates 701, num timesteps 280800, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.55, min/max reward -1.05/0.55
dist_entropy 2.97, value_loss 0.02, action_loss -0.02, explor_rew 0.026000 mean_episode_steps 18.85


  4%|████▉                                                                                                                                        | 702/20000 [11:28:09<304:38:14, 56.83s/it]

Updates 702, num timesteps 281200, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.55, min/max reward -1.05/0.55
dist_entropy 2.90, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 23.40


  4%|████▉                                                                                                                                        | 703/20000 [11:29:06<305:22:33, 56.97s/it]

Updates 703, num timesteps 281600, FPS 6 
Last 20 training episodes: mean/median reward 0.08/0.55, min/max reward -1.05/0.55
dist_entropy 2.94, value_loss 0.03, action_loss -0.02, explor_rew 0.031750 mean_episode_steps 16.00


  4%|████▉                                                                                                                                        | 705/20000 [11:30:59<304:31:15, 56.82s/it]

Updates 704, num timesteps 282000, FPS 6 
Last 20 training episodes: mean/median reward -0.19/-0.04, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.03, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 22.35


  4%|████▉                                                                                                                                        | 706/20000 [11:31:56<304:51:47, 56.88s/it]

Updates 705, num timesteps 282400, FPS 6 
Last 20 training episodes: mean/median reward -0.22/-0.04, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 23.60
Updates 706, num timesteps 282800, FPS 6 
Last 20 training episodes: mean/median reward 0.06/0.55, min/max reward -1.05/0.55
dist_entropy 2.73, value_loss 0.04, action_loss -0.02, explor_rew 0.032250 mean_episode_steps 17.05


  4%|████▉                                                                                                                                        | 707/20000 [11:32:54<305:55:01, 57.08s/it]

Updates 707, num timesteps 283200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.05/0.55
dist_entropy 2.64, value_loss 0.09, action_loss -0.02, explor_rew 0.032000 mean_episode_steps 24.35


  4%|████▉                                                                                                                                        | 709/20000 [11:34:48<305:05:08, 56.93s/it]

Updates 708, num timesteps 283600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.05/0.55
dist_entropy 2.63, value_loss 0.05, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 20.50


  4%|█████                                                                                                                                        | 710/20000 [11:35:45<305:29:47, 57.01s/it]

Updates 709, num timesteps 284000, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.45, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.10, action_loss -0.02, explor_rew 0.030250 mean_episode_steps 20.55
Updates 710, num timesteps 284400, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 17.60


  4%|█████                                                                                                                                        | 712/20000 [11:37:39<305:35:20, 57.04s/it]

Updates 711, num timesteps 284800, FPS 6 
Last 20 training episodes: mean/median reward 0.20/-0.03, min/max reward -0.05/0.55
dist_entropy 2.97, value_loss 0.05, action_loss -0.02, explor_rew 0.005250 mean_episode_steps 29.75
Updates 712, num timesteps 285200, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.55, min/max reward -1.05/0.75
dist_entropy 2.94, value_loss 0.07, action_loss -0.01, explor_rew 0.023750 mean_episode_steps 27.90


  4%|█████                                                                                                                                        | 713/20000 [11:38:36<305:27:10, 57.01s/it]

Updates 713, num timesteps 285600, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.55, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.05, action_loss -0.02, explor_rew 0.021000 mean_episode_steps 25.60


  4%|█████                                                                                                                                        | 715/20000 [11:40:30<305:49:50, 57.09s/it]

Updates 714, num timesteps 286000, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.50, min/max reward -1.05/0.55
dist_entropy 2.96, value_loss 0.05, action_loss -0.02, explor_rew 0.020000 mean_episode_steps 29.25


  4%|█████                                                                                                                                        | 716/20000 [11:41:27<304:37:42, 56.87s/it]

Updates 715, num timesteps 286400, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.45, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.05, action_loss -0.01, explor_rew 0.021000 mean_episode_steps 27.90
Updates 716, num timesteps 286800, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.06, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 22.30


  4%|█████                                                                                                                                        | 718/20000 [11:43:21<305:16:25, 57.00s/it]

Updates 717, num timesteps 287200, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.45, min/max reward -0.05/0.55
dist_entropy 2.73, value_loss 0.01, action_loss -0.01, explor_rew 0.025500 mean_episode_steps 21.95
Updates 718, num timesteps 287600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.90, value_loss 0.02, action_loss -0.01, explor_rew 0.021500 mean_episode_steps 30.25


  4%|█████                                                                                                                                        | 719/20000 [11:44:18<304:48:35, 56.91s/it]

Updates 719, num timesteps 288000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.00, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 19.60


  4%|█████                                                                                                                                        | 720/20000 [11:45:15<305:12:34, 56.99s/it]

Updates 720, num timesteps 288400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.03/0.55
dist_entropy 2.83, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 24.55


  4%|█████                                                                                                                                        | 721/20000 [11:46:12<305:20:48, 57.02s/it]

Updates 721, num timesteps 288800, FPS 6 
Last 20 training episodes: mean/median reward 0.09/0.55, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.07, action_loss -0.03, explor_rew 0.032750 mean_episode_steps 20.90


  4%|█████                                                                                                                                        | 722/20000 [11:47:09<305:26:56, 57.04s/it]

Updates 722, num timesteps 289200, FPS 6 
Last 20 training episodes: mean/median reward 0.04/0.55, min/max reward -1.05/0.55
dist_entropy 2.84, value_loss 0.06, action_loss -0.03, explor_rew 0.028750 mean_episode_steps 20.10


  4%|█████                                                                                                                                        | 724/20000 [11:49:03<304:48:53, 56.93s/it]

Updates 723, num timesteps 289600, FPS 6 
Last 20 training episodes: mean/median reward -0.29/-0.03, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.04, action_loss 0.02, explor_rew 0.029750 mean_episode_steps 13.75


  4%|█████                                                                                                                                        | 725/20000 [11:49:59<303:44:45, 56.73s/it]

Updates 724, num timesteps 290000, FPS 6 
Last 20 training episodes: mean/median reward -0.23/-0.03, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.03, action_loss -0.03, explor_rew 0.026500 mean_episode_steps 17.85


  4%|█████                                                                                                                                        | 726/20000 [11:50:56<303:31:25, 56.69s/it]

Updates 725, num timesteps 290400, FPS 6 
Last 20 training episodes: mean/median reward -0.20/-0.00, min/max reward -1.05/0.55
dist_entropy 2.73, value_loss 0.02, action_loss -0.02, explor_rew 0.022750 mean_episode_steps 18.45
Updates 726, num timesteps 290800, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.55, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.02, action_loss -0.02, explor_rew 0.027500 mean_episode_steps 18.60


  4%|█████▏                                                                                                                                       | 728/20000 [11:52:49<303:08:28, 56.63s/it]

Updates 727, num timesteps 291200, FPS 6 
Last 20 training episodes: mean/median reward 0.06/-0.00, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.01, explor_rew 0.026500 mean_episode_steps 26.35
Updates 728, num timesteps 291600, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.55, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.04, action_loss -0.01, explor_rew 0.026250 mean_episode_steps 14.35


  4%|█████▏                                                                                                                                       | 730/20000 [11:54:43<303:36:13, 56.72s/it]

Updates 729, num timesteps 292000, FPS 6 
Last 20 training episodes: mean/median reward -0.13/-0.03, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.01, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 16.70


  4%|█████▏                                                                                                                                       | 731/20000 [11:55:40<303:56:12, 56.78s/it]

Updates 730, num timesteps 292400, FPS 6 
Last 20 training episodes: mean/median reward -0.07/-0.00, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.03, action_loss -0.02, explor_rew 0.027750 mean_episode_steps 13.15


  4%|█████▏                                                                                                                                       | 732/20000 [11:56:36<302:57:54, 56.61s/it]

Updates 731, num timesteps 292800, FPS 6 
Last 20 training episodes: mean/median reward -0.05/-0.03, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.01, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 19.50


  4%|█████▏                                                                                                                                       | 733/20000 [11:57:32<302:49:00, 56.58s/it]

Updates 732, num timesteps 293200, FPS 6 
Last 20 training episodes: mean/median reward 0.01/0.26, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.02, action_loss -0.01, explor_rew 0.029250 mean_episode_steps 22.00
Updates 733, num timesteps 293600, FPS 6 
Last 20 training episodes: mean/median reward -0.01/0.55, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.01, action_loss -0.01, explor_rew 0.028750 mean_episode_steps 18.55


  4%|█████▏                                                                                                                                       | 735/20000 [11:59:27<304:04:08, 56.82s/it]

Updates 734, num timesteps 294000, FPS 6 
Last 20 training episodes: mean/median reward 0.01/0.26, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.02, action_loss -0.01, explor_rew 0.027500 mean_episode_steps 20.05


  4%|█████▏                                                                                                                                       | 736/20000 [12:00:23<303:26:07, 56.71s/it]

Updates 735, num timesteps 294400, FPS 6 
Last 20 training episodes: mean/median reward 0.01/0.26, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.02, action_loss -0.02, explor_rew 0.030500 mean_episode_steps 20.60


  4%|█████▏                                                                                                                                       | 737/20000 [12:01:19<301:59:04, 56.44s/it]

Updates 736, num timesteps 294800, FPS 6 
Last 20 training episodes: mean/median reward 0.03/-0.03, min/max reward -1.05/0.55
dist_entropy 2.99, value_loss 0.03, action_loss -0.03, explor_rew 0.020250 mean_episode_steps 16.35


  4%|█████▏                                                                                                                                       | 738/20000 [12:02:15<302:00:19, 56.44s/it]

Updates 737, num timesteps 295200, FPS 6 
Last 20 training episodes: mean/median reward 0.08/-0.03, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.02, action_loss -0.01, explor_rew 0.027000 mean_episode_steps 18.50


  4%|█████▏                                                                                                                                       | 739/20000 [12:03:12<302:19:39, 56.51s/it]

Updates 738, num timesteps 295600, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.50, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.02, action_loss -0.01, explor_rew 0.029000 mean_episode_steps 20.25
Updates 739, num timesteps 296000, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.55, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.01, action_loss -0.01, explor_rew 0.029000 mean_episode_steps 24.20


  4%|█████▏                                                                                                                                       | 741/20000 [12:05:06<303:05:05, 56.65s/it]

Updates 740, num timesteps 296400, FPS 6 
Last 20 training episodes: mean/median reward -0.02/-0.03, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 18.90
Updates 741, num timesteps 296800, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.55, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.01, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 22.45


  4%|█████▏                                                                                                                                       | 742/20000 [12:06:02<302:52:29, 56.62s/it]

Updates 742, num timesteps 297200, FPS 6 
Last 20 training episodes: mean/median reward 0.09/0.55, min/max reward -1.05/0.55
dist_entropy 2.83, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 20.70


  4%|█████▏                                                                                                                                       | 744/20000 [12:07:56<302:53:51, 56.63s/it]

Updates 743, num timesteps 297600, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.50, min/max reward -1.05/0.55
dist_entropy 2.82, value_loss 0.01, action_loss -0.01, explor_rew 0.028500 mean_episode_steps 19.90


  4%|█████▎                                                                                                                                       | 745/20000 [12:08:52<302:33:49, 56.57s/it]

Updates 744, num timesteps 298000, FPS 6 
Last 20 training episodes: mean/median reward 0.06/0.28, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.01, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 22.85


  4%|█████▎                                                                                                                                       | 746/20000 [12:09:48<302:11:18, 56.50s/it]

Updates 745, num timesteps 298400, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.28, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.01, action_loss -0.02, explor_rew 0.030250 mean_episode_steps 21.60
Updates 746, num timesteps 298800, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.55, min/max reward -1.05/0.55
dist_entropy 2.94, value_loss 0.02, action_loss -0.01, explor_rew 0.029000 mean_episode_steps 16.15


  4%|█████▎                                                                                                                                       | 748/20000 [12:11:42<302:24:48, 56.55s/it]

Updates 747, num timesteps 299200, FPS 6 
Last 20 training episodes: mean/median reward -0.18/-0.03, min/max reward -1.05/0.55
dist_entropy 2.97, value_loss 0.02, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 19.90


  4%|█████▎                                                                                                                                       | 749/20000 [12:12:38<302:16:28, 56.53s/it]

Updates 748, num timesteps 299600, FPS 6 
Last 20 training episodes: mean/median reward -0.15/-0.03, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.01, action_loss -0.00, explor_rew 0.028250 mean_episode_steps 24.90


  4%|█████▎                                                                                                                                       | 750/20000 [12:13:35<302:35:00, 56.59s/it]

Updates 749, num timesteps 300000, FPS 6 
Last 20 training episodes: mean/median reward -0.08/-0.03, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.01, action_loss 0.00, explor_rew 0.030500 mean_episode_steps 22.60
Updates 750, num timesteps 300400, FPS 6 
Last 20 training episodes: mean/median reward -0.37/-0.54, min/max reward -1.05/0.55
dist_entropy 2.98, value_loss 0.02, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 19.60


  4%|█████▎                                                                                                                                       | 751/20000 [12:14:32<303:06:19, 56.69s/it]

Updates 751, num timesteps 300800, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.55, min/max reward -1.05/0.55
dist_entropy 2.85, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 28.55


  4%|█████▎                                                                                                                                       | 753/20000 [12:16:25<303:26:21, 56.76s/it]

Updates 752, num timesteps 301200, FPS 6 
Last 20 training episodes: mean/median reward -0.21/-0.03, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.02, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 23.10
Updates 753, num timesteps 301600, FPS 6 
Last 20 training episodes: mean/median reward -0.07/0.55, min/max reward -1.05/0.55
dist_entropy 2.91, value_loss 0.02, action_loss -0.02, explor_rew 0.029000 mean_episode_steps 18.50


  4%|█████▎                                                                                                                                       | 755/20000 [12:18:20<304:20:31, 56.93s/it]

Updates 754, num timesteps 302000, FPS 6 
Last 20 training episodes: mean/median reward 0.03/-0.03, min/max reward -1.05/0.55
dist_entropy 2.96, value_loss 0.02, action_loss -0.03, explor_rew 0.022250 mean_episode_steps 19.05
Updates 755, num timesteps 302400, FPS 6 
Last 20 training episodes: mean/median reward 0.07/0.55, min/max reward -1.05/0.55
dist_entropy 3.03, value_loss 0.02, action_loss -0.01, explor_rew 0.024250 mean_episode_steps 22.10


  4%|█████▎                                                                                                                                       | 757/20000 [12:20:14<304:42:41, 57.01s/it]

Updates 756, num timesteps 302800, FPS 6 
Last 20 training episodes: mean/median reward -0.23/-0.03, min/max reward -1.05/0.55
dist_entropy 3.00, value_loss 0.03, action_loss -0.01, explor_rew 0.026500 mean_episode_steps 20.25


  4%|█████▎                                                                                                                                       | 758/20000 [12:21:11<304:09:34, 56.91s/it]

Updates 757, num timesteps 303200, FPS 6 
Last 20 training episodes: mean/median reward -0.05/-0.03, min/max reward -1.05/0.55
dist_entropy 2.94, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 21.70


  4%|█████▎                                                                                                                                       | 759/20000 [12:22:07<302:34:33, 56.61s/it]

Updates 758, num timesteps 303600, FPS 6 
Last 20 training episodes: mean/median reward -0.05/0.26, min/max reward -1.05/0.55
dist_entropy 2.96, value_loss 0.02, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 22.30


  4%|█████▎                                                                                                                                       | 760/20000 [12:23:03<302:51:03, 56.67s/it]

Updates 759, num timesteps 304000, FPS 6 
Last 20 training episodes: mean/median reward -0.24/-0.04, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 29.95
Updates 760, num timesteps 304400, FPS 6 
Last 20 training episodes: mean/median reward 0.08/0.55, min/max reward -1.05/0.55
dist_entropy 3.00, value_loss 0.02, action_loss 0.45, explor_rew 0.030000 mean_episode_steps 16.55


  4%|█████▎                                                                                                                                       | 762/20000 [12:24:57<303:50:28, 56.86s/it]

Updates 761, num timesteps 304800, FPS 6 
Last 20 training episodes: mean/median reward -0.18/-0.03, min/max reward -1.05/0.55
dist_entropy 2.98, value_loss 0.01, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 18.40


  4%|█████▍                                                                                                                                       | 763/20000 [12:25:54<303:38:34, 56.82s/it]

Updates 762, num timesteps 305200, FPS 6 
Last 20 training episodes: mean/median reward -0.05/0.25, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.01, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 21.70
Updates 763, num timesteps 305600, FPS 6 
Last 20 training episodes: mean/median reward 0.06/0.55, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 22.05


  4%|█████▍                                                                                                                                       | 764/20000 [12:26:51<303:39:55, 56.83s/it]

Updates 764, num timesteps 306000, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.55, min/max reward -1.05/0.55
dist_entropy 2.91, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 21.75


  4%|█████▍                                                                                                                                       | 766/20000 [12:28:45<303:35:43, 56.82s/it]

Updates 765, num timesteps 306400, FPS 6 
Last 20 training episodes: mean/median reward -0.03/-0.04, min/max reward -1.05/0.55
dist_entropy 3.00, value_loss 0.02, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 29.15


  4%|█████▍                                                                                                                                       | 767/20000 [12:29:41<303:28:59, 56.81s/it]

Updates 766, num timesteps 306800, FPS 6 
Last 20 training episodes: mean/median reward 0.12/-0.04, min/max reward -1.05/0.55
dist_entropy 2.98, value_loss 0.01, action_loss -0.02, explor_rew 0.031500 mean_episode_steps 29.45


  4%|█████▍                                                                                                                                       | 768/20000 [12:30:38<303:25:10, 56.80s/it]

Updates 767, num timesteps 307200, FPS 6 
Last 20 training episodes: mean/median reward -0.35/-0.04, min/max reward -1.05/0.55
dist_entropy 3.00, value_loss 0.01, action_loss -0.02, explor_rew 0.033500 mean_episode_steps 22.60


  4%|█████▍                                                                                                                                       | 769/20000 [12:31:35<302:49:27, 56.69s/it]

Updates 768, num timesteps 307600, FPS 6 
Last 20 training episodes: mean/median reward 0.02/-0.03, min/max reward -1.05/0.55
dist_entropy 3.00, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 21.40


  4%|█████▍                                                                                                                                       | 770/20000 [12:32:31<302:54:32, 56.71s/it]

Updates 769, num timesteps 308000, FPS 6 
Last 20 training episodes: mean/median reward 0.02/-0.04, min/max reward -1.05/0.55
dist_entropy 3.02, value_loss 0.02, action_loss -0.02, explor_rew 0.032250 mean_episode_steps 25.80


  4%|█████▍                                                                                                                                       | 771/20000 [12:33:28<302:35:59, 56.65s/it]

Updates 770, num timesteps 308400, FPS 6 
Last 20 training episodes: mean/median reward -0.16/-0.04, min/max reward -1.05/0.55
dist_entropy 3.01, value_loss 0.01, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 23.70


  4%|█████▍                                                                                                                                       | 772/20000 [12:34:24<302:05:16, 56.56s/it]

Updates 771, num timesteps 308800, FPS 6 
Last 20 training episodes: mean/median reward 0.10/0.26, min/max reward -1.05/0.55
dist_entropy 3.00, value_loss 0.01, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 24.10


  4%|█████▍                                                                                                                                       | 773/20000 [12:35:21<302:20:35, 56.61s/it]

Updates 772, num timesteps 309200, FPS 6 
Last 20 training episodes: mean/median reward -0.03/-0.04, min/max reward -1.05/0.55
dist_entropy 3.01, value_loss 0.01, action_loss 0.00, explor_rew 0.032000 mean_episode_steps 23.60
Updates 773, num timesteps 309600, FPS 6 
Last 20 training episodes: mean/median reward 0.11/0.55, min/max reward -1.05/0.55
dist_entropy 2.99, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 33.95


  4%|█████▍                                                                                                                                       | 775/20000 [12:37:14<302:00:57, 56.55s/it]

Updates 774, num timesteps 310000, FPS 6 
Last 20 training episodes: mean/median reward -0.12/-0.04, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.01, action_loss -0.01, explor_rew 0.034750 mean_episode_steps 24.40


  4%|█████▍                                                                                                                                       | 776/20000 [12:38:11<302:01:54, 56.56s/it]

Updates 775, num timesteps 310400, FPS 6 
Last 20 training episodes: mean/median reward 0.12/-0.04, min/max reward -1.05/0.55
dist_entropy 2.90, value_loss 0.01, action_loss -0.02, explor_rew 0.034750 mean_episode_steps 29.20


  4%|█████▍                                                                                                                                       | 777/20000 [12:39:08<302:21:40, 56.62s/it]

Updates 776, num timesteps 310800, FPS 6 
Last 20 training episodes: mean/median reward -0.11/-0.04, min/max reward -1.05/0.55
dist_entropy 3.02, value_loss 0.04, action_loss -0.01, explor_rew 0.026000 mean_episode_steps 24.45


  4%|█████▍                                                                                                                                       | 778/20000 [12:40:04<301:57:06, 56.55s/it]

Updates 777, num timesteps 311200, FPS 6 
Last 20 training episodes: mean/median reward 0.15/0.25, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.01, action_loss -0.00, explor_rew 0.030250 mean_episode_steps 26.05
Updates 778, num timesteps 311600, FPS 6 
Last 20 training episodes: mean/median reward -0.02/0.55, min/max reward -1.05/0.55
dist_entropy 2.97, value_loss 0.06, action_loss -0.06, explor_rew 0.011750 mean_episode_steps 21.60


  4%|█████▍                                                                                                                                       | 780/20000 [12:41:57<302:03:50, 56.58s/it]

Updates 779, num timesteps 312000, FPS 6 
Last 20 training episodes: mean/median reward -0.01/-0.04, min/max reward -1.05/0.55
dist_entropy 2.96, value_loss 0.02, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 24.50


  4%|█████▌                                                                                                                                       | 781/20000 [12:42:54<302:46:24, 56.71s/it]

Updates 780, num timesteps 312400, FPS 6 
Last 20 training episodes: mean/median reward -0.03/-0.04, min/max reward -1.05/0.55
dist_entropy 2.99, value_loss 0.02, action_loss -0.02, explor_rew 0.031000 mean_episode_steps 32.60


  4%|█████▌                                                                                                                                       | 782/20000 [12:43:51<302:43:48, 56.71s/it]

Updates 781, num timesteps 312800, FPS 6 
Last 20 training episodes: mean/median reward -0.30/-0.04, min/max reward -1.05/0.55
dist_entropy 2.94, value_loss 0.02, action_loss -0.01, explor_rew 0.034750 mean_episode_steps 26.95
Updates 782, num timesteps 313200, FPS 6 
Last 20 training episodes: mean/median reward 0.09/0.55, min/max reward -1.05/0.55
dist_entropy 2.90, value_loss 0.01, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 20.10


  4%|█████▌                                                                                                                                       | 783/20000 [12:44:48<302:45:47, 56.72s/it]

Updates 783, num timesteps 313600, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.55, min/max reward -1.05/0.55
dist_entropy 2.94, value_loss 0.01, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 24.60


  4%|█████▌                                                                                                                                       | 785/20000 [12:46:41<303:09:10, 56.80s/it]

Updates 784, num timesteps 314000, FPS 6 
Last 20 training episodes: mean/median reward -0.06/-0.04, min/max reward -1.05/0.55
dist_entropy 2.89, value_loss 0.02, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 24.75


  4%|█████▌                                                                                                                                       | 786/20000 [12:47:38<302:42:44, 56.72s/it]

Updates 785, num timesteps 314400, FPS 6 
Last 20 training episodes: mean/median reward 0.10/0.25, min/max reward -1.05/0.55
dist_entropy 2.96, value_loss 0.01, action_loss -0.03, explor_rew 0.024250 mean_episode_steps 25.15


  4%|█████▌                                                                                                                                       | 787/20000 [12:48:35<302:40:05, 56.71s/it]

Updates 786, num timesteps 314800, FPS 6 
Last 20 training episodes: mean/median reward 0.20/0.25, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.02, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 25.75
Updates 787, num timesteps 315200, FPS 6 
Last 20 training episodes: mean/median reward 0.19/0.55, min/max reward -1.05/0.55
dist_entropy 3.03, value_loss 0.02, action_loss -0.04, explor_rew 0.015250 mean_episode_steps 26.50


  4%|█████▌                                                                                                                                       | 788/20000 [12:49:31<301:44:20, 56.54s/it]

Updates 788, num timesteps 315600, FPS 6 
Last 20 training episodes: mean/median reward -0.07/0.55, min/max reward -1.05/0.55
dist_entropy 2.97, value_loss 0.02, action_loss -0.00, explor_rew 0.030250 mean_episode_steps 29.40


  4%|█████▌                                                                                                                                       | 790/20000 [12:51:25<302:29:06, 56.69s/it]

Updates 789, num timesteps 316000, FPS 6 
Last 20 training episodes: mean/median reward -0.03/-0.04, min/max reward -1.05/0.55
dist_entropy 2.90, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 23.40
Updates 790, num timesteps 316400, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.55, min/max reward -1.05/0.55
dist_entropy 2.91, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 22.85


  4%|█████▌                                                                                                                                       | 792/20000 [12:53:19<303:24:17, 56.86s/it]

Updates 791, num timesteps 316800, FPS 6 
Last 20 training episodes: mean/median reward 0.02/-0.04, min/max reward -1.05/0.55
dist_entropy 2.91, value_loss 0.01, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 24.50


  4%|█████▌                                                                                                                                       | 793/20000 [12:54:15<303:17:45, 56.85s/it]

Updates 792, num timesteps 317200, FPS 6 
Last 20 training episodes: mean/median reward -0.03/-0.04, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.03, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 21.75


  4%|█████▌                                                                                                                                       | 794/20000 [12:55:11<301:40:02, 56.54s/it]

Updates 793, num timesteps 317600, FPS 6 
Last 20 training episodes: mean/median reward -0.01/-0.04, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.04, action_loss -0.02, explor_rew 0.020750 mean_episode_steps 35.05


  4%|█████▌                                                                                                                                       | 795/20000 [12:56:08<302:08:05, 56.64s/it]

Updates 794, num timesteps 318000, FPS 6 
Last 20 training episodes: mean/median reward -0.10/-0.04, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.02, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 23.70


  4%|█████▌                                                                                                                                       | 796/20000 [12:57:05<302:35:26, 56.72s/it]

Updates 795, num timesteps 318400, FPS 6 
Last 20 training episodes: mean/median reward -0.13/-0.04, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 28.10
Updates 796, num timesteps 318800, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 23.95


  4%|█████▋                                                                                                                                       | 798/20000 [12:58:59<303:39:19, 56.93s/it]

Updates 797, num timesteps 319200, FPS 6 
Last 20 training episodes: mean/median reward -0.20/-0.04, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.03, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 23.85
Updates 798, num timesteps 319600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.55, min/max reward -0.04/0.55
dist_entropy 2.58, value_loss 0.07, action_loss -0.02, explor_rew 0.034000 mean_episode_steps 30.70


  4%|█████▋                                                                                                                                       | 800/20000 [13:00:53<303:22:58, 56.88s/it]

Updates 799, num timesteps 320000, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.13, min/max reward -1.05/0.55
dist_entropy 2.64, value_loss 0.02, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 22.75
Updates 800, num timesteps 320400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 23.95


  4%|█████▋                                                                                                                                       | 802/20000 [13:02:48<304:17:17, 57.06s/it]

Updates 801, num timesteps 320800, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.29, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 21.30


  4%|█████▋                                                                                                                                       | 803/20000 [13:03:44<303:47:12, 56.97s/it]

Updates 802, num timesteps 321200, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.13, min/max reward -0.05/0.55
dist_entropy 2.70, value_loss 0.01, action_loss 0.01, explor_rew 0.027500 mean_episode_steps 21.15
Updates 803, num timesteps 321600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.03/0.55
dist_entropy 2.59, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 19.95


  4%|█████▋                                                                                                                                       | 804/20000 [13:04:42<304:47:14, 57.16s/it]

Updates 804, num timesteps 322000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.00, action_loss 0.01, explor_rew 0.032750 mean_episode_steps 25.70


  4%|█████▋                                                                                                                                       | 805/20000 [13:05:39<305:20:17, 57.27s/it]

Updates 805, num timesteps 322400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.55, min/max reward -0.03/0.55
dist_entropy 2.55, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 20.10


  4%|█████▋                                                                                                                                       | 807/20000 [13:07:34<305:07:35, 57.23s/it]

Updates 806, num timesteps 322800, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.34, min/max reward -0.04/0.55
dist_entropy 2.57, value_loss 0.01, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 23.85


  4%|█████▋                                                                                                                                       | 808/20000 [13:08:31<304:53:19, 57.19s/it]

Updates 807, num timesteps 323200, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.13, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.04, action_loss -0.05, explor_rew 0.019000 mean_episode_steps 25.70
Updates 808, num timesteps 323600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.55, min/max reward -0.05/0.55
dist_entropy 2.60, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 20.60


  4%|█████▋                                                                                                                                       | 809/20000 [13:09:28<304:56:58, 57.20s/it]

Updates 809, num timesteps 324000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.04/0.55
dist_entropy 2.54, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 21.35


  4%|█████▋                                                                                                                                       | 811/20000 [13:11:22<303:57:22, 57.02s/it]

Updates 810, num timesteps 324400, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.13, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.02, explor_rew 0.033750 mean_episode_steps 21.00
Updates 811, num timesteps 324800, FPS 6 
Last 20 training episodes: mean/median reward 0.49/0.55, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 20.40


  4%|█████▋                                                                                                                                       | 812/20000 [13:12:19<303:47:24, 57.00s/it]

Updates 812, num timesteps 325200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 19.60


  4%|█████▋                                                                                                                                       | 814/20000 [13:14:13<303:35:40, 56.97s/it]

Updates 813, num timesteps 325600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 22.05


  4%|█████▋                                                                                                                                       | 815/20000 [13:15:10<303:58:34, 57.04s/it]

Updates 814, num timesteps 326000, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 23.10


  4%|█████▊                                                                                                                                       | 816/20000 [13:16:07<303:59:24, 57.05s/it]

Updates 815, num timesteps 326400, FPS 6 
Last 20 training episodes: mean/median reward -0.01/0.20, min/max reward -1.05/0.55
dist_entropy 2.66, value_loss 0.09, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 15.65
Updates 816, num timesteps 326800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 31.90


  4%|█████▊                                                                                                                                       | 818/20000 [13:18:02<304:17:38, 57.11s/it]

Updates 817, num timesteps 327200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.00, explor_rew 0.030000 mean_episode_steps 24.60


  4%|█████▊                                                                                                                                       | 819/20000 [13:18:58<303:40:59, 57.00s/it]

Updates 818, num timesteps 327600, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 30.65


  4%|█████▊                                                                                                                                       | 820/20000 [13:19:55<303:13:37, 56.91s/it]

Updates 819, num timesteps 328000, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 18.95
Updates 820, num timesteps 328400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 22.15


  4%|█████▊                                                                                                                                       | 822/20000 [13:21:49<303:28:35, 56.97s/it]

Updates 821, num timesteps 328800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss 0.00, explor_rew 0.032000 mean_episode_steps 18.10
Updates 822, num timesteps 329200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.04, action_loss 0.05, explor_rew 0.030250 mean_episode_steps 16.65


  4%|█████▊                                                                                                                                       | 823/20000 [13:22:47<304:40:45, 57.20s/it]

Updates 823, num timesteps 329600, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.55, min/max reward -1.05/0.55
dist_entropy 3.05, value_loss 0.15, action_loss 0.01, explor_rew -0.008750 mean_episode_steps 34.25


  4%|█████▊                                                                                                                                       | 825/20000 [13:24:39<301:42:44, 56.64s/it]

Updates 824, num timesteps 330000, FPS 6 
Last 20 training episodes: mean/median reward 0.10/-0.03, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.07, action_loss -0.00, explor_rew 0.014750 mean_episode_steps 40.35
Updates 825, num timesteps 330400, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.59, value_loss 0.07, action_loss 0.04, explor_rew 0.022000 mean_episode_steps 26.70


  4%|█████▊                                                                                                                                       | 827/20000 [13:26:34<303:46:59, 57.04s/it]

Updates 826, num timesteps 330800, FPS 6 
Last 20 training episodes: mean/median reward 0.03/0.08, min/max reward -1.05/0.55
dist_entropy 2.54, value_loss 0.05, action_loss -0.02, explor_rew 0.027500 mean_episode_steps 25.35


  4%|█████▊                                                                                                                                       | 828/20000 [13:27:31<304:01:42, 57.09s/it]

Updates 827, num timesteps 331200, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.50, min/max reward -1.05/0.55
dist_entropy 2.58, value_loss 0.03, action_loss 0.02, explor_rew 0.024250 mean_episode_steps 26.70


  4%|█████▊                                                                                                                                       | 829/20000 [13:28:28<303:02:03, 56.90s/it]

Updates 828, num timesteps 331600, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.29, min/max reward -1.05/0.55
dist_entropy 2.63, value_loss 0.05, action_loss -0.02, explor_rew 0.016750 mean_episode_steps 26.00


  4%|█████▊                                                                                                                                       | 830/20000 [13:29:25<303:23:06, 56.97s/it]

Updates 829, num timesteps 332000, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.08, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.03, action_loss -0.02, explor_rew 0.024500 mean_episode_steps 25.50


  4%|█████▊                                                                                                                                       | 831/20000 [13:30:22<303:16:53, 56.96s/it]

Updates 830, num timesteps 332400, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.13, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.03, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 17.50


  4%|█████▊                                                                                                                                       | 832/20000 [13:31:19<303:23:02, 56.98s/it]

Updates 831, num timesteps 332800, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.05/0.55
dist_entropy 2.82, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 29.75


  4%|█████▊                                                                                                                                       | 833/20000 [13:32:16<303:20:37, 56.97s/it]

Updates 832, num timesteps 333200, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.50, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 21.65
Updates 833, num timesteps 333600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.55, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 23.80


  4%|█████▉                                                                                                                                       | 834/20000 [13:33:13<303:17:53, 56.97s/it]

Updates 834, num timesteps 334000, FPS 6 
Last 20 training episodes: mean/median reward 0.49/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.00, explor_rew 0.031000 mean_episode_steps 20.55


  4%|█████▉                                                                                                                                       | 836/20000 [13:35:07<303:16:18, 56.97s/it]

Updates 835, num timesteps 334400, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.03, explor_rew 0.032250 mean_episode_steps 19.15


  4%|█████▉                                                                                                                                       | 837/20000 [13:36:03<302:34:03, 56.84s/it]

Updates 836, num timesteps 334800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 20.15


  4%|█████▉                                                                                                                                       | 838/20000 [13:37:00<302:52:33, 56.90s/it]

Updates 837, num timesteps 335200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 23.90


  4%|█████▉                                                                                                                                       | 839/20000 [13:37:57<302:18:36, 56.80s/it]

Updates 838, num timesteps 335600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.05/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 24.15


  4%|█████▉                                                                                                                                       | 840/20000 [13:38:54<302:23:07, 56.82s/it]

Updates 839, num timesteps 336000, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.01, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 24.50


  4%|█████▉                                                                                                                                       | 841/20000 [13:39:50<301:47:37, 56.71s/it]

Updates 840, num timesteps 336400, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.50, min/max reward -0.05/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 27.70
Updates 841, num timesteps 336800, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 21.65


  4%|█████▉                                                                                                                                       | 842/20000 [13:40:48<302:47:06, 56.90s/it]

Updates 842, num timesteps 337200, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.05/0.55
dist_entropy 2.83, value_loss 0.02, action_loss -0.02, explor_rew 0.018250 mean_episode_steps 24.65


  4%|█████▉                                                                                                                                       | 843/20000 [13:41:44<302:46:47, 56.90s/it]

Updates 843, num timesteps 337600, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.00, explor_rew 0.030500 mean_episode_steps 22.20


  4%|█████▉                                                                                                                                       | 844/20000 [13:42:42<303:30:54, 57.04s/it]

Updates 844, num timesteps 338000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 30.40


  4%|█████▉                                                                                                                                       | 845/20000 [13:43:39<303:59:03, 57.13s/it]

Updates 845, num timesteps 338400, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 21.60


  4%|█████▉                                                                                                                                       | 846/20000 [13:44:37<304:33:04, 57.24s/it]

Updates 846, num timesteps 338800, FPS 6 
Last 20 training episodes: mean/median reward 0.48/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 18.40


  4%|█████▉                                                                                                                                       | 847/20000 [13:45:34<305:08:33, 57.35s/it]

Updates 847, num timesteps 339200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 27.15


  4%|█████▉                                                                                                                                       | 849/20000 [13:47:29<304:33:09, 57.25s/it]

Updates 848, num timesteps 339600, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.45, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 23.30


  4%|█████▉                                                                                                                                       | 850/20000 [13:48:25<303:38:48, 57.08s/it]

Updates 849, num timesteps 340000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.05/0.55
dist_entropy 2.79, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 26.70
Updates 850, num timesteps 340400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 17.75


  4%|█████▉                                                                                                                                       | 851/20000 [13:49:23<304:13:24, 57.19s/it]

Updates 851, num timesteps 340800, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 21.70


  4%|██████                                                                                                                                       | 853/20000 [13:51:16<302:56:12, 56.96s/it]

Updates 852, num timesteps 341200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.034750 mean_episode_steps 20.85
Updates 853, num timesteps 341600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 21.95


  4%|██████                                                                                                                                       | 855/20000 [13:53:10<302:54:27, 56.96s/it]

Updates 854, num timesteps 342000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 22.35
Updates 855, num timesteps 342400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss 0.00, explor_rew 0.032250 mean_episode_steps 15.65


  4%|██████                                                                                                                                       | 857/20000 [13:55:04<301:59:28, 56.79s/it]

Updates 856, num timesteps 342800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 20.90
Updates 857, num timesteps 343200, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.05/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 18.00


  4%|██████                                                                                                                                       | 859/20000 [13:56:59<303:32:24, 57.09s/it]

Updates 858, num timesteps 343600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 20.55


  4%|██████                                                                                                                                       | 860/20000 [13:57:56<303:22:45, 57.06s/it]

Updates 859, num timesteps 344000, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 22.25
Updates 860, num timesteps 344400, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 20.05


  4%|██████                                                                                                                                       | 861/20000 [13:58:53<303:40:39, 57.12s/it]

Updates 861, num timesteps 344800, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -1.05/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 20.05


  4%|██████                                                                                                                                       | 862/20000 [13:59:50<304:12:50, 57.22s/it]

Updates 862, num timesteps 345200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.01, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 29.15


  4%|██████                                                                                                                                       | 864/20000 [14:01:45<303:57:45, 57.18s/it]

Updates 863, num timesteps 345600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 23.80


  4%|██████                                                                                                                                       | 865/20000 [14:02:41<302:56:11, 56.99s/it]

Updates 864, num timesteps 346000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 36.15


  4%|██████                                                                                                                                       | 866/20000 [14:03:38<303:22:54, 57.08s/it]

Updates 865, num timesteps 346400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 23.80


  4%|██████                                                                                                                                       | 867/20000 [14:04:35<303:05:43, 57.03s/it]

Updates 866, num timesteps 346800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 24.40


  4%|██████                                                                                                                                       | 868/20000 [14:05:32<302:40:09, 56.95s/it]

Updates 867, num timesteps 347200, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.02, explor_rew 0.033250 mean_episode_steps 20.85
Updates 868, num timesteps 347600, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 29.45


  4%|██████▏                                                                                                                                      | 870/20000 [14:07:26<302:19:07, 56.89s/it]

Updates 869, num timesteps 348000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 22.65


  4%|██████▏                                                                                                                                      | 871/20000 [14:08:23<301:59:21, 56.83s/it]

Updates 870, num timesteps 348400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.05/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 21.25


  4%|██████▏                                                                                                                                      | 872/20000 [14:09:19<301:59:01, 56.84s/it]

Updates 871, num timesteps 348800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.01, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 16.35


  4%|██████▏                                                                                                                                      | 873/20000 [14:10:16<301:38:26, 56.77s/it]

Updates 872, num timesteps 349200, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss 0.00, explor_rew 0.032750 mean_episode_steps 27.05


  4%|██████▏                                                                                                                                      | 874/20000 [14:11:13<302:15:51, 56.89s/it]

Updates 873, num timesteps 349600, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 15.25


  4%|██████▏                                                                                                                                      | 875/20000 [14:12:10<302:35:21, 56.96s/it]

Updates 874, num timesteps 350000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.59, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 20.20
Updates 875, num timesteps 350400, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.01, action_loss 0.00, explor_rew 0.031750 mean_episode_steps 23.10


  4%|██████▏                                                                                                                                      | 876/20000 [14:13:07<302:17:08, 56.90s/it]

Updates 876, num timesteps 350800, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.77, value_loss 0.01, action_loss 0.01, explor_rew 0.030500 mean_episode_steps 21.80


  4%|██████▏                                                                                                                                      | 877/20000 [14:14:05<303:12:12, 57.08s/it]

Updates 877, num timesteps 351200, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.55, min/max reward -1.05/0.55
dist_entropy 2.92, value_loss 0.20, action_loss -0.00, explor_rew 0.018500 mean_episode_steps 31.20


  4%|██████▏                                                                                                                                      | 879/20000 [14:15:58<302:09:33, 56.89s/it]

Updates 878, num timesteps 351600, FPS 6 
Last 20 training episodes: mean/median reward -0.36/-1.05, min/max reward -1.05/0.55
dist_entropy 3.12, value_loss 0.17, action_loss 0.03, explor_rew -0.010000 mean_episode_steps 32.25


  4%|██████▏                                                                                                                                      | 880/20000 [14:16:54<301:10:19, 56.71s/it]

Updates 879, num timesteps 352000, FPS 6 
Last 20 training episodes: mean/median reward -0.47/-1.05, min/max reward -1.05/0.55
dist_entropy 3.16, value_loss 0.08, action_loss 0.00, explor_rew -0.013750 mean_episode_steps 39.80
Updates 880, num timesteps 352400, FPS 6 
Last 20 training episodes: mean/median reward 0.09/0.55, min/max reward -1.05/0.55
dist_entropy 3.14, value_loss 0.07, action_loss -0.03, explor_rew 0.010500 mean_episode_steps 21.45


  4%|██████▏                                                                                                                                      | 881/20000 [14:17:51<301:29:08, 56.77s/it]

Updates 881, num timesteps 352800, FPS 6 
Last 20 training episodes: mean/median reward 0.06/0.55, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.06, action_loss -0.02, explor_rew 0.006000 mean_episode_steps 25.95


  4%|██████▏                                                                                                                                      | 883/20000 [14:19:44<300:48:34, 56.65s/it]

Updates 882, num timesteps 353200, FPS 6 
Last 20 training episodes: mean/median reward -0.07/-0.00, min/max reward -1.05/0.55
dist_entropy 2.65, value_loss 0.05, action_loss -0.02, explor_rew 0.013750 mean_episode_steps 23.05
Updates 883, num timesteps 353600, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.55, min/max reward -1.05/0.55
dist_entropy 2.58, value_loss 0.06, action_loss 0.00, explor_rew 0.012750 mean_episode_steps 25.10


  4%|██████▏                                                                                                                                      | 884/20000 [14:20:41<301:12:45, 56.73s/it]

Updates 884, num timesteps 354000, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.55, min/max reward -1.05/0.55
dist_entropy 2.58, value_loss 0.05, action_loss 0.00, explor_rew 0.012250 mean_episode_steps 29.20


  4%|██████▏                                                                                                                                      | 886/20000 [14:22:36<302:31:04, 56.98s/it]

Updates 885, num timesteps 354400, FPS 6 
Last 20 training episodes: mean/median reward 0.05/0.23, min/max reward -1.05/0.55
dist_entropy 2.49, value_loss 0.05, action_loss -0.00, explor_rew 0.010500 mean_episode_steps 30.30


  4%|██████▎                                                                                                                                      | 887/20000 [14:23:33<302:14:59, 56.93s/it]

Updates 886, num timesteps 354800, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.05/0.55
dist_entropy 2.31, value_loss 0.06, action_loss -0.02, explor_rew 0.021500 mean_episode_steps 34.50
Updates 887, num timesteps 355200, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.05/0.55
dist_entropy 2.31, value_loss 0.03, action_loss -0.02, explor_rew 0.024750 mean_episode_steps 22.30


  4%|██████▎                                                                                                                                      | 889/20000 [14:25:27<303:03:15, 57.09s/it]

Updates 888, num timesteps 355600, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.23, min/max reward -1.05/0.55
dist_entropy 2.44, value_loss 0.02, action_loss -0.01, explor_rew 0.023750 mean_episode_steps 22.45


  4%|██████▎                                                                                                                                      | 890/20000 [14:26:24<303:02:28, 57.09s/it]

Updates 889, num timesteps 356000, FPS 6 
Last 20 training episodes: mean/median reward 0.14/0.13, min/max reward -1.05/0.55
dist_entropy 2.56, value_loss 0.03, action_loss -0.02, explor_rew 0.025750 mean_episode_steps 21.15


  4%|██████▎                                                                                                                                      | 891/20000 [14:27:21<302:54:21, 57.07s/it]

Updates 890, num timesteps 356400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -1.05/0.55
dist_entropy 2.47, value_loss 0.02, action_loss 0.02, explor_rew 0.029250 mean_episode_steps 19.75
Updates 891, num timesteps 356800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.55, min/max reward -0.04/0.55
dist_entropy 2.50, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 19.25


  4%|██████▎                                                                                                                                      | 893/20000 [14:29:15<302:38:23, 57.02s/it]

Updates 892, num timesteps 357200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.03/0.55
dist_entropy 2.57, value_loss 0.02, action_loss -0.02, explor_rew 0.032000 mean_episode_steps 21.10


  4%|██████▎                                                                                                                                      | 894/20000 [14:30:13<303:12:38, 57.13s/it]

Updates 893, num timesteps 357600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -0.04/0.55
dist_entropy 2.57, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 24.40


  4%|██████▎                                                                                                                                      | 895/20000 [14:31:09<302:13:09, 56.95s/it]

Updates 894, num timesteps 358000, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -1.05/0.55
dist_entropy 2.55, value_loss 0.03, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 26.25
Updates 895, num timesteps 358400, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.03/0.55
dist_entropy 2.56, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 22.55


  4%|██████▎                                                                                                                                      | 896/20000 [14:32:06<302:08:15, 56.94s/it]

Updates 896, num timesteps 358800, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.56, value_loss 0.00, action_loss -0.02, explor_rew 0.031000 mean_episode_steps 16.60


  4%|██████▎                                                                                                                                      | 897/20000 [14:33:03<302:48:47, 57.07s/it]

Updates 897, num timesteps 359200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.04/0.55
dist_entropy 2.59, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 24.95


  4%|██████▎                                                                                                                                      | 899/20000 [14:34:57<302:35:13, 57.03s/it]

Updates 898, num timesteps 359600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.50, min/max reward -0.04/0.55
dist_entropy 2.59, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 19.05


  4%|██████▎                                                                                                                                      | 900/20000 [14:35:54<302:33:43, 57.03s/it]

Updates 899, num timesteps 360000, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.61, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 22.20
Updates 900, num timesteps 360400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.03/0.55
dist_entropy 2.59, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 24.75


  5%|██████▎                                                                                                                                      | 902/20000 [14:37:48<302:28:16, 57.02s/it]

Updates 901, num timesteps 360800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.56, value_loss 0.03, action_loss 0.00, explor_rew 0.028250 mean_episode_steps 24.85
Updates 902, num timesteps 361200, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.55, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 23.95


  5%|██████▎                                                                                                                                      | 903/20000 [14:38:46<302:35:54, 57.04s/it]

Updates 903, num timesteps 361600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.50, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 25.60


  5%|██████▍                                                                                                                                      | 905/20000 [14:40:40<302:36:57, 57.05s/it]

Updates 904, num timesteps 362000, FPS 6 
Last 20 training episodes: mean/median reward 0.48/0.50, min/max reward -0.04/0.55
dist_entropy 2.53, value_loss 0.00, action_loss -0.03, explor_rew 0.032500 mean_episode_steps 21.35


  5%|██████▍                                                                                                                                      | 906/20000 [14:41:36<302:07:15, 56.96s/it]

Updates 905, num timesteps 362400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.54, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 24.25
Updates 906, num timesteps 362800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.52, value_loss 0.01, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 28.25


  5%|██████▍                                                                                                                                      | 908/20000 [14:43:31<303:02:06, 57.14s/it]

Updates 907, num timesteps 363200, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.56, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 26.65
Updates 908, num timesteps 363600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.56, value_loss 0.02, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 21.95


  5%|██████▍                                                                                                                                      | 909/20000 [14:44:29<303:09:25, 57.17s/it]

Updates 909, num timesteps 364000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -1.05/0.85
dist_entropy 2.53, value_loss 0.01, action_loss -0.01, explor_rew 0.029000 mean_episode_steps 21.80


  5%|██████▍                                                                                                                                      | 911/20000 [14:46:23<302:52:05, 57.12s/it]

Updates 910, num timesteps 364400, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.05/0.55
dist_entropy 2.55, value_loss 0.00, action_loss -0.02, explor_rew 0.029750 mean_episode_steps 21.55


  5%|██████▍                                                                                                                                      | 912/20000 [14:47:20<302:16:30, 57.01s/it]

Updates 911, num timesteps 364800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 2.58, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 24.20


  5%|██████▍                                                                                                                                      | 913/20000 [14:48:16<301:45:42, 56.92s/it]

Updates 912, num timesteps 365200, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -0.04/0.55
dist_entropy 2.58, value_loss 0.00, action_loss -0.01, explor_rew 0.034000 mean_episode_steps 21.35
Updates 913, num timesteps 365600, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.61, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 20.90


  5%|██████▍                                                                                                                                      | 915/20000 [14:50:10<301:56:32, 56.96s/it]

Updates 914, num timesteps 366000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.60, value_loss 0.00, action_loss -0.03, explor_rew 0.033250 mean_episode_steps 16.80
Updates 915, num timesteps 366400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.028250 mean_episode_steps 26.80


  5%|██████▍                                                                                                                                      | 917/20000 [14:52:04<301:28:25, 56.87s/it]

Updates 916, num timesteps 366800, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.05/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 22.55
Updates 917, num timesteps 367200, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.55, min/max reward -0.05/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.02, explor_rew 0.030000 mean_episode_steps 21.50


  5%|██████▍                                                                                                                                      | 919/20000 [14:53:58<301:36:54, 56.91s/it]

Updates 918, num timesteps 367600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.02, explor_rew 0.033750 mean_episode_steps 23.55
Updates 919, num timesteps 368000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 21.60


  5%|██████▍                                                                                                                                      | 921/20000 [14:55:53<303:42:48, 57.31s/it]

Updates 920, num timesteps 368400, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 20.00


  5%|██████▌                                                                                                                                      | 922/20000 [14:56:50<302:50:57, 57.15s/it]

Updates 921, num timesteps 368800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 18.75


  5%|██████▌                                                                                                                                      | 923/20000 [14:57:47<302:20:41, 57.06s/it]

Updates 922, num timesteps 369200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 21.10


  5%|██████▌                                                                                                                                      | 924/20000 [14:58:44<302:16:44, 57.05s/it]

Updates 923, num timesteps 369600, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.50, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss 0.01, explor_rew 0.030000 mean_episode_steps 22.00


  5%|██████▌                                                                                                                                      | 925/20000 [14:59:41<301:36:15, 56.92s/it]

Updates 924, num timesteps 370000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.01, action_loss 0.00, explor_rew 0.033000 mean_episode_steps 23.65
Updates 925, num timesteps 370400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 21.00


  5%|██████▌                                                                                                                                      | 926/20000 [15:00:38<302:11:37, 57.04s/it]

Updates 926, num timesteps 370800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 23.20


  5%|██████▌                                                                                                                                      | 928/20000 [15:02:32<301:54:17, 56.99s/it]

Updates 927, num timesteps 371200, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 24.10


  5%|██████▌                                                                                                                                      | 929/20000 [15:03:29<301:47:06, 56.97s/it]

Updates 928, num timesteps 371600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 24.05


  5%|██████▌                                                                                                                                      | 930/20000 [15:04:26<301:53:42, 56.99s/it]

Updates 929, num timesteps 372000, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 23.65
Updates 930, num timesteps 372400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 23.95


  5%|██████▌                                                                                                                                      | 931/20000 [15:05:23<301:54:33, 57.00s/it]

Updates 931, num timesteps 372800, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss 0.00, explor_rew 0.032000 mean_episode_steps 23.65


  5%|██████▌                                                                                                                                      | 933/20000 [15:07:17<302:05:04, 57.04s/it]

Updates 932, num timesteps 373200, FPS 6 
Last 20 training episodes: mean/median reward 0.48/0.50, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 19.70


  5%|██████▌                                                                                                                                      | 934/20000 [15:08:14<301:54:44, 57.01s/it]

Updates 933, num timesteps 373600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.02, explor_rew 0.032250 mean_episode_steps 20.25


  5%|██████▌                                                                                                                                      | 935/20000 [15:09:11<302:01:55, 57.03s/it]

Updates 934, num timesteps 374000, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 24.65
Updates 935, num timesteps 374400, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 22.55


  5%|██████▌                                                                                                                                      | 936/20000 [15:10:09<303:18:37, 57.28s/it]

Updates 936, num timesteps 374800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.01, action_loss 0.02, explor_rew 0.030500 mean_episode_steps 18.70


  5%|██████▌                                                                                                                                      | 938/20000 [15:12:04<303:01:14, 57.23s/it]

Updates 937, num timesteps 375200, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 26.30


  5%|██████▌                                                                                                                                      | 939/20000 [15:13:00<302:17:29, 57.09s/it]

Updates 938, num timesteps 375600, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.05/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 22.85


  5%|██████▋                                                                                                                                      | 940/20000 [15:13:57<301:56:21, 57.03s/it]

Updates 939, num timesteps 376000, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 19.20


  5%|██████▋                                                                                                                                      | 941/20000 [15:14:54<301:59:36, 57.04s/it]

Updates 940, num timesteps 376400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.01, action_loss 0.00, explor_rew 0.031500 mean_episode_steps 24.20
Updates 941, num timesteps 376800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 19.55


  5%|██████▋                                                                                                                                      | 942/20000 [15:15:51<302:12:52, 57.09s/it]

Updates 942, num timesteps 377200, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 19.35


  5%|██████▋                                                                                                                                      | 944/20000 [15:17:46<301:57:32, 57.05s/it]

Updates 943, num timesteps 377600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 24.80


  5%|██████▋                                                                                                                                      | 945/20000 [15:18:43<302:46:47, 57.20s/it]

Updates 944, num timesteps 378000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss 0.00, explor_rew 0.032750 mean_episode_steps 18.00


  5%|██████▋                                                                                                                                      | 946/20000 [15:19:40<302:03:45, 57.07s/it]

Updates 945, num timesteps 378400, FPS 6 
Last 20 training episodes: mean/median reward 0.22/-0.00, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.02, action_loss 0.21, explor_rew 0.020000 mean_episode_steps 31.25


  5%|██████▋                                                                                                                                      | 947/20000 [15:20:37<302:05:00, 57.08s/it]

Updates 946, num timesteps 378800, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.50, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 26.05
Updates 947, num timesteps 379200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.05/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 23.15


  5%|██████▋                                                                                                                                      | 948/20000 [15:21:35<302:41:12, 57.19s/it]

Updates 948, num timesteps 379600, FPS 6 
Last 20 training episodes: mean/median reward -0.09/0.55, min/max reward -1.05/0.55
dist_entropy 2.96, value_loss 0.22, action_loss 0.04, explor_rew 0.031000 mean_episode_steps 27.90


  5%|██████▋                                                                                                                                      | 950/20000 [15:23:29<301:47:56, 57.03s/it]

Updates 949, num timesteps 380000, FPS 6 
Last 20 training episodes: mean/median reward -0.15/0.25, min/max reward -1.05/0.55
dist_entropy 2.80, value_loss 0.10, action_loss 0.03, explor_rew 0.010750 mean_episode_steps 25.90
Updates 950, num timesteps 380400, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.58, value_loss 0.07, action_loss 0.01, explor_rew 0.016500 mean_episode_steps 19.35


  5%|██████▋                                                                                                                                      | 951/20000 [15:24:26<302:37:43, 57.19s/it]

Updates 951, num timesteps 380800, FPS 6 
Last 20 training episodes: mean/median reward 0.18/0.55, min/max reward -1.05/0.55
dist_entropy 2.19, value_loss 0.05, action_loss 0.00, explor_rew -0.006500 mean_episode_steps 41.00


  5%|██████▋                                                                                                                                      | 953/20000 [15:26:20<301:37:17, 57.01s/it]

Updates 952, num timesteps 381200, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.26, min/max reward -1.05/0.55
dist_entropy 2.40, value_loss 0.05, action_loss -0.02, explor_rew 0.011000 mean_episode_steps 32.40


  5%|██████▋                                                                                                                                      | 954/20000 [15:27:17<301:58:06, 57.08s/it]

Updates 953, num timesteps 381600, FPS 6 
Last 20 training episodes: mean/median reward 0.16/0.23, min/max reward -1.05/0.55
dist_entropy 2.42, value_loss 0.04, action_loss 0.00, explor_rew 0.018500 mean_episode_steps 32.90
Updates 954, num timesteps 382000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.04/0.55
dist_entropy 2.44, value_loss 0.03, action_loss -0.01, explor_rew 0.024250 mean_episode_steps 23.55


  5%|██████▋                                                                                                                                      | 955/20000 [15:28:14<301:31:33, 57.00s/it]

Updates 955, num timesteps 382400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.05/0.55
dist_entropy 2.36, value_loss 0.02, action_loss -0.02, explor_rew 0.021000 mean_episode_steps 27.35


  5%|██████▋                                                                                                                                      | 957/20000 [15:30:08<301:51:25, 57.06s/it]

Updates 956, num timesteps 382800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.54, value_loss 0.02, action_loss -0.02, explor_rew 0.031000 mean_episode_steps 26.15
Updates 957, num timesteps 383200, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.03/0.55
dist_entropy 2.68, value_loss 0.04, action_loss 0.02, explor_rew 0.031000 mean_episode_steps 17.95


  5%|██████▊                                                                                                                                      | 959/20000 [15:32:03<302:20:36, 57.16s/it]

Updates 958, num timesteps 383600, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.50, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.04, action_loss -0.00, explor_rew 0.031000 mean_episode_steps 13.70


  5%|██████▊                                                                                                                                      | 960/20000 [15:33:00<302:39:08, 57.22s/it]

Updates 959, num timesteps 384000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.62, value_loss 0.02, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 22.55
Updates 960, num timesteps 384400, FPS 6 
Last 20 training episodes: mean/median reward 0.49/0.55, min/max reward 0.02/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 23.85


  5%|██████▊                                                                                                                                      | 961/20000 [15:33:58<304:00:54, 57.48s/it]

Updates 961, num timesteps 384800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.03/0.55
dist_entropy 2.78, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 17.25


  5%|██████▊                                                                                                                                      | 963/20000 [15:35:53<303:35:28, 57.41s/it]

Updates 962, num timesteps 385200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.00, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 29.80


  5%|██████▊                                                                                                                                      | 964/20000 [15:36:51<303:43:40, 57.44s/it]

Updates 963, num timesteps 385600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 18.95
Updates 964, num timesteps 386000, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.01, action_loss -0.00, explor_rew 0.030750 mean_episode_steps 20.45


  5%|██████▊                                                                                                                                      | 966/20000 [15:39:19<350:18:51, 66.26s/it]

Updates 965, num timesteps 386400, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 16.30
Updates 966, num timesteps 386800, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 16.40


  5%|██████▊                                                                                                                                      | 968/20000 [15:41:52<378:32:44, 71.60s/it]

Updates 967, num timesteps 387200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.00, explor_rew 0.031000 mean_episode_steps 24.00
Updates 968, num timesteps 387600, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.55, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.04, action_loss -0.02, explor_rew 0.022750 mean_episode_steps 25.75


  5%|██████▊                                                                                                                                      | 970/20000 [15:44:25<390:42:32, 73.91s/it]

Updates 969, num timesteps 388000, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 22.55


  5%|██████▊                                                                                                                                      | 971/20000 [15:45:41<394:19:27, 74.60s/it]

Updates 970, num timesteps 388400, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.05/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 25.25


  5%|██████▊                                                                                                                                      | 972/20000 [15:46:57<396:26:58, 75.01s/it]

Updates 971, num timesteps 388800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 20.05
Updates 972, num timesteps 389200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 18.85


  5%|██████▊                                                                                                                                      | 973/20000 [15:48:10<393:11:30, 74.39s/it]

Updates 973, num timesteps 389600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 20.90


  5%|██████▊                                                                                                                                      | 975/20000 [15:50:39<395:00:53, 74.75s/it]

Updates 974, num timesteps 390000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 26.05


  5%|██████▉                                                                                                                                      | 976/20000 [15:51:56<397:15:02, 75.17s/it]

Updates 975, num timesteps 390400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 22.25


  5%|██████▉                                                                                                                                      | 977/20000 [15:53:11<398:13:36, 75.36s/it]

Updates 976, num timesteps 390800, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 22.70


  5%|██████▉                                                                                                                                      | 978/20000 [15:54:28<400:01:34, 75.71s/it]

Updates 977, num timesteps 391200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 23.20
Updates 978, num timesteps 391600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 20.45


  5%|██████▉                                                                                                                                      | 980/20000 [15:57:01<401:44:38, 76.04s/it]

Updates 979, num timesteps 392000, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 22.45


  5%|██████▉                                                                                                                                      | 981/20000 [15:58:13<396:21:10, 75.02s/it]

Updates 980, num timesteps 392400, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.21, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 17.90


  5%|██████▉                                                                                                                                      | 982/20000 [15:59:31<400:49:37, 75.87s/it]

Updates 981, num timesteps 392800, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.45, min/max reward -1.05/0.55
dist_entropy 2.68, value_loss 0.01, action_loss -0.00, explor_rew 0.033750 mean_episode_steps 24.00
Updates 982, num timesteps 393200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 23.35


  5%|██████▉                                                                                                                                      | 983/20000 [16:00:48<402:20:32, 76.17s/it]

Updates 983, num timesteps 393600, FPS 6 
Last 20 training episodes: mean/median reward 0.49/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.02, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 17.05


  5%|██████▉                                                                                                                                      | 985/20000 [16:03:19<401:38:08, 76.04s/it]

Updates 984, num timesteps 394000, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.01, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 26.90
Updates 985, num timesteps 394400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 19.65


  5%|██████▉                                                                                                                                      | 986/20000 [16:04:36<402:33:10, 76.22s/it]

Updates 986, num timesteps 394800, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.02, explor_rew 0.033000 mean_episode_steps 19.00


  5%|██████▉                                                                                                                                      | 988/20000 [16:07:09<402:30:40, 76.22s/it]

Updates 987, num timesteps 395200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.01, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 22.85


  5%|██████▉                                                                                                                                      | 989/20000 [16:08:22<397:20:58, 75.24s/it]

Updates 988, num timesteps 395600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 24.15


  5%|██████▉                                                                                                                                      | 990/20000 [16:09:37<398:12:15, 75.41s/it]

Updates 989, num timesteps 396000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.033750 mean_episode_steps 23.05


  5%|██████▉                                                                                                                                      | 991/20000 [16:10:53<398:30:17, 75.47s/it]

Updates 990, num timesteps 396400, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.50, min/max reward -1.05/0.55
dist_entropy 2.75, value_loss 0.02, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 21.05
Updates 991, num timesteps 396800, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 26.65


  5%|███████                                                                                                                                      | 993/20000 [16:13:21<393:13:56, 74.48s/it]

Updates 992, num timesteps 397200, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 23.35


  5%|███████                                                                                                                                      | 994/20000 [16:14:21<371:28:44, 70.36s/it]

Updates 993, num timesteps 397600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 23.05


  5%|███████                                                                                                                                      | 995/20000 [16:15:22<355:16:19, 67.30s/it]

Updates 994, num timesteps 398000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 26.35


  5%|███████                                                                                                                                      | 996/20000 [16:16:22<344:28:46, 65.26s/it]

Updates 995, num timesteps 398400, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.50, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 18.00
Updates 996, num timesteps 398800, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.05/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 22.75


  5%|███████                                                                                                                                      | 998/20000 [16:18:20<327:11:56, 61.99s/it]

Updates 997, num timesteps 399200, FPS 6 
Last 20 training episodes: mean/median reward 0.18/-0.04, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.02, explor_rew 0.034000 mean_episode_steps 19.80


  5%|███████                                                                                                                                      | 999/20000 [16:19:18<319:40:17, 60.57s/it]

Updates 998, num timesteps 399600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 20.95
Updates 999, num timesteps 400000, FPS 6 
Last 20 training episodes: mean/median reward 0.53/0.55, min/max reward 0.45/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 19.50


  5%|███████                                                                                                                                     | 1000/20000 [16:20:15<315:14:27, 59.73s/it]

Updates 1000, num timesteps 400400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 21.85


  5%|███████                                                                                                                                     | 1002/20000 [16:22:10<308:53:39, 58.53s/it]

Updates 1001, num timesteps 400800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 24.90
Updates 1002, num timesteps 401200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 17.80


  5%|███████                                                                                                                                     | 1003/20000 [16:23:07<306:17:23, 58.04s/it]

Updates 1003, num timesteps 401600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 21.50


  5%|███████                                                                                                                                     | 1005/20000 [16:25:02<304:11:13, 57.65s/it]

Updates 1004, num timesteps 402000, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.29, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 33.20
Updates 1005, num timesteps 402400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 22.65


  5%|███████                                                                                                                                     | 1006/20000 [16:25:59<303:06:51, 57.45s/it]

Updates 1006, num timesteps 402800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.05/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 18.15


  5%|███████                                                                                                                                     | 1007/20000 [16:26:55<302:02:48, 57.25s/it]

Updates 1007, num timesteps 403200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 23.85


  5%|███████                                                                                                                                     | 1009/20000 [16:28:50<301:39:47, 57.18s/it]

Updates 1008, num timesteps 403600, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.01, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 20.40


  5%|███████                                                                                                                                     | 1010/20000 [16:29:47<301:12:10, 57.10s/it]

Updates 1009, num timesteps 404000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 21.80


  5%|███████                                                                                                                                     | 1011/20000 [16:30:44<300:59:55, 57.06s/it]

Updates 1010, num timesteps 404400, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 19.55


  5%|███████                                                                                                                                     | 1012/20000 [16:31:41<301:05:07, 57.08s/it]

Updates 1011, num timesteps 404800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 25.40
Updates 1012, num timesteps 405200, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.03, action_loss 0.00, explor_rew 0.030500 mean_episode_steps 21.55


  5%|███████                                                                                                                                     | 1013/20000 [16:32:37<300:16:26, 56.93s/it]

Updates 1013, num timesteps 405600, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 18.85


  5%|███████                                                                                                                                     | 1015/20000 [16:34:31<300:10:02, 56.92s/it]

Updates 1014, num timesteps 406000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 22.25


  5%|███████                                                                                                                                     | 1016/20000 [16:35:28<300:23:04, 56.96s/it]

Updates 1015, num timesteps 406400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss 0.00, explor_rew 0.032000 mean_episode_steps 16.00


  5%|███████                                                                                                                                     | 1017/20000 [16:36:25<300:11:29, 56.93s/it]

Updates 1016, num timesteps 406800, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.23, min/max reward -0.05/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 22.80


  5%|███████▏                                                                                                                                    | 1018/20000 [16:37:22<300:17:46, 56.95s/it]

Updates 1017, num timesteps 407200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.61, value_loss 0.00, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 21.05


  5%|███████▏                                                                                                                                    | 1019/20000 [16:38:19<300:19:40, 56.96s/it]

Updates 1018, num timesteps 407600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.03, explor_rew 0.029000 mean_episode_steps 25.35
Updates 1019, num timesteps 408000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -1.05/0.55
dist_entropy 2.67, value_loss 0.02, action_loss 0.00, explor_rew 0.031750 mean_episode_steps 16.00


  5%|███████▏                                                                                                                                    | 1020/20000 [16:39:16<300:46:07, 57.05s/it]

Updates 1020, num timesteps 408400, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 19.50


  5%|███████▏                                                                                                                                    | 1022/20000 [16:41:10<299:54:17, 56.89s/it]

Updates 1021, num timesteps 408800, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.05/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.029750 mean_episode_steps 27.05
Updates 1022, num timesteps 409200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 19.05


  5%|███████▏                                                                                                                                    | 1024/20000 [16:43:03<299:07:39, 56.75s/it]

Updates 1023, num timesteps 409600, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 26.40
Updates 1024, num timesteps 410000, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 21.30


  5%|███████▏                                                                                                                                    | 1026/20000 [16:44:58<300:12:16, 56.96s/it]

Updates 1025, num timesteps 410400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 27.25


  5%|███████▏                                                                                                                                    | 1027/20000 [16:45:54<299:35:42, 56.85s/it]

Updates 1026, num timesteps 410800, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 18.60
Updates 1027, num timesteps 411200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 21.50


  5%|███████▏                                                                                                                                    | 1029/20000 [16:47:48<299:54:58, 56.91s/it]

Updates 1028, num timesteps 411600, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.02, explor_rew 0.033250 mean_episode_steps 28.35


  5%|███████▏                                                                                                                                    | 1030/20000 [16:48:45<299:19:18, 56.80s/it]

Updates 1029, num timesteps 412000, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.02, explor_rew 0.033500 mean_episode_steps 32.20


  5%|███████▏                                                                                                                                    | 1031/20000 [16:49:41<298:50:59, 56.72s/it]

Updates 1030, num timesteps 412400, FPS 6 
Last 20 training episodes: mean/median reward 0.19/0.20, min/max reward -1.05/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 25.30


  5%|███████▏                                                                                                                                    | 1032/20000 [16:50:38<299:19:18, 56.81s/it]

Updates 1031, num timesteps 412800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.05/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 22.15
Updates 1032, num timesteps 413200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 22.10


  5%|███████▏                                                                                                                                    | 1034/20000 [16:52:33<299:55:21, 56.93s/it]

Updates 1033, num timesteps 413600, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.02, explor_rew 0.034000 mean_episode_steps 22.65


  5%|███████▏                                                                                                                                    | 1035/20000 [16:53:29<299:34:52, 56.87s/it]

Updates 1034, num timesteps 414000, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.45, min/max reward -0.05/0.55
dist_entropy 2.78, value_loss 0.01, action_loss -0.02, explor_rew 0.019750 mean_episode_steps 22.10
Updates 1035, num timesteps 414400, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.02, explor_rew 0.030500 mean_episode_steps 21.70


  5%|███████▎                                                                                                                                    | 1036/20000 [16:54:26<299:09:49, 56.79s/it]

Updates 1036, num timesteps 414800, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.03, explor_rew 0.030250 mean_episode_steps 22.65


  5%|███████▎                                                                                                                                    | 1038/20000 [16:56:20<299:17:37, 56.82s/it]

Updates 1037, num timesteps 415200, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.01, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 27.65


  5%|███████▎                                                                                                                                    | 1039/20000 [16:57:17<299:19:08, 56.83s/it]

Updates 1038, num timesteps 415600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.05/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 23.20


  5%|███████▎                                                                                                                                    | 1040/20000 [16:58:14<299:45:44, 56.92s/it]

Updates 1039, num timesteps 416000, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss 0.00, explor_rew 0.033250 mean_episode_steps 24.85
Updates 1040, num timesteps 416400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 26.20


  5%|███████▎                                                                                                                                    | 1042/20000 [17:00:07<299:21:53, 56.85s/it]

Updates 1041, num timesteps 416800, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 24.85


  5%|███████▎                                                                                                                                    | 1043/20000 [17:01:04<298:51:54, 56.76s/it]

Updates 1042, num timesteps 417200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.05/0.55
dist_entropy 2.77, value_loss 0.02, action_loss -0.02, explor_rew 0.023750 mean_episode_steps 27.60


  5%|███████▎                                                                                                                                    | 1044/20000 [17:02:00<298:35:42, 56.71s/it]

Updates 1043, num timesteps 417600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.02, action_loss -0.00, explor_rew 0.026000 mean_episode_steps 32.95


  5%|███████▎                                                                                                                                    | 1045/20000 [17:02:57<298:48:29, 56.75s/it]

Updates 1044, num timesteps 418000, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.01, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 21.80


  5%|███████▎                                                                                                                                    | 1046/20000 [17:03:53<297:33:44, 56.52s/it]

Updates 1045, num timesteps 418400, FPS 6 
Last 20 training episodes: mean/median reward -0.25/-0.04, min/max reward -1.05/0.45
dist_entropy 2.79, value_loss 0.27, action_loss 0.04, explor_rew 0.016750 mean_episode_steps 27.10


  5%|███████▎                                                                                                                                    | 1047/20000 [17:04:47<293:25:41, 55.73s/it]

Updates 1046, num timesteps 418800, FPS 6 
Last 20 training episodes: mean/median reward -0.06/-0.04, min/max reward -1.05/0.75
dist_entropy 3.27, value_loss 0.16, action_loss 0.03, explor_rew -0.042000 mean_episode_steps 24.70


  5%|███████▎                                                                                                                                    | 1048/20000 [17:05:42<292:00:12, 55.47s/it]

Updates 1047, num timesteps 419200, FPS 6 
Last 20 training episodes: mean/median reward -0.13/-0.04, min/max reward -1.05/0.55
dist_entropy 2.88, value_loss 0.13, action_loss -0.01, explor_rew -0.015250 mean_episode_steps 86.95


  5%|███████▎                                                                                                                                    | 1049/20000 [17:06:38<293:46:37, 55.81s/it]

Updates 1048, num timesteps 419600, FPS 6 
Last 20 training episodes: mean/median reward 0.06/-0.04, min/max reward -1.05/0.55
dist_entropy 2.87, value_loss 0.06, action_loss -0.01, explor_rew 0.025250 mean_episode_steps 29.40


  5%|███████▎                                                                                                                                    | 1050/20000 [17:07:35<294:32:57, 55.96s/it]

Updates 1049, num timesteps 420000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.05/0.55
dist_entropy 2.98, value_loss 0.05, action_loss -0.03, explor_rew 0.011750 mean_episode_steps 31.90
Updates 1050, num timesteps 420400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.05/0.55
dist_entropy 3.14, value_loss 0.10, action_loss 0.01, explor_rew -0.013250 mean_episode_steps 22.55


  5%|███████▎                                                                                                                                    | 1052/20000 [17:09:26<294:09:30, 55.89s/it]

Updates 1051, num timesteps 420800, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.05/0.55
dist_entropy 3.18, value_loss 0.09, action_loss -0.02, explor_rew -0.012750 mean_episode_steps 48.90


  5%|███████▎                                                                                                                                    | 1053/20000 [17:10:22<294:20:10, 55.92s/it]

Updates 1052, num timesteps 421200, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.05/0.55
dist_entropy 3.17, value_loss 0.05, action_loss -0.03, explor_rew 0.004000 mean_episode_steps 29.50


  5%|███████▍                                                                                                                                    | 1054/20000 [17:11:19<295:13:21, 56.10s/it]

Updates 1053, num timesteps 421600, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -1.05/0.55
dist_entropy 2.91, value_loss 0.03, action_loss 0.01, explor_rew 0.019500 mean_episode_steps 30.70


  5%|███████▍                                                                                                                                    | 1055/20000 [17:12:16<296:51:32, 56.41s/it]

Updates 1054, num timesteps 422000, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.05/0.55
dist_entropy 2.75, value_loss 0.03, action_loss -0.01, explor_rew 0.024000 mean_episode_steps 27.40


  5%|███████▍                                                                                                                                    | 1056/20000 [17:13:12<296:13:02, 56.29s/it]

Updates 1055, num timesteps 422400, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.08, action_loss -0.01, explor_rew 0.010250 mean_episode_steps 35.40


  5%|███████▍                                                                                                                                    | 1057/20000 [17:14:08<296:26:55, 56.34s/it]

Updates 1056, num timesteps 422800, FPS 6 
Last 20 training episodes: mean/median reward 0.07/0.45, min/max reward -1.05/0.55
dist_entropy 2.86, value_loss 0.06, action_loss -0.01, explor_rew 0.016500 mean_episode_steps 33.45


  5%|███████▍                                                                                                                                    | 1058/20000 [17:15:05<296:06:41, 56.28s/it]

Updates 1057, num timesteps 423200, FPS 6 
Last 20 training episodes: mean/median reward 0.12/-0.04, min/max reward -1.05/0.55
dist_entropy 3.13, value_loss 0.06, action_loss -0.01, explor_rew 0.002500 mean_episode_steps 26.50


  5%|███████▍                                                                                                                                    | 1059/20000 [17:16:02<297:17:37, 56.50s/it]

Updates 1058, num timesteps 423600, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.45, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.04, action_loss -0.03, explor_rew 0.029000 mean_episode_steps 29.30


  5%|███████▍                                                                                                                                    | 1060/20000 [17:16:59<298:07:46, 56.67s/it]

Updates 1059, num timesteps 424000, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.45, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 21.55


  5%|███████▍                                                                                                                                    | 1061/20000 [17:17:55<297:53:30, 56.62s/it]

Updates 1060, num timesteps 424400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.05/0.55
dist_entropy 2.80, value_loss 0.02, action_loss -0.00, explor_rew 0.027750 mean_episode_steps 22.60


  5%|███████▍                                                                                                                                    | 1062/20000 [17:18:52<297:42:48, 56.59s/it]

Updates 1061, num timesteps 424800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.05/0.55
dist_entropy 2.76, value_loss 0.03, action_loss -0.02, explor_rew 0.029250 mean_episode_steps 25.00


  5%|███████▍                                                                                                                                    | 1063/20000 [17:19:48<297:54:05, 56.63s/it]

Updates 1062, num timesteps 425200, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.45, min/max reward -0.05/0.55
dist_entropy 2.78, value_loss 0.01, action_loss -0.01, explor_rew 0.029250 mean_episode_steps 27.60


  5%|███████▍                                                                                                                                    | 1064/20000 [17:20:45<298:02:09, 56.66s/it]

Updates 1063, num timesteps 425600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.01, action_loss 0.01, explor_rew 0.030500 mean_episode_steps 22.65
Updates 1064, num timesteps 426000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -1.05/0.55
dist_entropy 2.73, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 32.15


  5%|███████▍                                                                                                                                    | 1065/20000 [17:21:42<298:55:13, 56.83s/it]

Updates 1065, num timesteps 426400, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.55, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.04, action_loss -0.02, explor_rew 0.025500 mean_episode_steps 18.15


  5%|███████▍                                                                                                                                    | 1067/20000 [17:23:36<299:15:46, 56.90s/it]

Updates 1066, num timesteps 426800, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.05/0.55
dist_entropy 2.83, value_loss 0.02, action_loss -0.04, explor_rew 0.025000 mean_episode_steps 21.50


  5%|███████▍                                                                                                                                    | 1068/20000 [17:24:33<299:17:06, 56.91s/it]

Updates 1067, num timesteps 427200, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 26.80


  5%|███████▍                                                                                                                                    | 1069/20000 [17:25:30<298:57:03, 56.85s/it]

Updates 1068, num timesteps 427600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.05/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 25.50
Updates 1069, num timesteps 428000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.05/0.55
dist_entropy 3.05, value_loss 0.08, action_loss -0.03, explor_rew -0.017500 mean_episode_steps 28.55


  5%|███████▍                                                                                                                                    | 1070/20000 [17:26:25<295:52:40, 56.27s/it]

Updates 1070, num timesteps 428400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -1.05/0.55
dist_entropy 3.54, value_loss 0.16, action_loss 0.00, explor_rew -0.037500 mean_episode_steps 43.00


  5%|███████▌                                                                                                                                    | 1072/20000 [17:28:16<293:46:03, 55.87s/it]

Updates 1071, num timesteps 428800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.05/0.55
dist_entropy 3.34, value_loss 0.07, action_loss 0.04, explor_rew -0.003250 mean_episode_steps 31.40
Updates 1072, num timesteps 429200, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.05/0.55
dist_entropy 2.95, value_loss 0.02, action_loss -0.00, explor_rew 0.012250 mean_episode_steps 40.75


  5%|███████▌                                                                                                                                    | 1074/20000 [17:30:08<293:41:09, 55.86s/it]

Updates 1073, num timesteps 429600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 3.04, value_loss 0.03, action_loss -0.02, explor_rew 0.003500 mean_episode_steps 32.05


  5%|███████▌                                                                                                                                    | 1075/20000 [17:31:04<294:50:33, 56.09s/it]

Updates 1074, num timesteps 430000, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.00, explor_rew 0.025500 mean_episode_steps 27.15
Updates 1075, num timesteps 430400, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.55, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 20.90


  5%|███████▌                                                                                                                                    | 1077/20000 [17:32:58<296:58:17, 56.50s/it]

Updates 1076, num timesteps 430800, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.05/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.03, explor_rew 0.028750 mean_episode_steps 25.75


  5%|███████▌                                                                                                                                    | 1078/20000 [17:33:55<297:47:34, 56.66s/it]

Updates 1077, num timesteps 431200, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.50, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 26.85


  5%|███████▌                                                                                                                                    | 1079/20000 [17:34:52<298:12:52, 56.74s/it]

Updates 1078, num timesteps 431600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.05/0.55
dist_entropy 2.75, value_loss 0.00, action_loss 2.21, explor_rew 0.030250 mean_episode_steps 22.95
Updates 1079, num timesteps 432000, FPS 6 
Last 20 training episodes: mean/median reward 0.10/0.55, min/max reward -1.05/0.55
dist_entropy 2.64, value_loss 0.09, action_loss -0.01, explor_rew 0.029000 mean_episode_steps 21.20


  5%|███████▌                                                                                                                                    | 1080/20000 [17:35:49<298:48:35, 56.86s/it]

Updates 1080, num timesteps 432400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.05/0.55
dist_entropy 2.61, value_loss 0.02, action_loss -0.00, explor_rew 0.026500 mean_episode_steps 24.80


  5%|███████▌                                                                                                                                    | 1081/20000 [17:36:46<299:18:44, 56.95s/it]

Updates 1081, num timesteps 432800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.01, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 23.85


  5%|███████▌                                                                                                                                    | 1083/20000 [17:38:39<297:45:00, 56.66s/it]

Updates 1082, num timesteps 433200, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.05/0.55
dist_entropy 3.02, value_loss 0.06, action_loss -0.02, explor_rew -0.010750 mean_episode_steps 33.95


  5%|███████▌                                                                                                                                    | 1084/20000 [17:39:36<297:47:43, 56.67s/it]

Updates 1083, num timesteps 433600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -0.05/0.55
dist_entropy 2.73, value_loss 0.02, action_loss -0.00, explor_rew 0.025750 mean_episode_steps 25.70


  5%|███████▌                                                                                                                                    | 1085/20000 [17:40:33<297:56:35, 56.71s/it]

Updates 1084, num timesteps 434000, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.05/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.01, explor_rew 0.027250 mean_episode_steps 25.95


  5%|███████▌                                                                                                                                    | 1086/20000 [17:41:30<298:08:11, 56.75s/it]

Updates 1085, num timesteps 434400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.05/0.55
dist_entropy 2.73, value_loss 0.01, action_loss -0.00, explor_rew 0.027750 mean_episode_steps 20.85


  5%|███████▌                                                                                                                                    | 1087/20000 [17:42:27<298:38:09, 56.84s/it]

Updates 1086, num timesteps 434800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.05/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.02, explor_rew 0.029250 mean_episode_steps 25.55
Updates 1087, num timesteps 435200, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.05/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 18.85


  5%|███████▌                                                                                                                                    | 1089/20000 [17:44:21<298:48:15, 56.88s/it]

Updates 1088, num timesteps 435600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.05/0.55
dist_entropy 2.67, value_loss 0.01, action_loss -0.01, explor_rew 0.029750 mean_episode_steps 25.50
Updates 1089, num timesteps 436000, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.03/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 19.85


  5%|███████▋                                                                                                                                    | 1091/20000 [17:46:15<299:12:41, 56.97s/it]

Updates 1090, num timesteps 436400, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.50, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.01, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 26.80


  5%|███████▋                                                                                                                                    | 1092/20000 [17:47:12<298:52:19, 56.90s/it]

Updates 1091, num timesteps 436800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.03/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 20.55


  5%|███████▋                                                                                                                                    | 1093/20000 [17:48:09<299:02:07, 56.94s/it]

Updates 1092, num timesteps 437200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.05/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 19.90


  5%|███████▋                                                                                                                                    | 1094/20000 [17:49:06<299:18:31, 56.99s/it]

Updates 1093, num timesteps 437600, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -1.05/0.55
dist_entropy 2.55, value_loss 0.03, action_loss 0.00, explor_rew 0.028250 mean_episode_steps 25.40
Updates 1094, num timesteps 438000, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.01, action_loss -0.00, explor_rew 0.027750 mean_episode_steps 20.95


  5%|███████▋                                                                                                                                    | 1095/20000 [17:50:03<299:09:13, 56.97s/it]

Updates 1095, num timesteps 438400, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.55, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.10, action_loss -0.03, explor_rew 0.027500 mean_episode_steps 26.90


  5%|███████▋                                                                                                                                    | 1096/20000 [17:51:00<299:08:47, 56.97s/it]

Updates 1096, num timesteps 438800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.55, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.02, action_loss -0.02, explor_rew 0.028000 mean_episode_steps 18.05


  5%|███████▋                                                                                                                                    | 1098/20000 [17:52:54<299:12:50, 56.99s/it]

Updates 1097, num timesteps 439200, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.13, min/max reward -0.04/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 23.80
Updates 1098, num timesteps 439600, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.55, min/max reward -1.05/0.55
dist_entropy 2.64, value_loss 0.09, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 22.30


  6%|███████▋                                                                                                                                    | 1100/20000 [17:54:48<299:46:02, 57.10s/it]

Updates 1099, num timesteps 440000, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.50, min/max reward -0.03/0.55
dist_entropy 2.61, value_loss 0.01, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 21.80
Updates 1100, num timesteps 440400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.03/0.55
dist_entropy 2.57, value_loss 0.01, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 40.40


  6%|███████▋                                                                                                                                    | 1102/20000 [17:56:43<300:15:04, 57.20s/it]

Updates 1101, num timesteps 440800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.03/0.55
dist_entropy 2.62, value_loss 0.01, action_loss -0.01, explor_rew 0.030250 mean_episode_steps 20.95
Updates 1102, num timesteps 441200, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.03/0.55
dist_entropy 2.66, value_loss 0.00, action_loss 0.00, explor_rew 0.031750 mean_episode_steps 19.55


  6%|███████▋                                                                                                                                    | 1103/20000 [17:57:40<300:37:09, 57.27s/it]

Updates 1103, num timesteps 441600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.03/0.55
dist_entropy 2.71, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 18.20


  6%|███████▋                                                                                                                                    | 1104/20000 [17:58:37<300:34:55, 57.27s/it]

Updates 1104, num timesteps 442000, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.01, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 18.70


  6%|███████▋                                                                                                                                    | 1106/20000 [18:00:32<300:36:19, 57.28s/it]

Updates 1105, num timesteps 442400, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.03/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 20.20


  6%|███████▋                                                                                                                                    | 1107/20000 [18:01:29<299:49:28, 57.13s/it]

Updates 1106, num timesteps 442800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.03/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 23.20


  6%|███████▊                                                                                                                                    | 1108/20000 [18:02:26<299:55:39, 57.15s/it]

Updates 1107, num timesteps 443200, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.50, min/max reward -0.03/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 24.30
Updates 1108, num timesteps 443600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.03/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 23.00


  6%|███████▊                                                                                                                                    | 1110/20000 [18:04:20<299:24:11, 57.06s/it]

Updates 1109, num timesteps 444000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.03/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 21.05


  6%|███████▊                                                                                                                                    | 1111/20000 [18:05:17<299:24:08, 57.06s/it]

Updates 1110, num timesteps 444400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 22.60


  6%|███████▊                                                                                                                                    | 1112/20000 [18:06:14<300:04:18, 57.19s/it]

Updates 1111, num timesteps 444800, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.03/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 25.10
Updates 1112, num timesteps 445200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward 0.02/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 25.50


  6%|███████▊                                                                                                                                    | 1114/20000 [18:08:08<299:12:41, 57.03s/it]

Updates 1113, num timesteps 445600, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -1.05/0.55
dist_entropy 2.75, value_loss 0.04, action_loss -0.02, explor_rew 0.030500 mean_episode_steps 20.35
Updates 1114, num timesteps 446000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.03/0.55
dist_entropy 2.71, value_loss 0.07, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 22.40


  6%|███████▊                                                                                                                                    | 1116/20000 [18:10:03<299:54:07, 57.17s/it]

Updates 1115, num timesteps 446400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.03/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 19.95
Updates 1116, num timesteps 446800, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.03/0.55
dist_entropy 2.75, value_loss 0.01, action_loss 0.02, explor_rew 0.031000 mean_episode_steps 21.25


  6%|███████▊                                                                                                                                    | 1118/20000 [18:11:57<299:44:41, 57.15s/it]

Updates 1117, num timesteps 447200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.03, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 17.90


  6%|███████▊                                                                                                                                    | 1119/20000 [18:12:55<300:26:49, 57.29s/it]

Updates 1118, num timesteps 447600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 19.45


  6%|███████▊                                                                                                                                    | 1120/20000 [18:13:52<300:37:42, 57.32s/it]

Updates 1119, num timesteps 448000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 24.35
Updates 1120, num timesteps 448400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 22.80


  6%|███████▊                                                                                                                                    | 1122/20000 [18:15:46<299:19:49, 57.08s/it]

Updates 1121, num timesteps 448800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.00, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 23.50
Updates 1122, num timesteps 449200, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.55, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 18.55


  6%|███████▊                                                                                                                                    | 1124/20000 [18:17:40<298:39:28, 56.96s/it]

Updates 1123, num timesteps 449600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.05/0.55
dist_entropy 2.83, value_loss 0.00, action_loss -0.03, explor_rew 0.028000 mean_episode_steps 27.95


  6%|███████▉                                                                                                                                    | 1125/20000 [18:18:37<298:27:31, 56.92s/it]

Updates 1124, num timesteps 450000, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.50, min/max reward -0.05/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.033750 mean_episode_steps 19.05
Updates 1125, num timesteps 450400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 20.70


  6%|███████▉                                                                                                                                    | 1126/20000 [18:19:34<298:56:45, 57.02s/it]

Updates 1126, num timesteps 450800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 20.65


  6%|███████▉                                                                                                                                    | 1128/20000 [18:21:28<299:04:18, 57.05s/it]

Updates 1127, num timesteps 451200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 24.70
Updates 1128, num timesteps 451600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 20.90


  6%|███████▉                                                                                                                                    | 1130/20000 [18:23:22<298:35:58, 56.97s/it]

Updates 1129, num timesteps 452000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 29.60
Updates 1130, num timesteps 452400, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 18.95


  6%|███████▉                                                                                                                                    | 1131/20000 [18:24:20<299:23:15, 57.12s/it]

Updates 1131, num timesteps 452800, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 18.10


  6%|███████▉                                                                                                                                    | 1132/20000 [18:25:17<299:38:50, 57.17s/it]

Updates 1132, num timesteps 453200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 27.35


  6%|███████▉                                                                                                                                    | 1134/20000 [18:27:10<298:25:09, 56.94s/it]

Updates 1133, num timesteps 453600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 19.05


  6%|███████▉                                                                                                                                    | 1135/20000 [18:28:07<297:58:43, 56.86s/it]

Updates 1134, num timesteps 454000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.86, value_loss 0.00, action_loss -0.02, explor_rew 0.034000 mean_episode_steps 27.05


  6%|███████▉                                                                                                                                    | 1136/20000 [18:29:04<297:54:01, 56.85s/it]

Updates 1135, num timesteps 454400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 22.65


  6%|███████▉                                                                                                                                    | 1137/20000 [18:30:01<297:28:19, 56.77s/it]

Updates 1136, num timesteps 454800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.05/0.55
dist_entropy 2.95, value_loss 0.06, action_loss -0.02, explor_rew 0.014500 mean_episode_steps 29.25


  6%|███████▉                                                                                                                                    | 1138/20000 [18:30:57<296:16:42, 56.55s/it]

Updates 1137, num timesteps 455200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 3.24, value_loss 0.06, action_loss -0.02, explor_rew 0.001000 mean_episode_steps 34.95
Updates 1138, num timesteps 455600, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.02, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 17.20


  6%|███████▉                                                                                                                                    | 1140/20000 [18:32:50<296:58:35, 56.69s/it]

Updates 1139, num timesteps 456000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss 0.00, explor_rew 0.033250 mean_episode_steps 17.30
Updates 1140, num timesteps 456400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss 0.03, explor_rew 0.032250 mean_episode_steps 29.20


  6%|███████▉                                                                                                                                    | 1141/20000 [18:33:47<297:32:57, 56.80s/it]

Updates 1141, num timesteps 456800, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 24.75


  6%|████████                                                                                                                                    | 1143/20000 [18:35:41<297:22:21, 56.77s/it]

Updates 1142, num timesteps 457200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.02, explor_rew 0.032500 mean_episode_steps 23.65
Updates 1143, num timesteps 457600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.05/0.55
dist_entropy 2.81, value_loss 0.01, action_loss -0.02, explor_rew 0.028000 mean_episode_steps 24.55


  6%|████████                                                                                                                                    | 1145/20000 [18:37:34<296:56:30, 56.70s/it]

Updates 1144, num timesteps 458000, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 18.30


  6%|████████                                                                                                                                    | 1146/20000 [18:38:30<296:12:03, 56.56s/it]

Updates 1145, num timesteps 458400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 28.05
Updates 1146, num timesteps 458800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 24.65


  6%|████████                                                                                                                                    | 1148/20000 [18:40:24<296:32:49, 56.63s/it]

Updates 1147, num timesteps 459200, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 22.20


  6%|████████                                                                                                                                    | 1149/20000 [18:41:21<296:58:31, 56.71s/it]

Updates 1148, num timesteps 459600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.00, explor_rew 0.033250 mean_episode_steps 16.75
Updates 1149, num timesteps 460000, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.02, explor_rew 0.031500 mean_episode_steps 19.15


  6%|████████                                                                                                                                    | 1150/20000 [18:42:18<297:29:52, 56.82s/it]

Updates 1150, num timesteps 460400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.05/0.55
dist_entropy 2.82, value_loss 0.00, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 24.60


  6%|████████                                                                                                                                    | 1152/20000 [18:44:12<297:34:09, 56.84s/it]

Updates 1151, num timesteps 460800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.05/0.55
dist_entropy 2.78, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 21.00


  6%|████████                                                                                                                                    | 1153/20000 [18:45:08<297:12:09, 56.77s/it]

Updates 1152, num timesteps 461200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.45, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.00, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 20.80


  6%|████████                                                                                                                                    | 1154/20000 [18:46:05<296:56:25, 56.72s/it]

Updates 1153, num timesteps 461600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 22.15
Updates 1154, num timesteps 462000, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 26.45


  6%|████████                                                                                                                                    | 1156/20000 [18:47:58<297:03:39, 56.75s/it]

Updates 1155, num timesteps 462400, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 19.40
Updates 1156, num timesteps 462800, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 22.40


  6%|████████                                                                                                                                    | 1158/20000 [18:49:52<296:50:01, 56.71s/it]

Updates 1157, num timesteps 463200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 22.65


  6%|████████                                                                                                                                    | 1159/20000 [18:50:48<296:30:23, 56.65s/it]

Updates 1158, num timesteps 463600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 19.70


  6%|████████                                                                                                                                    | 1160/20000 [18:51:45<296:39:01, 56.68s/it]

Updates 1159, num timesteps 464000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 22.30
Updates 1160, num timesteps 464400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 24.05


  6%|████████▏                                                                                                                                   | 1162/20000 [18:53:39<296:42:58, 56.70s/it]

Updates 1161, num timesteps 464800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 17.60


  6%|████████▏                                                                                                                                   | 1163/20000 [18:54:35<296:18:34, 56.63s/it]

Updates 1162, num timesteps 465200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.02, explor_rew 0.032500 mean_episode_steps 20.55


  6%|████████▏                                                                                                                                   | 1164/20000 [18:55:31<295:57:18, 56.56s/it]

Updates 1163, num timesteps 465600, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.45, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 16.75
Updates 1164, num timesteps 466000, FPS 6 
Last 20 training episodes: mean/median reward 0.49/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 22.70


  6%|████████▏                                                                                                                                   | 1166/20000 [18:57:24<294:35:14, 56.31s/it]

Updates 1165, num timesteps 466400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 26.30


  6%|████████▏                                                                                                                                   | 1167/20000 [18:58:20<293:51:59, 56.17s/it]

Updates 1166, num timesteps 466800, FPS 6 
Last 20 training episodes: mean/median reward 0.19/0.21, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.01, action_loss -0.01, explor_rew 0.029750 mean_episode_steps 27.80
Updates 1167, num timesteps 467200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 23.60


  6%|████████▏                                                                                                                                   | 1168/20000 [18:59:16<294:00:34, 56.20s/it]

Updates 1168, num timesteps 467600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 19.95


  6%|████████▏                                                                                                                                   | 1170/20000 [19:01:08<292:58:52, 56.01s/it]

Updates 1169, num timesteps 468000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.00, explor_rew 0.033250 mean_episode_steps 32.80


  6%|████████▏                                                                                                                                   | 1171/20000 [19:02:03<292:27:16, 55.92s/it]

Updates 1170, num timesteps 468400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 24.00
Updates 1171, num timesteps 468800, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss 0.01, explor_rew 0.032750 mean_episode_steps 20.65


  6%|████████▏                                                                                                                                   | 1173/20000 [19:03:55<292:33:22, 55.94s/it]

Updates 1172, num timesteps 469200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 22.50


  6%|████████▏                                                                                                                                   | 1174/20000 [19:04:51<292:12:57, 55.88s/it]

Updates 1173, num timesteps 469600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 26.70
Updates 1174, num timesteps 470000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.02, explor_rew 0.031250 mean_episode_steps 19.60


  6%|████████▏                                                                                                                                   | 1176/20000 [19:06:44<293:28:29, 56.13s/it]

Updates 1175, num timesteps 470400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 21.50


  6%|████████▏                                                                                                                                   | 1177/20000 [19:07:40<292:38:45, 55.97s/it]

Updates 1176, num timesteps 470800, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 20.55
Updates 1177, num timesteps 471200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 27.20


  6%|████████▏                                                                                                                                   | 1178/20000 [19:08:36<293:15:32, 56.09s/it]

Updates 1178, num timesteps 471600, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 18.75


  6%|████████▎                                                                                                                                   | 1179/20000 [19:09:33<294:10:51, 56.27s/it]

Updates 1179, num timesteps 472000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 21.60


  6%|████████▎                                                                                                                                   | 1180/20000 [19:10:29<294:12:10, 56.28s/it]

Updates 1180, num timesteps 472400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.01, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 24.15


  6%|████████▎                                                                                                                                   | 1182/20000 [19:12:21<293:01:58, 56.06s/it]

Updates 1181, num timesteps 472800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.02, explor_rew 0.029000 mean_episode_steps 28.70


  6%|████████▎                                                                                                                                   | 1183/20000 [19:13:17<293:32:04, 56.16s/it]

Updates 1182, num timesteps 473200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 20.50


  6%|████████▎                                                                                                                                   | 1184/20000 [19:14:12<291:47:37, 55.83s/it]

Updates 1183, num timesteps 473600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.05/0.55
dist_entropy 2.86, value_loss 0.04, action_loss 0.01, explor_rew 0.011250 mean_episode_steps 21.35


  6%|████████▎                                                                                                                                   | 1185/20000 [19:15:07<290:59:36, 55.68s/it]

Updates 1184, num timesteps 474000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.05/0.55
dist_entropy 2.79, value_loss 0.01, action_loss -0.01, explor_rew 0.022500 mean_episode_steps 23.55
Updates 1185, num timesteps 474400, FPS 6 
Last 20 training episodes: mean/median reward 0.50/0.55, min/max reward -0.04/0.55
dist_entropy 2.81, value_loss 0.01, action_loss 0.01, explor_rew 0.026750 mean_episode_steps 17.00


  6%|████████▎                                                                                                                                   | 1186/20000 [19:16:04<291:53:13, 55.85s/it]

Updates 1186, num timesteps 474800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.05/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 26.00


  6%|████████▎                                                                                                                                   | 1188/20000 [19:17:55<291:06:09, 55.71s/it]

Updates 1187, num timesteps 475200, FPS 6 
Last 20 training episodes: mean/median reward -0.10/0.25, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.06, action_loss -0.02, explor_rew 0.029250 mean_episode_steps 15.30


  6%|████████▎                                                                                                                                   | 1189/20000 [19:18:50<290:30:16, 55.60s/it]

Updates 1188, num timesteps 475600, FPS 6 
Last 20 training episodes: mean/median reward 0.10/-0.04, min/max reward -1.05/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.01, explor_rew 0.035000 mean_episode_steps 20.70


  6%|████████▎                                                                                                                                   | 1190/20000 [19:19:46<290:18:11, 55.56s/it]

Updates 1189, num timesteps 476000, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.50, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.01, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 24.30
Updates 1190, num timesteps 476400, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.05/0.55
dist_entropy 2.77, value_loss 0.01, action_loss -0.01, explor_rew 0.029500 mean_episode_steps 16.75


  6%|████████▎                                                                                                                                   | 1192/20000 [19:21:39<292:21:29, 55.96s/it]

Updates 1191, num timesteps 476800, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 25.70
Updates 1192, num timesteps 477200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 22.70


  6%|████████▎                                                                                                                                   | 1193/20000 [19:22:35<293:17:56, 56.14s/it]

Updates 1193, num timesteps 477600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.00, explor_rew 0.033250 mean_episode_steps 26.75


  6%|████████▎                                                                                                                                   | 1195/20000 [19:24:27<292:10:32, 55.93s/it]

Updates 1194, num timesteps 478000, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 20.00


  6%|████████▎                                                                                                                                   | 1196/20000 [19:25:22<292:00:31, 55.90s/it]

Updates 1195, num timesteps 478400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 23.10


  6%|████████▍                                                                                                                                   | 1197/20000 [19:26:18<291:36:26, 55.83s/it]

Updates 1196, num timesteps 478800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 20.30
Updates 1197, num timesteps 479200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 24.70


  6%|████████▍                                                                                                                                   | 1199/20000 [19:28:10<292:11:15, 55.95s/it]

Updates 1198, num timesteps 479600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.02, explor_rew 0.033250 mean_episode_steps 21.70


  6%|████████▍                                                                                                                                   | 1200/20000 [19:29:06<291:32:27, 55.83s/it]

Updates 1199, num timesteps 480000, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 19.45
Updates 1200, num timesteps 480400, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 21.05


  6%|████████▍                                                                                                                                   | 1201/20000 [19:30:02<292:15:39, 55.97s/it]

Updates 1201, num timesteps 480800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 18.70


  6%|████████▍                                                                                                                                   | 1202/20000 [19:30:58<292:01:33, 55.93s/it]

Updates 1202, num timesteps 481200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 22.40


  6%|████████▍                                                                                                                                   | 1203/20000 [19:31:54<292:24:22, 56.00s/it]

Updates 1203, num timesteps 481600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 20.20


  6%|████████▍                                                                                                                                   | 1205/20000 [19:33:46<291:57:41, 55.92s/it]

Updates 1204, num timesteps 482000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 23.35
Updates 1205, num timesteps 482400, FPS 6 
Last 20 training episodes: mean/median reward 0.51/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.00, explor_rew 0.031000 mean_episode_steps 23.45


  6%|████████▍                                                                                                                                   | 1207/20000 [19:35:38<291:53:54, 55.92s/it]

Updates 1206, num timesteps 482800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 29.95


  6%|████████▍                                                                                                                                   | 1208/20000 [19:36:34<291:23:29, 55.82s/it]

Updates 1207, num timesteps 483200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 23.80


  6%|████████▍                                                                                                                                   | 1209/20000 [19:37:29<290:36:27, 55.67s/it]

Updates 1208, num timesteps 483600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 18.35


  6%|████████▍                                                                                                                                   | 1210/20000 [19:38:25<290:45:08, 55.71s/it]

Updates 1209, num timesteps 484000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss 0.01, explor_rew 0.033250 mean_episode_steps 22.05
Updates 1210, num timesteps 484400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.58, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 19.90


  6%|████████▍                                                                                                                                   | 1211/20000 [19:39:21<291:25:55, 55.84s/it]

Updates 1211, num timesteps 484800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.05/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 36.60


  6%|████████▍                                                                                                                                   | 1212/20000 [19:40:17<291:37:35, 55.88s/it]

Updates 1212, num timesteps 485200, FPS 6 
Last 20 training episodes: mean/median reward 0.48/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 15.40


  6%|████████▍                                                                                                                                   | 1214/20000 [19:42:08<290:44:00, 55.71s/it]

Updates 1213, num timesteps 485600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 22.60


  6%|████████▌                                                                                                                                   | 1215/20000 [19:43:04<290:40:21, 55.71s/it]

Updates 1214, num timesteps 486000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 20.50


  6%|████████▌                                                                                                                                   | 1216/20000 [19:44:00<290:51:57, 55.75s/it]

Updates 1215, num timesteps 486400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 22.45


  6%|████████▌                                                                                                                                   | 1217/20000 [19:44:55<290:29:57, 55.68s/it]

Updates 1216, num timesteps 486800, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 21.20


  6%|████████▌                                                                                                                                   | 1218/20000 [19:45:51<290:07:54, 55.61s/it]

Updates 1217, num timesteps 487200, FPS 6 
Last 20 training episodes: mean/median reward 0.26/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.034750 mean_episode_steps 22.60


  6%|████████▌                                                                                                                                   | 1219/20000 [19:46:46<289:46:50, 55.55s/it]

Updates 1218, num timesteps 487600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 21.80


  6%|████████▌                                                                                                                                   | 1220/20000 [19:47:42<290:00:17, 55.59s/it]

Updates 1219, num timesteps 488000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 26.25
Updates 1220, num timesteps 488400, FPS 6 
Last 20 training episodes: mean/median reward 0.50/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss 0.00, explor_rew 0.031500 mean_episode_steps 20.10


  6%|████████▌                                                                                                                                   | 1221/20000 [19:48:38<291:08:35, 55.81s/it]

Updates 1221, num timesteps 488800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 20.85


  6%|████████▌                                                                                                                                   | 1223/20000 [19:50:30<291:24:14, 55.87s/it]

Updates 1222, num timesteps 489200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 26.20


  6%|████████▌                                                                                                                                   | 1224/20000 [19:51:25<290:56:03, 55.78s/it]

Updates 1223, num timesteps 489600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 21.05


  6%|████████▌                                                                                                                                   | 1225/20000 [19:52:21<290:13:13, 55.65s/it]

Updates 1224, num timesteps 490000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 31.00


  6%|████████▌                                                                                                                                   | 1226/20000 [19:53:16<290:08:29, 55.64s/it]

Updates 1225, num timesteps 490400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 23.55


  6%|████████▌                                                                                                                                   | 1227/20000 [19:54:12<290:05:20, 55.63s/it]

Updates 1226, num timesteps 490800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 21.95


  6%|████████▌                                                                                                                                   | 1228/20000 [19:55:07<289:46:58, 55.57s/it]

Updates 1227, num timesteps 491200, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 19.70
Updates 1228, num timesteps 491600, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.03/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 23.40


  6%|████████▌                                                                                                                                   | 1229/20000 [19:56:04<290:42:54, 55.75s/it]

Updates 1229, num timesteps 492000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss 0.00, explor_rew 0.031750 mean_episode_steps 18.70


  6%|████████▌                                                                                                                                   | 1231/20000 [19:57:56<291:19:40, 55.88s/it]

Updates 1230, num timesteps 492400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 24.45
Updates 1231, num timesteps 492800, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 18.75


  6%|████████▋                                                                                                                                   | 1233/20000 [19:59:47<290:41:21, 55.76s/it]

Updates 1232, num timesteps 493200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -1.05/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 21.65


  6%|████████▋                                                                                                                                   | 1234/20000 [20:00:43<291:02:41, 55.83s/it]

Updates 1233, num timesteps 493600, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 27.45
Updates 1234, num timesteps 494000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.01, action_loss 0.01, explor_rew 0.032000 mean_episode_steps 26.25


  6%|████████▋                                                                                                                                   | 1236/20000 [20:02:35<290:43:42, 55.78s/it]

Updates 1235, num timesteps 494400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 23.65


  6%|████████▋                                                                                                                                   | 1237/20000 [20:03:30<290:51:19, 55.81s/it]

Updates 1236, num timesteps 494800, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 22.05
Updates 1237, num timesteps 495200, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 25.45


  6%|████████▋                                                                                                                                   | 1239/20000 [20:05:21<290:06:58, 55.67s/it]

Updates 1238, num timesteps 495600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.05/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.01, explor_rew 0.027250 mean_episode_steps 26.35


  6%|████████▋                                                                                                                                   | 1240/20000 [20:06:18<290:45:24, 55.80s/it]

Updates 1239, num timesteps 496000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 21.90


  6%|████████▋                                                                                                                                   | 1241/20000 [20:07:14<291:12:45, 55.89s/it]

Updates 1240, num timesteps 496400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 25.15
Updates 1241, num timesteps 496800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 23.60


  6%|████████▋                                                                                                                                   | 1242/20000 [20:08:10<292:02:14, 56.05s/it]

Updates 1242, num timesteps 497200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 22.40


  6%|████████▋                                                                                                                                   | 1244/20000 [20:10:01<291:01:17, 55.86s/it]

Updates 1243, num timesteps 497600, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.05/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 23.80


  6%|████████▋                                                                                                                                   | 1245/20000 [20:10:58<291:21:10, 55.92s/it]

Updates 1244, num timesteps 498000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 21.85


  6%|████████▋                                                                                                                                   | 1246/20000 [20:11:52<289:24:58, 55.56s/it]

Updates 1245, num timesteps 498400, FPS 6 
Last 20 training episodes: mean/median reward 0.09/-0.03, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.05, action_loss -0.01, explor_rew 0.009250 mean_episode_steps 34.25


  6%|████████▋                                                                                                                                   | 1247/20000 [20:12:48<289:17:07, 55.53s/it]

Updates 1246, num timesteps 498800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.05/0.55
dist_entropy 2.72, value_loss 0.04, action_loss -0.03, explor_rew 0.018750 mean_episode_steps 20.90


  6%|████████▋                                                                                                                                   | 1248/20000 [20:13:43<289:31:44, 55.58s/it]

Updates 1247, num timesteps 499200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.50, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.02, action_loss -0.03, explor_rew 0.027000 mean_episode_steps 22.10
Updates 1248, num timesteps 499600, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.55, min/max reward -1.05/0.55
dist_entropy 2.81, value_loss 0.02, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 21.70


  6%|████████▋                                                                                                                                   | 1249/20000 [20:14:39<289:54:29, 55.66s/it]

Updates 1249, num timesteps 500000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 24.40


  6%|████████▊                                                                                                                                   | 1250/20000 [20:15:35<290:25:43, 55.76s/it]

Updates 1250, num timesteps 500400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss 0.01, explor_rew 0.032250 mean_episode_steps 18.90


  6%|████████▊                                                                                                                                   | 1251/20000 [20:16:31<291:06:58, 55.90s/it]

Updates 1251, num timesteps 500800, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 19.60


  6%|████████▊                                                                                                                                   | 1252/20000 [20:17:27<291:06:48, 55.90s/it]

Updates 1252, num timesteps 501200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 22.40


  6%|████████▊                                                                                                                                   | 1254/20000 [20:19:19<290:53:25, 55.86s/it]

Updates 1253, num timesteps 501600, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 28.40


  6%|████████▊                                                                                                                                   | 1255/20000 [20:20:15<290:19:41, 55.76s/it]

Updates 1254, num timesteps 502000, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 17.25
Updates 1255, num timesteps 502400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.04/0.55
dist_entropy 2.84, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 24.80


  6%|████████▊                                                                                                                                   | 1257/20000 [20:22:05<289:13:46, 55.55s/it]

Updates 1256, num timesteps 502800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 25.55


  6%|████████▊                                                                                                                                   | 1258/20000 [20:23:01<289:23:14, 55.59s/it]

Updates 1257, num timesteps 503200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 28.35


  6%|████████▊                                                                                                                                   | 1259/20000 [20:23:57<289:25:36, 55.60s/it]

Updates 1258, num timesteps 503600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.00, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 20.65


  6%|████████▊                                                                                                                                   | 1260/20000 [20:24:52<289:20:24, 55.58s/it]

Updates 1259, num timesteps 504000, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 31.10
Updates 1260, num timesteps 504400, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 25.05


  6%|████████▊                                                                                                                                   | 1261/20000 [20:25:48<289:46:19, 55.67s/it]

Updates 1261, num timesteps 504800, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.00, action_loss -0.02, explor_rew 0.031750 mean_episode_steps 19.70


  6%|████████▊                                                                                                                                   | 1263/20000 [20:27:40<289:49:21, 55.68s/it]

Updates 1262, num timesteps 505200, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.01, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 19.15
Updates 1263, num timesteps 505600, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 24.15


  6%|████████▊                                                                                                                                   | 1265/20000 [20:29:31<289:53:01, 55.70s/it]

Updates 1264, num timesteps 506000, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.50, min/max reward -0.05/0.55
dist_entropy 2.83, value_loss 0.00, action_loss -0.02, explor_rew 0.033500 mean_episode_steps 26.60


  6%|████████▊                                                                                                                                   | 1266/20000 [20:30:27<290:03:27, 55.74s/it]

Updates 1265, num timesteps 506400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.02, explor_rew 0.032750 mean_episode_steps 27.45


  6%|████████▊                                                                                                                                   | 1267/20000 [20:31:22<289:18:12, 55.60s/it]

Updates 1266, num timesteps 506800, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.034000 mean_episode_steps 28.50
Updates 1267, num timesteps 507200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 21.70


  6%|████████▉                                                                                                                                   | 1268/20000 [20:32:18<289:51:29, 55.71s/it]

Updates 1268, num timesteps 507600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 25.40


  6%|████████▉                                                                                                                                   | 1269/20000 [20:33:14<290:04:29, 55.75s/it]

Updates 1269, num timesteps 508000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -1.05/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 23.70


  6%|████████▉                                                                                                                                   | 1270/20000 [20:34:10<290:58:04, 55.93s/it]

Updates 1270, num timesteps 508400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss 0.00, explor_rew 0.032500 mean_episode_steps 24.95


  6%|████████▉                                                                                                                                   | 1272/20000 [20:36:02<290:40:22, 55.87s/it]

Updates 1271, num timesteps 508800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 21.75
Updates 1272, num timesteps 509200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 30.20


  6%|████████▉                                                                                                                                   | 1274/20000 [20:37:54<290:21:18, 55.82s/it]

Updates 1273, num timesteps 509600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 22.10


  6%|████████▉                                                                                                                                   | 1275/20000 [20:38:49<290:26:25, 55.84s/it]

Updates 1274, num timesteps 510000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 22.20


  6%|████████▉                                                                                                                                   | 1276/20000 [20:39:45<289:52:16, 55.73s/it]

Updates 1275, num timesteps 510400, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 27.95
Updates 1276, num timesteps 510800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.04/0.55
dist_entropy 2.78, value_loss 0.00, action_loss 0.01, explor_rew 0.031750 mean_episode_steps 22.45


  6%|████████▉                                                                                                                                   | 1278/20000 [20:41:35<288:01:05, 55.38s/it]

Updates 1277, num timesteps 511200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.05/0.55
dist_entropy 3.19, value_loss 0.10, action_loss -0.03, explor_rew -0.019250 mean_episode_steps 33.60


  6%|████████▉                                                                                                                                   | 1279/20000 [20:42:31<287:56:01, 55.37s/it]

Updates 1278, num timesteps 511600, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.45, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 24.25


  6%|████████▉                                                                                                                                   | 1280/20000 [20:43:26<288:21:38, 55.45s/it]

Updates 1279, num timesteps 512000, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 22.95
Updates 1280, num timesteps 512400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 23.30


  6%|████████▉                                                                                                                                   | 1282/20000 [20:45:18<288:56:37, 55.57s/it]

Updates 1281, num timesteps 512800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss 0.01, explor_rew 0.032500 mean_episode_steps 25.00


  6%|████████▉                                                                                                                                   | 1283/20000 [20:46:13<289:13:48, 55.63s/it]

Updates 1282, num timesteps 513200, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.033750 mean_episode_steps 18.90


  6%|████████▉                                                                                                                                   | 1284/20000 [20:47:09<289:41:30, 55.72s/it]

Updates 1283, num timesteps 513600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 20.25
Updates 1284, num timesteps 514000, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 21.55


  6%|█████████                                                                                                                                   | 1286/20000 [20:49:01<290:04:15, 55.80s/it]

Updates 1285, num timesteps 514400, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 26.25


  6%|█████████                                                                                                                                   | 1287/20000 [20:49:56<289:14:38, 55.64s/it]

Updates 1286, num timesteps 514800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 27.40


  6%|█████████                                                                                                                                   | 1288/20000 [20:50:52<289:00:36, 55.60s/it]

Updates 1287, num timesteps 515200, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 29.45


  6%|█████████                                                                                                                                   | 1289/20000 [20:51:47<288:44:14, 55.55s/it]

Updates 1288, num timesteps 515600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.00, explor_rew 0.033250 mean_episode_steps 19.30
Updates 1289, num timesteps 516000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -1.05/0.55
dist_entropy 2.75, value_loss 0.01, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 18.80


  6%|█████████                                                                                                                                   | 1291/20000 [20:53:39<289:27:25, 55.70s/it]

Updates 1290, num timesteps 516400, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.00, explor_rew 0.034250 mean_episode_steps 26.75


  6%|█████████                                                                                                                                   | 1292/20000 [20:54:35<289:30:00, 55.71s/it]

Updates 1291, num timesteps 516800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.02, explor_rew 0.031750 mean_episode_steps 22.85
Updates 1292, num timesteps 517200, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 26.55


  6%|█████████                                                                                                                                   | 1293/20000 [20:55:31<289:29:04, 55.71s/it]

Updates 1293, num timesteps 517600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.84, value_loss 0.04, action_loss 0.04, explor_rew 0.018250 mean_episode_steps 37.70


  6%|█████████                                                                                                                                   | 1294/20000 [20:56:26<289:19:50, 55.68s/it]

Updates 1294, num timesteps 518000, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 18.50


  6%|█████████                                                                                                                                   | 1295/20000 [20:57:22<289:46:41, 55.77s/it]

Updates 1295, num timesteps 518400, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.81, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 19.90


  6%|█████████                                                                                                                                   | 1297/20000 [20:59:14<289:22:15, 55.70s/it]

Updates 1296, num timesteps 518800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.80, value_loss 0.00, action_loss -0.01, explor_rew 0.033750 mean_episode_steps 20.90
Updates 1297, num timesteps 519200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.55, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.08, action_loss 0.01, explor_rew 0.028500 mean_episode_steps 29.85


  6%|█████████                                                                                                                                   | 1299/20000 [21:01:05<289:34:54, 55.75s/it]

Updates 1298, num timesteps 519600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 25.40
Updates 1299, num timesteps 520000, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss 0.00, explor_rew 0.031500 mean_episode_steps 19.50


  6%|█████████                                                                                                                                   | 1300/20000 [21:02:01<290:13:14, 55.87s/it]

Updates 1300, num timesteps 520400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 12.20


  7%|█████████                                                                                                                                   | 1301/20000 [21:02:58<290:58:53, 56.02s/it]

Updates 1301, num timesteps 520800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.05/0.55
dist_entropy 2.68, value_loss 0.01, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 20.45


  7%|█████████                                                                                                                                   | 1303/20000 [21:04:50<291:56:23, 56.21s/it]

Updates 1302, num timesteps 521200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 22.70
Updates 1303, num timesteps 521600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 24.95


  7%|█████████▏                                                                                                                                  | 1305/20000 [21:06:42<291:06:40, 56.06s/it]

Updates 1304, num timesteps 522000, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 21.25


  7%|█████████▏                                                                                                                                  | 1306/20000 [21:07:38<290:32:09, 55.95s/it]

Updates 1305, num timesteps 522400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 22.70
Updates 1306, num timesteps 522800, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 23.65


  7%|█████████▏                                                                                                                                  | 1307/20000 [21:08:34<290:41:29, 55.98s/it]

Updates 1307, num timesteps 523200, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss 0.01, explor_rew 0.032000 mean_episode_steps 21.70


  7%|█████████▏                                                                                                                                  | 1309/20000 [21:10:26<290:40:30, 55.99s/it]

Updates 1308, num timesteps 523600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 20.55
Updates 1309, num timesteps 524000, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 19.40


  7%|█████████▏                                                                                                                                  | 1310/20000 [21:11:22<290:51:56, 56.03s/it]

Updates 1310, num timesteps 524400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss 0.01, explor_rew 0.031500 mean_episode_steps 19.65


  7%|█████████▏                                                                                                                                  | 1312/20000 [21:13:14<290:48:01, 56.02s/it]

Updates 1311, num timesteps 524800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 21.40
Updates 1312, num timesteps 525200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss 0.00, explor_rew 0.034000 mean_episode_steps 21.10


  7%|█████████▏                                                                                                                                  | 1314/20000 [21:15:06<290:36:02, 55.99s/it]

Updates 1313, num timesteps 525600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 26.35


  7%|█████████▏                                                                                                                                  | 1315/20000 [21:16:02<289:49:58, 55.84s/it]

Updates 1314, num timesteps 526000, FPS 6 
Last 20 training episodes: mean/median reward 0.16/-0.04, min/max reward -1.05/0.55
dist_entropy 2.79, value_loss 0.01, action_loss -0.01, explor_rew 0.034000 mean_episode_steps 27.55
Updates 1315, num timesteps 526400, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 21.65


  7%|█████████▏                                                                                                                                  | 1317/20000 [21:17:54<290:58:06, 56.07s/it]

Updates 1316, num timesteps 526800, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 25.40


  7%|█████████▏                                                                                                                                  | 1318/20000 [21:18:51<291:01:15, 56.08s/it]

Updates 1317, num timesteps 527200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 22.35
Updates 1318, num timesteps 527600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 23.85


  7%|█████████▏                                                                                                                                  | 1320/20000 [21:20:42<289:44:14, 55.84s/it]

Updates 1319, num timesteps 528000, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.50, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.01, action_loss -0.00, explor_rew 0.033750 mean_episode_steps 28.85
Updates 1320, num timesteps 528400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 24.60


  7%|█████████▎                                                                                                                                  | 1322/20000 [21:22:34<289:41:09, 55.83s/it]

Updates 1321, num timesteps 528800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 21.85


  7%|█████████▎                                                                                                                                  | 1323/20000 [21:23:29<289:34:42, 55.82s/it]

Updates 1322, num timesteps 529200, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 22.15


  7%|█████████▎                                                                                                                                  | 1324/20000 [21:24:25<289:25:45, 55.79s/it]

Updates 1323, num timesteps 529600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 25.45


  7%|█████████▎                                                                                                                                  | 1325/20000 [21:25:21<289:30:35, 55.81s/it]

Updates 1324, num timesteps 530000, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 27.20


  7%|█████████▎                                                                                                                                  | 1326/20000 [21:26:17<289:03:55, 55.73s/it]

Updates 1325, num timesteps 530400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 23.95
Updates 1326, num timesteps 530800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 25.55


  7%|█████████▎                                                                                                                                  | 1327/20000 [21:27:13<290:36:31, 56.03s/it]

Updates 1327, num timesteps 531200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.01, action_loss -0.01, explor_rew 0.028750 mean_episode_steps 18.00


  7%|█████████▎                                                                                                                                  | 1329/20000 [21:29:06<290:57:15, 56.10s/it]

Updates 1328, num timesteps 531600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 20.30
Updates 1329, num timesteps 532000, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 23.20


  7%|█████████▎                                                                                                                                  | 1331/20000 [21:30:57<289:44:20, 55.87s/it]

Updates 1330, num timesteps 532400, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.033750 mean_episode_steps 19.95
Updates 1331, num timesteps 532800, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 19.00


  7%|█████████▎                                                                                                                                  | 1333/20000 [21:32:49<290:10:00, 55.96s/it]

Updates 1332, num timesteps 533200, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.034250 mean_episode_steps 19.80
Updates 1333, num timesteps 533600, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.00, explor_rew 0.033250 mean_episode_steps 18.80


  7%|█████████▎                                                                                                                                  | 1334/20000 [21:33:45<290:04:15, 55.94s/it]

Updates 1334, num timesteps 534000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss 0.00, explor_rew 0.031750 mean_episode_steps 22.20


  7%|█████████▎                                                                                                                                  | 1336/20000 [21:35:37<289:57:51, 55.93s/it]

Updates 1335, num timesteps 534400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 26.10


  7%|█████████▎                                                                                                                                  | 1337/20000 [21:36:33<290:24:52, 56.02s/it]

Updates 1336, num timesteps 534800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 21.75
Updates 1337, num timesteps 535200, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.55, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.00, explor_rew 0.033250 mean_episode_steps 15.70


  7%|█████████▎                                                                                                                                  | 1339/20000 [21:38:25<290:27:54, 56.04s/it]

Updates 1338, num timesteps 535600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 22.60
Updates 1339, num timesteps 536000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 21.30


  7%|█████████▍                                                                                                                                  | 1341/20000 [21:40:18<290:59:51, 56.14s/it]

Updates 1340, num timesteps 536400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 14.15


  7%|█████████▍                                                                                                                                  | 1342/20000 [21:41:14<291:00:30, 56.15s/it]

Updates 1341, num timesteps 536800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 22.25
Updates 1342, num timesteps 537200, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 20.10


  7%|█████████▍                                                                                                                                  | 1344/20000 [21:43:07<291:39:10, 56.28s/it]

Updates 1343, num timesteps 537600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 23.95


  7%|█████████▍                                                                                                                                  | 1345/20000 [21:44:03<290:39:55, 56.09s/it]

Updates 1344, num timesteps 538000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 21.40
Updates 1345, num timesteps 538400, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 23.45


  7%|█████████▍                                                                                                                                  | 1346/20000 [21:44:58<290:22:17, 56.04s/it]

Updates 1346, num timesteps 538800, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.55, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.03, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 14.50


  7%|█████████▍                                                                                                                                  | 1348/20000 [21:46:50<289:06:11, 55.80s/it]

Updates 1347, num timesteps 539200, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.13, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.04, action_loss -0.01, explor_rew 0.027500 mean_episode_steps 30.00


  7%|█████████▍                                                                                                                                  | 1349/20000 [21:47:46<289:39:09, 55.91s/it]

Updates 1348, num timesteps 539600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.50, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.01, action_loss -0.00, explor_rew 0.029000 mean_episode_steps 19.80


  7%|█████████▍                                                                                                                                  | 1350/20000 [21:48:41<289:04:15, 55.80s/it]

Updates 1349, num timesteps 540000, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.21, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.01, action_loss -0.03, explor_rew 0.031250 mean_episode_steps 25.20
Updates 1350, num timesteps 540400, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 15.20


  7%|█████████▍                                                                                                                                  | 1352/20000 [21:50:33<288:52:49, 55.77s/it]

Updates 1351, num timesteps 540800, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.45, min/max reward -1.05/0.55
dist_entropy 2.95, value_loss 0.05, action_loss -0.00, explor_rew 0.003250 mean_episode_steps 36.25
Updates 1352, num timesteps 541200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.02, action_loss -0.01, explor_rew 0.025250 mean_episode_steps 40.70


  7%|█████████▍                                                                                                                                  | 1354/20000 [21:52:25<288:45:38, 55.75s/it]

Updates 1353, num timesteps 541600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.00, action_loss 0.01, explor_rew 0.032500 mean_episode_steps 29.05


  7%|█████████▍                                                                                                                                  | 1355/20000 [21:53:20<288:08:38, 55.64s/it]

Updates 1354, num timesteps 542000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 22.20


  7%|█████████▍                                                                                                                                  | 1356/20000 [21:54:16<288:18:54, 55.67s/it]

Updates 1355, num timesteps 542400, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss 0.01, explor_rew 0.033000 mean_episode_steps 17.10
Updates 1356, num timesteps 542800, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 28.05


  7%|█████████▌                                                                                                                                  | 1358/20000 [21:56:08<288:53:46, 55.79s/it]

Updates 1357, num timesteps 543200, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 26.00
Updates 1358, num timesteps 543600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 21.25


  7%|█████████▌                                                                                                                                  | 1359/20000 [21:57:04<289:43:50, 55.95s/it]

Updates 1359, num timesteps 544000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.00, explor_rew 0.033250 mean_episode_steps 31.60


  7%|█████████▌                                                                                                                                  | 1361/20000 [21:58:55<289:03:05, 55.83s/it]

Updates 1360, num timesteps 544400, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.50, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss 0.00, explor_rew 0.032750 mean_episode_steps 20.80


  7%|█████████▌                                                                                                                                  | 1362/20000 [21:59:51<288:43:39, 55.77s/it]

Updates 1361, num timesteps 544800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.00, explor_rew 0.033750 mean_episode_steps 25.65


  7%|█████████▌                                                                                                                                  | 1363/20000 [22:00:47<288:40:48, 55.76s/it]

Updates 1362, num timesteps 545200, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.01, action_loss 0.00, explor_rew 0.033750 mean_episode_steps 25.25
Updates 1363, num timesteps 545600, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 26.65


  7%|█████████▌                                                                                                                                  | 1364/20000 [22:01:43<289:32:05, 55.93s/it]

Updates 1364, num timesteps 546000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.01, action_loss 0.00, explor_rew 0.033250 mean_episode_steps 22.25


  7%|█████████▌                                                                                                                                  | 1365/20000 [22:02:39<289:14:17, 55.88s/it]

Updates 1365, num timesteps 546400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.01, action_loss 0.01, explor_rew 0.033500 mean_episode_steps 21.65


  7%|█████████▌                                                                                                                                  | 1366/20000 [22:03:35<289:06:47, 55.86s/it]

Updates 1366, num timesteps 546800, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.55, min/max reward -0.04/0.55
dist_entropy 2.79, value_loss 0.01, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 27.30


  7%|█████████▌                                                                                                                                  | 1368/20000 [22:05:26<288:41:01, 55.78s/it]

Updates 1367, num timesteps 547200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 23.05


  7%|█████████▌                                                                                                                                  | 1369/20000 [22:06:22<289:12:16, 55.88s/it]

Updates 1368, num timesteps 547600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 19.80


  7%|█████████▌                                                                                                                                  | 1370/20000 [22:07:18<289:00:34, 55.85s/it]

Updates 1369, num timesteps 548000, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 16.10


  7%|█████████▌                                                                                                                                  | 1371/20000 [22:08:14<288:49:34, 55.81s/it]

Updates 1370, num timesteps 548400, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.05/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.00, explor_rew 0.034000 mean_episode_steps 25.50


  7%|█████████▌                                                                                                                                  | 1372/20000 [22:09:09<287:58:09, 55.65s/it]

Updates 1371, num timesteps 548800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.00, explor_rew 0.034250 mean_episode_steps 29.70


  7%|█████████▌                                                                                                                                  | 1373/20000 [22:10:05<288:17:44, 55.72s/it]

Updates 1372, num timesteps 549200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss 0.00, explor_rew 0.032000 mean_episode_steps 26.00


  7%|█████████▌                                                                                                                                  | 1374/20000 [22:11:01<288:11:19, 55.70s/it]

Updates 1373, num timesteps 549600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.02, explor_rew 0.034250 mean_episode_steps 27.25


  7%|█████████▋                                                                                                                                  | 1375/20000 [22:11:56<288:30:59, 55.77s/it]

Updates 1374, num timesteps 550000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss 0.00, explor_rew 0.033250 mean_episode_steps 16.55
Updates 1375, num timesteps 550400, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 22.80


  7%|█████████▋                                                                                                                                  | 1377/20000 [22:13:48<288:16:34, 55.73s/it]

Updates 1376, num timesteps 550800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -1.05/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 26.65


  7%|█████████▋                                                                                                                                  | 1378/20000 [22:14:43<288:07:48, 55.70s/it]

Updates 1377, num timesteps 551200, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 17.30


  7%|█████████▋                                                                                                                                  | 1379/20000 [22:15:39<288:19:38, 55.74s/it]

Updates 1378, num timesteps 551600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.01, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 27.60
Updates 1379, num timesteps 552000, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 20.50


  7%|█████████▋                                                                                                                                  | 1381/20000 [22:17:31<288:35:10, 55.80s/it]

Updates 1380, num timesteps 552400, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 24.05


  7%|█████████▋                                                                                                                                  | 1382/20000 [22:18:27<288:20:34, 55.75s/it]

Updates 1381, num timesteps 552800, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 17.40


  7%|█████████▋                                                                                                                                  | 1383/20000 [22:19:22<288:13:00, 55.73s/it]

Updates 1382, num timesteps 553200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 27.15


  7%|█████████▋                                                                                                                                  | 1384/20000 [22:20:18<288:16:12, 55.75s/it]

Updates 1383, num timesteps 553600, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 19.30


  7%|█████████▋                                                                                                                                  | 1385/20000 [22:21:14<289:08:28, 55.92s/it]

Updates 1384, num timesteps 554000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.00, action_loss 0.00, explor_rew 0.032000 mean_episode_steps 21.00


  7%|█████████▋                                                                                                                                  | 1386/20000 [22:22:11<289:18:50, 55.95s/it]

Updates 1385, num timesteps 554400, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 23.85


  7%|█████████▋                                                                                                                                  | 1387/20000 [22:23:07<289:21:44, 55.97s/it]

Updates 1386, num timesteps 554800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 23.90


  7%|█████████▋                                                                                                                                  | 1388/20000 [22:24:02<288:42:29, 55.84s/it]

Updates 1387, num timesteps 555200, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.02, action_loss -0.01, explor_rew 0.026750 mean_episode_steps 27.60


  7%|█████████▋                                                                                                                                  | 1389/20000 [22:24:58<288:43:11, 55.85s/it]

Updates 1388, num timesteps 555600, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 27.10


  7%|█████████▋                                                                                                                                  | 1390/20000 [22:25:53<287:47:42, 55.67s/it]

Updates 1389, num timesteps 556000, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 25.30
Updates 1390, num timesteps 556400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 18.10


  7%|█████████▋                                                                                                                                  | 1392/20000 [22:27:45<288:38:01, 55.84s/it]

Updates 1391, num timesteps 556800, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.01, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 23.95


  7%|█████████▊                                                                                                                                  | 1393/20000 [22:28:41<288:00:29, 55.72s/it]

Updates 1392, num timesteps 557200, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.45, min/max reward -1.05/0.55
dist_entropy 2.68, value_loss 0.01, action_loss -0.00, explor_rew 0.033500 mean_episode_steps 24.15


  7%|█████████▊                                                                                                                                  | 1394/20000 [22:29:37<288:23:05, 55.80s/it]

Updates 1393, num timesteps 557600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 23.15


  7%|█████████▊                                                                                                                                  | 1395/20000 [22:30:32<288:05:36, 55.75s/it]

Updates 1394, num timesteps 558000, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.45, min/max reward -1.05/0.55
dist_entropy 2.64, value_loss 0.01, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 18.50
Updates 1395, num timesteps 558400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 22.60


  7%|█████████▊                                                                                                                                  | 1397/20000 [22:32:25<288:47:01, 55.88s/it]

Updates 1396, num timesteps 558800, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 24.20


  7%|█████████▊                                                                                                                                  | 1398/20000 [22:33:20<288:06:05, 55.76s/it]

Updates 1397, num timesteps 559200, FPS 6 
Last 20 training episodes: mean/median reward 0.27/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.01, action_loss -0.00, explor_rew 0.034000 mean_episode_steps 23.15
Updates 1398, num timesteps 559600, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 24.75


  7%|█████████▊                                                                                                                                  | 1400/20000 [22:35:12<288:32:22, 55.85s/it]

Updates 1399, num timesteps 560000, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss 0.00, explor_rew 0.034000 mean_episode_steps 17.80
Updates 1400, num timesteps 560400, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 23.20


  7%|█████████▊                                                                                                                                  | 1402/20000 [22:37:04<288:46:49, 55.90s/it]

Updates 1401, num timesteps 560800, FPS 6 
Last 20 training episodes: mean/median reward 0.23/0.20, min/max reward -0.04/0.55
dist_entropy 2.77, value_loss 0.00, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 20.80
Updates 1402, num timesteps 561200, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.01, action_loss 0.00, explor_rew 0.031500 mean_episode_steps 19.55


  7%|█████████▊                                                                                                                                  | 1404/20000 [22:38:56<289:24:15, 56.03s/it]

Updates 1403, num timesteps 561600, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.50, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 23.35


  7%|█████████▊                                                                                                                                  | 1405/20000 [22:39:52<288:09:01, 55.79s/it]

Updates 1404, num timesteps 562000, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.033750 mean_episode_steps 29.00


  7%|█████████▊                                                                                                                                  | 1406/20000 [22:40:48<288:27:59, 55.85s/it]

Updates 1405, num timesteps 562400, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 25.75
Updates 1406, num timesteps 562800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.01, action_loss 0.00, explor_rew 0.032500 mean_episode_steps 23.75


  7%|█████████▊                                                                                                                                  | 1407/20000 [22:41:44<288:49:44, 55.92s/it]

Updates 1407, num timesteps 563200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss 0.00, explor_rew 0.030750 mean_episode_steps 18.20


  7%|█████████▊                                                                                                                                  | 1408/20000 [22:42:40<288:42:44, 55.90s/it]

Updates 1408, num timesteps 563600, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 18.90


  7%|█████████▊                                                                                                                                  | 1409/20000 [22:43:36<289:32:40, 56.07s/it]

Updates 1409, num timesteps 564000, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.55, min/max reward -0.05/0.55
dist_entropy 2.72, value_loss 0.00, action_loss 0.01, explor_rew 0.031500 mean_episode_steps 19.90


  7%|█████████▊                                                                                                                                  | 1410/20000 [22:44:32<289:11:31, 56.00s/it]

Updates 1410, num timesteps 564400, FPS 6 
Last 20 training episodes: mean/median reward 0.53/0.55, min/max reward 0.45/0.55
dist_entropy 3.16, value_loss 0.14, action_loss -0.02, explor_rew -0.030500 mean_episode_steps 21.15


  7%|█████████▉                                                                                                                                  | 1412/20000 [22:46:21<285:53:44, 55.37s/it]

Updates 1411, num timesteps 564800, FPS 6 
Last 20 training episodes: mean/median reward 0.22/0.45, min/max reward -1.05/0.55
dist_entropy 3.16, value_loss 0.04, action_loss 0.00, explor_rew 0.006750 mean_episode_steps 25.25
Updates 1412, num timesteps 565200, FPS 6 
Last 20 training episodes: mean/median reward 0.48/0.55, min/max reward -0.05/0.55
dist_entropy 2.91, value_loss 0.02, action_loss 0.01, explor_rew 0.023500 mean_episode_steps 21.75


  7%|█████████▉                                                                                                                                  | 1414/20000 [22:48:13<286:45:45, 55.54s/it]

Updates 1413, num timesteps 565600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.05/0.55
dist_entropy 2.84, value_loss 0.01, action_loss 0.00, explor_rew 0.025750 mean_episode_steps 24.05
Updates 1414, num timesteps 566000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.55, min/max reward -0.05/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.029750 mean_episode_steps 17.75


  7%|█████████▉                                                                                                                                  | 1416/20000 [22:50:05<288:27:41, 55.88s/it]

Updates 1415, num timesteps 566400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward -0.04/0.55
dist_entropy 2.75, value_loss 0.00, action_loss 0.00, explor_rew 0.029250 mean_episode_steps 24.50


  7%|█████████▉                                                                                                                                  | 1417/20000 [22:51:01<288:29:34, 55.89s/it]

Updates 1416, num timesteps 566800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.03/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.030500 mean_episode_steps 21.30
Updates 1417, num timesteps 567200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.00, explor_rew 0.029750 mean_episode_steps 24.90


  7%|█████████▉                                                                                                                                  | 1419/20000 [22:52:53<288:37:39, 55.92s/it]

Updates 1418, num timesteps 567600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss 0.00, explor_rew 0.032000 mean_episode_steps 24.05


  7%|█████████▉                                                                                                                                  | 1420/20000 [22:53:49<288:47:57, 55.96s/it]

Updates 1419, num timesteps 568000, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 21.50
Updates 1420, num timesteps 568400, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 20.70


  7%|█████████▉                                                                                                                                  | 1422/20000 [22:55:41<288:25:49, 55.89s/it]

Updates 1421, num timesteps 568800, FPS 6 
Last 20 training episodes: mean/median reward 0.24/0.21, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.00, explor_rew 0.033750 mean_episode_steps 29.85


  7%|█████████▉                                                                                                                                  | 1423/20000 [22:56:37<288:21:34, 55.88s/it]

Updates 1422, num timesteps 569200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.50, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 14.70


  7%|█████████▉                                                                                                                                  | 1424/20000 [22:57:33<288:27:43, 55.90s/it]

Updates 1423, num timesteps 569600, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.50, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 21.30


  7%|█████████▉                                                                                                                                  | 1425/20000 [22:58:29<288:31:38, 55.92s/it]

Updates 1424, num timesteps 570000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 23.05


  7%|█████████▉                                                                                                                                  | 1426/20000 [22:59:23<285:26:39, 55.32s/it]

Updates 1425, num timesteps 570400, FPS 6 
Last 20 training episodes: mean/median reward 0.17/0.20, min/max reward -1.05/0.55
dist_entropy 3.18, value_loss 0.19, action_loss 0.08, explor_rew -0.024000 mean_episode_steps 35.85
Updates 1426, num timesteps 570800, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.01, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 22.55


  7%|█████████▉                                                                                                                                  | 1427/20000 [23:00:20<287:49:42, 55.79s/it]

Updates 1427, num timesteps 571200, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -1.05/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 20.40


  7%|█████████▉                                                                                                                                  | 1428/20000 [23:01:16<288:03:00, 55.84s/it]

Updates 1428, num timesteps 571600, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.05/0.55
dist_entropy 3.12, value_loss 0.06, action_loss -0.04, explor_rew -0.004250 mean_episode_steps 42.65


  7%|██████████                                                                                                                                  | 1429/20000 [23:02:10<286:30:50, 55.54s/it]

Updates 1429, num timesteps 572000, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.05/0.55
dist_entropy 2.94, value_loss 0.02, action_loss 0.00, explor_rew 0.010750 mean_episode_steps 28.55


  7%|██████████                                                                                                                                  | 1431/20000 [23:04:01<285:33:40, 55.36s/it]

Updates 1430, num timesteps 572400, FPS 6 
Last 20 training episodes: mean/median reward 0.16/-0.01, min/max reward -1.05/0.55
dist_entropy 2.96, value_loss 0.02, action_loss -0.01, explor_rew 0.018000 mean_episode_steps 23.60


  7%|██████████                                                                                                                                  | 1432/20000 [23:04:57<286:16:57, 55.51s/it]

Updates 1431, num timesteps 572800, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.45, min/max reward -0.05/0.55
dist_entropy 2.88, value_loss 0.01, action_loss -0.02, explor_rew 0.020500 mean_episode_steps 19.70


  7%|██████████                                                                                                                                  | 1433/20000 [23:05:52<285:53:58, 55.43s/it]

Updates 1432, num timesteps 573200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.05/0.55
dist_entropy 2.85, value_loss 0.00, action_loss -0.02, explor_rew 0.024000 mean_episode_steps 27.70
Updates 1433, num timesteps 573600, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.55, min/max reward -1.05/0.55
dist_entropy 2.93, value_loss 0.02, action_loss 0.00, explor_rew 0.021000 mean_episode_steps 27.80


  7%|██████████                                                                                                                                  | 1434/20000 [23:06:48<286:15:40, 55.51s/it]

Updates 1434, num timesteps 574000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.05/0.55
dist_entropy 2.87, value_loss 0.02, action_loss -0.01, explor_rew 0.025500 mean_episode_steps 28.15


  7%|██████████                                                                                                                                  | 1436/20000 [23:08:39<286:46:03, 55.61s/it]

Updates 1435, num timesteps 574400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.05/0.55
dist_entropy 2.80, value_loss 0.02, action_loss -0.01, explor_rew 0.029250 mean_episode_steps 19.50
Updates 1436, num timesteps 574800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 22.55


  7%|██████████                                                                                                                                  | 1437/20000 [23:09:35<287:32:24, 55.76s/it]

Updates 1437, num timesteps 575200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.03/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 16.15


  7%|██████████                                                                                                                                  | 1439/20000 [23:11:28<288:34:15, 55.97s/it]

Updates 1438, num timesteps 575600, FPS 6 
Last 20 training episodes: mean/median reward 0.50/0.50, min/max reward 0.45/0.55
dist_entropy 2.60, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 18.65


  7%|██████████                                                                                                                                  | 1440/20000 [23:12:23<288:20:21, 55.93s/it]

Updates 1439, num timesteps 576000, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.03/0.55
dist_entropy 2.59, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 17.25


  7%|██████████                                                                                                                                  | 1441/20000 [23:13:19<288:14:55, 55.91s/it]

Updates 1440, num timesteps 576400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.50, min/max reward -0.03/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 23.65


  7%|██████████                                                                                                                                  | 1442/20000 [23:14:15<288:34:47, 55.98s/it]

Updates 1441, num timesteps 576800, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.45, min/max reward 0.02/0.55
dist_entropy 2.60, value_loss 0.00, action_loss 0.00, explor_rew 0.031750 mean_episode_steps 27.45
Updates 1442, num timesteps 577200, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward 0.02/0.55
dist_entropy 2.62, value_loss 0.00, action_loss 0.00, explor_rew 0.031250 mean_episode_steps 26.05


  7%|██████████                                                                                                                                  | 1444/20000 [23:16:07<288:23:47, 55.95s/it]

Updates 1443, num timesteps 577600, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.50, min/max reward -0.03/0.55
dist_entropy 2.61, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 30.80


  7%|██████████                                                                                                                                  | 1445/20000 [23:17:03<287:59:48, 55.88s/it]

Updates 1444, num timesteps 578000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.03/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 26.70


  7%|██████████                                                                                                                                  | 1446/20000 [23:17:59<287:53:44, 55.86s/it]

Updates 1445, num timesteps 578400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.03/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 23.00


  7%|██████████▏                                                                                                                                 | 1447/20000 [23:18:55<287:48:47, 55.85s/it]

Updates 1446, num timesteps 578800, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 23.25


  7%|██████████▏                                                                                                                                 | 1448/20000 [23:19:51<288:31:24, 55.99s/it]

Updates 1447, num timesteps 579200, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -1.05/0.55
dist_entropy 2.64, value_loss 0.01, action_loss 0.79, explor_rew 0.030500 mean_episode_steps 21.95
Updates 1448, num timesteps 579600, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.04/0.55
dist_entropy 2.76, value_loss 0.01, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 27.85


  7%|██████████▏                                                                                                                                 | 1450/20000 [23:21:43<288:41:58, 56.03s/it]

Updates 1449, num timesteps 580000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 22.35
Updates 1450, num timesteps 580400, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 29.65


  7%|██████████▏                                                                                                                                 | 1451/20000 [23:22:39<288:35:43, 56.01s/it]

Updates 1451, num timesteps 580800, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss 0.00, explor_rew 0.032750 mean_episode_steps 22.95


  7%|██████████▏                                                                                                                                 | 1453/20000 [23:24:31<288:02:06, 55.91s/it]

Updates 1452, num timesteps 581200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.00, explor_rew 0.031750 mean_episode_steps 20.55
Updates 1453, num timesteps 581600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 21.75


  7%|██████████▏                                                                                                                                 | 1454/20000 [23:25:27<288:27:56, 55.99s/it]

Updates 1454, num timesteps 582000, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.55, min/max reward -0.05/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 25.95


  7%|██████████▏                                                                                                                                 | 1456/20000 [23:27:19<287:56:53, 55.90s/it]

Updates 1455, num timesteps 582400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.031000 mean_episode_steps 18.95


  7%|██████████▏                                                                                                                                 | 1457/20000 [23:28:15<288:24:04, 55.99s/it]

Updates 1456, num timesteps 582800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.03/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 24.00


  7%|██████████▏                                                                                                                                 | 1458/20000 [23:29:11<287:54:05, 55.90s/it]

Updates 1457, num timesteps 583200, FPS 6 
Last 20 training episodes: mean/median reward 0.21/-0.00, min/max reward -0.05/0.55
dist_entropy 2.68, value_loss 0.01, action_loss 0.00, explor_rew 0.032500 mean_episode_steps 24.50


  7%|██████████▏                                                                                                                                 | 1459/20000 [23:30:07<287:41:59, 55.86s/it]

Updates 1458, num timesteps 583600, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.01, action_loss 0.00, explor_rew 0.030500 mean_episode_steps 21.20


  7%|██████████▏                                                                                                                                 | 1460/20000 [23:31:02<287:03:07, 55.74s/it]

Updates 1459, num timesteps 584000, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.45, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 24.95
Updates 1460, num timesteps 584400, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.05/0.55
dist_entropy 2.77, value_loss 0.01, action_loss 0.08, explor_rew 0.027750 mean_episode_steps 20.05


  7%|██████████▏                                                                                                                                 | 1461/20000 [23:31:58<287:08:10, 55.76s/it]

Updates 1461, num timesteps 584800, FPS 6 
Last 20 training episodes: mean/median reward 0.21/0.55, min/max reward -1.05/0.55
dist_entropy 2.78, value_loss 0.03, action_loss 0.00, explor_rew 0.029500 mean_episode_steps 21.85


  7%|██████████▏                                                                                                                                 | 1463/20000 [23:33:50<288:18:49, 55.99s/it]

Updates 1462, num timesteps 585200, FPS 6 
Last 20 training episodes: mean/median reward 0.13/0.50, min/max reward -1.05/0.55
dist_entropy 2.76, value_loss 0.07, action_loss -0.02, explor_rew 0.030750 mean_episode_steps 23.50
Updates 1463, num timesteps 585600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.04, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 23.55


  7%|██████████▎                                                                                                                                 | 1465/20000 [23:35:43<288:25:13, 56.02s/it]

Updates 1464, num timesteps 586000, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.03/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 20.80


  7%|██████████▎                                                                                                                                 | 1466/20000 [23:36:38<288:00:44, 55.94s/it]

Updates 1465, num timesteps 586400, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.50, min/max reward -0.03/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 22.15
Updates 1466, num timesteps 586800, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 21.60


  7%|██████████▎                                                                                                                                 | 1468/20000 [23:38:31<289:24:14, 56.22s/it]

Updates 1467, num timesteps 587200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.45, min/max reward -0.03/0.55
dist_entropy 2.69, value_loss 0.01, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 29.10


  7%|██████████▎                                                                                                                                 | 1469/20000 [23:39:27<288:52:51, 56.12s/it]

Updates 1468, num timesteps 587600, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.03/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.031750 mean_episode_steps 23.70


  7%|██████████▎                                                                                                                                 | 1470/20000 [23:40:23<288:24:07, 56.03s/it]

Updates 1469, num timesteps 588000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.03/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 24.05
Updates 1470, num timesteps 588400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.03/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 20.10


  7%|██████████▎                                                                                                                                 | 1472/20000 [23:42:15<288:18:20, 56.02s/it]

Updates 1471, num timesteps 588800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.03/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 22.05


  7%|██████████▎                                                                                                                                 | 1473/20000 [23:43:11<288:03:40, 55.97s/it]

Updates 1472, num timesteps 589200, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.50, min/max reward -0.03/0.55
dist_entropy 2.78, value_loss 0.02, action_loss -0.02, explor_rew 0.028500 mean_episode_steps 22.70


  7%|██████████▎                                                                                                                                 | 1474/20000 [23:44:07<287:29:34, 55.87s/it]

Updates 1473, num timesteps 589600, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.03/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 21.75


  7%|██████████▎                                                                                                                                 | 1475/20000 [23:45:03<287:47:16, 55.93s/it]

Updates 1474, num timesteps 590000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.45, min/max reward -0.03/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 17.75


  7%|██████████▎                                                                                                                                 | 1476/20000 [23:45:58<286:58:06, 55.77s/it]

Updates 1475, num timesteps 590400, FPS 6 
Last 20 training episodes: mean/median reward 0.04/-0.03, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.02, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 20.90


  7%|██████████▎                                                                                                                                 | 1477/20000 [23:46:54<286:58:37, 55.77s/it]

Updates 1476, num timesteps 590800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 20.45


  7%|██████████▎                                                                                                                                 | 1478/20000 [23:47:50<287:36:15, 55.90s/it]

Updates 1477, num timesteps 591200, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -1.05/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.00, explor_rew 0.032000 mean_episode_steps 22.95


  7%|██████████▎                                                                                                                                 | 1479/20000 [23:48:46<287:10:54, 55.82s/it]

Updates 1478, num timesteps 591600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 24.35


  7%|██████████▎                                                                                                                                 | 1480/20000 [23:49:41<286:45:35, 55.74s/it]

Updates 1479, num timesteps 592000, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.50, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 24.25


  7%|██████████▎                                                                                                                                 | 1481/20000 [23:50:37<286:38:53, 55.72s/it]

Updates 1480, num timesteps 592400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 24.95


  7%|██████████▎                                                                                                                                 | 1482/20000 [23:51:32<286:17:47, 55.66s/it]

Updates 1481, num timesteps 592800, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 21.95
Updates 1482, num timesteps 593200, FPS 6 
Last 20 training episodes: mean/median reward 0.29/0.55, min/max reward -0.04/0.55
dist_entropy 2.73, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 22.75


  7%|██████████▍                                                                                                                                 | 1484/20000 [23:53:24<286:40:24, 55.74s/it]

Updates 1483, num timesteps 593600, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 27.65
Updates 1484, num timesteps 594000, FPS 6 
Last 20 training episodes: mean/median reward 0.45/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.01, action_loss -0.00, explor_rew 0.033250 mean_episode_steps 19.05


  7%|██████████▍                                                                                                                                 | 1485/20000 [23:54:20<287:18:37, 55.86s/it]

Updates 1485, num timesteps 594400, FPS 6 
Last 20 training episodes: mean/median reward 0.37/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.01, action_loss 0.00, explor_rew 0.032000 mean_episode_steps 22.30


  7%|██████████▍                                                                                                                                 | 1487/20000 [23:56:11<286:18:37, 55.68s/it]

Updates 1486, num timesteps 594800, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.50, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 21.30
Updates 1487, num timesteps 595200, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.00, explor_rew 0.030750 mean_episode_steps 18.60


  7%|██████████▍                                                                                                                                 | 1488/20000 [23:57:08<287:25:20, 55.89s/it]

Updates 1488, num timesteps 595600, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.72, value_loss 0.01, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 19.00


  7%|██████████▍                                                                                                                                 | 1490/20000 [23:58:59<287:12:10, 55.86s/it]

Updates 1489, num timesteps 596000, FPS 6 
Last 20 training episodes: mean/median reward 0.48/0.50, min/max reward -0.03/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.00, explor_rew 0.030750 mean_episode_steps 19.80


  7%|██████████▍                                                                                                                                 | 1491/20000 [23:59:55<287:01:00, 55.82s/it]

Updates 1490, num timesteps 596400, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.50, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.01, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 25.65


  7%|██████████▍                                                                                                                                 | 1492/20000 [24:00:51<286:25:04, 55.71s/it]

Updates 1491, num timesteps 596800, FPS 6 
Last 20 training episodes: mean/median reward 0.32/0.45, min/max reward -0.04/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 28.15


  7%|██████████▍                                                                                                                                 | 1493/20000 [24:01:47<287:12:38, 55.87s/it]

Updates 1492, num timesteps 597200, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.50, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 29.45


  7%|██████████▍                                                                                                                                 | 1494/20000 [24:02:42<286:45:50, 55.78s/it]

Updates 1493, num timesteps 597600, FPS 6 
Last 20 training episodes: mean/median reward 0.28/0.50, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.00, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 23.10


  7%|██████████▍                                                                                                                                 | 1495/20000 [24:03:38<286:50:56, 55.80s/it]

Updates 1494, num timesteps 598000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.50, min/max reward -0.04/0.55
dist_entropy 2.67, value_loss 0.00, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 27.40


  7%|██████████▍                                                                                                                                 | 1496/20000 [24:04:34<286:21:53, 55.71s/it]

Updates 1495, num timesteps 598400, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.45, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.01, action_loss 0.00, explor_rew 0.033500 mean_episode_steps 20.15
Updates 1496, num timesteps 598800, FPS 6 
Last 20 training episodes: mean/median reward 0.46/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 17.70


  7%|██████████▍                                                                                                                                 | 1497/20000 [24:05:30<287:15:54, 55.89s/it]

Updates 1497, num timesteps 599200, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 18.85


  7%|██████████▍                                                                                                                                 | 1498/20000 [24:06:26<287:39:21, 55.97s/it]

Updates 1498, num timesteps 599600, FPS 6 
Last 20 training episodes: mean/median reward 0.42/0.55, min/max reward -0.04/0.55
dist_entropy 2.92, value_loss 0.03, action_loss 0.01, explor_rew 0.011500 mean_episode_steps 24.00


  7%|██████████▍                                                                                                                                 | 1499/20000 [24:07:22<287:16:20, 55.90s/it]

Updates 1499, num timesteps 600000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 3.00, value_loss 0.03, action_loss -0.03, explor_rew 0.011750 mean_episode_steps 19.60


  8%|██████████▌                                                                                                                                 | 1500/20000 [24:08:17<286:04:14, 55.67s/it]

Updates 1500, num timesteps 600400, FPS 6 
Last 20 training episodes: mean/median reward 0.16/-0.04, min/max reward -1.05/0.55
dist_entropy 2.68, value_loss 0.01, action_loss 0.01, explor_rew 0.033750 mean_episode_steps 33.40


  8%|██████████▌                                                                                                                                 | 1502/20000 [24:10:09<286:06:45, 55.68s/it]

Updates 1501, num timesteps 600800, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.50, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.00, explor_rew 0.033000 mean_episode_steps 21.40


  8%|██████████▌                                                                                                                                 | 1503/20000 [24:11:04<286:14:07, 55.71s/it]

Updates 1502, num timesteps 601200, FPS 6 
Last 20 training episodes: mean/median reward 0.34/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.01, action_loss 0.01, explor_rew 0.033000 mean_episode_steps 23.55


  8%|██████████▌                                                                                                                                 | 1504/20000 [24:12:00<286:25:55, 55.75s/it]

Updates 1503, num timesteps 601600, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.50, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.01, explor_rew 0.032250 mean_episode_steps 16.15
Updates 1504, num timesteps 602000, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.01, action_loss -0.01, explor_rew 0.033250 mean_episode_steps 23.90


  8%|██████████▌                                                                                                                                 | 1506/20000 [24:13:53<287:23:25, 55.94s/it]

Updates 1505, num timesteps 602400, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.45, min/max reward -0.04/0.55
dist_entropy 2.65, value_loss 0.00, action_loss -0.00, explor_rew 0.034250 mean_episode_steps 20.40
Updates 1506, num timesteps 602800, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.031500 mean_episode_steps 25.85


  8%|██████████▌                                                                                                                                 | 1507/20000 [24:14:49<287:21:24, 55.94s/it]

Updates 1507, num timesteps 603200, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -1.05/0.55
dist_entropy 2.66, value_loss 0.01, action_loss -0.00, explor_rew 0.032250 mean_episode_steps 20.60


  8%|██████████▌                                                                                                                                 | 1508/20000 [24:15:45<287:52:26, 56.04s/it]

Updates 1508, num timesteps 603600, FPS 6 
Last 20 training episodes: mean/median reward 0.36/0.55, min/max reward -0.04/0.55
dist_entropy 2.63, value_loss 0.00, action_loss -0.02, explor_rew 0.033500 mean_episode_steps 29.65


  8%|██████████▌                                                                                                                                 | 1509/20000 [24:16:41<288:33:27, 56.18s/it]

Updates 1509, num timesteps 604000, FPS 6 
Last 20 training episodes: mean/median reward 0.44/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.032750 mean_episode_steps 24.50


  8%|██████████▌                                                                                                                                 | 1510/20000 [24:17:38<289:01:39, 56.27s/it]

Updates 1510, num timesteps 604400, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 22.65


  8%|██████████▌                                                                                                                                 | 1511/20000 [24:18:34<288:59:58, 56.27s/it]

Updates 1511, num timesteps 604800, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.04/0.55
dist_entropy 2.71, value_loss 0.01, action_loss -0.01, explor_rew 0.033000 mean_episode_steps 22.15


  8%|██████████▌                                                                                                                                 | 1513/20000 [24:20:25<287:24:39, 55.97s/it]

Updates 1512, num timesteps 605200, FPS 6 
Last 20 training episodes: mean/median reward 0.41/0.45, min/max reward -0.04/0.55
dist_entropy 2.64, value_loss 0.00, action_loss -0.01, explor_rew 0.032000 mean_episode_steps 16.00


  8%|██████████▌                                                                                                                                 | 1514/20000 [24:21:21<287:08:26, 55.92s/it]

Updates 1513, num timesteps 605600, FPS 6 
Last 20 training episodes: mean/median reward 0.30/0.45, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.01, action_loss -0.01, explor_rew 0.033500 mean_episode_steps 35.50
Updates 1514, num timesteps 606000, FPS 6 
Last 20 training episodes: mean/median reward 0.33/0.55, min/max reward -1.05/0.55
dist_entropy 2.59, value_loss 0.07, action_loss -0.00, explor_rew 0.021750 mean_episode_steps 23.85


  8%|██████████▌                                                                                                                                 | 1516/20000 [24:23:12<286:04:27, 55.72s/it]

Updates 1515, num timesteps 606400, FPS 6 
Last 20 training episodes: mean/median reward -0.00/-0.03, min/max reward -1.05/0.55
dist_entropy 2.71, value_loss 0.08, action_loss 0.01, explor_rew 0.033000 mean_episode_steps 23.35


  8%|██████████▌                                                                                                                                 | 1517/20000 [24:24:08<285:17:22, 55.57s/it]

Updates 1516, num timesteps 606800, FPS 6 
Last 20 training episodes: mean/median reward 0.25/0.23, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.03, action_loss -0.01, explor_rew 0.029750 mean_episode_steps 24.90


  8%|██████████▋                                                                                                                                 | 1518/20000 [24:25:04<286:07:34, 55.73s/it]

Updates 1517, num timesteps 607200, FPS 6 
Last 20 training episodes: mean/median reward 0.31/0.50, min/max reward -1.05/0.55
dist_entropy 2.64, value_loss 0.02, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 22.15


  8%|██████████▋                                                                                                                                 | 1519/20000 [24:25:59<285:44:30, 55.66s/it]

Updates 1518, num timesteps 607600, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.50, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss 0.00, explor_rew 0.032000 mean_episode_steps 24.80
Updates 1519, num timesteps 608000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.00, action_loss -0.00, explor_rew 0.032500 mean_episode_steps 22.00


  8%|██████████▋                                                                                                                                 | 1521/20000 [24:27:51<286:06:34, 55.74s/it]

Updates 1520, num timesteps 608400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.03/0.55
dist_entropy 2.72, value_loss 0.00, action_loss -0.01, explor_rew 0.030750 mean_episode_steps 27.20
Updates 1521, num timesteps 608800, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.03/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.00, explor_rew 0.031500 mean_episode_steps 18.55


  8%|██████████▋                                                                                                                                 | 1522/20000 [24:28:47<286:54:36, 55.90s/it]

Updates 1522, num timesteps 609200, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.70, value_loss 0.01, action_loss -0.01, explor_rew 0.032750 mean_episode_steps 24.25


  8%|██████████▋                                                                                                                                 | 1523/20000 [24:29:43<287:10:43, 55.95s/it]

Updates 1523, num timesteps 609600, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.04/0.55
dist_entropy 2.68, value_loss 0.00, action_loss -0.01, explor_rew 0.032500 mean_episode_steps 14.45


  8%|██████████▋                                                                                                                                 | 1524/20000 [24:30:39<286:56:19, 55.91s/it]

Updates 1524, num timesteps 610000, FPS 6 
Last 20 training episodes: mean/median reward 0.47/0.55, min/max reward -0.04/0.55
dist_entropy 2.62, value_loss 0.00, action_loss -0.00, explor_rew 0.031250 mean_episode_steps 16.75


  8%|██████████▋                                                                                                                                 | 1526/20000 [24:32:31<286:11:10, 55.77s/it]

Updates 1525, num timesteps 610400, FPS 6 
Last 20 training episodes: mean/median reward 0.40/0.45, min/max reward -0.05/0.55
dist_entropy 2.99, value_loss 0.03, action_loss -0.04, explor_rew 0.000250 mean_episode_steps 36.35


  8%|██████████▋                                                                                                                                 | 1527/20000 [24:33:26<284:54:52, 55.52s/it]

Updates 1526, num timesteps 610800, FPS 6 
Last 20 training episodes: mean/median reward 0.35/0.45, min/max reward -0.05/0.55
dist_entropy 2.77, value_loss 0.01, action_loss -0.01, explor_rew 0.021000 mean_episode_steps 35.30
Updates 1527, num timesteps 611200, FPS 6 
Last 20 training episodes: mean/median reward 0.38/0.55, min/max reward -0.05/0.55
dist_entropy 2.74, value_loss 0.00, action_loss -0.01, explor_rew 0.026250 mean_episode_steps 20.95


  8%|██████████▋                                                                                                                                 | 1528/20000 [24:34:22<285:37:27, 55.67s/it]

Updates 1528, num timesteps 611600, FPS 6 
Last 20 training episodes: mean/median reward 0.43/0.55, min/max reward -0.04/0.55
dist_entropy 2.66, value_loss 0.01, action_loss -0.01, explor_rew 0.031250 mean_episode_steps 19.45


  8%|██████████▋                                                                                                                                 | 1529/20000 [24:35:18<286:23:14, 55.82s/it]

Updates 1529, num timesteps 612000, FPS 6 
Last 20 training episodes: mean/median reward 0.39/0.55, min/max reward -0.04/0.55
dist_entropy 2.69, value_loss 0.00, action_loss -0.01, explor_rew 0.030000 mean_episode_steps 20.80


  8%|██████████▋                                                                                                                                 | 1530/20000 [24:36:14<286:30:22, 55.84s/it]

In [None]:
for env in envs:
    env.close()