# Enable GPU

In [2]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# Actor and Critic Network



In [3]:
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

class Actor_Net(nn.Module):
  def __init__(self, input_dims, output_dims, num_neurons = 128):
    super(Actor_Net, self).__init__()
    self.fc1 = nn.Linear(input_dims, num_neurons)
    self.actor = nn.Linear(num_neurons, output_dims)
    self.log_probs = []
    self.entropies = []

  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = F.softmax(self.actor(x), dim = 1)

    return x

  def get_action(self, state):
    with torch.no_grad():
      probs = self.forward(state)
      dist = Categorical(probs = probs)
      action = dist.sample()
    return action
  
  def eval_action(self, state):
    probs = self.forward(state)
    dist = Categorical(probs = probs)
    action = dist.sample().to(device)
    log_prob = dist.log_prob(action)
    entropy = dist.entropy()
    self.log_probs.append(log_prob)
    self.entropies.append(entropy)

    return action

class Critic_Net(nn.Module):
  def __init__ (self, input_dims, output_dims, num_neurons = 128):
    super(Critic_Net, self).__init__()

    self.values = []
    self.next_values = []
    
    self.fc1 = nn.Linear(input_dims, num_neurons)
    self.critic = nn.Linear(num_neurons, 1)

  def forward (self, state):
    x = F.relu(self.fc1(state))
    x = self.critic(x)

    return x

In [4]:
import torch.optim as optim
import numpy as np
import gym

class Actor_Critic_Agent(nn.Module):
  def __init__(self, input_dims, output_dims, optimizer = 'RMSprop', num_neurons = 128 , gamma = 0.99, actor_lr=0.001, critic_lr = 0.01):
    super(Actor_Critic_Agent, self).__init__()
    self.actor_net = Actor_Net(input_dims= input_dims, output_dims= output_dims, num_neurons= num_neurons).to(device)
    self.critic_net = Critic_Net(input_dims=input_dims, output_dims= output_dims, num_neurons= num_neurons).to(device)
    self.gamma = gamma
    if optimizer == 'RMSprop':
      self.actor_optimizer = optim.RMSprop(params = self.actor_net.parameters(), lr =actor_lr)
      self.critic_optimizer = optim.RMSprop(params = self.critic_net.parameters(), lr = critic_lr)
    else:
      self.actor_optimizer = optim.Adam(params = self.actor_net.parameters(), lr = actor_lr)
      self.critic_optimizer = optim.Adam(params = self.critic_net.parameters(), lr = critic_lr)

  def learn_mean(self, rewards, dones):
    value_criteration = nn.MSELoss()
    value_losses = []
    actor_losses = []
    self.critic_net.next_values = torch.cat(self.critic_net.next_values, dim = 0).squeeze(0)
    self.critic_net.values = torch.cat(self.critic_net.values, dim = 0).squeeze(0)
    self.actor_net.log_probs = torch.cat(self.actor_net.log_probs, dim = 0)
    self.actor_net.entropies = torch.cat(self.actor_net.entropies, dim = 0)

    for reward, entropy, log_prob, v, v_next, done in zip(rewards ,self.actor_net.entropies, self.actor_net.log_probs, self.critic_net.values, self.critic_net.next_values, dones):
      td_target = reward + self.gamma * v_next * done
      td_error = td_target - v
      value_loss = value_criteration(v, td_target.detach())- 0.001 * entropy.detach()
      actor_loss = - log_prob * td_error.detach() 
      value_losses.append(value_loss)
      actor_losses.append(actor_loss)

    self.critic_optimizer.zero_grad()
    value_losses = torch.stack(value_losses).sum()
    value_losses.backward()
    self.critic_optimizer.step()  

    self.actor_optimizer.zero_grad()
    actor_losses = torch.stack(actor_losses).sum()
    actor_losses.backward()
    self.actor_optimizer.step()

      
    # clear out memory 
    self.actor_net.log_probs = []
    self.actor_net.entropies = []
    self.critic_net.values = []
    self.critic_net.next_values = []



# Without Wandb

In [5]:
import gym
import time
import pdb

env = gym.make('CartPole-v1')
env.seed(543)
torch.manual_seed(543)
state_dims = env.observation_space.shape[0]
action_dims = env.action_space.n
agent = Actor_Critic_Agent(input_dims= state_dims, output_dims = action_dims)

def train():

  num_ep = 2000
  print_every = 100
  running_score = 10
  start = time.time()

  rewards = []
  dones = []

  for ep in range(1, num_ep + 1):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    dones = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.actor_net.eval_action(state)
      v = agent.critic_net(state)

      next_state, reward, done, _ = env.step(action.item())
      v_next = agent.critic_net(torch.tensor([next_state]).float().to(device))
  
      agent.critic_net.values.append(v.squeeze(0))
      agent.critic_net.next_values.append(v_next.squeeze(0))
      rewards.append(reward)
      dones.append(1 - done)
      
      # update episode
      score += reward
      state = next_state

      if done:
        break

    # update agent
    #pdb.set_trace()
    agent.learn_mean(rewards,dones)
   
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score

    if ep % print_every == 0:
      print('episode: {}, running score: {}, time elapsed: {}'.format(ep, running_score, time.time() - start))





In [None]:
train() #RMS

episode: 100, running score: 43.32507441570408, time elapsed: 4.842878341674805
episode: 200, running score: 129.30332722904944, time elapsed: 19.552313089370728


# Wtih wandb

In [None]:
!pip install wandb
!wandb login


In [None]:
import wandb
sweep_config = dict()
sweep_config['method'] = 'grid'
sweep_config['metric'] = {'name': 'running_score', 'goal': 'maximize'}
sweep_config['parameters'] = {'learning': {'value': 'learn_mean'}, 'actor_learning_rate': {'values' : [0.01, 0.001, 0.0001,0.0003,0.00001]}, 'critic_learning_rate' : {'values': [0.01, 0.001, 0.0001, 0.0003, 0.00001]}
                              , 'num_neurons': {'value': 128 }, 'optimizer': {'values' : ['RMSprop', 'Adam']}}

sweep_id = wandb.sweep(sweep_config, project = 'Advantage_Actor_Critic')

Create sweep with ID: t9gia22t
Sweep URL: https://wandb.ai/ko120/Advantage_Actor_Critic/sweeps/t9gia22t


In [None]:
import gym 
import torch
import time
import wandb



def train():
  wandb.init(config = {'env':'CartPole-v1','algorithm:': 'Actor_Critic','architecture': 'seperate','num_laeyrs':'2'}, project = 'Advantage_Actor_Critic',group = 'Cart_128_neurons_2_layer')
  config = wandb.config

  env = gym.make('CartPole-v1')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = Actor_Critic_Agent(input_dims= state_dim, output_dims= action_dim, optimizer = config.optimizer, num_neurons= config.num_neurons, actor_lr = config.actor_learning_rate, critic_lr = config.critic_learning_rate)


  num_ep = 3000
  print_interval = 100
  save_interval = 1000
  running_score = 10
  start = time.time()

  
  wandb.watch(agent)
  for ep in range(1,num_ep+1):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    dones = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.actor_net.eval_action(state)
      v = agent.critic_net(state)

      next_state, reward, done, _ = env.step(action.item())
      v_next = agent.critic_net(torch.tensor([next_state]).float().to(device))
  
      agent.critic_net.values.append(v.squeeze(0))
      agent.critic_net.next_values.append(v_next.squeeze(0))
      rewards.append(reward)
      dones.append(1 - done)
      
      # update episode
      score += reward
      state = next_state

      if done:
        break

    # update agent
    agent.learn_mean(rewards,dones)
   
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score

    wandb.log({'episode': ep, 'running_score': running_score})   

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))    
    
    if ep % save_interval == 0:
      save_name_actor = 'actor_' + str(ep) + '.pt'
      torch.save(agent.actor_net.state_dict(),save_name_actor)
      save_name_critic = 'critic_' + str(ep) + '.pt'
      torch.save(agent.critic_net.state_dict(),save_name_critic)
      wandb.save(save_name_actor)
      wandb.save(save_name_critic)

    if ep == num_ep:
      dummy_input = torch.rand(1,4).to(device)
      torch.onnx.export(agent.actor_net,dummy_input,'final_model_actor.onnx')
      wandb.save('final_model_actor.onnx')
      torch.onnx.export(agent.critic_net, dummy_input, 'final_model_critic.onnx')
      wandb.save('final_model_critic.onnx')
    

In [None]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: wivnmds7 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 9.320976246076176, ended at 2.0
episode 200 average reward 9.500732875399857, ended at 4.2
episode 300 average reward 9.300742705361715, ended at 6.4
episode 400 average reward 9.277204480629702, ended at 8.6
episode 500 average reward 9.273707051135274, ended at 10.8
episode 600 average reward 9.327889863203684, ended at 13.0
episode 700 average reward 9.333155867973675, ended at 15.4
episode 800 average reward 9.523549421141825, ended at 17.6
episode 900 average reward 9.673708625666013, ended at 19.7
episode 1000 average reward 9.748745763117851, ended at 21.9
episode 1100 average reward 9.323193182969527, ended at 24.2
episode 1200 average reward 56.42889769119886, ended at 31.3
episode 1300 average reward 43.74196023208914, ended at 41.4
episode 1400 average reward 67.21059343899846, ended at 56.4
episode 1500 average reward 75.91453703193939, ended at 70.0
episode 1600 average reward 312.7339309544486, ended at 112.1
episode 1700 average reward 491.6694

VBox(children=(Label(value=' 0.18MB of 0.18MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,497.6113
_runtime,1497.0
_timestamp,1627527224.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▅▇█████████████████
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 23g5a6tq with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.393700613898634, ended at 2.1
episode 200 average reward 10.894709220097706, ended at 4.3
episode 300 average reward 51.989124686948216, ended at 12.4
episode 400 average reward 203.69745507042563, ended at 35.4
episode 500 average reward 47.54950302248991, ended at 56.8
episode 600 average reward 92.8884477211901, ended at 69.0
episode 700 average reward 85.14868442109272, ended at 83.9
episode 800 average reward 90.31060358700215, ended at 101.6
episode 900 average reward 130.65703923534898, ended at 123.5
episode 1000 average reward 185.8257517386689, ended at 158.8
episode 1100 average reward 422.64762493258826, ended at 221.2
episode 1200 average reward 27.15940252918527, ended at 232.1
episode 1300 average reward 25.83159086560288, ended at 237.2
episode 1400 average reward 29.841646116682767, ended at 243.2
episode 1500 average reward 30.07053742243523, ended at 249.7
episode 1600 average reward 36.46616493916145, ended at 256.9
episode 1700 average 

VBox(children=(Label(value=' 0.19MB of 0.19MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,75.62835
_runtime,481.0
_timestamp,1627527711.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▂▂▄▂▂▃▂▂▃▄▄█▂▁▁▁▁▁▂▂▂▂▂▃▄▂▂▂▂▃▃▄▂▂▃▂▂
_runtime,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: uw0vj72l with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 9.323047870238417, ended at 2.0
episode 200 average reward 9.500745140511238, ended at 4.1
episode 300 average reward 9.300742777977668, ended at 6.3
episode 400 average reward 9.277204481059627, ended at 8.5
episode 500 average reward 9.273707051137817, ended at 10.6
episode 600 average reward 9.327889863203705, ended at 12.7
episode 700 average reward 9.333155867973675, ended at 14.8
episode 800 average reward 9.523549421141825, ended at 17.0
episode 900 average reward 9.673708625666013, ended at 19.1
episode 1000 average reward 9.17122064557445, ended at 21.3
episode 1100 average reward 9.39649773679353, ended at 23.5
episode 1200 average reward 9.193564571194404, ended at 25.8
episode 1300 average reward 9.48526540298333, ended at 27.9
episode 1400 average reward 9.437208248169437, ended at 30.2
episode 1500 average reward 9.555995245145606, ended at 32.5
episode 1600 average reward 9.372828596848617, ended at 34.7
episode 1700 average reward 9.4268708731

VBox(children=(Label(value=' 0.21MB of 0.21MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.22451
_runtime,68.0
_timestamp,1627527785.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▇▃▅▂▂▄▁▃▄▄▇▃▄▁▅▅▂▅▄▂▄▄▅▄▆▃▆▄▄▅▄▄▃▁▄▄█▄█▃
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: ygowlc1q with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.482441320427503, ended at 2.3
episode 200 average reward 9.521694115910226, ended at 4.4
episode 300 average reward 9.300866806998652, ended at 6.5
episode 400 average reward 9.281812195275764, ended at 8.6
episode 500 average reward 9.273734331244475, ended at 10.6
episode 600 average reward 9.32789002471637, ended at 12.7
episode 700 average reward 9.827598664513028, ended at 14.9
episode 800 average reward 9.26144886325921, ended at 17.0
episode 900 average reward 9.110503240776072, ended at 19.1
episode 1000 average reward 9.545656955308921, ended at 21.2
episode 1100 average reward 9.321990789748574, ended at 23.4
episode 1200 average reward 9.56841024704241, ended at 25.6
episode 1300 average reward 9.348897438109866, ended at 27.7
episode 1400 average reward 9.250522490296776, ended at 29.9
episode 1500 average reward 9.16773866121093, ended at 32.1
episode 1600 average reward 9.47137972380003, ended at 34.2
episode 1700 average reward 9.900633984295

VBox(children=(Label(value=' 0.22MB of 0.22MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,40.16527
_runtime,193.0
_timestamp,1627527984.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▂▂▂▂▂▂█▂▂▂▂▂
_runtime,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▄▄▄▅▅▅▆▇▇███
_timestamp,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▄▄▄▅▅▅▆▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 9msbp2mq with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 9.322985811087863, ended at 2.0
episode 200 average reward 9.279185567218038, ended at 4.0
episode 300 average reward 9.457789522408772, ended at 6.1
episode 400 average reward 9.267436658340829, ended at 8.2
episode 500 average reward 9.35007257483729, ended at 10.4
episode 600 average reward 9.419099381342358, ended at 12.7
episode 700 average reward 9.46243508339491, ended at 14.9
episode 800 average reward 9.259286901606998, ended at 17.1
episode 900 average reward 9.110490440818939, ended at 19.3
episode 1000 average reward 9.545656879526401, ended at 21.5
episode 1100 average reward 9.321990789299901, ended at 23.9
episode 1200 average reward 9.446234309539754, ended at 26.2
episode 1300 average reward 9.348174091901857, ended at 28.5
episode 1400 average reward 9.250518207704413, ended at 30.8
episode 1500 average reward 9.167738635855713, ended at 33.1
episode 1600 average reward 9.43553113140906, ended at 35.3
episode 1700 average reward 9.3367891916

VBox(children=(Label(value=' 0.23MB of 0.23MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.50817
_runtime,68.0
_timestamp,1627528057.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▂▂▃▄▂▄▄▃▂▁▃▂▄▂▂▃▂▂▃▃▃▂▂▂▃▂▃▃▃▃▃▂▁▃▃▅▃▂▃
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: i6rwob6k with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.6155461231107, ended at 2.4
episode 200 average reward 10.348622036051804, ended at 4.8
episode 300 average reward 9.309286169768178, ended at 7.0
episode 400 average reward 9.277255062460366, ended at 9.3
episode 500 average reward 9.27933860292073, ended at 11.5
episode 600 average reward 9.327923204970592, ended at 13.7
episode 700 average reward 9.334986023755194, ended at 16.0
episode 800 average reward 9.523560256632605, ended at 18.3
episode 900 average reward 9.673708689817854, ended at 20.6
episode 1000 average reward 9.171220645954264, ended at 22.8
episode 1100 average reward 9.396497736795776, ended at 25.2
episode 1200 average reward 9.193564571194418, ended at 27.4
episode 1300 average reward 9.48526540298333, ended at 29.6
episode 1400 average reward 9.437208248169437, ended at 31.8
episode 1500 average reward 9.555995245145606, ended at 34.0
episode 1600 average reward 9.372828596848617, ended at 36.2
episode 1700 average reward 9.4268708731

VBox(children=(Label(value=' 0.25MB of 0.25MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.22369
_runtime,69.0
_timestamp,1627528132.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▂▃▁▁▁▁▁▁▁▂▁▁▁▂▂▁▁▁▁▁▁▂▁▂▁▂▁▁▁▁▁▁▂▂▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: udmv9phl with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 9.319351382733059, ended at 2.0
episode 200 average reward 9.279164049478767, ended at 4.2
episode 300 average reward 9.457789395012366, ended at 6.4
episode 400 average reward 9.267436657586574, ended at 8.6
episode 500 average reward 9.350072574832824, ended at 10.8
episode 600 average reward 9.419099381342326, ended at 13.0
episode 700 average reward 9.46243508339491, ended at 15.2
episode 800 average reward 9.259286901606998, ended at 17.4
episode 900 average reward 9.110490440818939, ended at 19.5
episode 1000 average reward 9.545656879526401, ended at 21.6
episode 1100 average reward 9.321990789299901, ended at 23.8
episode 1200 average reward 9.446234309539754, ended at 26.0
episode 1300 average reward 9.348174091901857, ended at 28.2
episode 1400 average reward 9.250518207704413, ended at 30.4
episode 1500 average reward 9.167738635855713, ended at 32.6
episode 1600 average reward 9.43553113140906, ended at 34.7
episode 1700 average reward 9.336789191

VBox(children=(Label(value=' 0.26MB of 0.26MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.22369
_runtime,68.0
_timestamp,1627528205.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▁▁▁▂▁▂▂▁▁▁▂▁▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁█▂▁▁▁▂▂▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: ynx1szp6 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.6155461231107, ended at 2.5
episode 200 average reward 9.50813304769929, ended at 4.6
episode 300 average reward 9.300786518298054, ended at 6.7
episode 400 average reward 9.277204740025473, ended at 8.8
episode 500 average reward 9.273707052671034, ended at 10.9
episode 600 average reward 9.32788986321278, ended at 13.0
episode 700 average reward 9.333155867973732, ended at 15.1
episode 800 average reward 9.523549421141825, ended at 17.1
episode 900 average reward 9.673708625666013, ended at 19.3
episode 1000 average reward 9.17122064557445, ended at 21.4
episode 1100 average reward 9.39649773679353, ended at 23.6
episode 1200 average reward 9.193564571194404, ended at 25.9
episode 1300 average reward 9.48526540298333, ended at 28.1
episode 1400 average reward 9.437208248169437, ended at 30.3
episode 1500 average reward 9.555995245145606, ended at 32.6
episode 1600 average reward 9.372828596848617, ended at 34.8
episode 1700 average reward 9.42687087317219

VBox(children=(Label(value=' 0.27MB of 0.27MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.22369
_runtime,68.0
_timestamp,1627528278.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▂▂▁▁▁▁▁▁▁▂▁▁▁▂▂▁▁▁▁▁▁▂▁▂▁▂▁▁▁▁▁▁▂▂▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: xzkcn85g with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 9.330509379812371, ended at 2.1
episode 200 average reward 9.279230110726518, ended at 4.1
episode 300 average reward 9.457789786129913, ended at 6.2
episode 400 average reward 9.267436659902199, ended at 8.3
episode 500 average reward 9.350072574846532, ended at 10.3
episode 600 average reward 9.419099381342413, ended at 12.5
episode 700 average reward 9.46243508339491, ended at 14.5
episode 800 average reward 9.259286901606998, ended at 16.6
episode 900 average reward 9.110490440818939, ended at 18.7
episode 1000 average reward 9.545656879526401, ended at 20.8
episode 1100 average reward 9.321990789299901, ended at 23.0
episode 1200 average reward 9.446234309539754, ended at 25.1
episode 1300 average reward 9.348174091901857, ended at 27.3
episode 1400 average reward 9.250518207704413, ended at 29.4
episode 1500 average reward 9.167738635855713, ended at 31.6
episode 1600 average reward 9.43553113140906, ended at 33.8
episode 1700 average reward 9.336789191

VBox(children=(Label(value=' 0.29MB of 0.29MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.50817
_runtime,68.0
_timestamp,1627528352.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▂▂▂▃▁▃▃▃▂▁▃▂▃▂▁▃▂▁▂▂▃▂▂▁▂▂▂▂▃▂▃▂▁▂▃▄▂▂▂
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: qjm9pm3k with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.01
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 9.6155461231107, ended at 2.3
episode 200 average reward 11.440452241039267, ended at 4.8
episode 300 average reward 9.3553382658701, ended at 7.1
episode 400 average reward 9.277527715240993, ended at 9.3
episode 500 average reward 9.273708964855233, ended at 11.6
episode 600 average reward 9.32788987453392, ended at 13.8
episode 700 average reward 9.333155868040755, ended at 16.1
episode 800 average reward 9.523549421142219, ended at 18.4
episode 900 average reward 9.673708625666015, ended at 20.6
episode 1000 average reward 9.17122064557445, ended at 22.8
episode 1100 average reward 9.39649773679353, ended at 25.2
episode 1200 average reward 9.193564571194404, ended at 27.6
episode 1300 average reward 9.48526540298333, ended at 29.9
episode 1400 average reward 9.437208248169437, ended at 32.3
episode 1500 average reward 9.555995245145606, ended at 34.5
episode 1600 average reward 9.372828596848617, ended at 36.7
episode 1700 average reward 9.61747565291304

VBox(children=(Label(value=' 0.30MB of 0.30MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.22369
_runtime,70.0
_timestamp,1627528427.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▂▃▁▁▁▁▁▁▁▂▁▁▁▂▂▁▁▁▁▁▁▂▁▂▁▂▁▁▁▁▁▁▂▂▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: lppgqebl with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 38.49117845323453, ended at 5.4
episode 200 average reward 90.98396247552904, ended at 19.8
episode 300 average reward 234.6990089442594, ended at 56.8
episode 400 average reward 429.7758558350067, ended at 132.5
episode 500 average reward 464.1382824032045, ended at 222.4
episode 600 average reward 480.24216683987, ended at 318.6
episode 700 average reward 499.7075886278137, ended at 416.7
episode 800 average reward 499.9982687699262, ended at 516.2
episode 900 average reward 499.99998975020134, ended at 616.0
episode 1000 average reward 499.99999993931544, ended at 714.5
episode 1100 average reward 499.9999999996403, ended at 816.7
episode 1200 average reward 499.9999999999974, ended at 917.1
episode 1300 average reward 499.99999999999903, ended at 1015.6
episode 1400 average reward 499.99999999999903, ended at 1115.5
episode 1500 average reward 499.99999999999903, ended at 1214.1
episode 1600 average reward 499.99999999999903, ended at 1314.4
episode 1700 

VBox(children=(Label(value=' 0.31MB of 0.31MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,499.95555
_runtime,2711.0
_timestamp,1627531143.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▃▅▆██████████████████████████████████
_runtime,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: jg7e6uyz with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 22.127096391631376, ended at 4.1
episode 200 average reward 46.88088931368053, ended at 12.7
episode 300 average reward 130.29734753758626, ended at 31.7
episode 400 average reward 102.96884628686925, ended at 68.8
episode 500 average reward 71.41966375582756, ended at 93.0
episode 600 average reward 149.96102165371136, ended at 116.4
episode 700 average reward 176.29007432864043, ended at 149.0
episode 800 average reward 291.07787461774853, ended at 192.8
episode 900 average reward 129.28146845242324, ended at 232.8
episode 1000 average reward 175.18641910894772, ended at 275.2
episode 1100 average reward 181.50422997864825, ended at 309.6
episode 1200 average reward 360.91919283614226, ended at 369.2
episode 1300 average reward 495.65447287099516, ended at 463.1
episode 1400 average reward 499.9742721796545, ended at 562.5
episode 1500 average reward 499.99984767768746, ended at 662.5
episode 1600 average reward 499.4254025550004, ended at 761.4
episode 170

VBox(children=(Label(value=' 0.33MB of 0.33MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,500.0
_runtime,2163.0
_timestamp,1627533311.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▂▃▃▂▂▃▃▄▅▄▃▃▄▆███████████████████████
_runtime,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
_timestamp,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 0egf1a2p with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 10.100727452244117, ended at 2.2
episode 200 average reward 10.098491133079564, ended at 4.4
episode 300 average reward 31.047342971086774, ended at 9.6
episode 400 average reward 43.20151360756624, ended at 17.1
episode 500 average reward 47.69530800571525, ended at 25.9
episode 600 average reward 63.43312705248867, ended at 36.9
episode 700 average reward 68.18393646097374, ended at 50.9
episode 800 average reward 99.38069628757123, ended at 68.0
episode 900 average reward 149.86604246327923, ended at 93.1
episode 1000 average reward 78.12386610641325, ended at 106.7
episode 1100 average reward 91.59125622462376, ended at 127.4
episode 1200 average reward 118.18070118601455, ended at 150.1
episode 1300 average reward 140.4383836232385, ended at 173.6
episode 1400 average reward 149.4655417432933, ended at 205.0
episode 1500 average reward 171.55494731018914, ended at 234.8
episode 1600 average reward 360.55169819776137, ended at 291.2
episode 1700 average r

VBox(children=(Label(value=' 0.34MB of 0.34MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,105.46283
_runtime,629.0
_timestamp,1627533945.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▂▂▂▂▃▃▃▂▃▃▃▄▄▄▅█▆▂▃▃▄▄▄▃▃▃▃▃▃▂▃▂▃▃
_runtime,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: vvjm2uaj with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 15.688438248989923, ended at 4.0
episode 200 average reward 10.624975964757729, ended at 6.8
episode 300 average reward 9.791802208233651, ended at 9.2
episode 400 average reward 10.385584423567064, ended at 11.6
episode 500 average reward 12.469043999351891, ended at 14.5
episode 600 average reward 15.505706509341886, ended at 17.3
episode 700 average reward 35.368848239246304, ended at 22.0
episode 800 average reward 56.37180779915586, ended at 33.0
episode 900 average reward 68.91299393537524, ended at 45.0
episode 1000 average reward 73.752645260424, ended at 59.5
episode 1100 average reward 89.64149811099615, ended at 77.1
episode 1200 average reward 145.28636851649955, ended at 102.3
episode 1300 average reward 187.99701766475866, ended at 136.0
episode 1400 average reward 99.10936266159108, ended at 159.1
episode 1500 average reward 84.80766299215598, ended at 176.7
episode 1600 average reward 92.8804446927266, ended at 194.8
episode 1700 average rewar

VBox(children=(Label(value=' 0.35MB of 0.35MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,45.38115
_runtime,410.0
_timestamp,1627534360.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▂▃▃▃▄▅▆▆█▅▄▄▅▅▅▅▅▆▇▂▃▄▄▃▃▃▃▃▃▃▃
_runtime,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: fw88t04x with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 14.218639415528147, ended at 2.7
episode 200 average reward 10.912389425921376, ended at 5.2
episode 300 average reward 9.640467358403493, ended at 7.4
episode 400 average reward 9.649130390266096, ended at 9.7
episode 500 average reward 9.297028545202501, ended at 11.9
episode 600 average reward 9.336111302742296, ended at 14.1
episode 700 average reward 9.333204543246694, ended at 16.4
episode 800 average reward 9.666049709325199, ended at 18.7
episode 900 average reward 9.738178410631058, ended at 20.9
episode 1000 average reward 9.171602340820163, ended at 23.2
episode 1100 average reward 9.396499996631384, ended at 25.5
episode 1200 average reward 9.193564584573839, ended at 27.8
episode 1300 average reward 9.485265403062542, ended at 30.1
episode 1400 average reward 9.437208248169908, ended at 32.3
episode 1500 average reward 9.55599524514561, ended at 34.5
episode 1600 average reward 9.372828596848617, ended at 36.7
episode 1700 average reward 9.426870

VBox(children=(Label(value=' 0.37MB of 0.37MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.22369
_runtime,69.0
_timestamp,1627534435.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▆▆▂▂▂▁▁▁▁▂▁▁▁▁▂▁▁▁▁▁▁▂▁▂▁▂▁▁▁▁▁▁▂▂▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: m9443xdd with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 17.56341872623613, ended at 3.9
episode 200 average reward 13.903675330140285, ended at 6.9
episode 300 average reward 11.474363893334926, ended at 9.6
episode 400 average reward 10.746946192719347, ended at 12.2
episode 500 average reward 9.8630783206856, ended at 14.6
episode 600 average reward 9.795798773275042, ended at 17.0
episode 700 average reward 9.830811503731807, ended at 19.4
episode 800 average reward 9.717636859583344, ended at 21.7
episode 900 average reward 9.772868072472656, ended at 23.9
episode 1000 average reward 9.209353221027804, ended at 26.1
episode 1100 average reward 9.466370609192104, ended at 28.4
episode 1200 average reward 9.250027824377073, ended at 30.7
episode 1300 average reward 9.485599695323675, ended at 32.9
episode 1400 average reward 9.445309698438765, ended at 35.2
episode 1500 average reward 9.621255453364217, ended at 37.4
episode 1600 average reward 9.387057064449957, ended at 39.6
episode 1700 average reward 9.42695

VBox(children=(Label(value=' 0.38MB of 0.38MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.22375
_runtime,73.0
_timestamp,1627534514.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▇▄▃▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 9f4col0q with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 11.131495161854097, ended at 2.6
episode 200 average reward 9.885001154235361, ended at 5.0
episode 300 average reward 9.340458468379818, ended at 7.2
episode 400 average reward 9.27743961896516, ended at 9.4
episode 500 average reward 9.304444130051161, ended at 11.4
episode 600 average reward 9.348462525552133, ended at 13.5
episode 700 average reward 9.337118382196055, ended at 15.6
episode 800 average reward 9.525499153302654, ended at 17.8
episode 900 average reward 9.707471699239294, ended at 20.0
episode 1000 average reward 9.833441118209244, ended at 22.3
episode 1100 average reward 12.503637313731057, ended at 25.7
episode 1200 average reward 20.85652761509138, ended at 29.2
episode 1300 average reward 38.2479278246824, ended at 36.7
episode 1400 average reward 35.33824074300959, ended at 44.2
episode 1500 average reward 52.69601570181552, ended at 53.5
episode 1600 average reward 42.16523061549264, ended at 61.9
episode 1700 average reward 48.079254

VBox(children=(Label(value=' 0.39MB of 0.39MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,237.34555
_runtime,256.0
_timestamp,1627534776.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▂▃▃▃▃▃▃▃▃▃▃▃▄▅█
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇█
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: ckav9s9c with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 17.200056626317927, ended at 3.7
episode 200 average reward 10.731601296456487, ended at 6.4
episode 300 average reward 9.800728946674008, ended at 8.6
episode 400 average reward 9.395073585267308, ended at 10.8
episode 500 average reward 9.439322168158235, ended at 12.9
episode 600 average reward 9.348367525895979, ended at 15.0
episode 700 average reward 9.334928644012512, ended at 17.2
episode 800 average reward 9.587694568869471, ended at 19.3
episode 900 average reward 9.711726481187247, ended at 21.5
episode 1000 average reward 9.179351548515234, ended at 23.8
episode 1100 average reward 9.434582194085655, ended at 26.2
episode 1200 average reward 9.272839636373673, ended at 28.5
episode 1300 average reward 9.68713425294333, ended at 30.9
episode 1400 average reward 9.68589103762107, ended at 33.3
episode 1500 average reward 9.70221024666102, ended at 35.6
episode 1600 average reward 10.68355814790492, ended at 38.3
episode 1700 average reward 9.7366954

VBox(children=(Label(value=' 0.41MB of 0.41MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,87.51169
_runtime,173.0
_timestamp,1627534954.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▄▃▄▄▄▆▇▇▇▆███
_runtime,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▆▆▆▇▇█
_timestamp,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▆▆▆▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: se6adsw0 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 13.718617446074026, ended at 2.7
episode 200 average reward 11.100607457005788, ended at 5.2
episode 300 average reward 9.744366803209541, ended at 7.4
episode 400 average reward 9.452645770362027, ended at 9.7
episode 500 average reward 9.380098377254884, ended at 11.9
episode 600 average reward 9.418677780143854, ended at 14.1
episode 700 average reward 9.406897859749009, ended at 16.4
episode 800 average reward 9.672543823699337, ended at 18.6
episode 900 average reward 9.77005018764094, ended at 20.9
episode 1000 average reward 9.240839942043745, ended at 23.1
episode 1100 average reward 9.466743649481948, ended at 25.5
episode 1200 average reward 9.199485931016069, ended at 27.8
episode 1300 average reward 9.485300460567178, ended at 30.1
episode 1400 average reward 9.443007566793886, ended at 32.4
episode 1500 average reward 9.556029580180981, ended at 34.6
episode 1600 average reward 9.372828800130202, ended at 36.8
episode 1700 average reward 9.426870

VBox(children=(Label(value=' 0.42MB of 0.42MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.22369
_runtime,70.0
_timestamp,1627535029.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▇▅▂▂▂▁▂▁▂▂▁▁▁▁▂▁▁▁▁▁▁▂▁▂▁▂▁▁▁▁▁▁▂▂▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: abjj9c32 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 17.53297415418677, ended at 3.9
episode 200 average reward 12.591964375892367, ended at 7.0
episode 300 average reward 11.966616323200302, ended at 9.7
episode 400 average reward 10.166780157774648, ended at 12.3
episode 500 average reward 10.247645188041885, ended at 14.7
episode 600 average reward 9.72928069416185, ended at 17.0
episode 700 average reward 10.218308782674505, ended at 19.5
episode 800 average reward 10.129692770129399, ended at 21.9
episode 900 average reward 9.921238990639196, ended at 24.2
episode 1000 average reward 9.514541365000733, ended at 26.4
episode 1100 average reward 9.498301728412528, ended at 28.6
episode 1200 average reward 9.425134368164274, ended at 31.0
episode 1300 average reward 9.83488286276887, ended at 33.2
episode 1400 average reward 10.00406811375428, ended at 35.4
episode 1500 average reward 10.277219331582621, ended at 37.7
episode 1600 average reward 9.508052350450098, ended at 39.9
episode 1700 average reward 9.7

VBox(children=(Label(value=' 0.43MB of 0.43MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.24777
_runtime,73.0
_timestamp,1627535108.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▇▄▃▃▂▂▂▂▂▂▂▁▁▁▂▁▁▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 8fzb135m with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 26.545242394900374, ended at 5.0
episode 200 average reward 28.29136948790017, ended at 10.5
episode 300 average reward 30.226030380891107, ended at 17.1
episode 400 average reward 34.770768270386654, ended at 24.7
episode 500 average reward 40.489256408070304, ended at 32.7
episode 600 average reward 43.16239833216411, ended at 41.2
episode 700 average reward 60.90007147548488, ended at 51.6
episode 800 average reward 54.94657811561271, ended at 62.4
episode 900 average reward 52.62810600488378, ended at 73.7
episode 1000 average reward 72.83678860840894, ended at 87.4
episode 1100 average reward 68.47585528201705, ended at 101.8
episode 1200 average reward 82.33935705507763, ended at 117.7
episode 1300 average reward 112.53696791844897, ended at 137.1
episode 1400 average reward 128.78364279354003, ended at 161.0
episode 1500 average reward 138.02747162074502, ended at 187.6
episode 1600 average reward 124.23148812428101, ended at 213.1
episode 1700 average

VBox(children=(Label(value=' 0.45MB of 0.45MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,411.1106
_runtime,1064.0
_timestamp,1627536178.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▂▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▆▆▆▇▇▇▇█▇█▇█
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇█
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 6x94wawx with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 21.407100014355482, ended at 4.2
episode 200 average reward 23.116248479496186, ended at 9.0
episode 300 average reward 22.949174579293377, ended at 13.8
episode 400 average reward 34.02454750534225, ended at 20.2
episode 500 average reward 39.62572745776359, ended at 28.3
episode 600 average reward 41.26190045596035, ended at 36.5
episode 700 average reward 46.98562532564131, ended at 45.5
episode 800 average reward 53.114490462921964, ended at 55.5
episode 900 average reward 60.73085594009105, ended at 66.4
episode 1000 average reward 80.34328977811363, ended at 80.4
episode 1100 average reward 77.99339184810411, ended at 94.6
episode 1200 average reward 87.95928137077595, ended at 111.0
episode 1300 average reward 141.4817756345705, ended at 136.1
episode 1400 average reward 155.98280613597143, ended at 163.9
episode 1500 average reward 197.92752278442322, ended at 198.0
episode 1600 average reward 216.48905039282454, ended at 237.9
episode 1700 average re

VBox(children=(Label(value=' 0.46MB of 0.46MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,489.2129
_runtime,1334.0
_timestamp,1627537518.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▅▆▆▆▆▇▇▇▇▇▇▇█▇███
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 46e8nm2o with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 20.31701436279174, ended at 4.3
episode 200 average reward 18.530987097251934, ended at 7.7
episode 300 average reward 17.01360296130387, ended at 11.3
episode 400 average reward 20.860178159522736, ended at 15.4
episode 500 average reward 19.91298711142951, ended at 19.7
episode 600 average reward 24.64686382859735, ended at 24.4
episode 700 average reward 30.321341118731766, ended at 30.5
episode 800 average reward 34.33271526363201, ended at 37.6
episode 900 average reward 33.63333238141048, ended at 44.6
episode 1000 average reward 41.924397911936374, ended at 52.2
episode 1100 average reward 44.017657464687424, ended at 60.6
episode 1200 average reward 49.55290712376267, ended at 69.2
episode 1300 average reward 50.541708414632396, ended at 78.4
episode 1400 average reward 51.73784471800152, ended at 88.5
episode 1500 average reward 51.38907462467695, ended at 99.6
episode 1600 average reward 54.976407857031965, ended at 110.8
episode 1700 average reward

VBox(children=(Label(value=' 0.47MB of 0.47MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,208.68999
_runtime,455.0
_timestamp,1627537978.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▇███
_runtime,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▇▇█
_timestamp,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: cuec9ama with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 19.22855130201222, ended at 4.3
episode 200 average reward 18.909204037755686, ended at 8.3
episode 300 average reward 18.739272817640227, ended at 12.4
episode 400 average reward 15.665947120528209, ended at 15.9
episode 500 average reward 17.692760212470326, ended at 19.6
episode 600 average reward 20.545006392400303, ended at 23.7
episode 700 average reward 26.944732765512875, ended at 29.1
episode 800 average reward 31.13497062365848, ended at 34.6
episode 900 average reward 34.238402117711, ended at 40.9
episode 1000 average reward 44.98055882877101, ended at 49.3
episode 1100 average reward 49.483214918232356, ended at 58.6
episode 1200 average reward 51.68389336135749, ended at 69.4
episode 1300 average reward 49.824151307129206, ended at 80.2
episode 1400 average reward 62.43269773309989, ended at 92.0
episode 1500 average reward 67.14904041104664, ended at 104.3
episode 1600 average reward 70.852924798151, ended at 117.3
episode 1700 average reward 9

VBox(children=(Label(value=' 0.49MB of 0.49MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,352.63259
_runtime,695.0
_timestamp,1627538679.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▆▆▆▆▇█
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇█
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: ityv2lhv with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 24.498639256739676, ended at 4.5
episode 200 average reward 24.833727208453066, ended at 9.5
episode 300 average reward 19.681741614311168, ended at 13.3
episode 400 average reward 18.851759074474167, ended at 17.4
episode 500 average reward 16.754874748661155, ended at 21.1
episode 600 average reward 18.47905400839913, ended at 25.3
episode 700 average reward 17.229704931285408, ended at 29.1
episode 800 average reward 15.023611636044272, ended at 32.4
episode 900 average reward 15.710393510383325, ended at 35.6
episode 1000 average reward 14.010921135061706, ended at 38.8
episode 1100 average reward 13.886136876475295, ended at 41.8
episode 1200 average reward 12.7966117433662, ended at 44.7
episode 1300 average reward 12.824546447642534, ended at 47.6
episode 1400 average reward 12.268897653661917, ended at 50.2
episode 1500 average reward 11.748762260467661, ended at 52.9
episode 1600 average reward 12.63487703120646, ended at 55.6
episode 1700 average re

VBox(children=(Label(value=' 0.50MB of 0.50MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,12.43203
_runtime,94.0
_timestamp,1627538779.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▆██▆▆▅▄▆▅▄▃▃▃▃▃▂▂▂▂▁▁▂▃▁▂▁▁▁▁▂▁▁▁▁▂▁▁▂▁▁
_runtime,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 0sudew35 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 21.656913914353566, ended at 4.3
episode 200 average reward 23.33872504179144, ended at 8.7
episode 300 average reward 21.861351037693296, ended at 13.1
episode 400 average reward 17.887514920373103, ended at 17.0
episode 500 average reward 18.759890499892915, ended at 20.9
episode 600 average reward 19.989279102375388, ended at 25.1
episode 700 average reward 17.315564794758203, ended at 28.9
episode 800 average reward 16.26121400203889, ended at 32.6
episode 900 average reward 15.166587118194375, ended at 36.2
episode 1000 average reward 12.614790289114257, ended at 39.3
episode 1100 average reward 14.628381778390505, ended at 42.3
episode 1200 average reward 12.47218887840548, ended at 45.2
episode 1300 average reward 12.630248764814818, ended at 48.1
episode 1400 average reward 12.11466202913179, ended at 50.9
episode 1500 average reward 11.570788919911907, ended at 53.5
episode 1600 average reward 11.216784082224484, ended at 56.0
episode 1700 average re

VBox(children=(Label(value=' 0.51MB of 0.51MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,10.59549
_runtime,93.0
_timestamp,1627538878.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▇█▇▇▇▆▇▇▆▆▆▅▅▃▃▄▃▂▂▂▃▂▂▂▂▂▂▂▂▂▁▁▁▂▁▂▁▁▁▁
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: zhkdggqy with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 20.517006783397946, ended at 4.1
episode 200 average reward 18.164346968872106, ended at 7.8
episode 300 average reward 17.361260559934795, ended at 11.5
episode 400 average reward 16.226652935997492, ended at 14.8
episode 500 average reward 17.350287269952357, ended at 18.3
episode 600 average reward 15.213713524415684, ended at 21.6
episode 700 average reward 14.34676895422499, ended at 24.5
episode 800 average reward 14.796248356448734, ended at 27.6
episode 900 average reward 14.059698364874244, ended at 30.6
episode 1000 average reward 15.285587327537748, ended at 33.6
episode 1100 average reward 15.251454487691154, ended at 36.9
episode 1200 average reward 15.856791106447279, ended at 40.3
episode 1300 average reward 17.841435658562148, ended at 43.9
episode 1400 average reward 17.84899287527099, ended at 47.7
episode 1500 average reward 17.863438538331017, ended at 51.8
episode 1600 average reward 24.54548030022953, ended at 56.5
episode 1700 average r

VBox(children=(Label(value=' 0.52MB of 0.52MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,51.19263
_runtime,167.0
_timestamp,1627539051.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▂▁▂▂▂▃▂▃▃▅▅▄▄▆▅▇█▇▇▇▇███
_runtime,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇██
_timestamp,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: n2r6ps81 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 20.319270305983324, ended at 3.9
episode 200 average reward 16.905148131659477, ended at 7.9
episode 300 average reward 19.2921635162932, ended at 12.4
episode 400 average reward 17.339865085846835, ended at 16.1
episode 500 average reward 14.958410882651654, ended at 19.8
episode 600 average reward 14.318916813012677, ended at 23.2
episode 700 average reward 13.438937880564687, ended at 26.5
episode 800 average reward 14.186601139667982, ended at 29.7
episode 900 average reward 12.43772416807405, ended at 32.6
episode 1000 average reward 12.687361907584474, ended at 35.4
episode 1100 average reward 14.209598051558254, ended at 38.6
episode 1200 average reward 13.435360810609806, ended at 41.5
episode 1300 average reward 13.590310363103923, ended at 44.5
episode 1400 average reward 15.078528463514955, ended at 47.7
episode 1500 average reward 17.05270446037111, ended at 51.2
episode 1600 average reward 21.249106841882018, ended at 55.2
episode 1700 average re

VBox(children=(Label(value=' 0.54MB of 0.54MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,78.48316
_runtime,201.0
_timestamp,1627539258.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▄▅▄▅▆▅▆▆▆▅▆█████
_runtime,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇█
_timestamp,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: cnahpu9o with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 21.486966001691464, ended at 3.9
episode 200 average reward 18.04597449704868, ended at 7.7
episode 300 average reward 20.591665280503037, ended at 11.8
episode 400 average reward 20.677941901660038, ended at 16.0
episode 500 average reward 20.495105601211637, ended at 20.0
episode 600 average reward 19.268213854067984, ended at 24.1
episode 700 average reward 20.140091790769304, ended at 28.0
episode 800 average reward 19.931083770114867, ended at 32.0
episode 900 average reward 17.561463015675272, ended at 35.9
episode 1000 average reward 15.463835081026126, ended at 39.5
episode 1100 average reward 17.533493907843187, ended at 43.4
episode 1200 average reward 13.252711759035256, ended at 46.8
episode 1300 average reward 14.211859632215935, ended at 50.1
episode 1400 average reward 15.405686530920498, ended at 53.4
episode 1500 average reward 12.976161633277473, ended at 56.4
episode 1600 average reward 12.64114994406764, ended at 59.2
episode 1700 average 

VBox(children=(Label(value=' 0.55MB of 0.55MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,10.68468
_runtime,97.0
_timestamp,1627539361.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▆▇▆▇▅█▆▇▆▆▆▅▅▄▅▄▃▄▄▄▃▃▃▂▃▃▂▂▂▂▁▂▂▁▂▂▁▁▂▁
_runtime,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 75kgikfu with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0001
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 23.051224014743116, ended at 4.4
episode 200 average reward 21.263334020315085, ended at 9.1
episode 300 average reward 22.712687521202962, ended at 13.8
episode 400 average reward 20.291792448596556, ended at 18.5
episode 500 average reward 21.132372118708727, ended at 23.0
episode 600 average reward 22.79822227616106, ended at 27.1
episode 700 average reward 19.956620482705674, ended at 31.3
episode 800 average reward 20.328737374604014, ended at 35.4
episode 900 average reward 18.7321414651388, ended at 39.1
episode 1000 average reward 14.813651897844721, ended at 42.3
episode 1100 average reward 16.43058723680688, ended at 45.9
episode 1200 average reward 13.660691646992193, ended at 49.0
episode 1300 average reward 13.61767339694182, ended at 52.1
episode 1400 average reward 14.940149532529682, ended at 55.3
episode 1500 average reward 12.60859407793336, ended at 58.4
episode 1600 average reward 13.045953031433971, ended at 61.5
episode 1700 average rewa

VBox(children=(Label(value=' 0.56MB of 0.56MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,10.25862
_runtime,102.0
_timestamp,1627539469.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▆▇▅▇█▅▇▅▇▅▅▄▄▃▄▃▃▄▃▃▂▂▂▂▂▂▂▂▂▁▂▂▂▂▁▁▁▂▂▁
_runtime,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 60cjgz86 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 32.09077247945594, ended at 5.8
episode 200 average reward 43.399165117159946, ended at 14.4
episode 300 average reward 56.97014651221504, ended at 24.3
episode 400 average reward 70.85001776467622, ended at 36.5
episode 500 average reward 128.69720908538724, ended at 60.5
episode 600 average reward 165.8324159488941, ended at 92.4
episode 700 average reward 211.80881247618575, ended at 128.2
episode 800 average reward 257.70859812747057, ended at 182.7
episode 900 average reward 368.0986261417445, ended at 249.2
episode 1000 average reward 451.31409107980386, ended at 335.7
episode 1100 average reward 475.4835078598013, ended at 427.3
episode 1200 average reward 478.2158590025827, ended at 522.1
episode 1300 average reward 477.87572541913295, ended at 614.6
episode 1400 average reward 498.8260491038842, ended at 712.3
episode 1500 average reward 499.70678631019854, ended at 810.8
episode 1600 average reward 490.3202567046688, ended at 908.5
episode 1700 aver

VBox(children=(Label(value=' 0.58MB of 0.58MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,500.0
_runtime,2293.0
_timestamp,1627541768.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▂▁▂▃▃▃▄▅▅▆▇██████████████████████████
_runtime,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: fzdbynv3 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 18.424518663072565, ended at 3.8
episode 200 average reward 27.873311664343788, ended at 8.4
episode 300 average reward 42.33054965889336, ended at 15.6
episode 400 average reward 61.745907479619135, ended at 26.7
episode 500 average reward 81.72378800769087, ended at 41.1
episode 600 average reward 119.6130814481791, ended at 64.0
episode 700 average reward 158.1029350724095, ended at 93.3
episode 800 average reward 188.3729707631223, ended at 126.2
episode 900 average reward 235.04725398927928, ended at 171.1
episode 1000 average reward 384.5134904742514, ended at 239.0
episode 1100 average reward 443.4318152989121, ended at 323.6
episode 1200 average reward 490.5463718327637, ended at 418.7
episode 1300 average reward 495.7117807231135, ended at 515.4
episode 1400 average reward 498.39244870921726, ended at 614.0
episode 1500 average reward 499.459613464542, ended at 712.4
episode 1600 average reward 499.9968006257261, ended at 810.1
episode 1700 average r

VBox(children=(Label(value=' 0.59MB of 0.59MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,500.0
_runtime,2185.0
_timestamp,1627543958.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▂▂▂▃▃▃▄▅▆▇█████████████████████████
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇██
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: ursroc9m with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 13.263413515864448, ended at 3.4
episode 200 average reward 11.030207710733118, ended at 6.0
episode 300 average reward 22.008712408515315, ended at 9.4
episode 400 average reward 24.378023762167032, ended at 14.3
episode 500 average reward 35.889893036715115, ended at 21.0
episode 600 average reward 44.58462768660734, ended at 29.2
episode 700 average reward 51.10625589046285, ended at 38.6
episode 800 average reward 49.26684788611758, ended at 48.6
episode 900 average reward 48.896838565502506, ended at 59.0
episode 1000 average reward 60.292190069293916, ended at 70.1
episode 1100 average reward 61.4336021895396, ended at 82.5
episode 1200 average reward 75.95676363769158, ended at 97.3
episode 1300 average reward 98.905859969814, ended at 114.6
episode 1400 average reward 122.86709968702966, ended at 137.8
episode 1500 average reward 120.69127302159089, ended at 159.3
episode 1600 average reward 108.2142001968934, ended at 179.2
episode 1700 average rewar

VBox(children=(Label(value=' 0.60MB of 0.60MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,243.70864
_runtime,703.0
_timestamp,1627544666.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▄▄▄▃▄▅▅▅▇▆▄▄▄▆▇█▅▅▅██▇
_runtime,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇██
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: ihqoe51b with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 18.95273608413951, ended at 3.9
episode 200 average reward 13.563961078059728, ended at 7.0
episode 300 average reward 14.35580876012201, ended at 10.1
episode 400 average reward 15.470731728077737, ended at 13.2
episode 500 average reward 28.775874501145385, ended at 17.8
episode 600 average reward 41.50824692612722, ended at 24.8
episode 700 average reward 49.30720755225275, ended at 33.7
episode 800 average reward 64.60403122813639, ended at 46.1
episode 900 average reward 74.09417120221264, ended at 59.2
episode 1000 average reward 100.94473634345835, ended at 78.8
episode 1100 average reward 136.83270499770592, ended at 104.0
episode 1200 average reward 108.11643254931207, ended at 125.4
episode 1300 average reward 164.1977230752807, ended at 166.1
episode 1400 average reward 60.918064092404315, ended at 178.1
episode 1500 average reward 84.69299855267553, ended at 195.8
episode 1600 average reward 88.43255447188135, ended at 213.6
episode 1700 average r

VBox(children=(Label(value=' 0.62MB of 0.62MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,56.89358
_runtime,461.0
_timestamp,1627545133.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▂▂▃▃▃▅▅▇▅▇▇▃▅▅▅▄▅▅▄▄▅▅▆▅▇█▄▄▄▃▃▃▃
_runtime,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇████
_timestamp,▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: bi8gi0ft with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 22.99010724848736, ended at 4.2
episode 200 average reward 17.067555751023953, ended at 8.0
episode 300 average reward 15.135846891769232, ended at 11.2
episode 400 average reward 13.01665822110251, ended at 14.3
episode 500 average reward 12.296602221245555, ended at 16.9
episode 600 average reward 11.003937202816186, ended at 19.4
episode 700 average reward 10.920786401399571, ended at 21.7
episode 800 average reward 10.24314768444847, ended at 24.1
episode 900 average reward 10.71223543161477, ended at 26.3
episode 1000 average reward 9.868080276548492, ended at 28.5
episode 1100 average reward 9.902513560296214, ended at 30.8
episode 1200 average reward 9.620296954913321, ended at 33.1
episode 1300 average reward 9.825903258223274, ended at 35.5
episode 1400 average reward 9.507279991351572, ended at 37.8
episode 1500 average reward 9.572354745731342, ended at 40.1
episode 1600 average reward 9.444128447008316, ended at 42.4
episode 1700 average reward 9.

VBox(children=(Label(value=' 0.63MB of 0.63MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,10.69042
_runtime,76.0
_timestamp,1627545215.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▆█▇▄▅▃▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 3ujpbgz3 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 19.803152515800818, ended at 4.0
episode 200 average reward 18.22874208834332, ended at 7.8
episode 300 average reward 16.174931796216047, ended at 11.2
episode 400 average reward 14.956008052960971, ended at 14.5
episode 500 average reward 12.720347807827855, ended at 17.3
episode 600 average reward 10.828623329669943, ended at 20.0
episode 700 average reward 10.66616877489783, ended at 22.6
episode 800 average reward 10.284585399019582, ended at 25.2
episode 900 average reward 11.058016420422405, ended at 27.6
episode 1000 average reward 9.547851285341725, ended at 30.0
episode 1100 average reward 9.693059220173698, ended at 32.5
episode 1200 average reward 9.535493980197622, ended at 34.9
episode 1300 average reward 9.655774661791938, ended at 37.3
episode 1400 average reward 10.01979171235155, ended at 39.6
episode 1500 average reward 9.652633081948414, ended at 42.0
episode 1600 average reward 9.408352908871038, ended at 44.3
episode 1700 average reward 

VBox(children=(Label(value=' 0.64MB of 0.64MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.36758
_runtime,78.0
_timestamp,1627545299.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,██▇▆▅▅▃▃▂▂▂▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 7nlluqtp with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 18.61222393883686, ended at 4.6
episode 200 average reward 16.49157881094041, ended at 8.4
episode 300 average reward 13.170479215740354, ended at 11.6
episode 400 average reward 12.60636950280949, ended at 14.4
episode 500 average reward 11.963504562816082, ended at 17.3
episode 600 average reward 11.836166680338362, ended at 20.1
episode 700 average reward 11.562129183550502, ended at 22.8
episode 800 average reward 12.318984837548733, ended at 25.4
episode 900 average reward 11.696582483905496, ended at 27.8
episode 1000 average reward 10.97448916521732, ended at 30.2
episode 1100 average reward 12.014514416717219, ended at 32.8
episode 1200 average reward 13.216249463556032, ended at 35.5
episode 1300 average reward 17.7437936010526, ended at 38.8
episode 1400 average reward 26.282651140978356, ended at 43.7
episode 1500 average reward 36.68320507726758, ended at 50.2
episode 1600 average reward 42.660598164448956, ended at 58.0
episode 1700 average rewar

VBox(children=(Label(value=' 0.66MB of 0.66MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,72.44936
_runtime,218.0
_timestamp,1627545523.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▃▄▅▄▅▅▅▆▆▆▆▅▇▆▆▇▆▇▇▇█
_runtime,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇██
_timestamp,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: pp2j1em2 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 17.46010108705662, ended at 3.7
episode 200 average reward 17.587768623396286, ended at 7.5
episode 300 average reward 12.919732127895253, ended at 10.7
episode 400 average reward 11.408130624264821, ended at 13.3
episode 500 average reward 10.345520609149105, ended at 15.9
episode 600 average reward 10.403592954070277, ended at 18.4
episode 700 average reward 9.827044683498183, ended at 20.8
episode 800 average reward 10.084932136207298, ended at 23.2
episode 900 average reward 10.145940622825542, ended at 25.6
episode 1000 average reward 9.60761531173061, ended at 27.9
episode 1100 average reward 9.626373723369852, ended at 30.3
episode 1200 average reward 9.930905280170599, ended at 32.8
episode 1300 average reward 10.989640759685388, ended at 35.2
episode 1400 average reward 15.015984121234911, ended at 38.3
episode 1500 average reward 15.43077917440905, ended at 41.4
episode 1600 average reward 25.300587969287363, ended at 45.9
episode 1700 average rewar

VBox(children=(Label(value=' 0.67MB of 0.67MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,105.78708
_runtime,251.0
_timestamp,1627545780.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▄▄▄▄▅▄▅▅▅▅▅▇▆█▆██
_runtime,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▆▆▇▇▇█
_timestamp,▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▆▆▇▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 8wmt2pha with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 19.759001599443287, ended at 4.2
episode 200 average reward 18.83928257902684, ended at 7.8
episode 300 average reward 16.132467913846806, ended at 11.3
episode 400 average reward 14.726555554513839, ended at 14.5
episode 500 average reward 14.218417715904918, ended at 17.5
episode 600 average reward 12.853564084634133, ended at 20.5
episode 700 average reward 13.044082524228317, ended at 23.3
episode 800 average reward 12.347061717727035, ended at 26.1
episode 900 average reward 12.054297879166956, ended at 28.7
episode 1000 average reward 11.899863019934644, ended at 31.2
episode 1100 average reward 11.465858593825807, ended at 33.9
episode 1200 average reward 12.041276514082554, ended at 36.4
episode 1300 average reward 10.789123052113045, ended at 38.8
episode 1400 average reward 10.692240612252908, ended at 41.2
episode 1500 average reward 10.73221304039556, ended at 43.6
episode 1600 average reward 9.93334281222339, ended at 46.1
episode 1700 average re

VBox(children=(Label(value=' 0.68MB of 0.68MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.28878
_runtime,80.0
_timestamp,1627545868.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,█▇▇▅▅▅▄▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 38fzboec with config:
[34m[1mwandb[0m: 	actor_learning_rate: 0.0003
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 19.970456137377624, ended at 4.0
episode 200 average reward 18.33940439908967, ended at 7.9
episode 300 average reward 22.616620419767592, ended at 12.0
episode 400 average reward 16.488991002163345, ended at 15.4
episode 500 average reward 17.356493558267, ended at 19.0
episode 600 average reward 15.614025681023794, ended at 22.4
episode 700 average reward 14.612790190690209, ended at 25.9
episode 800 average reward 13.233270839315264, ended at 29.0
episode 900 average reward 13.530500703577475, ended at 32.2
episode 1000 average reward 11.92427597056068, ended at 35.0
episode 1100 average reward 11.58511239037678, ended at 37.9
episode 1200 average reward 10.821141690892485, ended at 40.6
episode 1300 average reward 11.060097934012466, ended at 43.2
episode 1400 average reward 10.809512220284766, ended at 45.8
episode 1500 average reward 10.734346900440663, ended at 48.4
episode 1600 average reward 9.997196776910089, ended at 50.7
episode 1700 average rewar

VBox(children=(Label(value=' 0.70MB of 0.70MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.43508
_runtime,84.0
_timestamp,1627545959.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,██▇▇▇▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: s29aqwrx with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 21.95019387420569, ended at 4.3
episode 200 average reward 20.360769003620472, ended at 8.8
episode 300 average reward 21.43147391284045, ended at 13.6
episode 400 average reward 21.90724306005727, ended at 18.5
episode 500 average reward 23.505307930427406, ended at 23.4
episode 600 average reward 22.317767693736823, ended at 28.2
episode 700 average reward 22.925962920730793, ended at 32.9
episode 800 average reward 21.84474155152528, ended at 37.9
episode 900 average reward 24.820318909016905, ended at 43.3
episode 1000 average reward 23.209764496203817, ended at 49.0
episode 1100 average reward 23.84764262749921, ended at 54.5
episode 1200 average reward 21.352399616728455, ended at 59.4
episode 1300 average reward 24.225489225900862, ended at 64.6
episode 1400 average reward 25.879485957846626, ended at 69.9
episode 1500 average reward 24.70576475567905, ended at 75.0
episode 1600 average reward 25.366599527345656, ended at 80.4
episode 1700 average rewa

VBox(children=(Label(value=' 0.71MB of 0.71MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,35.32685
_runtime,165.0
_timestamp,1627546131.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▁▂▁▃▁▁▂▂▄▃▃▄▄▄▃▄▄▄▄▄▃▅▅▅▇▅▅▆▅▇▇▆█▄█▅█▇
_runtime,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: c77dn7uw with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.01
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 25.183558096789277, ended at 4.6
episode 200 average reward 22.09731240420407, ended at 9.3
episode 300 average reward 18.95188295862078, ended at 13.8
episode 400 average reward 20.81537449814472, ended at 18.5
episode 500 average reward 20.427939106069324, ended at 23.3
episode 600 average reward 23.343444886091618, ended at 27.8
episode 700 average reward 22.044628126027856, ended at 32.2
episode 800 average reward 23.902509872048235, ended at 37.0
episode 900 average reward 23.95846215918607, ended at 41.9
episode 1000 average reward 22.534066073683455, ended at 47.0
episode 1100 average reward 29.35736878676489, ended at 52.0
episode 1200 average reward 21.763375164169535, ended at 57.1
episode 1300 average reward 21.124084094605927, ended at 61.9
episode 1400 average reward 26.188337386922143, ended at 67.7
episode 1500 average reward 23.94721826700816, ended at 73.1
episode 1600 average reward 23.949953705708555, ended at 78.6
episode 1700 average rewa

VBox(children=(Label(value=' 0.72MB of 0.72MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,29.96969
_runtime,164.0
_timestamp,1627546301.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▄▂▃▁▃▃▂▂▃▃▃▅▃▄▃▁▃▅▅▃▃▄▆▆▅▇▅▄▆▄▅▆▅█▄▇▅▇▅
_runtime,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 3szcfqxz with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 22.367287454687165, ended at 4.3
episode 200 average reward 21.09734712643457, ended at 8.7
episode 300 average reward 21.086356247483565, ended at 13.0
episode 400 average reward 19.07091953175093, ended at 17.0
episode 500 average reward 21.446973883224867, ended at 21.3
episode 600 average reward 22.44389009138466, ended at 25.9
episode 700 average reward 22.740254820868813, ended at 31.1
episode 800 average reward 22.213804577407526, ended at 35.8
episode 900 average reward 22.39220400560287, ended at 40.5
episode 1000 average reward 22.35563515144277, ended at 45.2
episode 1100 average reward 21.601274206693393, ended at 49.8
episode 1200 average reward 22.44597402414551, ended at 54.6
episode 1300 average reward 23.214153078455215, ended at 59.5
episode 1400 average reward 22.944035426503444, ended at 64.5
episode 1500 average reward 24.02908698567394, ended at 69.8
episode 1600 average reward 23.963746313035895, ended at 75.1
episode 1700 average rewar

VBox(children=(Label(value=' 0.74MB of 0.74MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,33.19377
_runtime,156.0
_timestamp,1627546463.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▄▃▃▁▁▂▂▄▃▂▂▂▃▄▄▃▄▃▃▄▄▃▅▅█▄▄▆▆▆▆█▇▆▆▅█▇█
_runtime,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: p1vox139 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 22.34919068439769, ended at 4.1
episode 200 average reward 20.66534585328419, ended at 8.2
episode 300 average reward 19.201035829160674, ended at 12.1
episode 400 average reward 19.580921539726333, ended at 16.2
episode 500 average reward 21.743766527948086, ended at 20.5
episode 600 average reward 23.232366248234126, ended at 25.2
episode 700 average reward 21.200700744424502, ended at 29.8
episode 800 average reward 18.946109418940253, ended at 34.4
episode 900 average reward 20.282878591200983, ended at 39.0
episode 1000 average reward 22.960426895065453, ended at 44.0
episode 1100 average reward 23.386347373260463, ended at 49.2
episode 1200 average reward 22.642027309978037, ended at 53.9
episode 1300 average reward 22.626267641308516, ended at 58.8
episode 1400 average reward 27.662426646330214, ended at 64.3
episode 1500 average reward 24.86848499096924, ended at 69.1
episode 1600 average reward 27.05134640812649, ended at 74.2
episode 1700 average re

VBox(children=(Label(value=' 0.75MB of 0.75MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,34.27846
_runtime,157.0
_timestamp,1627546627.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▁▂▁▁▂▃▂▃▂▁▂▃▃▂▂▄▅▃▅▅▄▄▆▅▃▄▇▅▄▅▇▆▅▇▇▆█▇
_runtime,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 9x2f3ws6 with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 22.046357601977178, ended at 4.4
episode 200 average reward 20.34109306467247, ended at 8.8
episode 300 average reward 17.613885628342874, ended at 12.8
episode 400 average reward 18.391765834486048, ended at 17.2
episode 500 average reward 21.774673505933492, ended at 21.5
episode 600 average reward 20.544394613751976, ended at 25.5
episode 700 average reward 21.34239577781354, ended at 30.2
episode 800 average reward 21.40511392381546, ended at 34.8
episode 900 average reward 22.3672333698722, ended at 39.4
episode 1000 average reward 18.64114830578389, ended at 43.6
episode 1100 average reward 21.19319826550689, ended at 48.4
episode 1200 average reward 19.907275718166744, ended at 52.8
episode 1300 average reward 17.67215320766846, ended at 56.8
episode 1400 average reward 20.44604116991339, ended at 60.8
episode 1500 average reward 18.452076636764957, ended at 64.5
episode 1600 average reward 18.965714992030545, ended at 68.4
episode 1700 average reward 

VBox(children=(Label(value=' 0.76MB of 0.76MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,16.38298
_runtime,124.0
_timestamp,1627546757.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▅█▄▃▅▄▅▅▅▅▅▆▆▃▃▆▄▄▃▃▆▃▃▂▃▄▄▂▂▃▃▄▃▂▁▄▁▁▁▂
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: m69xjxch with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0001
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 22.02826083168772, ended at 4.2
episode 200 average reward 20.340985922215175, ended at 8.3
episode 300 average reward 21.719834634750423, ended at 12.5
episode 400 average reward 20.065872229540915, ended at 16.5
episode 500 average reward 21.50009424181895, ended at 20.7
episode 600 average reward 18.295354841900203, ended at 24.4
episode 700 average reward 19.600092856942542, ended at 28.7
episode 800 average reward 19.487349009472695, ended at 32.9
episode 900 average reward 20.9919615167906, ended at 37.4
episode 1000 average reward 22.34643359869227, ended at 42.0
episode 1100 average reward 19.609552846420357, ended at 46.0
episode 1200 average reward 17.327188149048837, ended at 50.1
episode 1300 average reward 20.416218272218483, ended at 54.1
episode 1400 average reward 17.687410218033353, ended at 57.7
episode 1500 average reward 19.107702474631708, ended at 61.6
episode 1600 average reward 17.847167299206316, ended at 65.5
episode 1700 average rew

VBox(children=(Label(value=' 0.78MB of 0.78MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,17.62525
_runtime,122.0
_timestamp,1627546886.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▄█▅▃▄▄▅▆▄▅▅▆▅▅▅▅▃▄▂▅▄▃▅▂▂▂▂▁▂▃▂▄▂▄▂▂▂▂▄▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: g22k8qzu with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 22.046357601977178, ended at 4.1
episode 200 average reward 21.096146054420245, ended at 8.5
episode 300 average reward 19.114755172397686, ended at 12.6
episode 400 average reward 21.815992721096855, ended at 16.7
episode 500 average reward 20.698677604941174, ended at 20.6
episode 600 average reward 20.83618107725596, ended at 24.8
episode 700 average reward 22.131113378835977, ended at 29.2
episode 800 average reward 17.851778852710414, ended at 33.3
episode 900 average reward 19.305963498893874, ended at 37.8
episode 1000 average reward 20.39021359313006, ended at 41.8
episode 1100 average reward 19.16599276324549, ended at 46.2
episode 1200 average reward 19.328289651438464, ended at 50.6
episode 1300 average reward 22.023279219369844, ended at 55.0
episode 1400 average reward 16.745521295872507, ended at 58.8
episode 1500 average reward 17.021901841566077, ended at 62.8
episode 1600 average reward 18.17680695038146, ended at 66.5
episode 1700 average re

VBox(children=(Label(value=' 0.79MB of 0.79MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,20.58347
_runtime,129.0
_timestamp,1627547022.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▄█▇▂▃▆▄▇▄▅▃▄▂▂▃▃▇▅▁▄▃▂▄▄▄▂▄▁▄▆▃▄▅▇▅▅▆█▆▅
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: r5jqnihy with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 0.0003
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 22.02826083168772, ended at 4.4
episode 200 average reward 17.959281385743154, ended at 8.5
episode 300 average reward 17.81407912474177, ended at 12.8
episode 400 average reward 23.446880628729517, ended at 17.5
episode 500 average reward 20.627705354769446, ended at 21.6
episode 600 average reward 20.02276792288111, ended at 25.8
episode 700 average reward 23.760649850802995, ended at 30.0
episode 800 average reward 20.014224269634294, ended at 34.1
episode 900 average reward 18.6668061101292, ended at 38.1
episode 1000 average reward 20.50299301581837, ended at 42.1
episode 1100 average reward 19.685582547935162, ended at 46.5
episode 1200 average reward 21.254632171597862, ended at 50.7
episode 1300 average reward 22.032228743792924, ended at 54.9
episode 1400 average reward 17.38187000679952, ended at 58.6
episode 1500 average reward 19.38005535385276, ended at 63.2
episode 1600 average reward 20.382840000660853, ended at 67.4
episode 1700 average reward

VBox(children=(Label(value=' 0.80MB of 0.80MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,24.24685
_runtime,135.0
_timestamp,1627547164.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▃▆▃▃▄▅▂▄▃▅▄▄▂▄▄▄▄▅▁▄▃▄▄▃▄▂▃▄▅▅▆▅▄▄▄▆▅▆█▇
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 8v6pawzl with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: RMSprop


episode 100 average reward 22.365866017187162, ended at 4.5
episode 200 average reward 21.29214542016999, ended at 9.0
episode 300 average reward 19.184107257593134, ended at 13.2
episode 400 average reward 19.54330828168993, ended at 17.3
episode 500 average reward 21.402900380808727, ended at 21.6
episode 600 average reward 25.108267448429167, ended at 26.0
episode 700 average reward 20.05636490963607, ended at 30.3
episode 800 average reward 25.810105069734036, ended at 34.8
episode 900 average reward 19.475182686015263, ended at 39.1
episode 1000 average reward 19.60798256949253, ended at 43.2
episode 1100 average reward 18.637105006219656, ended at 47.2
episode 1200 average reward 22.468107541407928, ended at 52.1
episode 1300 average reward 19.51128814699193, ended at 56.6
episode 1400 average reward 19.3995182506707, ended at 61.1
episode 1500 average reward 20.664587201571667, ended at 65.7
episode 1600 average reward 20.908232726075884, ended at 70.4
episode 1700 average rewar

VBox(children=(Label(value=' 0.82MB of 0.82MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,19.26564
_runtime,133.0
_timestamp,1627547303.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▃▇▃▄▄▁▆▄█▂▄▅▄▄▂▆▅▄▂▄▃▄▄▂▅▄▁▇▁▃▆▂▂▄▆▄▁▂▂▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: w48wnakr with config:
[34m[1mwandb[0m: 	actor_learning_rate: 1e-05
[34m[1mwandb[0m: 	critic_learning_rate: 1e-05
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	num_neurons: 128
[34m[1mwandb[0m: 	optimizer: Adam


episode 100 average reward 22.02826083168772, ended at 4.4
episode 200 average reward 23.915060653591283, ended at 8.9
episode 300 average reward 19.529958212613973, ended at 13.4
episode 400 average reward 20.012544668608147, ended at 17.8
episode 500 average reward 22.766900354863303, ended at 22.7
episode 600 average reward 23.949102055515915, ended at 27.4
episode 700 average reward 23.98487926746266, ended at 32.2
episode 800 average reward 22.257725139399007, ended at 36.6
episode 900 average reward 21.183578631694704, ended at 40.6
episode 1000 average reward 25.615198667384625, ended at 45.1
episode 1100 average reward 21.52429362781972, ended at 49.3
episode 1200 average reward 20.90847775076167, ended at 53.8
episode 1300 average reward 21.881818278107996, ended at 58.2
episode 1400 average reward 19.620668831303114, ended at 62.4
episode 1500 average reward 19.61684378930741, ended at 66.6
episode 1600 average reward 20.6829890430845, ended at 70.7
episode 1700 average rewar

VBox(children=(Label(value=' 0.83MB of 0.83MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,21.83107
_runtime,135.0
_timestamp,1627547444.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▃▆▃▃▇▄▆▅▆▆▃▃▅█▄▅▅▄▂▅▂▄▅▂▅▃▇▅▃▃▃▄▄▁▆▃▃▄▂▆
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# You can see the result here!
[Report Link](https://wandb.ai/ko120/Advantage_Actor_Critic/reports/TD-Actor-Critic-Learning-rate-tune---Vmlldzo4OTIwODg)