# Enable GPU

In [1]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# Actor Critic Share Network

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class ActorCritic_Net(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128):
    super(ActorCritic_Net , self).__init__()
    self.fc1 = nn.Linear(input_dims,fc1_dims)
    self.actor = nn.Linear(fc1_dims, output_dims)
    self.critic = nn.Linear(fc1_dims,1)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    pi = F.softmax(self.actor(x), dim = 1)
    value = self.critic(x)
    return (pi, value)



# REINFORCE with Baseline Agent

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical 
import numpy as np

class ActorCritic(nn.Module):
  def __init__(self, input_dims, output_dims, fc1_dims = 128, gamma = 0.99, lr = 1e-3):
    super(ActorCritic, self).__init__()
    self.ac_net = ActorCritic_Net(input_dims= input_dims, output_dims= output_dims, fc1_dims = fc1_dims)
    self.optimizer = optim.RMSprop(params= self.ac_net.parameters(), lr = lr)
    self.gamma = gamma

    self.log_probs = []
    self.values = []
  
  def get_action(self, state):
    pi, v = self.ac_net(state)
  
    distribution = Categorical(probs = pi)
    action = distribution.sample()
    self.log_probs.append(distribution.log_prob(action))
    self.values.append(v)

    return action


  def learn_mean(self, rewards ,states, actions, return_norm = True):
    
  
    returns = []
    actor_losses = []
    critic_losses = []

    # Calculate returns
    G = 0
    for reward in rewards[::-1]:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)


    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    
    # Resize the vectors
  
    self.values = torch.cat(self.values).squeeze() # concatinating plus squeeze since input dim is 2d
    self.log_probs = torch.cat(self.log_probs) # only concatinating since input dim is 1d

    # Compute actor and critic losses
    
    for G, log_prob, v in zip(returns, self.log_probs, self.values):
      
      G = G.detach() 
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_losses.append(-log_prob * advantage)
      critic_losses.append(F.smooth_l1_loss(v, G))
    self.optimizer.zero_grad()
    loss = (torch.stack(actor_losses).sum()).mean() + (torch.stack(critic_losses).sum()).mean()
    loss.backward()
    self.optimizer.step()

    # clear out the memory
    self.values = []
    self.log_probs = []

  def learn_forward(self, rewards, states, actions, return_norm = True):
    
  
    returns = []
    states = torch.cat(states, dim = 0).to(device)
    actions = torch.tensor(actions).to(device)  
    rewards = torch.tensor(rewards).to(device).flip(dims= [0])

    # Calculate returns
    G = 0
    for reward in rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)

    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    

    # Compute actor and critic losses

    for G, state, action in zip(returns, states, actions):
      G = G.detach() 
      state = state.unsqueeze(0)
      pi, v = self.ac_net(state)
      dist = Categorical(probs = pi)
      log_prob = dist.log_prob(action)
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_loss = -log_prob * advantage
      critic_loss = F.smooth_l1_loss(v, torch.tensor([G]).unsqueeze(0).to(device))
      self.optimizer.zero_grad()
      loss = actor_loss + critic_loss
      loss.backward()
      self.optimizer.step()

    # clear out the memory
    self.values = []
    self.log_probs = []

  def learn_backward(self, rewards, states, actions, return_norm = True):
    
    returns = []
    states = torch.cat(states, dim = 0).to(device).flip(dims = [0])
    actions = torch.tensor(actions).to(device).flip(dims = [0])
    rewards = torch.tensor(rewards).to(device).flip(dims= [0])

    # Calculate returns
    G = 0
    for reward in rewards:
      G = reward + self.gamma * G
      returns.insert(0, G)
    returns = torch.tensor(returns).to(device)
    returns = returns.flip(dims = [0]) # Flip it again to traverse backward

    if return_norm:
      eps = np.finfo(np.float32).eps.item()
      returns = (returns - returns.mean()) / (returns.std() + eps)
    

    # Compute actor and critic losses

    for G, state, action in zip(returns, states, actions):
      G = G.detach() 
      state = state.unsqueeze(0)
      pi, v = self.ac_net(state)
      dist = Categorical(probs = pi)
      log_prob = dist.log_prob(action)
      advantage = G - v.item() # detach the grad computation to avoid computing gradient
      actor_loss = -log_prob * advantage
      critic_loss = F.smooth_l1_loss(v, torch.tensor([G]).unsqueeze(0).to(device)).unsqueeze(0)
      self.optimizer.zero_grad()
      assert actor_loss.size() == critic_loss.size()
      loss = actor_loss + critic_loss
      loss.backward()
      self.optimizer.step()

    # clear out the memory
    self.values = []
    self.log_probs = []


# Without Wandb

In [6]:
import gym
import torch
import time

def train():

  start = time.time()

  env = gym.make('CartPole-v1')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = ActorCritic(input_dims = state_dim,  output_dims = action_dim, lr = 0.001).to(device)
  num_ep = 3000
  print_interval = 100
  running_score = 10

  for ep in range(num_ep):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    states = []
    actions = []

    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      rewards.append(reward)
      states.append(state)
      actions.append(action.item())
      # update score and state
      score += reward
      state = next_state

      if done:
        break

  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score

    # train the agent
    agent.learn_backward(rewards, states, actions, return_norm = True)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))
  

In [None]:
train()

# With Wandb

In [None]:
!pip install wandb
!wandb login

In [None]:
import wandb
sweep_config = dict()
sweep_config['method'] = 'grid'
sweep_config['metric'] = {'name': 'running_score', 'goal': 'maximize'}
sweep_config['parameters'] = {'learning': {'values': ['learn_mean','learn_forward','learn_backward']}, 'learning_rate': {'values' : [0.01, 0.001, 0.0001,0.0003,0.00001]}
                              , 'norm_return': {'value': True}, 'num_neurons': {'value': 128}}

sweep_id = wandb.sweep(sweep_config, project = 'REINFORCE_Baseline')

Create sweep with ID: uticyas7
Sweep URL: https://wandb.ai/ko120/REINFORCE_Baseline/sweeps/uticyas7


In [None]:
import gym 
import torch
import time
import wandb

def train():
  wandb.init(config = {'env':'CartPole-v1','algorithm:': 'REINFORCE_Baseline','architecture': 'shared','num_laeyrs':'2'}, project = 'REINFORCE_Baseline',group = 'Cart_REINFORCE_Baseline_with_128_neurons_RMSProp')
  config = wandb.config

  start = time.time()

  env = gym.make('CartPole-v1')
  env.seed(543)
  torch.manual_seed(543)

  state_dim = env.observation_space.shape[0]
  action_dim = env.action_space.n

  device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
  agent = ActorCritic(input_dims = state_dim,  output_dims = action_dim, lr = config.learning_rate, fc1_dims= config.num_neurons).to(device)
  num_ep = 3000
  print_interval = 100
  save_interval = 1000
  running_score = 10

  wandb.watch(agent)
  for ep in range(1,num_ep+1):
    state = env.reset()
    score = 0
    done = False
    rewards = []
    states = []
    actions = []
    while not done:
      state = torch.tensor([state]).float().to(device)
      action = agent.get_action(state)
      next_state, reward, done, _ = env.step(action.item())
      
      # saving episode
      rewards.append(reward)
      states.append(state)
      actions.append(action.item())

      # update score and state
      score += reward
      state = next_state

      if done:
        break
  
    # calculating score and running score
    running_score = 0.05 * score + (1 - 0.05) * running_score
    wandb.log({'episode': ep, 'running_score': running_score})

    # train the agent
  
    if config.learning == 'learn_mean':
      agent.learn_mean(rewards,states, actions, return_norm = True)
    elif config.learning == 'learn_forward':
      agent.learn_forward(rewards, states, actions, return_norm = True)
    elif config.learning == 'learn_backward':
      agent.learn_backward(rewards, states, actions, return_norm = True)

    if ep % print_interval == 0:
      print('episode {} average reward {}, ended at {:.01f}'.format(ep, running_score, time.time() - start))    
    
    if ep % save_interval == 0:
      save_name = 'agent_' + str(ep) + '.pt'
      torch.save(agent.state_dict(),save_name)
      wandb.save(save_name)

    if ep == num_ep:
      dummy_input = torch.rand(1,4).to(device)
      torch.onnx.export(agent.ac_net,dummy_input,'final_model.onnx')
      wandb.save('final_model.onnx')
    

In [None]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: sa12jwhb with config:
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 59.97391039064079, ended at 7.0
episode 200 average reward 61.89096405838821, ended at 22.4
episode 300 average reward 287.4959541386293, ended at 67.4
episode 400 average reward 441.40501878301546, ended at 126.3
episode 500 average reward 386.12215299398105, ended at 191.9
episode 600 average reward 497.46071711913794, ended at 285.3
episode 700 average reward 469.79866650347964, ended at 366.6
episode 800 average reward 471.2086979303794, ended at 445.6
episode 900 average reward 499.82954025480495, ended at 542.2
episode 1000 average reward 439.45723819070673, ended at 630.4
episode 1100 average reward 499.64155480962796, ended at 728.4
episode 1200 average reward 499.99787781477613, ended at 825.4
episode 1300 average reward 497.060488231461, ended at 920.0
episode 1400 average reward 478.8244108332537, ended at 1013.6
episode 1500 average reward 461.9305351492036, ended at 1108.4
episode 1600 average reward 446.1696586564902, ended at 1178.4
episode 170

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,499.92257
_runtime,2279.0
_timestamp,1627082513.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▂▂▃▃▆▄▇██▆██▇██████▅▇▇▇██▆███▆▇▇▄▆▅▅▇▇█
_runtime,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 8ef9mlqk with config:
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 31.519144179477014, ended at 5.3
episode 200 average reward 41.387473371873995, ended at 12.9
episode 300 average reward 138.09489831881822, ended at 31.1
episode 400 average reward 183.8797629143809, ended at 64.6
episode 500 average reward 204.38286168727623, ended at 111.6
episode 600 average reward 363.24235984622266, ended at 177.5
episode 700 average reward 362.02785569962225, ended at 256.5
episode 800 average reward 430.3839872608341, ended at 339.0
episode 900 average reward 442.4633884185407, ended at 424.0
episode 1000 average reward 470.67591662449246, ended at 517.9
episode 1100 average reward 476.44191623301117, ended at 610.8
episode 1200 average reward 470.7739156753023, ended at 703.8
episode 1300 average reward 473.826222249592, ended at 799.2
episode 1400 average reward 461.8787267570851, ended at 894.2
episode 1500 average reward 488.13461487827004, ended at 989.6
episode 1600 average reward 499.8881726470901, ended at 1088.1
episode 1700 

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,489.2466
_runtime,2461.0
_timestamp,1627084979.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▂▃▃▅▅▆▆▇▇███▇███▇████████████▇███████
_runtime,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: eh4f5h03 with config:
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 21.4211520777414, ended at 4.4
episode 200 average reward 23.071729395498682, ended at 9.1
episode 300 average reward 20.02667682818426, ended at 13.5
episode 400 average reward 21.247921870312226, ended at 18.5
episode 500 average reward 28.11395757883904, ended at 23.6
episode 600 average reward 24.306529486222182, ended at 29.0
episode 700 average reward 27.43574973897528, ended at 35.4
episode 800 average reward 30.64988388961609, ended at 41.6
episode 900 average reward 30.53708003144177, ended at 47.7
episode 1000 average reward 34.47774825112114, ended at 54.0
episode 1100 average reward 34.0013994461929, ended at 61.1
episode 1200 average reward 41.44641815106113, ended at 68.4
episode 1300 average reward 33.43475656672272, ended at 76.4
episode 1400 average reward 36.363921395374, ended at 85.0
episode 1500 average reward 40.207903922367386, ended at 93.8
episode 1600 average reward 52.57258686539141, ended at 104.2
episode 1700 average reward 47.057

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,148.83803
_runtime,360.0
_timestamp,1627085344.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▂▃▃▃▃▃▃▄▄▅▅▅▅▅▆▇▆█
_runtime,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇█
_timestamp,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇█
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 5lc066ja with config:
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 20.96766757682591, ended at 4.3
episode 200 average reward 23.1388445999379, ended at 9.2
episode 300 average reward 29.336993212585007, ended at 14.9
episode 400 average reward 34.628083497620736, ended at 21.7
episode 500 average reward 38.729865619112076, ended at 29.9
episode 600 average reward 56.78739823325429, ended at 40.5
episode 700 average reward 82.82363964505736, ended at 54.7
episode 800 average reward 104.79172477889163, ended at 73.0
episode 900 average reward 118.00949060481128, ended at 94.6
episode 1000 average reward 162.69424566672916, ended at 123.0
episode 1100 average reward 165.47174778068782, ended at 154.1
episode 1200 average reward 201.01081811202377, ended at 192.4
episode 1300 average reward 227.1594204743566, ended at 235.5
episode 1400 average reward 274.1980165428583, ended at 284.3
episode 1500 average reward 262.2286793411837, ended at 337.4
episode 1600 average reward 309.43684679842215, ended at 398.2
episode 1700 average

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,460.62959
_runtime,1534.0
_timestamp,1627086883.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇███████
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇██
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: cfltlxcl with config:
[34m[1mwandb[0m: 	learning: learn_mean
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 22.027825238688308, ended at 4.6
episode 200 average reward 22.64946639315034, ended at 9.3
episode 300 average reward 21.18971667659817, ended at 13.8
episode 400 average reward 22.647629465188764, ended at 18.6
episode 500 average reward 20.57378298956933, ended at 23.2
episode 600 average reward 23.42052464126032, ended at 28.0
episode 700 average reward 22.15058574050925, ended at 32.9
episode 800 average reward 20.84625504656594, ended at 37.5
episode 900 average reward 25.57870862333955, ended at 42.1
episode 1000 average reward 19.580020290253547, ended at 46.4
episode 1100 average reward 23.080865958592746, ended at 51.4
episode 1200 average reward 23.114500178101558, ended at 55.9
episode 1300 average reward 23.04314900865914, ended at 60.6
episode 1400 average reward 20.2330402046395, ended at 65.7
episode 1500 average reward 23.92803951138116, ended at 70.7
episode 1600 average reward 25.694352723690372, ended at 75.6
episode 1700 average reward 24

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,24.20117
_runtime,150.0
_timestamp,1627087039.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▅▃▃▁▆▁▄▄▃▃▁▁▁▅▁▄█▃▂▂▇▇▃▂▄▄▁▅▇▃▂▆▅▄▆█▃▇▆
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: siigpi3e with config:
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 13.59333063629175, ended at 5.5
episode 200 average reward 9.53359835301858, ended at 10.1
episode 300 average reward 9.3460622863823, ended at 14.6
episode 400 average reward 15.597774181925795, ended at 20.7
episode 500 average reward 9.387551523120251, ended at 25.2
episode 600 average reward 9.4542381413555, ended at 29.6
episode 700 average reward 9.529072458741247, ended at 34.0
episode 800 average reward 9.260127642592426, ended at 38.4
episode 900 average reward 9.110495418450512, ended at 42.7
episode 1000 average reward 9.621299086301303, ended at 47.2
episode 1100 average reward 34.9437003045367, ended at 57.9
episode 1200 average reward 11.084086694296932, ended at 70.2
episode 1300 average reward 9.496458294454849, ended at 74.6
episode 1400 average reward 9.437274516010456, ended at 79.0
episode 1500 average reward 9.555995637486298, ended at 83.5
episode 1600 average reward 9.372828599171484, ended at 87.9
episode 1700 average reward 9.42687087

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.23686
_runtime,153.0
_timestamp,1627087197.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▁▁▁▅▁▁▁▁▁▁▁▁▇█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: vdsh8f7m with config:
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 89.95264449204595, ended at 28.6
episode 200 average reward 114.16127914268749, ended at 71.9
episode 300 average reward 137.5960305564844, ended at 135.5
episode 400 average reward 96.74224311657204, ended at 188.0
episode 500 average reward 69.58382446125881, ended at 219.9
episode 600 average reward 70.58838928356104, ended at 253.4
episode 700 average reward 105.29869427577376, ended at 289.5
episode 800 average reward 116.79302069802849, ended at 347.9
episode 900 average reward 85.53590328035072, ended at 383.4
episode 1000 average reward 96.32042508867126, ended at 426.7
episode 1100 average reward 161.67201513466821, ended at 487.2
episode 1200 average reward 115.00154862125424, ended at 566.0
episode 1300 average reward 93.99162727702215, ended at 612.6
episode 1400 average reward 90.54545132081655, ended at 650.1
episode 1500 average reward 198.0906842770164, ended at 706.8
episode 1600 average reward 34.8412017286297, ended at 774.8
episode 1700 av

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.22431
_runtime,840.0
_timestamp,1627088043.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▃▅▄▄▃▃▃▄▄▃▃▄▄█▄▃▃▄█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇██████████████████
_timestamp,▁▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇██████████████████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 8t3wogcx with config:
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 26.313140397647206, ended at 11.1
episode 200 average reward 40.05454106487343, ended at 26.6
episode 300 average reward 71.7082154957238, ended at 53.5
episode 400 average reward 213.2475562602905, ended at 121.7
episode 500 average reward 278.24326323582613, ended at 233.2
episode 600 average reward 396.215113729367, ended at 400.1
episode 700 average reward 368.89962670365986, ended at 571.8
episode 800 average reward 390.0720576159393, ended at 766.3
episode 900 average reward 380.9278297112747, ended at 936.2
episode 1000 average reward 445.7876459569513, ended at 1137.6
episode 1100 average reward 486.0946327426862, ended at 1339.1
episode 1200 average reward 495.5375071291634, ended at 1552.8
episode 1300 average reward 431.4817126293418, ended at 1762.6
episode 1400 average reward 450.601634089287, ended at 1969.5
episode 1500 average reward 471.54140163813486, ended at 2181.4
episode 1600 average reward 303.63590558646723, ended at 2372.2
episode 170

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,419.24321
_runtime,5148.0
_timestamp,1627093197.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▂▃▄▆▆▆█▆▆▇███▇███▅▅██▇███▇█▇█▇██▇▇█▇
_runtime,▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: c8swi82z with config:
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 35.870598534108616, ended at 12.9
episode 200 average reward 124.89828735294802, ended at 58.3
episode 300 average reward 234.14957790004846, ended at 155.8
episode 400 average reward 384.05290892954844, ended at 314.1
episode 500 average reward 336.8202701637642, ended at 478.2
episode 600 average reward 402.46684891127444, ended at 675.6
episode 700 average reward 120.6395373947515, ended at 714.7
episode 800 average reward 172.4346541816798, ended at 799.3
episode 900 average reward 158.52798776416867, ended at 867.6
episode 1000 average reward 294.86940064804514, ended at 990.7
episode 1100 average reward 484.2275383095235, ended at 1194.8
episode 1200 average reward 474.40267047420065, ended at 1411.2
episode 1300 average reward 470.18271675562545, ended at 1601.6
episode 1400 average reward 360.0337132172557, ended at 1769.5
episode 1500 average reward 401.0711856646495, ended at 1938.6
episode 1600 average reward 460.405579963313, ended at 2115.4
episo

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,198.78306
_runtime,4461.0
_timestamp,1627097664.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▃▄▆▅▇▆▃▃▃▄▅██▇█▆▇▆▇█▇▇▄▅▅▇▅▆████▆▇▆▅▂
_runtime,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: lm25e6xm with config:
[34m[1mwandb[0m: 	learning: learn_forward
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 22.755056258088082, ended at 9.5
episode 200 average reward 22.290075668268017, ended at 19.1
episode 300 average reward 20.467349903036883, ended at 28.6
episode 400 average reward 23.77027599759585, ended at 38.4
episode 500 average reward 24.582901600300282, ended at 48.6
episode 600 average reward 24.174880418218535, ended at 59.3
episode 700 average reward 20.34162753597591, ended at 70.0
episode 800 average reward 23.219263106799815, ended at 81.2
episode 900 average reward 25.00570940786282, ended at 92.6
episode 1000 average reward 26.463142905824547, ended at 105.3
episode 1100 average reward 33.32105035740504, ended at 119.3
episode 1200 average reward 27.663532473552323, ended at 131.7
episode 1300 average reward 27.847474265425127, ended at 144.7
episode 1400 average reward 31.60818244389899, ended at 159.4
episode 1500 average reward 32.83929558865753, ended at 173.7
episode 1600 average reward 36.827648653394355, ended at 189.5
episode 1700 aver

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,84.06581
_runtime,504.0
_timestamp,1627098174.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▁▁▁▂▂▁▂▂▂▂▃▂▃▃▂▃▃▃▃▄▅▃▄▄▅▄▄▅▆▅▅▅███▇
_runtime,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇██
_timestamp,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: bqp8zx41 with config:
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 19.196979562646145, ended at 13.4
episode 200 average reward 9.337644835745538, ended at 17.9
episode 300 average reward 9.463881135388034, ended at 22.5
episode 400 average reward 9.267472723913471, ended at 27.0
episode 500 average reward 9.350072788364566, ended at 31.6
episode 600 average reward 9.419099382606548, ended at 36.1
episode 700 average reward 104.2876550345088, ended at 51.4
episode 800 average reward 9.820702387152165, ended at 55.7
episode 900 average reward 10.340543799883376, ended at 61.6
episode 1000 average reward 9.17516866270855, ended at 66.0
episode 1100 average reward 9.39652111114433, ended at 70.4
episode 1200 average reward 9.19356470958293, ended at 74.9
episode 1300 average reward 9.485265403802662, ended at 79.3
episode 1400 average reward 36.07689849686818, ended at 98.5
episode 1500 average reward 53.41821912236184, ended at 124.1
episode 1600 average reward 46.00663477564272, ended at 148.1
episode 1700 average reward 58.2

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,9.35164
_runtime,274.0
_timestamp,1627098454.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▂▂▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▅▄▆▄▅▂▁▁▁▁▁▁█▂▂▁▁▁▁▁▁▁
_runtime,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇██████
_timestamp,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇██████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: rqxnri1k with config:
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 163.9298319715372, ended at 50.1
episode 200 average reward 339.6923098700983, ended at 183.7
episode 300 average reward 85.9202855896491, ended at 259.0
episode 400 average reward 142.5477230992821, ended at 359.4
episode 500 average reward 127.17007386264035, ended at 416.4
episode 600 average reward 287.4886190783078, ended at 536.9
episode 700 average reward 157.7739697281776, ended at 648.6
episode 800 average reward 232.35122190541327, ended at 738.1
episode 900 average reward 426.25109595704896, ended at 926.3
episode 1000 average reward 337.9217904849235, ended at 1127.2
episode 1100 average reward 474.09810144849433, ended at 1322.9
episode 1200 average reward 470.78134851223393, ended at 1511.0
episode 1300 average reward 294.94477801344635, ended at 1708.7
episode 1400 average reward 194.75920404450915, ended at 1804.7
episode 1500 average reward 321.9992585837499, ended at 1941.0
episode 1600 average reward 409.4277922283893, ended at 2101.7
episo

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,484.24165
_runtime,4861.0
_timestamp,1627103321.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▃▅▄▅▃▂▄▅▃▃▆█▇▇▇█▅▄▄▅▇█▆▇▇▆█▆▄▇███▇▇█▇█▇
_runtime,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: fpx5mc1h with config:
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 24.689141419694522, ended at 10.3
episode 200 average reward 34.90406674145161, ended at 24.3
episode 300 average reward 56.62398348096365, ended at 45.4
episode 400 average reward 144.38492501072247, ended at 98.6
episode 500 average reward 247.51447438217667, ended at 209.3
episode 600 average reward 447.0566194718332, ended at 376.8
episode 700 average reward 468.0126120562754, ended at 581.0
episode 800 average reward 488.6458957611719, ended at 789.6
episode 900 average reward 481.94793260694524, ended at 1008.4
episode 1000 average reward 482.3242167624411, ended at 1227.6
episode 1100 average reward 480.40484197362144, ended at 1446.6
episode 1200 average reward 495.10992796658417, ended at 1667.3
episode 1300 average reward 487.1112474720673, ended at 1887.4
episode 1400 average reward 486.0135953877326, ended at 2107.1
episode 1500 average reward 489.7851471714744, ended at 2329.9
episode 1600 average reward 456.5824013274805, ended at 2544.9
episode

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,488.89433
_runtime,5555.0
_timestamp,1627108882.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▂▃▄▅▇█▇██████████▇▇███▇███▇▇▇█████▇█
_runtime,▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 3h3ipum3 with config:
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 42.671742125498355, ended at 15.9
episode 200 average reward 139.61590714070144, ended at 61.6
episode 300 average reward 428.12800250025714, ended at 221.8
episode 400 average reward 344.768381417998, ended at 410.7
episode 500 average reward 466.9988913003535, ended at 607.6
episode 600 average reward 263.1214076511686, ended at 767.1
episode 700 average reward 480.88463844804613, ended at 981.5
episode 800 average reward 484.18586212928983, ended at 1200.0
episode 900 average reward 487.82916104439124, ended at 1419.0
episode 1000 average reward 481.1407991871439, ended at 1631.7
episode 1100 average reward 477.10430386965714, ended at 1844.5
episode 1200 average reward 483.6666163809662, ended at 2048.7
episode 1300 average reward 479.0354244846262, ended at 2254.6
episode 1400 average reward 468.7960131024215, ended at 2455.0
episode 1500 average reward 490.50380613900467, ended at 2672.5
episode 1600 average reward 460.71357383483013, ended at 2886.3
ep

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,497.58988
_runtime,5873.0
_timestamp,1627114762.0
_step,3000.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▂▆▇▇▇▇▆██████▇██▇██▇████████▇███▇█▇███
_runtime,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Agent Starting Run: 7urrzckl with config:
[34m[1mwandb[0m: 	learning: learn_backward
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	norm_return: True
[34m[1mwandb[0m: 	num_neurons: 128


episode 100 average reward 22.755056258088082, ended at 9.8
episode 200 average reward 22.290075668268017, ended at 20.0
episode 300 average reward 18.065510604768143, ended at 29.8
episode 400 average reward 23.844039612709008, ended at 41.1
episode 500 average reward 27.368024955917125, ended at 53.9
episode 600 average reward 29.092309375654686, ended at 66.6
episode 700 average reward 24.627655672379724, ended at 79.3
episode 800 average reward 25.093283083045897, ended at 91.5
episode 900 average reward 27.507672337342314, ended at 104.5
episode 1000 average reward 27.85420258490636, ended at 117.2
episode 1100 average reward 28.44699977824292, ended at 131.2
episode 1200 average reward 25.674642547149496, ended at 143.6
episode 1300 average reward 39.10251351900882, ended at 159.6
episode 1400 average reward 28.777853219849504, ended at 174.1
episode 1500 average reward 32.71761911911992, ended at 189.7
episode 1600 average reward 34.41307239995592, ended at 205.9
episode 1700 av

VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,3000.0
running_score,68.951
_runtime,522.0
_timestamp,1627115289.0
_step,2999.0


0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
running_score,▁▁▁▁▂▁▂▂▂▂▂▂▂▂▃▂▂▃▃▂▃▃▃▄▃▅▄▃▄▄▅▅▅▆▄▅█▇▇█
_runtime,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇██
_timestamp,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇██
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# You can see the result here!
[Report Link](https://wandb.ai/ko120/REINFORCE_Baseline/reports/REINFORCE-with-Baseline-forward-and-backward--Vmlldzo4NzM4ODE)