In [18]:
%load_ext autoreload
%autoreload 2
import gym
import os
import numpy as np
import random
import torch.nn as nn
import torch
from itertools import count
from torch.utils.tensorboard import SummaryWriter
from experiment import rollout, ReplayBuffer, Trajectory, load_model, save_model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
HIDDEN = 32
MODEL_NAME = 'model_cart_pole_v1'

writer = SummaryWriter()

class Behavior(torch.nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Behavior, self).__init__()
        self.classifier = torch.nn.Sequential(
            nn.Linear(input_shape, HIDDEN), 
            nn.ReLU(),
            nn.Linear(HIDDEN, HIDDEN), 
            nn.ReLU(),
            nn.Linear(HIDDEN, num_actions)
        )        

    def forward(self, x):
        return self.classifier(x)

In [20]:
env = gym.make('CartPole-v1')

In [21]:
loss_object = torch.nn.BCEWithLogitsLoss().to(device)

d = env.observation_space.shape[0]
    
model = Behavior(input_shape=d+2, num_actions=1).to(device)
optimizer = torch.optim.Adam(model.parameters())

In [22]:
rb = ReplayBuffer(max_size=100, last_few=50)

# Random rollout
trajectories, mean_reward, length = rollout(episodes=50, env=env, render=False)
rb.add(trajectories)

print(f"Mean Episode Reward: {mean_reward}")

# Keep track of steps used during random rollout!
epoch, model, optimizer, loss, steps = load_model(MODEL_NAME, model, optimizer, device, train=True)
steps += length
save_model(MODEL_NAME, epoch, model, optimizer, loss, steps)

# Plot initial values
writer.add_scalar('Steps/reward', mean_reward, steps)              

Mean Episode Reward: 22.74
No checkpoint found. Creating new model.


In [23]:
batch_size = 1024

In [24]:
def train_step(inputs, targets):
    optimizer.zero_grad()    
    predictions = model(inputs)
    targets = targets.float()
    loss = loss_object(predictions, targets.unsqueeze(1))
    
    loss.backward()
    optimizer.step()
    
    return loss

def action_fn(model, inputs, sample_action=True):
    action_probs = model(inputs)
    action_probs = torch.sigmoid(action_probs)
    
    if random.random() < 0.01: # Random action
        return env.action_space.sample()
    if sample_action:
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs)            
        action = int(m.sample().squeeze().cpu().numpy())
    else:
        action = int(np.round(action_probs.detach().squeeze().numpy()))
    return action
    

In [25]:
loss_sum = 0
loss_count = 0

In [26]:
epoch, model, optimizer, loss, steps = load_model(MODEL_NAME, model, optimizer, device, train=True)
SOLVED_MEAN_REWARD = 350 # officially 195 - be we sample less episode
MAX_STEPS = 10**7

for i in count(start=epoch):
    x, y = rb.sample(batch_size, device)    
    loss = train_step(x, y)
    loss_sum += loss
    loss_count += 1
    writer.add_scalar('Loss/loss', loss, i)
    
    (dh, dr) = rb.sample_command()
    writer.add_scalar('Epoch/dh', dh, i)
    writer.add_scalar('Epoch/dr', dr, i)

    n_episodes_per_iter = 10
    n_updates_per_iter = 100
   
    if i % n_updates_per_iter == 0:        
        trajectories, mean_reward, length = rollout(n_episodes_per_iter, env=env, model=model, sample_action=True, replay_buffer=rb, 
                              device=device, action_fn=action_fn)
        rb.add(trajectories)
        
        print(f"Average Episode Reward: {mean_reward}")  
        
        steps += length
        avg_loss = loss_sum/loss_count
        save_model(MODEL_NAME, i, model, optimizer, avg_loss, steps)        
        print(f"Average Episode Reward: {mean_reward}")        
        writer.add_scalar('Steps/reward', mean_reward, steps)
        
        mean_length = length*1.0/n_episodes_per_iter
        writer.add_scalar('Steps/length', mean_length, steps)
        
        if mean_reward >= SOLVED_MEAN_REWARD:
            print("Task considered solved! Stopping.")
            break
        
        if steps >= MAX_STEPS:
            print(f"Steps {steps} exceeds max env steps {MAX_STEPS}. Stopping.")
            break

    if i % 200 == 0:
        avg_loss = loss_sum/loss_count
        print(f'i: {i}, s: {steps}, Loss: {avg_loss}')
        save_model(MODEL_NAME, i, model, optimizer, avg_loss, steps)    

Existing model found. Loading from epoch 0, steps 1137 with loss: 0.0
Average Episode Reward: 23.6
Average Episode Reward: 23.6
i: 0, s: 1373, Loss: 0.6993221044540405
Average Episode Reward: 16.7
Average Episode Reward: 16.7
Average Episode Reward: 20.6
Average Episode Reward: 20.6
i: 200, s: 1746, Loss: 0.6847312450408936
Average Episode Reward: 21.0
Average Episode Reward: 21.0
Average Episode Reward: 30.6
Average Episode Reward: 30.6
i: 400, s: 2262, Loss: 0.6721463203430176
Average Episode Reward: 29.2
Average Episode Reward: 29.2
Average Episode Reward: 32.1
Average Episode Reward: 32.1
i: 600, s: 2875, Loss: 0.6585131883621216
Average Episode Reward: 33.5
Average Episode Reward: 33.5
Average Episode Reward: 35.7
Average Episode Reward: 35.7
i: 800, s: 3567, Loss: 0.6484062075614929
Average Episode Reward: 36.6
Average Episode Reward: 36.6
Average Episode Reward: 38.4
Average Episode Reward: 38.4
i: 1000, s: 4317, Loss: 0.6400537490844727
Average Episode Reward: 39.9
Average Epis

KeyboardInterrupt: 

In [8]:
rb.sample_command()
  # dh       dr

(22.36, 24.437429575704112)

In [17]:
dh = 100
dr = 400
cmd = (dh, dr)
# cmd = rb.sample_command()

env = gym.make('CartPole-v1')
e, model, _, l, steps = load_model(MODEL_NAME, train=False, model=model, optimizer=optimizer, device=device)
print(f"Loaded model at epoch: {e} with loss {l}")
_, mean_reward, length = rollout(episodes=3, env=env, model=model, sample_action=False, cmd=cmd, render=True, device=device, action_fn=action_fn)


print(f"Average Episode Reward: {mean_reward}, Mean Length: {length}")

Existing model found. Loading from epoch 59900, steps 686074 with loss: 0.411252498626709
Loaded model at epoch: 59900 with loss 0.411252498626709
Average Episode Reward: 393.6666666666667, Mean Length: 1181
