In [1]:
%load_ext autoreload
%autoreload 2
import gym
import os
import numpy as np
import random
import torch.nn as nn
import torch
from itertools import count
from torch.utils.tensorboard import SummaryWriter
from experiment import rollout, ReplayBuffer, Trajectory, load_model, save_model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
HIDDEN = 16
MODEL_NAME = 'model_cart_pole_v1'

writer = SummaryWriter()


class Behavior(torch.nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Behavior, self).__init__()
        self.classifier = torch.nn.Sequential(
            nn.Linear(input_shape, HIDDEN), 
            nn.ReLU(),
            nn.Linear(HIDDEN, HIDDEN), 
            nn.ReLU(),
            nn.Linear(HIDDEN, num_actions)
        )        

    def forward(self, x):
        return self.classifier(x)

In [3]:
env = gym.make('CartPole-v1')

In [4]:
loss_object = torch.nn.BCEWithLogitsLoss().to(device)

d = env.observation_space.shape[0]
    
model = Behavior(input_shape=d+2, num_actions=1).to(device)
optimizer = torch.optim.Adam(model.parameters())

In [5]:
rb = ReplayBuffer(max_size=500, last_few=200)

# Random rollout
trajectories, mean_reward, length = rollout(episodes=1, env=env, render=False)
rb.add(trajectories)

print(f"Mean Episode Reward: {mean_reward}")

# Keep track of steps used during random rollout!
epoch, model, optimizer, loss, steps = load_model(MODEL_NAME, model, optimizer, device, train=True)
steps += length
save_model(MODEL_NAME, epoch, model, optimizer, loss, steps)

# Plot initial values
writer.add_scalar('Steps/reward', mean_reward, steps)              

Mean Episode Reward: 25.0
No checkpoint found. Creating new model.


In [6]:
batch_size = 512

In [7]:
def train_step(inputs, targets):
    optimizer.zero_grad()    
    predictions = model(inputs)
    targets = targets.float()
    loss = loss_object(predictions, targets.unsqueeze(1))
    
    loss.backward()
    optimizer.step()
    
    return loss

def action_fn(model, inputs, sample_action=True):
    action_probs = model(inputs)
    action_probs = torch.sigmoid(action_probs)

    if sample_action:
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs)            
        action = int(m.sample().squeeze().cpu().numpy())
    else:
        action = int(np.round(action_probs.detach().squeeze().numpy()))
    return action
    

In [8]:
loss_sum = 0
loss_count = 0

In [9]:
epoch, model, optimizer, loss, steps = load_model(MODEL_NAME, model, optimizer, device, train=True)
SOLVED_MEAN_REWARD = 250 # officially 195 - be we sample less episode
MAX_STEPS = 10**7

for i in count(start=epoch):
    x, y = rb.sample(batch_size, device)    
    loss = train_step(x, y)
    loss_sum += loss
    loss_count += 1
    writer.add_scalar('Loss/loss', loss, i)
    
    (dh, dr) = rb.sample_command()
    writer.add_scalar('Epoch/dh', dh, i)
    writer.add_scalar('Epoch/dr', dr, i)

    n_episodes_per_iter = 10
    n_updates_per_iter = 100
   
    if i % n_updates_per_iter == 0:        
        trajectories, mean_reward, length = rollout(n_episodes_per_iter, env=env, model=model, sample_action=True, replay_buffer=rb, 
                              device=device, action_fn=action_fn)
        rb.add(trajectories)
        
        print(f"Average Episode Reward: {mean_reward}")  
        
        steps += length
        avg_loss = loss_sum/loss_count
        save_model(MODEL_NAME, i, model, optimizer, avg_loss, steps)        
        print(f"Average Episode Reward: {mean_reward}")        
        writer.add_scalar('Steps/reward', mean_reward, steps)
        
        mean_length = length*1.0/n_episodes_per_iter
        writer.add_scalar('Steps/length', mean_length, steps)
        
        if mean_reward >= SOLVED_MEAN_REWARD:
            print("Task considered solved! Stopping.")
            break
        
        if steps >= MAX_STEPS:
            print(f"Steps {steps} exceeds max env steps {MAX_STEPS}. Stopping.")
            break

    if i % 200 == 0:
        avg_loss = loss_sum/loss_count
        print(f'i: {i}, s: {steps}, Loss: {avg_loss}')
        save_model(MODEL_NAME, i, model, optimizer, avg_loss, steps)    

Existing model found. Loading from epoch 0, steps 25 with loss: 0.0
Average Episode Reward: 17.8
Average Episode Reward: 17.8
i: 0, s: 203, Loss: 0.6872869729995728
Average Episode Reward: 18.3
Average Episode Reward: 18.3
Average Episode Reward: 20.0
Average Episode Reward: 20.0
i: 200, s: 586, Loss: 0.6831081509590149
Average Episode Reward: 22.7
Average Episode Reward: 22.7
Average Episode Reward: 19.7
Average Episode Reward: 19.7
i: 400, s: 1010, Loss: 0.6758297085762024
Average Episode Reward: 15.3
Average Episode Reward: 15.3
Average Episode Reward: 23.6
Average Episode Reward: 23.6
i: 600, s: 1399, Loss: 0.6710043549537659
Average Episode Reward: 18.0
Average Episode Reward: 18.0
Average Episode Reward: 20.4
Average Episode Reward: 20.4
i: 800, s: 1783, Loss: 0.6650773882865906
Average Episode Reward: 19.5
Average Episode Reward: 19.5
Average Episode Reward: 19.3
Average Episode Reward: 19.3
i: 1000, s: 2171, Loss: 0.6596022248268127
Average Episode Reward: 20.7
Average Episode 

KeyboardInterrupt: 

In [6]:
rb.sample_command()

(43.32, 46.80113515870184)

In [7]:
cmd = (500, 500) #rb.sample_command()
cmd = rb.sample_command()

env = gym.make('CartPole-v1')
e, model, _, l, steps = load_model(MODEL_NAME, train=False, model=model, optimizer=optimizer, device=device)
print(f"Loaded model at epoch: {e} with loss {l}")
_, mean_reward, length = rollout(episodes=3, env=env, model=model, sample_action=False, 
                      replay_buffer=rb, render=True, device=device, action_fn=action_fn)


print(f"Average Episode Reward: {mean_reward}, Mean Length: {length}")

Existing model found. Loading from epoch 0, steps 22908 with loss: 0.0
Loaded model at epoch: 0 with loss 0.0


NameError: name 'action_fn' is not defined