In [1]:
%load_ext autoreload
%autoreload 2
import gym
import os
import numpy as np
import random
import torch
from experiment import rollout, ReplayBuffer, Trajectory, load_model, save_model, Behavior
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
env = gym.make('LunarLander-v2')

In [3]:
rb = ReplayBuffer(max_size=500, last_few=200)

# Random rollout
trajectories, avg_reward = rollout(episodes=100, env=env, render=False)
rb.add(trajectories)

print(f"Average Episode Reward: {avg_reward}")

Average Episode Reward: -186.4984420478953


In [4]:
loss_object = torch.nn.CrossEntropyLoss().to(device)
model = Behavior(input_shape=env.observation_space.shape[0]+2, num_actions=env.action_space.n).to(device)
optimizer = torch.optim.Adam(model.parameters())

In [5]:
batch_size = 1024

In [6]:
def train_step(inputs, targets):
    optimizer.zero_grad()    
    predictions = model(inputs)
    loss = loss_object(predictions, targets)
    
    loss.backward()
    optimizer.step()
    
    return loss

def action_fn(model, inputs, sample_action=True):
    action_logits = model(inputs)
    action_probs = torch.sigmoid(action_logits)

    if sample_action:        
        m = torch.distributions.categorical.Categorical(logits=action_logits)             
        action = int(m.sample().squeeze().cpu().numpy())        
    else:
        action = int(np.argmax(action_probs.detach().squeeze().numpy()))
    return action
    

In [11]:
loss_sum = 0
loss_count = 0

epochs = 1000000
epoch, model, optimizer, loss = load_model('lunar_lander', model, optimizer, device, train=True)

for i in range(epoch, epochs+epoch):
    x, y = rb.sample(batch_size, device)    
    loss = train_step(x, y)
    loss_sum += loss
    loss_count += 1
    
    if i % 100 == 0:        
        trajectories, mean_reward = rollout(100, env=env, model=model, sample_action=False, replay_buffer=rb, 
                              device=device, action_fn=action_fn)
        rb.add(trajectories)
        
        print(f"Average Episode Reward: {mean_reward}")        

    if i % 200 == 0:
        print(f'i: {i}, Loss: {loss_sum/loss_count}') #'\t Accuracy: {accuracy_m.result()}')
        save_model('lunar_lander', i, model, optimizer, loss)

Existing model found. Loading.
Average Episode Reward: -233.39486951164068
i: 4200, Loss: 0.3868766725063324
Average Episode Reward: -220.85241934248944
Average Episode Reward: -252.0326491306753
i: 4400, Loss: 0.38298773765563965
Average Episode Reward: -227.8380643559971
Average Episode Reward: -221.24491058297906
i: 4600, Loss: 0.37908604741096497
Average Episode Reward: -210.4813589573771
Average Episode Reward: -210.09931983224575
i: 4800, Loss: 0.37552937865257263
Average Episode Reward: -150.9117766633395
Average Episode Reward: -196.14485699291697
i: 5000, Loss: 0.3737042248249054
Average Episode Reward: -209.79150816466688
Average Episode Reward: -169.40483862612749
i: 5200, Loss: 0.3730125427246094
Average Episode Reward: -196.75464899237966
Average Episode Reward: -195.9273203382398
i: 5400, Loss: 0.37170934677124023
Average Episode Reward: -289.39427515579365
Average Episode Reward: -158.10820553537317
i: 5600, Loss: 0.3699265122413635
Average Episode Reward: -182.128211007

KeyboardInterrupt: 

In [None]:
# SAMPLE ACTIONS

loss_sum = 0
loss_count = 0

epochs = 1000000
epoch, model, optimizer, loss = load_model('lunar_lander_sample_actions', model, optimizer, device, train=True)

for i in range(epoch, epochs+epoch):
    x, y = rb.sample(batch_size, device)    
    loss = train_step(x, y)
    loss_sum += loss
    loss_count += 1
    
    if i % 100 == 0:        
        trajectories, mean_reward = rollout(100, env=env, model=model, sample_action=True, replay_buffer=rb, 
                              device=device, action_fn=action_fn)
        rb.add(trajectories)
        
        print(f"Average Episode Reward: {mean_reward}")        

    if i % 200 == 0:
        print(f'i: {i}, Loss: {loss_sum/loss_count}') #'\t Accuracy: {accuracy_m.result()}')
        save_model('lunar_lander_sample_actions', i, model, optimizer, loss)

Existing model found. Loading.
Average Episode Reward: -301.9474068640721
i: 23600, Loss: 0.9494513869285583
Average Episode Reward: -246.3624809472592


In [9]:
rb.sample_command()

(221.975, 234.03588327452988)

In [12]:
cmd = (500, 500) #rb.sample_command()
env = gym.make('LunarLander-v2')
e, model, _, l = load_model(name='lunar_lander', train=False, model=model, optimizer=optimizer, device=device)
print(f"Loaded model at epoch: {e} with loss {l}")
_, mean_reward = rollout(episodes=3, env=env, model=model, sample_action=True, 
                      replay_buffer=rb, render=True, device=device, action_fn=action_fn)


print(f"Average Episode Reward: {mean_reward}")

No checkpoint found. Creating new model.
Loaded model at epoch: 0 with loss 0.0
Average Episode Reward: -850.1558946955619
