In [1]:
%load_ext autoreload
%autoreload 2
import gym
import os
import numpy as np
import random
import torch
from experiment import rollout, ReplayBuffer, Trajectory, load_model, save_model, Behavior
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
env = gym.make('LunarLander-v2')

In [3]:
rb = ReplayBuffer(max_size=500, last_few=200)

# Random rollout
trajectories, avg_reward = rollout(episodes=500, env=env, render=False)
rb.add(trajectories)

print(f"Average Episode Reward: {avg_reward}")

Average Episode Reward: -182.55923739961167


In [4]:
loss_object = torch.nn.CrossEntropyLoss().to(device)
model_sample = Behavior(input_shape=env.observation_space.shape[0]+2, num_actions=env.action_space.n).to(device)
optimizer = torch.optim.Adam(model_sample.parameters())

In [5]:
batch_size = 1024

In [6]:
def train_step(model, inputs, targets):
    optimizer.zero_grad()    
    predictions = model(inputs)
    loss = loss_object(predictions, targets)
    
    loss.backward()
    optimizer.step()
    
    return loss

def action_fn(model, inputs, sample_action=True):
    action_logits = model(inputs)
    action_probs = torch.softmax(action_logits, axis=-1)

    if sample_action:        
        m = torch.distributions.categorical.Categorical(logits=action_logits)             
        action = int(m.sample().squeeze().cpu().numpy())        
    else:
        action = int(np.argmax(action_probs.detach().squeeze().numpy()))
    return action
    

In [7]:
# SAMPLE ACTIONS

loss_sum = 0
loss_count = 0

epochs = 1000000
epoch, model_sample, optimizer, loss = load_model('lunar_lander_sample_actions', model_sample, optimizer, device, train=True)

for i in range(epoch, epochs+epoch):
    x, y = rb.sample(batch_size, device)    
    loss = train_step(model_sample, x, y)
    loss_sum += loss
    loss_count += 1
    
    if i % 500 == 0:        
        trajectories, mean_reward = rollout(100, env=env, model=model_sample, sample_action=True, replay_buffer=rb, 
                              device=device, action_fn=action_fn)
        rb.add(trajectories)
        
        print(f"Average Episode Reward: {mean_reward}")        

    if i % 200 == 0:
        avg_loss = loss_sum/loss_count
        print(f'i: {i}, Loss: {avg_loss}') #'\t Accuracy: {accuracy_m.result()}')
        save_model('lunar_lander_sample_actions', i, model_sample, optimizer, avg_loss)

No checkpoint found. Creating new model.
Average Episode Reward: -569.996092661935
i: 0, Loss: 1.4224317073822021
i: 200, Loss: 1.375794529914856
i: 400, Loss: 1.3584486246109009
Average Episode Reward: -172.69898111223407
i: 600, Loss: 1.3515361547470093
i: 800, Loss: 1.3487199544906616
Average Episode Reward: -148.04904080918777
i: 1000, Loss: 1.3450168371200562
i: 1200, Loss: 1.3410649299621582
i: 1400, Loss: 1.3375409841537476
Average Episode Reward: -156.3782369754889
i: 1600, Loss: 1.334271788597107
i: 1800, Loss: 1.3311759233474731
Average Episode Reward: -164.7910605259152
i: 2000, Loss: 1.3282464742660522
i: 2200, Loss: 1.3258823156356812
i: 2400, Loss: 1.3237124681472778
Average Episode Reward: -135.16169996185627
i: 2600, Loss: 1.3215991258621216
i: 2800, Loss: 1.3197011947631836
Average Episode Reward: -155.8358534970011
i: 3000, Loss: 1.3178629875183105
i: 3200, Loss: 1.3162224292755127
i: 3400, Loss: 1.3147048950195312
Average Episode Reward: -127.78925134038276
i: 3600, 

KeyboardInterrupt: 

In [8]:
rb.sample_command()

(434.665, 323.77477264604414)

In [15]:
cmd = (280.085, 318.91295076177306)
rb.sample_command()
env = gym.make('LunarLander-v2')
e, model, _, l = load_model(name='lunar_lander_sample_actions', train=False, model=model_sample, optimizer=optimizer, device=device)

# _, mean_reward = rollout(episodes=1, env=env, model=model, sample_action=False, 
#                       replay_buffer=rb, render=True, device=device, action_fn=action_fn)
_, mean_reward = rollout(episodes=5, env=env, model=model_sample, sample_action=True, 
                      cmd=cmd, render=True, device=device, action_fn=action_fn)


print(f"Average Episode Reward: {mean_reward}")

Existing model found. Loading from epoch 405800 with loss: 0.8198422789573669
Average Episode Reward: 233.37975548816704
