In [1]:
%load_ext autoreload
%autoreload 2
import gym
import os
import numpy as np
import random
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from experiment import rollout, ReplayBuffer, Trajectory, load_model, save_model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
MODEL_NAME = 'model_v0_lunar_lander_v2'
HIDDEN = 64

writer = SummaryWriter()

class Behavior(torch.nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Behavior, self).__init__()
        self.classifier = torch.nn.Sequential(
            nn.Linear(input_shape, HIDDEN), 
#             nn.Dropout(0.1),
#             torch.nn.LayerNorm(HIDDEN),
            nn.ReLU(),
            nn.Linear(HIDDEN, HIDDEN),
#             torch.nn.LayerNorm(HIDDEN),
#             nn.Dropout(0.1),
            nn.ReLU(),            
            nn.Linear(HIDDEN, HIDDEN), 
#             torch.nn.LayerNorm(HIDDEN),
#             nn.Dropout(0.1),
            nn.ReLU(),            
            nn.Linear(HIDDEN, HIDDEN), 
#             torch.nn.LayerNorm(HIDDEN),
#             nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(HIDDEN, num_actions)
        )        

    def forward(self, x):
        return self.classifier(x)

In [3]:
env = gym.make('LunarLander-v2')

In [4]:
loss_object = torch.nn.CrossEntropyLoss().to(device)
model_sample = Behavior(input_shape=env.observation_space.shape[0]+2, num_actions=env.action_space.n).to(device)
optimizer = torch.optim.Adam(model_sample.parameters(), lr=0.001)

In [5]:
rb = ReplayBuffer(max_size=150, last_few=100)

n_warmup_episodes = 10
# Random rollout
trajectories, mean_reward, length = rollout(episodes=n_warmup_episodes, env=env, render=False)
rb.add(trajectories)

# Keep track of steps used during random rollout!
epoch, model_sample, optimizer, loss, steps = load_model(MODEL_NAME, model_sample, optimizer, device, train=True)
steps += length
save_model(MODEL_NAME, epoch, model_sample, optimizer, loss, steps)

# Plot initial values
writer.add_scalar('Steps/reward', mean_reward, steps)        

No checkpoint found. Creating new model.


In [6]:
batch_size = 1024

In [7]:
def train_step(model, inputs, targets):
    optimizer.zero_grad()    
    predictions = model(inputs)
    loss = loss_object(predictions, targets)
    
    loss.backward()
    optimizer.step()
    
    return loss

def action_fn(model, inputs, sample_action=True):
    action_logits = model(inputs)
    action_probs = torch.softmax(action_logits, axis=-1)

    if sample_action:        
        m = torch.distributions.categorical.Categorical(logits=action_logits)             
        action = int(m.sample().squeeze().cpu().numpy())        
    else:
        action = int(np.argmax(action_probs.detach().squeeze().numpy()))
    return action
    

In [8]:
# SAMPLE ACTIONS

loss_sum = 0
loss_count = 0

epochs = 1000000
epoch, model_sample, optimizer, loss, steps = load_model(MODEL_NAME, model_sample, optimizer, device, train=True)
print(steps)

# eval_every = 2000

for i in range(epoch, epochs+epoch):
    x, y = rb.sample(batch_size, device)    
    loss = train_step(model_sample, x, y)
    loss_sum += loss
    loss_count += 1
    
    writer.add_scalar('Loss/loss', loss, i)
    
    (dh, dr) = rb.sample_command()
    writer.add_scalar('Epoch/dh', dh, i)
    writer.add_scalar('Epoch/dr', dr, i)

    n_episodes_per_iter = 10
    n_updates_per_iter = 100
    if i % n_updates_per_iter == 0:
        trajectories, mean_reward, length = rollout(n_episodes_per_iter, env=env, model=model_sample, sample_action=True, replay_buffer=rb, 
                              device=device, action_fn=action_fn)
        rb.add(trajectories)
        
        steps += length
        avg_loss = loss_sum/loss_count
        save_model(MODEL_NAME, i, model_sample, optimizer, avg_loss, steps)        
        print(f"Average Episode Reward: {mean_reward}")        
        writer.add_scalar('Steps/reward', mean_reward, steps)
        
        mean_length = length*1.0/n_episodes_per_iter
        writer.add_scalar('Steps/length', mean_length, steps)
        
        
#     if i % eval_every == 0:
#         eval_episodes = 10
#         _, mean_reward, length = rollout(eval_episodes, env=env, model=model_sample, 
#                             sample_action=True, replay_buffer=rb, 
#                             device=device, action_fn=action_fn)
        
#         writer.add_scalar('Epoch/reward', mean_reward, i)        
#         mean_length = length*1.0/n_episodes_per_iter
#         writer.add_scalar('Epoch/length', mean_length, i)
        
    if i % 200 == 0:
        avg_loss = loss_sum/loss_count
        print(f'i: {i}, s: {steps}, Loss: {avg_loss}')
        
        save_model(MODEL_NAME, i, model_sample, optimizer, avg_loss, steps)

Existing model found. Loading from epoch 0, steps 864 with loss: 0.0
864
Average Episode Reward: -192.79306413717734
i: 0, s: 1740, Loss: 1.5553230047225952
Average Episode Reward: -170.74703283161512
Average Episode Reward: -166.28558643487798
i: 200, s: 3528, Loss: 1.3806486129760742
Average Episode Reward: -210.00028865272316
Average Episode Reward: -197.42022935332525
i: 400, s: 5378, Loss: 1.3783584833145142
Average Episode Reward: -234.72498951710517
Average Episode Reward: -109.93945381486685
i: 600, s: 7166, Loss: 1.3772085905075073
Average Episode Reward: -226.87068049324913
Average Episode Reward: -201.72852621232246
i: 800, s: 9048, Loss: 1.3757730722427368
Average Episode Reward: -193.07927051363947
Average Episode Reward: -127.57049004650852
i: 1000, s: 10812, Loss: 1.3746272325515747
Average Episode Reward: -109.93802857672922
Average Episode Reward: -166.0632920331154
i: 1200, s: 12635, Loss: 1.3737704753875732
Average Episode Reward: -158.2054344121453
Average Episode R

KeyboardInterrupt: 

In [9]:
rb.sample_command()

(384.11, 217.64070418127295)

In [11]:
cmd = (384.11, 217.64070418127295)
rb.sample_command()
env = gym.make('LunarLander-v2')
e, model, _, l,_ = load_model(name=MODEL_NAME, train=False, model=model_sample, optimizer=optimizer, device=device)

_, mean_reward, _ = rollout(episodes=5, env=env, model=model_sample, sample_action=True, 
                      cmd=cmd, render=True, device=device, action_fn=action_fn)


print(f"Average Episode Reward: {mean_reward}")

Existing model found. Loading from epoch 239800, steps 283882 with loss: 0.9338444471359253
Average Episode Reward: -305.3344898485258
