In [1]:
%load_ext autoreload
%autoreload 2
import gym
import os
import numpy as np
import random
import torch
from torch import nn
from itertools import count
from torch.utils.tensorboard import SummaryWriter
from experiment import rollout, ReplayBuffer, Trajectory, load_model, save_model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
MODEL_NAME = 'model_v0_lunar_lander_v2'
HIDDEN = 32

writer = SummaryWriter()

# class Behavior(torch.nn.Module):
#     def __init__(self, input_shape, num_actions):
#         super(Behavior, self).__init__()
#         self.classifier = torch.nn.Sequential(
#             nn.Linear(input_shape, HIDDEN), 
#             nn.ReLU(),
#             nn.Linear(HIDDEN, HIDDEN),
#             nn.ReLU(),            
#             nn.Linear(HIDDEN, HIDDEN), 
#             nn.ReLU(),            
#             nn.Linear(HIDDEN, num_actions)
#         )        

#     def forward(self, x):
#         return self.classifier(x)


class Behavior(nn.Module):
    def __init__(self, state_shape, cmd_shape, num_actions):
        super(Behavior, self).__init__()
        self.fc_state = nn.Linear(state_shape,HIDDEN)
        self.fc_cmd = nn.Linear(cmd_shape,HIDDEN)
        
        self.fc1 = nn.Linear(HIDDEN,HIDDEN)
        self.fc2 = nn.Linear(HIDDEN,num_actions)

    def forward(self, x):
        output_spate = self.fc_state(x[0])
        output_cmd = torch.sigmoid(self.fc_cmd(x[1]))
        
        output = output_spate * output_cmd
        
        output = torch.relu(self.fc1(output))
        output = self.fc2(output)
        return output
    
#         self.classifier = torch.nn.Sequential(
#             nn.Linear(input_shape, HIDDEN), 
# #             nn.Dropout(0.1),
#             torch.nn.LayerNorm(HIDDEN),
#             nn.ReLU(),
#             nn.Linear(HIDDEN, HIDDEN),
# #             torch.nn.LayerNorm(HIDDEN),
# #             nn.Dropout(0.1),
#             nn.ReLU(),            
# #             nn.Linear(HIDDEN, HIDDEN), 
# #             torch.nn.LayerNorm(HIDDEN),
# #             nn.Dropout(0.1),
# #             nn.ReLU(),            
# #             nn.Linear(HIDDEN, HIDDEN), 
# #             torch.nn.LayerNorm(HIDDEN),
# #             nn.Dropout(0.1),
# #             nn.ReLU(),
#             nn.Linear(HIDDEN, num_actions)
#         )        
    

In [3]:
env = gym.make('LunarLander-v2')

In [4]:
loss_object = torch.nn.CrossEntropyLoss().to(device)
model_sample = Behavior(state_shape=env.observation_space.shape[0], cmd_shape=2, num_actions=env.action_space.n).to(device)
optimizer = torch.optim.Adam(model_sample.parameters(), lr=0.005)

In [5]:
rb = ReplayBuffer(max_size=50, last_few=50)

n_warmup_episodes = 30
# Random rollout
trajectories, mean_reward, length = rollout(episodes=n_warmup_episodes, env=env, render=False)
rb.add(trajectories)

# Keep track of steps used during random rollout!
epoch, model_sample, optimizer, loss, steps = load_model(MODEL_NAME, model_sample, optimizer, device, train=True)
steps += length
save_model(MODEL_NAME, epoch, model_sample, optimizer, loss, steps)

# Plot initial values
writer.add_scalar('Steps/reward', mean_reward, steps)        

No checkpoint found. Creating new model.


In [6]:
batch_size = 1024

In [7]:
def train_step(model, inputs, targets):
    optimizer.zero_grad()    
    predictions = model([inputs[:, :-2], inputs[:, -2:]])
    loss = loss_object(predictions, targets)
    
    loss.backward()
    optimizer.step()
    
    return loss

def action_fn(model, inputs, sample_action, epsilon):
    action_logits = model([inputs[:, :-2], inputs[:, -2:]])
    action_probs = torch.softmax(action_logits, axis=-1)

    if random.random() < epsilon: # Random action
        return env.action_space.sample()
    
    if sample_action:        
        m = torch.distributions.categorical.Categorical(logits=action_logits)             
        action = int(m.sample().squeeze().cpu().numpy())        
    else:
        action = int(np.argmax(action_probs.detach().squeeze().numpy()))
    return action
    

In [9]:
# SAMPLE ACTIONS

loss_sum = 0
loss_count = 0
SOLVED_MEAN_REWARD = 200
MAX_STEPS = 10**7
rewards = []

epoch, model_sample, optimizer, loss, steps = load_model(MODEL_NAME, model_sample, optimizer, device, train=True)
print(steps)

EVAL_EVERY = 1000

for i in count(start=epoch):
    x, y = rb.sample(batch_size, device)    
    loss = train_step(model_sample, x, y)
    loss_sum += loss
    loss_count += 1
    
    writer.add_scalar('Loss/loss', loss, i)
    
    (dh, dr) = rb.sample_command()
    writer.add_scalar('Epoch/dh', dh, i)
    writer.add_scalar('Epoch/dr', dr, i)

    n_episodes_per_iter = 10
    n_updates_per_iter = 50
    if i % n_updates_per_iter == 0:
        trajectories, mean_reward, length = rollout(n_episodes_per_iter, env=env, model=model_sample, sample_action=True, replay_buffer=rb, 
                              device=device, action_fn=action_fn, epsilon=0.01)
        rb.add(trajectories)
        rewards.append(mean_reward)
        rewards = rewards[-50:] # Keep only last  rewards
        
        steps += length
        avg_loss = loss_sum/loss_count
        save_model(MODEL_NAME, i, model_sample, optimizer, avg_loss, steps)        
        print(f"Average Episode Reward: {mean_reward}")        
        writer.add_scalar('Steps/reward', mean_reward, steps)
        
        mean_length = length*1.0/n_episodes_per_iter
        writer.add_scalar('Steps/length', mean_length, steps)
        
        if np.mean(rewards) >= SOLVED_MEAN_REWARD:
            print("Task considered solved! Stopping.")
            break
        
        if steps >= MAX_STEPS:
            print(f"Steps {steps} exceeds max env steps {MAX_STEPS}. Stopping.")
            break
            
    if i % EVAL_EVERY == 0:
        eval_episodes = 10
        _, mean_reward, length = rollout(eval_episodes, env=env, model=model_sample, 
                            sample_action=True, replay_buffer=rb, 
                            device=device, action_fn=action_fn, evaluation=True)
        
        writer.add_scalar('Eval/reward', mean_reward, i)        
        mean_length = length*1.0/n_episodes_per_iter
        writer.add_scalar('Eval/length', mean_length, i)
        
    if i % 200 == 0:
        avg_loss = loss_sum/loss_count
        print(f'i: {i}, s: {steps}, Loss: {avg_loss}')
        
        save_model(MODEL_NAME, i, model_sample, optimizer, avg_loss, steps)
        
    

Existing model found. Loading from epoch 7950, steps 183797 with loss: 1.1867789030075073
183797
Average Episode Reward: -126.80167492436576
Average Episode Reward: -138.8016162009057
i: 8000, s: 186864, Loss: 1.089489459991455
Average Episode Reward: -113.61700519753927
Average Episode Reward: -97.97472374535371
Average Episode Reward: -59.4700936034032
Average Episode Reward: -152.55199852132515
i: 8200, s: 195005, Loss: 1.0847513675689697
Average Episode Reward: -94.85693066370214
Average Episode Reward: -148.20512978051391
Average Episode Reward: -142.56741812042114
Average Episode Reward: -123.03359636150851
i: 8400, s: 202200, Loss: 1.0807380676269531
Average Episode Reward: -86.51822589946589
Average Episode Reward: -140.3833830914096
Average Episode Reward: -114.6457021306021
Average Episode Reward: -105.3505705885005
i: 8600, s: 210493, Loss: 1.0775346755981445
Average Episode Reward: -101.77017538437215
Average Episode Reward: -141.83507439152976
Average Episode Reward: -85.0

KeyboardInterrupt: 

In [7]:
rb.sample_command()

(94.6, -154.08065847387493)

In [12]:
       # dh ,dr
cmd = rb.sample_command()
rb.sample_command()
env = gym.make('LunarLander-v2')
e, model, _, l,_ = load_model(name=MODEL_NAME, train=False, model=model_sample, optimizer=optimizer, device=device)

_, mean_reward, _ = rollout(episodes=5, env=env, model=model_sample, sample_action=True, 
                      cmd=cmd, render=True, device=device, action_fn=action_fn, epsilon=0.0)


print(f"Average Episode Reward: {mean_reward}")

Existing model found. Loading from epoch 5700, steps 107461 with loss: 1.1478462219238281
Average Episode Reward: -73.02952937640885
