In [1]:
%load_ext autoreload
%autoreload 2
import gym
import os
import numpy as np
import random
import torch
from torch import nn
from itertools import count
from torch.utils.tensorboard import SummaryWriter
from experiment import rollout, ReplayBuffer, Trajectory, load_model, save_model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
MODEL_NAME = 'model_v0_lunar_lander_v2'
HIDDEN = 32

writer = SummaryWriter()

# class Behavior(torch.nn.Module):
#     def __init__(self, input_shape, num_actions):
#         super(Behavior, self).__init__()
#         self.classifier = torch.nn.Sequential(
#             nn.Linear(input_shape, HIDDEN), 
#             nn.ReLU(),
#             nn.Linear(HIDDEN, HIDDEN),
#             nn.ReLU(),            
#             nn.Linear(HIDDEN, HIDDEN), 
#             nn.ReLU(),            
#             nn.Linear(HIDDEN, num_actions)
#         )        

#     def forward(self, x):
#         return self.classifier(x)


class Behavior(nn.Module):
    def __init__(self, state_shape, cmd_shape, num_actions):
        super(Behavior, self).__init__()
        self.fc_state = nn.Linear(state_shape,HIDDEN)
        self.fc_cmd = nn.Linear(cmd_shape,HIDDEN)
        
        self.fc1 = nn.Linear(HIDDEN,HIDDEN)
        self.fc2 = nn.Linear(HIDDEN,num_actions)

    def forward(self, x):
        output_spate = self.fc_state(x[0])
        output_cmd = torch.sigmoid(self.fc_cmd(x[1]))
        
        output = output_spate * output_cmd
        
        output = torch.relu(self.fc1(output))
        output = self.fc2(output)
        return output
    
#         self.classifier = torch.nn.Sequential(
#             nn.Linear(input_shape, HIDDEN), 
# #             nn.Dropout(0.1),
#             torch.nn.LayerNorm(HIDDEN),
#             nn.ReLU(),
#             nn.Linear(HIDDEN, HIDDEN),
# #             torch.nn.LayerNorm(HIDDEN),
# #             nn.Dropout(0.1),
#             nn.ReLU(),            
# #             nn.Linear(HIDDEN, HIDDEN), 
# #             torch.nn.LayerNorm(HIDDEN),
# #             nn.Dropout(0.1),
# #             nn.ReLU(),            
# #             nn.Linear(HIDDEN, HIDDEN), 
# #             torch.nn.LayerNorm(HIDDEN),
# #             nn.Dropout(0.1),
# #             nn.ReLU(),
#             nn.Linear(HIDDEN, num_actions)
#         )        
    

In [3]:
env = gym.make('LunarLander-v2')

In [4]:
loss_object = torch.nn.CrossEntropyLoss().to(device)
model_sample = Behavior(state_shape=env.observation_space.shape[0], cmd_shape=2, num_actions=env.action_space.n).to(device)
optimizer = torch.optim.Adam(model_sample.parameters(), lr=0.005)

In [5]:
rb = ReplayBuffer(max_size=50, last_few=50)

n_warmup_episodes = 30
# Random rollout
trajectories, mean_reward, length = rollout(episodes=n_warmup_episodes, env=env, render=False)
rb.add(trajectories)

# Keep track of steps used during random rollout!
epoch, model_sample, optimizer, loss, steps = load_model(MODEL_NAME, model_sample, optimizer, device, train=True)
steps += length
save_model(MODEL_NAME, epoch, model_sample, optimizer, loss, steps)

# Plot initial values
writer.add_scalar('Steps/reward', mean_reward, steps)        

No checkpoint found. Creating new model.


In [6]:
batch_size = 1024

In [7]:
def train_step(model, inputs, targets):
    optimizer.zero_grad()    
    predictions = model([inputs[:, :-2], inputs[:, -2:]])
    loss = loss_object(predictions, targets)
    
    loss.backward()
    optimizer.step()
    
    return loss

def action_fn(model, inputs, sample_action=True):
    action_logits = model([inputs[:, :-2], inputs[:, -2:]])
    action_probs = torch.softmax(action_logits, axis=-1)

    if sample_action:        
        m = torch.distributions.categorical.Categorical(logits=action_logits)             
        action = int(m.sample().squeeze().cpu().numpy())        
    else:
        action = int(np.argmax(action_probs.detach().squeeze().numpy()))
    return action
    

In [8]:
# SAMPLE ACTIONS

loss_sum = 0
loss_count = 0
SOLVED_MEAN_REWARD = 200
MAX_STEPS = 10**7
rewards = []

epoch, model_sample, optimizer, loss, steps = load_model(MODEL_NAME, model_sample, optimizer, device, train=True)
print(steps)

EVAL_EVERY = 1000

for i in count(start=epoch):
    x, y = rb.sample(batch_size, device)    
    loss = train_step(model_sample, x, y)
    loss_sum += loss
    loss_count += 1
    
    writer.add_scalar('Loss/loss', loss, i)
    
    (dh, dr) = rb.sample_command()
    writer.add_scalar('Epoch/dh', dh, i)
    writer.add_scalar('Epoch/dr', dr, i)

    n_episodes_per_iter = 10
    n_updates_per_iter = 50
    if i % n_updates_per_iter == 0:
        trajectories, mean_reward, length = rollout(n_episodes_per_iter, env=env, model=model_sample, sample_action=True, replay_buffer=rb, 
                              device=device, action_fn=action_fn)
        rb.add(trajectories)
        rewards.append(mean_reward)
        rewards = rewards[-50:] # Keep only last  rewards
        
        steps += length
        avg_loss = loss_sum/loss_count
        save_model(MODEL_NAME, i, model_sample, optimizer, avg_loss, steps)        
        print(f"Average Episode Reward: {mean_reward}")        
        writer.add_scalar('Steps/reward', mean_reward, steps)
        
        mean_length = length*1.0/n_episodes_per_iter
        writer.add_scalar('Steps/length', mean_length, steps)
        
        if np.mean(rewards) >= SOLVED_MEAN_REWARD:
            print("Task considered solved! Stopping.")
            break
        
        if steps >= MAX_STEPS:
            print(f"Steps {steps} exceeds max env steps {MAX_STEPS}. Stopping.")
            break
            
    if i % EVAL_EVERY == 0:
        eval_episodes = 10
        _, mean_reward, length = rollout(eval_episodes, env=env, model=model_sample, 
                            sample_action=True, replay_buffer=rb, 
                            device=device, action_fn=action_fn, evaluation=True)
        
        writer.add_scalar('Eval/reward', mean_reward, i)        
        mean_length = length*1.0/n_episodes_per_iter
        writer.add_scalar('Eval/length', mean_length, i)
        
    if i % 200 == 0:
        avg_loss = loss_sum/loss_count
        print(f'i: {i}, s: {steps}, Loss: {avg_loss}')
        
        save_model(MODEL_NAME, i, model_sample, optimizer, avg_loss, steps)
        
    

Existing model found. Loading from epoch 0, steps 3676 with loss: 0.0
3676
Average Episode Reward: -156.60252454335253
i: 0, s: 4603, Loss: 1.3939533233642578
Average Episode Reward: -140.97323759529982
Average Episode Reward: -144.61891011867456
Average Episode Reward: -126.9948052598018
Average Episode Reward: -113.97513915697087
i: 200, s: 8095, Loss: 1.3779411315917969
Average Episode Reward: -105.62407844282163
Average Episode Reward: -147.17655294931035
Average Episode Reward: -114.46937486127533
Average Episode Reward: -97.74574513351453
i: 400, s: 11807, Loss: 1.3705745935440063
Average Episode Reward: -105.22635558690949
Average Episode Reward: -106.18434431837773
Average Episode Reward: -70.79296625740696
Average Episode Reward: -93.05582327119569
i: 600, s: 16132, Loss: 1.3629987239837646
Average Episode Reward: -123.43925537953706
Average Episode Reward: -67.5068043105255
Average Episode Reward: -113.11475356968904
Average Episode Reward: -95.63659784455245
i: 800, s: 19965

KeyboardInterrupt: 

In [None]:
rb.sample_command()

In [17]:
       # dh ,dr
cmd = rb.sample_command()
rb.sample_command()
env = gym.make('LunarLander-v2')
e, model, _, l,_ = load_model(name=MODEL_NAME, train=False, model=model_sample, optimizer=optimizer, device=device)

_, mean_reward, _ = rollout(episodes=5, env=env, model=model_sample, sample_action=True, 
                      cmd=cmd, render=True, device=device, action_fn=action_fn)


print(f"Average Episode Reward: {mean_reward}")

Existing model found. Loading from epoch 1223800, steps 851509 with loss: 0.35655415058135986
Average Episode Reward: -107.85978643501417
