In [1]:
%load_ext autoreload
%autoreload 2
import gym
import os
import numpy as np
import math
import random
import torch
import torch.nn as nn
from experiment import rollout, ReplayBuffer, Trajectory, load_model, save_model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
env = gym.make('LunarLander-v2')

In [3]:
class Behavior(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Behavior, self).__init__()
        self.fc1 = nn.Linear(input_shape,64)
        self.fc2 = nn.Linear(64,64)
        self.fc3 = nn.Linear(64,num_actions)
        
    def forward(self, x):
        output = torch.relu(self.fc1(x))
        output = torch.relu(self.fc2(output))
        output = self.fc3(output)
        return output

In [4]:
loss_object = torch.nn.CrossEntropyLoss() #torch.nn.BCEWithLogitsLoss() #torch.nn.CrossEntropyLoss().to(device)
model_sample = Behavior(input_shape=env.observation_space.shape[0]+2, num_actions=env.action_space.n).to(device) #env.action_space.n
optimizer = torch.optim.Adam(model_sample.parameters(), lr=0.01)

In [5]:
rb = ReplayBuffer(max_size=50, last_few=50)

# Random rollout
trajectories, avg_reward = rollout(episodes=10, env=env, render=False)
rb.add(trajectories)

print(f"Average Episode Reward: {avg_reward}")

Average Episode Reward: -240.4364181122449


In [6]:
batch_size = 1024

EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200

In [7]:
def train_step(model, inputs, targets):
    optimizer.zero_grad()    
    predictions = model(inputs)
    loss = loss_object(predictions, targets)
    
    loss.backward()
    optimizer.step()
    
    return loss



def action_fn(model, inputs, sample_action=True):
    action_logits = model(inputs)
    action_probs = torch.softmax(action_logits, axis=-1)

    if sample_action:
        global steps_done
        sample = random.random()
        eps_threshold = 0.0 #EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
        
                
        #if sample > eps_threshold:
        m = torch.distributions.categorical.Categorical(logits=action_logits)             
        action = int(m.sample().squeeze().cpu().numpy())
#         else:
#             action = random.randrange(env.action_space.n)
    else:
        action = int(np.argmax(action_probs.detach().squeeze().cpu().numpy()))
    return action
    

In [None]:
# SAMPLE ACTIONS

loss_sum = 0
loss_count = 0

epochs = 1000000
epoch, model_sample, optimizer, loss = load_model('lunar_lander_sample_actions', model_sample, optimizer, device, train=True)

steps_done = 0

for i in range(epoch, epochs+epoch):
    for _ in range(50):
        x, y = rb.sample(batch_size, device)    
        loss = train_step(model_sample, x, y)
        loss_sum += loss
        loss_count += 1
    
    if i == 0:
        n_episodes_per_iter = 10
    else:
        n_episodes_per_iter = 10
        
    trajectories, mean_reward = rollout(n_episodes_per_iter, env=env, model=model_sample, sample_action=True, replay_buffer=rb, 
                          device=device, action_fn=action_fn)
    rb.add(trajectories)
    
    if i % 20:
        steps_done += 1

    if i % 10 == 0:
        print(f"Average Episode Reward: {mean_reward}")
        avg_loss = loss_sum/loss_count
        print(f'i: {i}, Loss: {avg_loss}') #'\t Accuracy: {accuracy_m.result()}')
        save_model('lunar_lander_sample_actions', i, model_sample, optimizer, avg_loss)

No checkpoint found. Creating new model.
Average Episode Reward: -204.67461045033377
i: 0, Loss: 1.3611431121826172
Average Episode Reward: -118.59136830172977
i: 10, Loss: 1.3151540756225586
Average Episode Reward: -83.52454752224455
i: 20, Loss: 1.2771064043045044
Average Episode Reward: -66.55097213741024
i: 30, Loss: 1.2326279878616333
Average Episode Reward: -55.06935499634218
i: 40, Loss: 1.1914715766906738
Average Episode Reward: -56.87821724604019
i: 50, Loss: 1.1493827104568481
Average Episode Reward: -49.57860491065807
i: 60, Loss: 1.1125949621200562
Average Episode Reward: -51.07264866424924
i: 70, Loss: 1.0807398557662964
Average Episode Reward: -43.71636288326839
i: 80, Loss: 1.04901123046875
Average Episode Reward: -35.72996024661258
i: 90, Loss: 1.0212032794952393
Average Episode Reward: -33.46904584220375
i: 100, Loss: 0.9997531175613403


In [6]:
[(xxx.total_return, xxx.length) for xxx in rb.buffer]

[(-101.11198134691003, 58),
 (-103.64699052774198, 68),
 (-127.58791857723014, 84),
 (-131.89672730059758, 155),
 (-163.5911319642045, 124),
 (-210.31447991516205, 82),
 (-275.18922879202216, 102),
 (-305.9108282899752, 96),
 (-351.6562497742343, 109),
 (-410.746399174143, 93)]

In [17]:
import pickle

pickle.dump(rb, open("buffer.p", "wb"))

In [18]:
rbbb = pickle.load(open("buffer.p", "rb"))

In [22]:
[(xxx.total_return, xxx.length) for xxx in rbbb.buffer]

[(310.66825831675976, 381),
 (307.26196994702906, 356),
 (303.24897175129075, 434),
 (302.4825435649981, 422),
 (302.08153217515587, 443),
 (301.62343453588, 450),
 (301.336795776998, 374),
 (301.0824957187764, 401),
 (300.30239077159666, 393),
 (300.00413043635956, 376),
 (299.5551268182258, 403),
 (299.4571607913125, 401),
 (299.3745826281496, 439),
 (299.27842296736947, 832),
 (298.9785312032605, 431),
 (298.9744925975774, 368),
 (298.8005255482776, 377),
 (298.2905025908745, 374),
 (298.25199484161163, 327),
 (298.1082917441395, 426),
 (298.01858655378317, 380),
 (297.86267734638704, 387),
 (297.44062399672396, 222),
 (297.167313218092, 372),
 (296.8036351123287, 346),
 (296.40188358973296, 677),
 (296.3219972655071, 392),
 (296.0394593078145, 377),
 (295.79209420786935, 187),
 (295.5912289208195, 415),
 (295.2668316842348, 389),
 (295.1391259645944, 284),
 (295.12430763923635, 409),
 (295.10259048325526, 365),
 (294.76372007622194, 365),
 (294.67945424467996, 425),
 (294.222648258

In [8]:
[(xxx.total_return, xxx.length) for xxx in rb.buffer]

[(305.6909235090742, 225),
 (302.0056402212026, 208),
 (301.7944143213541, 269),
 (300.71844923634444, 224),
 (300.65080857502187, 224),
 (298.2907459656101, 222),
 (296.1363344579555, 217),
 (295.11447143257334, 237),
 (295.01556728710614, 219),
 (293.1993229567172, 212),
 (293.05844646098535, 348),
 (289.4628917571782, 190),
 (289.20918684615805, 184),
 (282.6850227305037, 196),
 (280.199684161463, 162),
 (279.1794201095174, 193),
 (278.78865415341886, 211),
 (272.22855976041046, 209),
 (271.5107908143556, 215),
 (269.0895324255778, 209),
 (268.8088351096112, 190),
 (266.1338566930086, 227),
 (265.524661876984, 203),
 (262.22279884775435, 343),
 (260.5030123683403, 209),
 (258.43169595399274, 225),
 (254.9498661720394, 195),
 (254.78018048064902, 386),
 (253.8927482723219, 276),
 (253.66807584372114, 190),
 (253.6187685539958, 180),
 (253.49713835959105, 172),
 (252.58063975061393, 183),
 (252.29265489999935, 228),
 (250.91490513950285, 205),
 (250.41154765153993, 183),
 (248.5060591

In [9]:
rb.sample_command()

(394.48, 297.9185157945218)

In [14]:
cmd = rb.sample_command() #(200, 200)
rb.sample_command()
#env = gym.make('MountainCar-v0')
e, model, _, l = load_model(name='lunar_lander_sample_actions', train=False, model=model_sample, optimizer=optimizer, device=device)

# _, mean_reward = rollout(episodes=1, env=env, model=model, sample_action=False, 
#                       replay_buffer=rb, render=True, device=device, action_fn=action_fn)
_, mean_reward = rollout(episodes=100, env=env, model=model_sample, sample_action=False, 
                      cmd=cmd, render=False, device=device, action_fn=action_fn)


print(f"Average Episode Reward: {mean_reward}")

Existing model found. Loading from epoch 1820 with loss: 0.600623369216919
Average Episode Reward: 29.77999965700948
