In [1]:
%load_ext autoreload
%autoreload 2
import gym

import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from experiment import rollout_random, ReplayBuffer, Trajectory

### A1-1. Initialize replay buffer with warm-up episodes using random actions

In [2]:
env = gym.make('CartPole-v1')

In [13]:
rb = ReplayBuffer(500, 100)
avg_reward = rollout_random(num_episodes=500, env=env, replay_buffer=rb, render=False)

print(f"Average Episode Reward: {avg_reward}")

Average Episode Reward: 22.1


### A1-2 Initialize a behavior function

In [14]:
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
    """
    Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate
    linearly for the first ``warmup_steps`` training steps, and decreasing it thereafter proportionally
    to the inverse square root of the step number, scaled by the inverse square root of the
    dimensionality of the model. Time will tell if this is just madness or it's actually important.
    Parameters
    ----------
    warmup_steps: ``int``, required.
        The number of steps to linearly increase the learning rate.
    """
    def __init__(self, optimizer, warmup_steps):
        self.warmup_steps = warmup_steps
        super().__init__(optimizer)

    def get_lr(self):
        last_epoch = max(1, self.last_epoch)
        scale = self.warmup_steps ** 0.5 * min(last_epoch ** (-0.5), last_epoch * self.warmup_steps ** (-1.5))
        return [base_lr * scale for base_lr in self.base_lrs]

In [22]:
class Behavior(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Behavior, self).__init__()
        self.fc1 = nn.Linear(input_shape,512)
        self.fc2 = nn.Linear(512,512)
        self.fc3 = nn.Linear(512,512)
        self.fc4 = nn.Linear(512,512)
        self.fc5 = nn.Linear(512,num_actions)

    def forward(self, x):
        output = F.relu(self.fc1(x))
        output = F.relu(self.fc2(output))
        output = F.relu(self.fc3(output))
        output = F.relu(self.fc4(output))
        output = self.fc5(output)
        return output


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
d = env.observation_space.shape[0]
model = Behavior(input_shape=d+2, num_actions=1).to(device) # env.action_space.n
optimizer = torch.optim.Adam(model.parameters())
#lr_scheduler = NoamLR(optimizer, 5000)

loss_object = torch.nn.BCEWithLogitsLoss().to(device) #CrossEntropyLoss().to(device)

In [23]:
batch_size = 1024

In [24]:
### A1-3: while stopping criteria is not reached do:
### A1-4:   Improve the behavior function by training on replay buffer

In [25]:
loss_sum = 0
loss_count = 0

In [26]:
def to_training(s, dr, dh):
    l = s.tolist()
    l.append(dr)
    l.append(dh)
    return l

def segments_to_training(segments):
    x = []
    y = []
    for (s, dr, dh), action in segments:
        l = to_training(s, dr, dh)
        x.append(l)
        y.append(action)
        
    x = torch.tensor(x).float().to(device)
    y = torch.tensor(y).float().to(device)
    
    return x, y
        
# accuracy_m = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def train_step(inputs, targets):
    optimizer.zero_grad()
    
    predictions = model(inputs)
    
    #print(predictions, targets)
    
    loss = loss_object(predictions, targets.unsqueeze(1))
    
    loss.backward()
    optimizer.step()
    
    return loss


def generate_episode(cmd):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    t = Trajectory()
    while not done:
        (dh, dr) = cmd
        inputs = torch.tensor([to_training(s, dr, dh)]).float().to(device)
        
        action_probs = model(inputs)
        action_probs = F.sigmoid(action_probs) #, dim=-1)
        
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs) #categorical.Categorical(probs=action_probs)
        action = int(m.sample().squeeze().cpu().numpy())
        
        # env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    # print(f'Episode reward: {ep_reward}')
    rb.add(t)
    return ep_reward

In [27]:
epochs = 1000000

for i in range(1, epochs+1):
    segments = rb.sample(batch_size)
    segments = np.array(segments)
    x, y = segments_to_training(segments)
    loss = train_step(x, y)
    loss_sum += loss
    loss_count += 1
    
    #if i % 1000 == 0:
    #lr_scheduler.step()
    #print(lr_scheduler.get_lr())
    
    if i % 100 == 0:
        rewards = [] 
        for e in range(100):
            cmd = rb.sample_command()
            rewards.append(generate_episode(cmd))
        
        print(f"Average Episode Reward: {np.mean(rewards)}")
    
    if i % 200 == 0:
        print(f'i: {i}, Loss: {loss_sum/loss_count}') #'\t Accuracy: {accuracy_m.result()}')
    

Average Episode Reward: 12.84
Average Episode Reward: 42.36
i: 200, Loss: 0.6799904704093933
Average Episode Reward: 41.29
Average Episode Reward: 27.37
i: 400, Loss: 0.6696462631225586
Average Episode Reward: 27.85
Average Episode Reward: 38.56
i: 600, Loss: 0.6629749536514282
Average Episode Reward: 49.75
Average Episode Reward: 49.94
i: 800, Loss: 0.6577674150466919
Average Episode Reward: 66.81
Average Episode Reward: 36.6
i: 1000, Loss: 0.6537543535232544
Average Episode Reward: 38.87
Average Episode Reward: 60.49
i: 1200, Loss: 0.6502493023872375
Average Episode Reward: 49.2
Average Episode Reward: 70.0
i: 1400, Loss: 0.6467254757881165
Average Episode Reward: 69.01
Average Episode Reward: 70.05
i: 1600, Loss: 0.6434688568115234
Average Episode Reward: 73.83
Average Episode Reward: 65.46
i: 1800, Loss: 0.6404635310173035
Average Episode Reward: 64.6
Average Episode Reward: 76.73
i: 2000, Loss: 0.6378771066665649
Average Episode Reward: 49.24
Average Episode Reward: 66.57
i: 2200,

Average Episode Reward: 111.55
Average Episode Reward: 304.21
i: 17400, Loss: 0.6091704964637756
Average Episode Reward: 278.23
Average Episode Reward: 255.31
i: 17600, Loss: 0.6091257333755493
Average Episode Reward: 142.03
Average Episode Reward: 243.95
i: 17800, Loss: 0.6090912818908691
Average Episode Reward: 280.91
Average Episode Reward: 222.87
i: 18000, Loss: 0.6090420484542847
Average Episode Reward: 289.27
Average Episode Reward: 220.54
i: 18200, Loss: 0.6090273261070251
Average Episode Reward: 119.34
Average Episode Reward: 232.89
i: 18400, Loss: 0.6090124249458313
Average Episode Reward: 213.58
Average Episode Reward: 319.23
i: 18600, Loss: 0.6089856624603271
Average Episode Reward: 170.62
Average Episode Reward: 124.15
i: 18800, Loss: 0.6089608073234558
Average Episode Reward: 166.42
Average Episode Reward: 268.75
i: 19000, Loss: 0.6089322566986084
Average Episode Reward: 123.06
Average Episode Reward: 202.59
i: 19200, Loss: 0.6089104413986206
Average Episode Reward: 233.29

Average Episode Reward: 175.74
Average Episode Reward: 234.69
i: 34400, Loss: 0.6077231764793396
Average Episode Reward: 82.99
Average Episode Reward: 210.21
i: 34600, Loss: 0.6077201962471008
Average Episode Reward: 326.8
Average Episode Reward: 222.86
i: 34800, Loss: 0.6077219247817993
Average Episode Reward: 302.57
Average Episode Reward: 248.81
i: 35000, Loss: 0.6077096462249756
Average Episode Reward: 306.71
Average Episode Reward: 194.31
i: 35200, Loss: 0.6076945662498474
Average Episode Reward: 245.97
Average Episode Reward: 262.98
i: 35400, Loss: 0.6076852679252625
Average Episode Reward: 324.76
Average Episode Reward: 278.44
i: 35600, Loss: 0.6076691150665283
Average Episode Reward: 295.2
Average Episode Reward: 237.51
i: 35800, Loss: 0.607656717300415
Average Episode Reward: 314.27
Average Episode Reward: 312.83
i: 36000, Loss: 0.6076412796974182
Average Episode Reward: 322.2
Average Episode Reward: 270.88
i: 36200, Loss: 0.6076250076293945
Average Episode Reward: 257.23
Aver

Average Episode Reward: 251.57
Average Episode Reward: 368.25
i: 51400, Loss: 0.6068745255470276
Average Episode Reward: 252.52
Average Episode Reward: 275.9
i: 51600, Loss: 0.6068617701530457
Average Episode Reward: 203.01
Average Episode Reward: 279.7
i: 51800, Loss: 0.6068536043167114
Average Episode Reward: 424.9
Average Episode Reward: 293.9
i: 52000, Loss: 0.6068450808525085
Average Episode Reward: 329.67
Average Episode Reward: 241.4
i: 52200, Loss: 0.6068375110626221
Average Episode Reward: 208.53
Average Episode Reward: 360.07
i: 52400, Loss: 0.6068310141563416
Average Episode Reward: 266.32
Average Episode Reward: 287.48
i: 52600, Loss: 0.6068215370178223
Average Episode Reward: 343.44
Average Episode Reward: 252.05
i: 52800, Loss: 0.6068099141120911
Average Episode Reward: 293.24
Average Episode Reward: 280.11
i: 53000, Loss: 0.6067960262298584
Average Episode Reward: 296.07
Average Episode Reward: 262.5
i: 53200, Loss: 0.6067923307418823
Average Episode Reward: 380.88
Avera

KeyboardInterrupt: 

In [28]:
rb.sample_command()

(500.0, 500.0)

In [29]:
## A1:6 Generate episodes using Alg 2 and add to replay buffer

In [30]:
cmd = (500, 500) #rb.sample_command()

In [37]:
env = gym.make('CartPole-v1')
avg_rewards = []
import time 

def test(cmd):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    while not done:
        (dh, dr) = cmd
        inputs = torch.tensor([to_training(s, dr, dh)]).float().to(device)
        
        action_probs = model(inputs)
        action_probs = torch.sigmoid(action_probs) #, dim=-1)
        
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs) #torch.distributions.categorical.Categorical(probs=action_probs)
        #action = int(torch.round(action_probs).detach().squeeze().cpu().numpy())
        action = int(m.sample().squeeze().cpu().numpy())
        
        env.render()
        time.sleep(0.01)
        s_old = s
        
        s, reward, done, info = env.step(action)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    print(f'Episode reward: {ep_reward}')
    return ep_reward

rewards = [] 
for e in range(20):
    rewards.append(test(cmd))

env.close()
print(f"Average Episode Reward: {np.mean(rewards)}")

Episode reward: 32.0
Episode reward: 500.0
Episode reward: 16.0
Episode reward: 378.0
Episode reward: 255.0
Episode reward: 56.0
Episode reward: 500.0
Episode reward: 91.0
Episode reward: 90.0
Episode reward: 47.0
Episode reward: 500.0
Episode reward: 153.0
Episode reward: 226.0
Episode reward: 37.0
Episode reward: 91.0
Episode reward: 500.0
Episode reward: 41.0
Episode reward: 500.0
Episode reward: 296.0
Episode reward: 500.0
Average Episode Reward: 240.45
