In [1]:
import gym

import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class Trajectory(object):
    
    def __init__(self):
        self.trajectory = []
        self.total_return = 0
        self.length = 0
        
    def add(self, state, action, reward, state_prime):
        self.trajectory.append((state, action, reward, state_prime))
        self.total_return += reward
        self.length += 1
        
    def sample_segment(self):
        T = len(self.trajectory)

        t1 = np.random.randint(1, T+1)
        t2 = np.random.randint(t1, T+1)

        state = self.trajectory[t1-1][0]
        action = self.trajectory[t1-1][1]

        d_r = 0.0
        for i in range(t1, t2 + 1):
            d_r += self.trajectory[i-1][2]

        d_h = t2 - t1 + 1.0

        return ((state,d_r,d_h),action)
    
class ReplayBuffer(object):
    
    def __init__(self, max_size, last_few):
        """
        @param last_few: Number of episodes from the end of the replay buffer
        used for sampling exploratory commands.
        """
        self.max_size = max_size
        self.cur_size = 0
        self.buffer = []
        
        self.last_few = last_few
        
    def add(self, trajectory):
        self.buffer.append(trajectory)
        
        self.buffer = sorted(self.buffer, key=lambda x: x.total_return, reverse=True)
        self.buffer = self.buffer[:self.max_size]
        
    def sample(self, batch_size):
        trajectories = np.random.choice(self.buffer, batch_size, replace=True)
        
        segments = []
        
        for t in trajectories:
            segments.append(t.sample_segment())
            
        return segments
    
    def sample_command(self):
        eps = self.buffer[:self.last_few]
        
        dh_0 = np.mean([e.length for e in eps])
        
        m = np.mean([e.total_return for e in eps])
        s = np.std([e.total_return for e in eps])
        
        dr_0 = np.random.uniform(m, m+s)
        
        return dh_0, dr_0
        

### A1-1. Initialize replay buffer with warm-up episodes using random actions

In [47]:
env = gym.make('CartPole-v1')

In [117]:
rb = ReplayBuffer(5000, 100)

avg_rewards = []

for _ in range(5000):
    s = env.reset()
    done = False
    ep_reward = 0.0
    t = Trajectory()
    while not done:
        # env.render()
        s_old = s
        action = env.action_space.sample()
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        ep_reward += reward
    avg_rewards.append(ep_reward)    
    # print(f'Episode reward: {ep_reward}')
    rb.add(t)
    
    
env.close()
print(f"Average Episode Reward: {np.mean(avg_rewards)}")

Average Episode Reward: 22.2368


### A1-2 Initialize a behavior function

In [136]:
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
    """
    Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate
    linearly for the first ``warmup_steps`` training steps, and decreasing it thereafter proportionally
    to the inverse square root of the step number, scaled by the inverse square root of the
    dimensionality of the model. Time will tell if this is just madness or it's actually important.
    Parameters
    ----------
    warmup_steps: ``int``, required.
        The number of steps to linearly increase the learning rate.
    """
    def __init__(self, optimizer, warmup_steps):
        self.warmup_steps = warmup_steps
        super().__init__(optimizer)

    def get_lr(self):
        last_epoch = max(1, self.last_epoch)
        scale = self.warmup_steps ** 0.5 * min(last_epoch ** (-0.5), last_epoch * self.warmup_steps ** (-1.5))
        return [base_lr * scale for base_lr in self.base_lrs]

In [150]:
class Behavior(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Behavior, self).__init__()
        self.fc1 = nn.Linear(input_shape,512)
        self.fc2 = nn.Linear(512,512)
        self.fc3 = nn.Linear(512,512)
        self.fc4 = nn.Linear(512,512)
        self.fc5 = nn.Linear(512,num_actions)

    def forward(self, x):
        output = F.relu(self.fc1(x))
        output = F.relu(self.fc2(output))
        output = F.relu(self.fc3(output))
        output = F.relu(self.fc4(output))
        output = self.fc5(output)
        return output


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
d = env.observation_space.shape[0]
model = Behavior(input_shape=d+2, num_actions=1).to(device) # env.action_space.n
optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = NoamLR(optimizer, 50000)

loss_object = torch.nn.BCEWithLogitsLoss().to(device) #CrossEntropyLoss().to(device)

In [151]:
batch_size = 1024

In [152]:
### A1-3: while stopping criteria is not reached do:
### A1-4:   Improve the behavior function by training on replay buffer

In [153]:
loss_sum = 0
loss_count = 0

In [166]:
def to_training(s, dr, dh):
    l = s.tolist()
    l.append(dr)
    l.append(dh)
    return l

def segments_to_training(segments):
    x = []
    y = []
    for (s, dr, dh), action in segments:
        l = to_training(s, dr, dh)
        x.append(l)
        y.append(action)
        
    x = torch.tensor(x).float().to(device)
    y = torch.tensor(y).float().to(device)
    
    return x, y
        
# accuracy_m = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def train_step(inputs, targets):
    optimizer.zero_grad()
    
    predictions = model(inputs)
    
    #print(predictions, targets)
    
    loss = loss_object(predictions, targets.unsqueeze(1))
    
    loss.backward()
    optimizer.step()
    
    return loss


def generate_episode(cmd):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    t = Trajectory()
    while not done:
        (dh, dr) = cmd
        inputs = torch.tensor([to_training(s, dr, dh)]).float().to(device)
        
        action_probs = model(inputs)
        action_probs = F.sigmoid(action_probs) #, dim=-1)
        
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs) #categorical.Categorical(probs=action_probs)
        action = int(m.sample().squeeze().cpu().numpy())
        
        # env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    # print(f'Episode reward: {ep_reward}')
    rb.add(t)
    return ep_reward
    
    
epochs = 1000000

for i in range(1, epochs+1):
    segments = rb.sample(batch_size)
    segments = np.array(segments)
    x, y = segments_to_training(segments)
    loss = train_step(x, y)
    loss_sum += loss
    loss_count += 1
    
    #if i % 1000 == 0:
    lr_scheduler.step()
    #print(lr_scheduler.get_lr())
    
    if i % 1000 == 0:
        rewards = [] 
        for e in range(1000):
            cmd = rb.sample_command()
            rewards.append(generate_episode(cmd))
        
        print(f"Average Episode Reward: {np.mean(rewards)}")
    
    if i % 200 == 0:
        print(f'i: {i}, Loss: {loss_sum/loss_count}') #'\t Accuracy: {accuracy_m.result()}')
    
    

i: 200, Loss: 0.6266692876815796
i: 400, Loss: 0.6266453266143799
i: 600, Loss: 0.6266225576400757
i: 800, Loss: 0.626600980758667
Average Episode Reward: 330.419
i: 1000, Loss: 0.6265760660171509
i: 1200, Loss: 0.6265539526939392
i: 1400, Loss: 0.6265305280685425
i: 1600, Loss: 0.6265065670013428
i: 1800, Loss: 0.6264835000038147
Average Episode Reward: 363.18
i: 2000, Loss: 0.6264581680297852
i: 2200, Loss: 0.6264347434043884
i: 2400, Loss: 0.626413106918335
i: 2600, Loss: 0.6263912916183472
i: 2800, Loss: 0.6263686418533325
Average Episode Reward: 344.886
i: 3000, Loss: 0.6263457536697388
i: 3200, Loss: 0.6263212561607361
i: 3400, Loss: 0.6262988448143005
i: 3600, Loss: 0.6262755990028381
i: 3800, Loss: 0.6262524127960205
Average Episode Reward: 272.719
i: 4000, Loss: 0.6262301802635193
i: 4200, Loss: 0.6262065768241882
i: 4400, Loss: 0.626183271408081
i: 4600, Loss: 0.6261605620384216
i: 4800, Loss: 0.6261381506919861
Average Episode Reward: 183.407
i: 5000, Loss: 0.626114666461944

i: 40200, Loss: 0.6228013634681702
i: 40400, Loss: 0.6227853894233704
i: 40600, Loss: 0.6227695941925049
i: 40800, Loss: 0.6227543950080872
Average Episode Reward: 318.292
i: 41000, Loss: 0.6227377653121948
i: 41200, Loss: 0.6227222084999084
i: 41400, Loss: 0.6227061748504639
i: 41600, Loss: 0.6226914525032043
i: 41800, Loss: 0.6226764917373657
Average Episode Reward: 268.188
i: 42000, Loss: 0.6226606965065002
i: 42200, Loss: 0.6226471662521362
i: 42400, Loss: 0.6226323843002319
i: 42600, Loss: 0.6226170063018799
i: 42800, Loss: 0.6226010322570801
Average Episode Reward: 379.893
i: 43000, Loss: 0.6225854754447937
i: 43200, Loss: 0.6225701570510864
i: 43400, Loss: 0.6225555539131165
i: 43600, Loss: 0.6225407719612122
i: 43800, Loss: 0.6225256323814392
Average Episode Reward: 314.976
i: 44000, Loss: 0.6225089430809021
i: 44200, Loss: 0.6224940419197083
i: 44400, Loss: 0.6224792003631592
i: 44600, Loss: 0.6224632859230042
i: 44800, Loss: 0.6224494576454163
Average Episode Reward: 298.757


Average Episode Reward: 393.073
i: 80000, Loss: 0.6201555728912354
i: 80200, Loss: 0.6201431751251221
i: 80400, Loss: 0.6201311945915222
i: 80600, Loss: 0.6201194524765015
i: 80800, Loss: 0.6201089024543762
Average Episode Reward: 407.947
i: 81000, Loss: 0.6200975179672241
i: 81200, Loss: 0.6200874447822571
i: 81400, Loss: 0.6200760006904602
i: 81600, Loss: 0.6200655102729797
i: 81800, Loss: 0.6200532913208008
Average Episode Reward: 320.949
i: 82000, Loss: 0.6200428605079651
i: 82200, Loss: 0.620033323764801
i: 82400, Loss: 0.6200231313705444
i: 82600, Loss: 0.6200125813484192
i: 82800, Loss: 0.6200008988380432
Average Episode Reward: 298.726
i: 83000, Loss: 0.619989812374115
i: 83200, Loss: 0.6199785470962524
i: 83400, Loss: 0.6199671626091003
i: 83600, Loss: 0.6199563145637512
i: 83800, Loss: 0.6199458837509155
Average Episode Reward: 346.167
i: 84000, Loss: 0.6199352741241455
i: 84200, Loss: 0.6199237108230591
i: 84400, Loss: 0.6199122071266174
i: 84600, Loss: 0.6199022531509399
i:

i: 119200, Loss: 0.6182425618171692
i: 119400, Loss: 0.6182339787483215
i: 119600, Loss: 0.618225634098053
i: 119800, Loss: 0.6182166337966919
Average Episode Reward: 340.0
i: 120000, Loss: 0.6182078719139099
i: 120200, Loss: 0.6181988716125488
i: 120400, Loss: 0.6181899905204773
i: 120600, Loss: 0.6181817650794983
i: 120800, Loss: 0.6181718111038208
Average Episode Reward: 430.643
i: 121000, Loss: 0.6181629300117493
i: 121200, Loss: 0.6181545853614807
i: 121400, Loss: 0.6181464791297913
i: 121600, Loss: 0.6181380152702332
i: 121800, Loss: 0.6181297898292542
Average Episode Reward: 335.011
i: 122000, Loss: 0.6181210279464722
i: 122200, Loss: 0.6181128025054932
i: 122400, Loss: 0.6181047558784485
i: 122600, Loss: 0.618096649646759
i: 122800, Loss: 0.6180891394615173
Average Episode Reward: 338.396
i: 123000, Loss: 0.6180803179740906
i: 123200, Loss: 0.6180713772773743
i: 123400, Loss: 0.6180635094642639
i: 123600, Loss: 0.6180549263954163
i: 123800, Loss: 0.618046760559082
Average Episo

i: 158200, Loss: 0.6167581677436829
i: 158400, Loss: 0.6167512536048889
i: 158600, Loss: 0.6167430877685547
i: 158800, Loss: 0.6167362928390503
Average Episode Reward: 198.089
i: 159000, Loss: 0.6167297959327698
i: 159200, Loss: 0.6167231202125549
i: 159400, Loss: 0.6167165637016296
i: 159600, Loss: 0.6167100667953491
i: 159800, Loss: 0.6167024374008179
Average Episode Reward: 400.372
i: 160000, Loss: 0.6166959404945374
i: 160200, Loss: 0.6166881918907166
i: 160400, Loss: 0.6166815757751465
i: 160600, Loss: 0.6166753172874451
i: 160800, Loss: 0.6166682839393616
Average Episode Reward: 341.861
i: 161000, Loss: 0.6166609525680542
i: 161200, Loss: 0.616655170917511
i: 161400, Loss: 0.6166481375694275
i: 161600, Loss: 0.616642415523529
i: 161800, Loss: 0.6166349649429321
Average Episode Reward: 427.633
i: 162000, Loss: 0.6166291832923889
i: 162200, Loss: 0.6166225671768188
i: 162400, Loss: 0.6166152954101562
i: 162600, Loss: 0.6166080236434937
i: 162800, Loss: 0.616601824760437
Average Epi

KeyboardInterrupt: 

In [161]:
rb.sample_command()

(500.0, 500.0)

In [162]:
## A1:6 Generate episodes using Alg 2 and add to replay buffer

In [163]:
cmd = (500, 500) #rb.sample_command()

In [167]:
avg_rewards = []

def test(cmd):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    while not done:
        (dh, dr) = cmd
        inputs = torch.tensor([to_training(s, dr, dh)]).float().to(device)
        
        action_probs = model(inputs)
        action_probs = F.sigmoid(action_probs) #, dim=-1)
        
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs) #torch.distributions.categorical.Categorical(probs=action_probs)
        action = int(m.sample().squeeze().cpu().numpy())
        
        #env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    # print(f'Episode reward: {ep_reward}')
    return ep_reward

rewards = [] 
for e in range(1000):
    rewards.append(test(cmd))

# env.close()
print(f"Average Episode Reward: {np.mean(rewards)}")

Average Episode Reward: 372.187
