In [1]:
import gym

import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class Trajectory(object):
    
    def __init__(self):
        self.trajectory = []
        self.total_return = 0
        self.length = 0
        
    def add(self, state, action, reward, state_prime):
        self.trajectory.append((state, action, reward, state_prime))
        self.total_return += reward
        self.length += 1
        
    def sample_segment(self):
        T = len(self.trajectory)

        t1 = np.random.randint(1, T+1)
        t2 = np.random.randint(t1, T+1)

        state = self.trajectory[t1-1][0]
        action = self.trajectory[t1-1][1]

        d_r = 0.0
        for i in range(t1, t2 + 1):
            d_r += self.trajectory[i-1][2]

        d_h = t2 - t1 + 1.0

        return ((state,d_r,d_h),action)
    
class ReplayBuffer(object):
    
    def __init__(self, max_size, last_few):
        """
        @param last_few: Number of episodes from the end of the replay buffer
        used for sampling exploratory commands.
        """
        self.max_size = max_size
        self.cur_size = 0
        self.buffer = []
        
        self.last_few = last_few
        
    def add(self, trajectory):
        self.buffer.append(trajectory)
        
        self.buffer = sorted(self.buffer, key=lambda x: x.total_return, reverse=True)
        self.buffer = self.buffer[:self.max_size]
        
    def sample(self, batch_size):
        trajectories = np.random.choice(self.buffer, batch_size, replace=True)
        
        segments = []
        
        for t in trajectories:
            segments.append(t.sample_segment())
            
        return segments
    
    def sample_command(self):
        eps = self.buffer[:self.last_few]
        
        dh_0 = np.mean([e.length for e in eps])
        
        m = np.mean([e.total_return for e in eps])
        s = np.std([e.total_return for e in eps])
        
        dr_0 = np.random.uniform(m, m+s)
        
        return dh_0, dr_0
        

### A1-1. Initialize replay buffer with warm-up episodes using random actions

In [47]:
env = gym.make('CartPole-v1')

In [117]:
rb = ReplayBuffer(5000, 100)

avg_rewards = []

for _ in range(5000):
    s = env.reset()
    done = False
    ep_reward = 0.0
    t = Trajectory()
    while not done:
        # env.render()
        s_old = s
        action = env.action_space.sample()
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        ep_reward += reward
    avg_rewards.append(ep_reward)    
    # print(f'Episode reward: {ep_reward}')
    rb.add(t)
    
    
env.close()
print(f"Average Episode Reward: {np.mean(avg_rewards)}")

Average Episode Reward: 22.2368


### A1-2 Initialize a behavior function

In [136]:
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
    """
    Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate
    linearly for the first ``warmup_steps`` training steps, and decreasing it thereafter proportionally
    to the inverse square root of the step number, scaled by the inverse square root of the
    dimensionality of the model. Time will tell if this is just madness or it's actually important.
    Parameters
    ----------
    warmup_steps: ``int``, required.
        The number of steps to linearly increase the learning rate.
    """
    def __init__(self, optimizer, warmup_steps):
        self.warmup_steps = warmup_steps
        super().__init__(optimizer)

    def get_lr(self):
        last_epoch = max(1, self.last_epoch)
        scale = self.warmup_steps ** 0.5 * min(last_epoch ** (-0.5), last_epoch * self.warmup_steps ** (-1.5))
        return [base_lr * scale for base_lr in self.base_lrs]

In [150]:
class Behavior(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Behavior, self).__init__()
        self.fc1 = nn.Linear(input_shape,512)
        self.fc2 = nn.Linear(512,512)
        self.fc3 = nn.Linear(512,512)
        self.fc4 = nn.Linear(512,512)
        self.fc5 = nn.Linear(512,num_actions)

    def forward(self, x):
        output = F.relu(self.fc1(x))
        output = F.relu(self.fc2(output))
        output = F.relu(self.fc3(output))
        output = F.relu(self.fc4(output))
        output = self.fc5(output)
        return output


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
d = env.observation_space.shape[0]
model = Behavior(input_shape=d+2, num_actions=1).to(device) # env.action_space.n
optimizer = torch.optim.Adam(model.parameters())
lr_scheduler = NoamLR(optimizer, 50000)

loss_object = torch.nn.BCEWithLogitsLoss().to(device) #CrossEntropyLoss().to(device)

In [151]:
batch_size = 1024

In [152]:
### A1-3: while stopping criteria is not reached do:
### A1-4:   Improve the behavior function by training on replay buffer

In [153]:
loss_sum = 0
loss_count = 0

In [None]:
def to_training(s, dr, dh):
    l = s.tolist()
    l.append(dr)
    l.append(dh)
    return l

def segments_to_training(segments):
    x = []
    y = []
    for (s, dr, dh), action in segments:
        l = to_training(s, dr, dh)
        x.append(l)
        y.append(action)
        
    x = torch.tensor(x).float().to(device)
    y = torch.tensor(y).float().to(device)
    
    return x, y
        
# accuracy_m = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def train_step(inputs, targets):
    optimizer.zero_grad()
    
    predictions = model(inputs)
    
    #print(predictions, targets)
    
    loss = loss_object(predictions, targets.unsqueeze(1))
    
    loss.backward()
    optimizer.step()
    
    return loss


def generate_episode(cmd):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    t = Trajectory()
    while not done:
        (dh, dr) = cmd
        inputs = torch.tensor([to_training(s, dr, dh)]).float().to(device)
        
        action_probs = model(inputs)
        action_probs = F.sigmoid(action_probs) #, dim=-1)
        
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs) #categorical.Categorical(probs=action_probs)
        action = int(m.sample().squeeze().cpu().numpy())
        
        # env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    # print(f'Episode reward: {ep_reward}')
    rb.add(t)
    return ep_reward
    
    
epochs = 100000

for i in range(1, epochs+1):
    segments = rb.sample(batch_size)
    segments = np.array(segments)
    x, y = segments_to_training(segments)
    loss = train_step(x, y)
    loss_sum += loss
    loss_count += 1
    
    #if i % 1000 == 0:
    lr_scheduler.step()
    #print(lr_scheduler.get_lr())
    
    if i % 10000 == 0:
        rewards = [] 
        for e in range(100):
            cmd = rb.sample_command()
            rewards.append(generate_episode(cmd))
        
        print(f"Average Episode Reward: {np.mean(rewards)}")
    
    if i % 20 == 0:
        print(f'i: {i}, Loss: {loss_sum/loss_count}') #'\t Accuracy: {accuracy_m.result()}')
    
    

i: 20, Loss: 0.6966964602470398
i: 40, Loss: 0.6962608695030212
i: 60, Loss: 0.695766806602478
i: 80, Loss: 0.6953386068344116
i: 100, Loss: 0.6949957609176636
i: 120, Loss: 0.6947757601737976
i: 140, Loss: 0.6945681571960449
i: 160, Loss: 0.6943745613098145
i: 180, Loss: 0.6942179203033447
i: 200, Loss: 0.6940911412239075
i: 220, Loss: 0.6939490437507629
i: 240, Loss: 0.6938275098800659
i: 260, Loss: 0.6936827898025513
i: 280, Loss: 0.6935606002807617
i: 300, Loss: 0.6934188008308411
i: 320, Loss: 0.6932811737060547
i: 340, Loss: 0.6931300163269043
i: 360, Loss: 0.6929775476455688
i: 380, Loss: 0.6928224563598633
i: 400, Loss: 0.6926590800285339
i: 420, Loss: 0.6924958825111389
i: 440, Loss: 0.6923493146896362
i: 460, Loss: 0.6921654343605042
i: 480, Loss: 0.6919947862625122
i: 500, Loss: 0.6918227076530457
i: 520, Loss: 0.6916201114654541
i: 540, Loss: 0.6913687586784363
i: 560, Loss: 0.6911411285400391
i: 580, Loss: 0.6909313797950745
i: 600, Loss: 0.6906948685646057
i: 620, Loss: 0

i: 4900, Loss: 0.6678755879402161
i: 4920, Loss: 0.667838454246521
i: 4940, Loss: 0.6677879095077515
i: 4960, Loss: 0.6677395701408386
i: 4980, Loss: 0.6676943302154541
i: 5000, Loss: 0.6676409840583801
i: 5020, Loss: 0.6675950288772583
i: 5040, Loss: 0.6675431728363037
i: 5060, Loss: 0.667497992515564
i: 5080, Loss: 0.6674628257751465
i: 5100, Loss: 0.667426347732544
i: 5120, Loss: 0.6673836708068848
i: 5140, Loss: 0.6673491597175598
i: 5160, Loss: 0.6673089265823364
i: 5180, Loss: 0.6672724485397339
i: 5200, Loss: 0.6672255992889404
i: 5220, Loss: 0.667184054851532
i: 5240, Loss: 0.6671608686447144
i: 5260, Loss: 0.6671088933944702
i: 5280, Loss: 0.6670717000961304
i: 5300, Loss: 0.667034387588501
i: 5320, Loss: 0.6669964790344238
i: 5340, Loss: 0.6669535040855408
i: 5360, Loss: 0.6669206619262695
i: 5380, Loss: 0.6668850779533386
i: 5400, Loss: 0.6668460369110107
i: 5420, Loss: 0.6668053865432739
i: 5440, Loss: 0.6667670607566833
i: 5460, Loss: 0.6667313575744629
i: 5480, Loss: 0.66

In [None]:
rb.sample_command()

In [91]:
## A1:6 Generate episodes using Alg 2 and add to replay buffer

In [106]:
cmd = (500, 500)#rb.sample_command()

In [113]:
avg_rewards = []

def test(cmd):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    while not done:
        (dh, dr) = cmd
        inputs = torch.tensor([to_training(s, dr, dh)]).float().to(device)
        
        action_probs = model(inputs)
        action_probs = F.sigmoid(action_probs) #, dim=-1)
        
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs) #torch.distributions.categorical.Categorical(probs=action_probs)
        action = int(m.sample().squeeze().cpu().numpy())
        
        #env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    # print(f'Episode reward: {ep_reward}')
    return ep_reward

rewards = [] 
for e in range(100):
    rewards.append(test(cmd))

# env.close()
print(f"Average Episode Reward: {np.mean(rewards)}")

Average Episode Reward: 268.09
