In [1]:
import gym

import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class Trajectory(object):
    
    def __init__(self):
        self.trajectory = []
        self.total_return = 0
        self.length = 0
        
    def add(self, state, action, reward, state_prime):
        self.trajectory.append((state, action, reward, state_prime))
        self.total_return += reward
        self.length += 1
        
    def sample_segment(self):
        T = len(self.trajectory)

        t1 = np.random.randint(1, T+1)
        t2 = np.random.randint(t1, T+1)

        state = self.trajectory[t1-1][0]
        action = self.trajectory[t1-1][1]

        d_r = 0.0
        for i in range(t1, t2 + 1):
            d_r += self.trajectory[i-1][2]

        d_h = t2 - t1 + 1.0

        return ((state,d_r,d_h),action)
    
class ReplayBuffer(object):
    
    def __init__(self, max_size, last_few):
        """
        @param last_few: Number of episodes from the end of the replay buffer
        used for sampling exploratory commands.
        """
        self.max_size = max_size
        self.cur_size = 0
        self.buffer = []
        
        self.last_few = last_few
        
    def add(self, trajectory):
        self.buffer.append(trajectory)
        
        self.buffer = sorted(self.buffer, key=lambda x: x.total_return, reverse=True)
        self.buffer = self.buffer[:self.max_size]
        
    def sample(self, batch_size):
        trajectories = np.random.choice(self.buffer, batch_size, replace=True)
        
        segments = []
        
        for t in trajectories:
            segments.append(t.sample_segment())
            
        return segments
    
    def sample_command(self):
        eps = self.buffer[:self.last_few]
        
        dh_0 = np.mean([e.length for e in eps])
        
        m = np.mean([e.total_return for e in eps])
        s = np.std([e.total_return for e in eps])
        
        dr_0 = np.random.uniform(m, m+s)
        
        return dh_0, dr_0
        

### A1-1. Initialize replay buffer with warm-up episodes using random actions

In [47]:
env = gym.make('CartPole-v1')

In [117]:
rb = ReplayBuffer(5000, 100)

avg_rewards = []

for _ in range(5000):
    s = env.reset()
    done = False
    ep_reward = 0.0
    t = Trajectory()
    while not done:
        # env.render()
        s_old = s
        action = env.action_space.sample()
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        ep_reward += reward
    avg_rewards.append(ep_reward)    
    # print(f'Episode reward: {ep_reward}')
    rb.add(t)
    
    
env.close()
print(f"Average Episode Reward: {np.mean(avg_rewards)}")

Average Episode Reward: 22.2368


### A1-2 Initialize a behavior function

In [123]:
class Behavior(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(Behavior, self).__init__()
        self.fc1 = nn.Linear(input_shape,512)
        self.fc2 = nn.Linear(512,512)
        self.fc3 = nn.Linear(512,num_actions)

    def forward(self, x):
        output = F.relu(self.fc1(x))
        output = F.relu(self.fc2(output))
        output = self.fc3(output)
        return output


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
d = env.observation_space.shape[0]
model = Behavior(input_shape=d+2, num_actions=1).to(device) # env.action_space.n
optimizer = torch.optim.Adam(model.parameters())

loss_object = torch.nn.BCEWithLogitsLoss().to(device) #CrossEntropyLoss().to(device)

In [124]:
batch_size = 1024

In [125]:
### A1-3: while stopping criteria is not reached do:
### A1-4:   Improve the behavior function by training on replay buffer

In [126]:
loss_sum = 0
loss_count = 0

In [None]:
def to_training(s, dr, dh):
    l = s.tolist()
    l.append(dr)
    l.append(dh)
    return l

def segments_to_training(segments):
    x = []
    y = []
    for (s, dr, dh), action in segments:
        l = to_training(s, dr, dh)
        x.append(l)
        y.append(action)
        
    x = torch.tensor(x).float().to(device)
    y = torch.tensor(y).float().to(device)
    
    return x, y
        
# accuracy_m = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def train_step(inputs, targets):
    optimizer.zero_grad()
    
    predictions = model(inputs)
    
    #print(predictions, targets)
    
    loss = loss_object(predictions, targets.unsqueeze(1))
    
    loss.backward()
    optimizer.step()
    
    return loss


def generate_episode(cmd):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    t = Trajectory()
    while not done:
        (dh, dr) = cmd
        inputs = torch.tensor([to_training(s, dr, dh)]).float().to(device)
        
        action_probs = model(inputs)
        action_probs = F.sigmoid(action_probs) #, dim=-1)
        
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs) #categorical.Categorical(probs=action_probs)
        action = int(m.sample().squeeze().cpu().numpy())
        
        # env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    # print(f'Episode reward: {ep_reward}')
    rb.add(t)
    return ep_reward
    
    
epochs = 100000

for i in range(1, epochs+1):
    segments = rb.sample(batch_size)
    segments = np.array(segments)
    x, y = segments_to_training(segments)
    loss = train_step(x, y)
    loss_sum += loss
    loss_count += 1
    
    if i % 10000 == 0:
        rewards = [] 
        for e in range(100):
            cmd = rb.sample_command()
            rewards.append(generate_episode(cmd))
        
        print(f"Average Episode Reward: {np.mean(rewards)}")
    
    if i % 20 == 0:
        print(f'i: {i}, Loss: {loss_sum/loss_count}') #'\t Accuracy: {accuracy_m.result()}')
    
    

i: 20, Loss: 0.6575998663902283
i: 40, Loss: 0.6575884819030762
i: 60, Loss: 0.6575772166252136
i: 80, Loss: 0.6575629711151123
i: 100, Loss: 0.6575509905815125
i: 120, Loss: 0.6575313210487366
i: 140, Loss: 0.6575155258178711
i: 160, Loss: 0.6575053334236145
i: 180, Loss: 0.6574890613555908
i: 200, Loss: 0.6574710607528687
i: 220, Loss: 0.6574550867080688
i: 240, Loss: 0.6574425101280212
i: 260, Loss: 0.6574320197105408
i: 280, Loss: 0.6574203372001648
i: 300, Loss: 0.6574078798294067
i: 320, Loss: 0.6573948860168457
i: 340, Loss: 0.6573821902275085
i: 360, Loss: 0.6573679447174072
i: 380, Loss: 0.657350480556488
i: 400, Loss: 0.6573327779769897
i: 420, Loss: 0.657323956489563
i: 440, Loss: 0.6573086380958557
i: 460, Loss: 0.6572924852371216
i: 480, Loss: 0.6572871804237366
i: 500, Loss: 0.6572737693786621
i: 520, Loss: 0.6572664380073547
i: 540, Loss: 0.6572550535202026
i: 560, Loss: 0.6572429537773132
i: 580, Loss: 0.6572306156158447
i: 600, Loss: 0.6572176218032837
i: 620, Loss: 0.

i: 4900, Loss: 0.6550154089927673
i: 4920, Loss: 0.6550066471099854
i: 4940, Loss: 0.655002236366272
i: 4960, Loss: 0.6549966335296631
i: 4980, Loss: 0.654992401599884
i: 5000, Loss: 0.6549856662750244
i: 5020, Loss: 0.6549742221832275
i: 5040, Loss: 0.6549717783927917
i: 5060, Loss: 0.6549655199050903
i: 5080, Loss: 0.6549561023712158
i: 5100, Loss: 0.6549476385116577
i: 5120, Loss: 0.6549401879310608
i: 5140, Loss: 0.6549288630485535
i: 5160, Loss: 0.6549204587936401
i: 5180, Loss: 0.6549116969108582
i: 5200, Loss: 0.6549047231674194
i: 5220, Loss: 0.6548959016799927
i: 5240, Loss: 0.6548879742622375
i: 5260, Loss: 0.6548815369606018
i: 5280, Loss: 0.6548735499382019
i: 5300, Loss: 0.6548663973808289
i: 5320, Loss: 0.6548570394515991
i: 5340, Loss: 0.654848039150238
i: 5360, Loss: 0.6548413634300232
i: 5380, Loss: 0.6548348069190979
i: 5400, Loss: 0.6548274755477905
i: 5420, Loss: 0.6548191905021667
i: 5440, Loss: 0.6548115015029907
i: 5460, Loss: 0.6548041105270386
i: 5480, Loss: 0.

In [90]:
rb.sample_command()

(500.0, 500.0)

In [91]:
## A1:6 Generate episodes using Alg 2 and add to replay buffer

In [106]:
cmd = (500, 500)#rb.sample_command()

In [113]:
avg_rewards = []

def test(cmd):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    while not done:
        (dh, dr) = cmd
        inputs = torch.tensor([to_training(s, dr, dh)]).float().to(device)
        
        action_probs = model(inputs)
        action_probs = F.sigmoid(action_probs) #, dim=-1)
        
        m = torch.distributions.bernoulli.Bernoulli(probs=action_probs) #torch.distributions.categorical.Categorical(probs=action_probs)
        action = int(m.sample().squeeze().cpu().numpy())
        
        #env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    # print(f'Episode reward: {ep_reward}')
    return ep_reward

rewards = [] 
for e in range(100):
    rewards.append(test(cmd))

# env.close()
print(f"Average Episode Reward: {np.mean(rewards)}")

Average Episode Reward: 268.09
