In [1]:
import gym

import numpy as np
import random

import torch

In [2]:
class Trajectory(object):
    
    def __init__(self):
        self.trajectory = []
        self.total_return = 0
        self.length = 0
        
    def add(self, state, action, reward, state_prime):
        self.trajectory.append((state, action, reward, state_prime))
        self.total_return += reward
        self.length += 1
        
    def sample_segment(self):
        T = len(self.trajectory)

        t1 = np.random.randint(1, T+1)
        t2 = np.random.randint(t1, T+1)

        state = self.trajectory[t1-1][0]
        action = self.trajectory[t1-1][1]

        d_r = 0.0
        for i in range(t1, t2 + 1):
            d_r += self.trajectory[i-1][2]

        d_h = t2 - t1 + 1.0

        return ((state,d_r,d_h),action)
    
class ReplayBuffer(object):
    
    def __init__(self, max_size, last_few):
        """
        @param last_few: Number of episodes from the end of the replay buffer
        used for sampling exploratory commands.
        """
        self.max_size = max_size
        self.cur_size = 0
        self.buffer = []
        
        self.last_few = last_few
        
    def add(self, trajectory):
        self.buffer.append(trajectory)
        
        self.buffer = sorted(self.buffer, key=lambda x: x.total_return, reverse=True)
        self.buffer = self.buffer[:self.max_size]
        
    def sample(self, batch_size):
        trajectories = np.random.choice(self.buffer, batch_size, replace=True)
        
        segments = []
        
        for t in trajectories:
            segments.append(t.sample_segment())
            
        return segments
    
    def sample_command(self):
        eps = self.buffer[:self.last_few]
        
        dh_0 = np.mean([e.length for e in eps])
        
        m = np.mean([e.total_return for e in eps])
        s = np.std([e.total_return for e in eps])
        
        dr_0 = np.random.uniform(m, m+s)
        
        return dh_0, dr_0
        

### A1-1. Initialize replay buffer with warm-up episodes using random actions

In [3]:
env = gym.make('MountainCar-v0')

In [4]:
rb = ReplayBuffer(1000, 100)

avg_rewards = []

for _ in range(1000):
    s = env.reset()
    done = False
    ep_reward = 0.0
    t = Trajectory()
    while not done:
#         env.render()
        s_old = s
        action = env.action_space.sample()
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        ep_reward += reward
    avg_rewards.append(ep_reward)    
#     print(f'Episode reward: {ep_reward}')
    rb.add(t)
    
    
env.close()
print(f"Average Episode Reward: {np.mean(avg_rewards)}")

Average Episode Reward: -200.0


### A1-2 Initialize a behavior function

In [11]:
import tensorflow as tf
class Behavior(tf.keras.Model):
    def __init__(self, input_shape, num_actions):
        super().__init__()        
        print(input_shape)
        self.d = tf.keras.layers.Dense(32, input_shape=input_shape, activation='relu')
        self.d2 = tf.keras.layers.Dense(32, activation='relu')
        self.classifier = tf.keras.layers.Dense(num_actions, activation='softmax')

    def call(self, inputs):
        x = self.d(inputs)
        x = self.d2(x)
        x = self.classifier(x)
        return x

(d,) = env.observation_space.shape
model = Behavior(input_shape=(d+2,), num_actions=env.action_space.n)
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
batch_size = 16

loss_m = tf.keras.metrics.Mean(name='loss')

model.run_eagerly = True

(4,)


In [12]:
### A1-3: while stopping criteria is not reached do:
### A1-4:   Improve the behavior function by training on replay buffer

In [25]:
def to_training(s, dr, dh):
    l = s.tolist()
    l.append(dr)
    l.append(dh)
    return l

def segments_to_training(segments):
    x = []
    y = []
    for (s, dr, dh), action in segments:
        l = to_training(s, dr, dh)
        x.append(l)
        y.append(action)
    x = tf.constant(x, dtype=tf.float32)
    y = tf.constant(y)
    
    return x, y
        
# accuracy_m = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        predictions = model(inputs)
#         print(predictions)
        loss = loss_object(targets, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    loss_m(loss)
#     accuracy_m(targets, predictions)
    
    
epochs = 1000
for i in range(1, epochs+1):
    segments = rb.sample(batch_size)
    segments = np.array(segments)
    x, y = segments_to_training(segments)
    train_step(x, y)
    if i % 20 == 0:
        print(f'Loss: {loss_m.result()}') #'\t Accuracy: {accuracy_m.result()}')
    
    

Loss: 1.1326848268508911
Loss: 1.132562279701233
Loss: 1.132436990737915
Loss: 1.1323360204696655
Loss: 1.132219672203064
Loss: 1.1321113109588623
Loss: 1.1319878101348877
Loss: 1.1318769454956055
Loss: 1.1317698955535889
Loss: 1.1316936016082764
Loss: 1.1315803527832031
Loss: 1.1314586400985718
Loss: 1.1313440799713135
Loss: 1.1312594413757324
Loss: 1.1311520338058472
Loss: 1.1310617923736572
Loss: 1.1309516429901123
Loss: 1.1308298110961914
Loss: 1.1307485103607178
Loss: 1.1306666135787964
Loss: 1.1305646896362305
Loss: 1.1305086612701416
Loss: 1.1304153203964233
Loss: 1.130315899848938
Loss: 1.1302450895309448
Loss: 1.1301460266113281
Loss: 1.1300417184829712
Loss: 1.1299362182617188
Loss: 1.1298493146896362
Loss: 1.1297376155853271
Loss: 1.1296405792236328
Loss: 1.1295446157455444
Loss: 1.129443645477295
Loss: 1.1293468475341797
Loss: 1.1292541027069092
Loss: 1.1291635036468506
Loss: 1.1290628910064697
Loss: 1.1289750337600708
Loss: 1.1288809776306152
Loss: 1.1287952661514282
Loss:

In [15]:
### A1:5 Sample exploratory commands based on replay buffer
cmd = rb.sample_command()

In [16]:
## A1:6 Generate episodes using Alg 2 and add to replay buffer

In [26]:
avg_rewards = []

def generate_episode(cmd):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    t = Trajectory()
    while not done:
        (dh, dr) = cmd
        inputs = tf.constant([to_training(s, dr, dh)], dtype=tf.float32)
        
        action_probs = model(inputs)
        action = np.argmax(action_probs)
        
#         env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
#     print(f'Episode reward: {ep_reward}')
    rb.add(t)
    return ep_reward

rewards = [] 
for e in range(10):
    rewards.append(generate_episode(cmd))

# env.close()
print(f"Average Episode Reward: {np.mean(rewards)}")

Average Episode Reward: -200.0


In [14]:
model.summary()

Model: "behavior_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              multiple                  160       
_________________________________________________________________
dense_4 (Dense)              multiple                  1056      
_________________________________________________________________
dense_5 (Dense)              multiple                  99        
Total params: 1,315
Trainable params: 1,315
Non-trainable params: 0
_________________________________________________________________
