In [24]:
%load_ext autoreload
%autoreload 2
import gym

import numpy as np
import random
import time
import tensorflow as tf
import tensorflow_probability as tfp
from experiment import rollout_random, ReplayBuffer, Trajectory

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### A1-1. Initialize replay buffer with warm-up episodes using random actions

In [25]:
env = gym.make('CartPole-v1')

In [26]:
rb = ReplayBuffer(max_size=5000, last_few=100)
avg_reward = rollout_random(num_episodes=5000, env=env, replay_buffer=rb, render=False)

print(f"Average Episode Reward: {avg_reward}")

Average Episode Reward: 22.2266


### A1-2 Initialize a behavior function

In [27]:
class Behavior(tf.keras.Model):
    def __init__(self, input_shape, num_actions):
        super().__init__()        
        self.d1 = tf.keras.layers.Dense(512, input_shape=input_shape, activation='relu')
        self.d2 = tf.keras.layers.Dense(512, activation='relu')
        self.d3 = tf.keras.layers.Dense(512, activation='relu')
        self.d4 = tf.keras.layers.Dense(512, activation='relu')
        self.d5 = tf.keras.layers.Dense(num_actions)

    def call(self, x):
        output = self.d1(x)
        output = self.d2(output)
        output = self.d3(output)
        output = self.d4(output)
        output = self.d5(output)
        return output

(d,) = env.observation_space.shape
model = Behavior(input_shape=(d+2,), num_actions=1)
model.run_eagerly = True
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [28]:
batch_size = 1024

In [29]:
### A1-3: while stopping criteria is not reached do:
### A1-4:   Improve the behavior function by training on replay buffer

In [37]:
def to_training(s, dr, dh):
    l = s.tolist()
    l.append(dr)
    l.append(dh)
    return np.array(l)

def segments_to_training(segments):
    x = []
    y = []
    for (s, dr, dh), action in segments:
        l = to_training(s, dr, dh)
        x.append(l)
        y.append(action)
        
    x = tf.constant(x, dtype=tf.float32)
    y = tf.constant(y, dtype=tf.float32)
    return x, y

def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        
        targets = tf.expand_dims(targets, axis=-1)

        loss = loss_object(y_true=targets, y_pred=predictions)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss

def generate_episode(cmd, render=False, add_to_replay_buffer=True):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    t = Trajectory()
    while not done:
        (dh, dr) = cmd
        inputs = tf.constant([to_training(s, dr, dh)], dtype=tf.float32)
        
        action_probs = model(inputs)
        action_probs = tf.keras.activations.sigmoid(action_probs)
        m = tfp.distributions.Bernoulli(probs=action_probs)
        action = int(tf.squeeze(m.sample(), axis=0).numpy())
        if render:
            env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    # print(f'Episode reward: {ep_reward}')
    if add_to_replay_buffer:
        rb.add(t)
    if render:
        env.close()
    return ep_reward

In [62]:
loss_m = tf.keras.metrics.Mean(name='loss')

iterations = 1000000
epochs_per_iteration = 200


for iteration in range(1, iterations+1):
    # Train epoch
    start = time.time()
    for epoch in range(1, epochs_per_iteration):
        segments = rb.sample(batch_size)
        x, y = segments_to_training(segments)
        loss = train_step(x, y)
        loss_m(loss)
        
    training_time += (time.time() - start)
    
    print(f'Iteration: {iteration:3}\tLoss: {loss_m.result():.10}')
    print(f'\t\t\t Training time: {training_time}')
    training_time = 0
        
    # Generate more episodes
    rewards = [] 
    start = time.time()
    num_new_ep = 100
    for e in range(num_new_ep):
        cmd = rb.sample_command()
        rewards.append(generate_episode(cmd))
    
    time_to_gen = (time.time() - start)
    print(f"Average Episode Reward: {np.mean(rewards)}")
    print(f'\t\t\t Time to generate {num_new_ep} episodes: {time_to_gen}')

Iteration:   1	Loss: 0.6411684752
			 Training time: 6.38103175163269
Average Episode Reward: 144.44
			 Time to generate 100 episodes: 40.60236620903015
Iteration:   2	Loss: 0.6405643225
			 Training time: 6.258647203445435
Average Episode Reward: 94.93
			 Time to generate 100 episodes: 26.90075445175171
Iteration:   3	Loss: 0.6407071352
			 Training time: 6.327107667922974
Average Episode Reward: 56.73
			 Time to generate 100 episodes: 16.073679447174072
Iteration:   4	Loss: 0.640366137
			 Training time: 6.333270788192749
Average Episode Reward: 61.53
			 Time to generate 100 episodes: 17.665672540664673
Iteration:   5	Loss: 0.6400870085
			 Training time: 6.417681694030762
Average Episode Reward: 53.86
			 Time to generate 100 episodes: 15.630410432815552
Iteration:   6	Loss: 0.6398815513
			 Training time: 6.583188533782959
Average Episode Reward: 105.3
			 Time to generate 100 episodes: 30.926119804382324
Iteration:   7	Loss: 0.6396022439
			 Training time: 6.497132062911987
Av

KeyboardInterrupt: 

In [66]:
### A1:5 Sample exploratory commands based on replay buffer
cmd = rb.sample_command()
cmd

(224.52, 229.06108738513825)

In [64]:
## A1:6 Generate episodes using Alg 2 and add to replay buffer

In [68]:
cmd = (200, 200)

avg_rewards = []

rewards = [] 
for e in range(10):
    rewards.append(generate_episode(cmd, render=True, add_to_replay_buffer=False))


print(f"Average Episode Reward: {np.mean(rewards)}")

Average Episode Reward: 81.1


In [12]:
model.summary()

Model: "behavior"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  3584      
_________________________________________________________________
dense_1 (Dense)              multiple                  262656    
_________________________________________________________________
dense_2 (Dense)              multiple                  262656    
_________________________________________________________________
dense_3 (Dense)              multiple                  262656    
_________________________________________________________________
dense_4 (Dense)              multiple                  513       
Total params: 792,065
Trainable params: 792,065
Non-trainable params: 0
_________________________________________________________________
