In [1]:
%load_ext autoreload
%autoreload 2
import gym

import numpy as np
import random
import time
import tensorflow as tf
import tensorflow_probability as tfp
from experiment import rollout_random, ReplayBuffer, Trajectory

### A1-1. Initialize replay buffer with warm-up episodes using random actions

In [2]:
env = gym.make('CartPole-v1')

In [3]:
rb = ReplayBuffer(max_size=5000, last_few=100)
avg_reward = rollout_random(num_episodes=5000, env=env, replay_buffer=rb, render=False)
print(env.action_space.n)
print(f"Average Episode Reward: {avg_reward}")

2
Average Episode Reward: 22.318


### A1-2 Initialize a behavior function

In [4]:
class Behavior(tf.keras.Model):
    def __init__(self, input_shape, num_actions):
        super().__init__()        
        self.d1 = tf.keras.layers.Dense(512, input_shape=input_shape, activation='relu')
        self.d2 = tf.keras.layers.Dense(512, activation='relu')
        self.d3 = tf.keras.layers.Dense(512, activation='relu')
        self.d4 = tf.keras.layers.Dense(512, activation='relu')
        self.d5 = tf.keras.layers.Dense(num_actions, activation='relu')

    def call(self, x):
        output = self.d1(x)
        output = self.d2(output)
        output = self.d3(output)
        output = self.d4(output)
        output = self.d5(output)
        return output

(d,) = env.observation_space.shape
model = Behavior(input_shape=(d+2,), num_actions=env.action_space.n)
model.run_eagerly = True
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [5]:
batch_size = 1024

In [6]:
### A1-3: while stopping criteria is not reached do:
### A1-4:   Improve the behavior function by training on replay buffer

In [11]:
def to_training(s, dr, dh):
    l = s.tolist()
    l.append(dr)
    l.append(dh)
    return np.array(l)

def segments_to_training(segments):
    x = []
    y = []
    for (s, dr, dh), action in segments:
        l = to_training(s, dr, dh)
        x.append(l)
        y.append(action)
        
    x = tf.constant(x, dtype=tf.float32)
    y = tf.constant(y, dtype=tf.int32)
    return x, y

def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        
#         targets = tf.expand_dims(targets, axis=-1)
#         print(targets)
#         print(predictions)
#         print(tf.shape(predictions))
#         print(tf.shape(targets))
        loss = loss_object(y_true=targets, y_pred=predictions)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss

def generate_episode(cmd, render=False, add_to_replay_buffer=True):
    s = env.reset()
    done = False
    ep_reward = 0.0
    
    t = Trajectory()
    while not done:
        (dh, dr) = cmd
        inputs = tf.constant([to_training(s, dr, dh)], dtype=tf.float32)
        
        action_logits = model(inputs)
        action_probs = tf.keras.activations.sigmoid(action_logits[0])
        m = tfp.distributions.Bernoulli(probs=action_probs[0])
        action = int((m.sample()).numpy())
        if render:
            env.render()
        s_old = s
        
        s, reward, done, info = env.step(action)
        t.add(s_old, action, reward, s)
        
        ep_reward += reward
        dh = dh - 1
        dr = dr - reward
        cmd = (dh, dr)
    
    if render:
        env.close()
    # print(f'Episode reward: {ep_reward}')
    if add_to_replay_buffer:
        rb.add(t)
    return ep_reward

In [19]:
loss_m = tf.keras.metrics.Mean(name='loss')
start = time.time()

epochs = 1000000
for i in range(1, epochs+1):
    segments = rb.sample(batch_size)
    x, y = segments_to_training(segments)
    loss = train_step(x, y)
    loss_m(loss)
    
    if i % 1000 == 0:
        rewards = [] 
        for e in range(100):
            cmd = rb.sample_command()
            rewards.append(generate_episode(cmd))
        
        print(f"Average Episode Reward: {np.mean(rewards)}")
    
    if i % 200 == 0:
        time_per_epoch = (time.time() - start) / 200.0
        start = time.time()
        print(f'i: {i:5}\tLoss: {loss_m.result():.10}\tTime/epoch: {time_per_epoch:.4}')
        

i:   200	Loss: 0.6931462288	Time/epoch: 0.02979
i:   400	Loss: 0.6931459904	Time/epoch: 0.02926
i:   600	Loss: 0.6931459308	Time/epoch: 0.02922
i:   800	Loss: 0.6931481957	Time/epoch: 0.02909
Average Episode Reward: 22.41
i:  1000	Loss: 0.6931537986	Time/epoch: 0.06424
i:  1200	Loss: 0.6931575537	Time/epoch: 0.02923
i:  1400	Loss: 0.6931602359	Time/epoch: 0.02918
i:  1600	Loss: 0.6931575537	Time/epoch: 0.02899
i:  1800	Loss: 0.6931528449	Time/epoch: 0.0293
Average Episode Reward: 21.71
i:  2000	Loss: 0.6931490898	Time/epoch: 0.06283
i:  2200	Loss: 0.69314605	Time/epoch: 0.02936
i:  2400	Loss: 0.693143487	Time/epoch: 0.02896
i:  2600	Loss: 0.6931412816	Time/epoch: 0.02916
i:  2800	Loss: 0.6931394339	Time/epoch: 0.02907
Average Episode Reward: 22.0
i:  3000	Loss: 0.6931378841	Time/epoch: 0.06288
i:  3200	Loss: 0.6931364536	Time/epoch: 0.0296
i:  3400	Loss: 0.6931352019	Time/epoch: 0.02927


KeyboardInterrupt: 

In [20]:
### A1:5 Sample exploratory commands based on replay buffer
cmd = rb.sample_command()
cmd

(71.32, 72.13954899978405)

In [21]:
## A1:6 Generate episodes using Alg 2 and add to replay buffer

In [22]:
avg_rewards = []

rewards = [] 
for e in range(10):
    rewards.append(generate_episode(cmd, render=True, add_to_replay_buffer=False))


print(f"Average Episode Reward: {np.mean(rewards)}")

Average Episode Reward: 22.2


In [12]:
model.summary()

Model: "behavior"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  3584      
_________________________________________________________________
dense_1 (Dense)              multiple                  262656    
_________________________________________________________________
dense_2 (Dense)              multiple                  262656    
_________________________________________________________________
dense_3 (Dense)              multiple                  262656    
_________________________________________________________________
dense_4 (Dense)              multiple                  513       
Total params: 792,065
Trainable params: 792,065
Non-trainable params: 0
_________________________________________________________________
