In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt

import sklearn
import sklearn.preprocessing

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model, Sequential

In [3]:
env = gym.envs.make("Pendulum-v0") 

def get_critic_model():
    n_inputs = 2
    n_hidden1 = 40  
    n_hidden2 = 40
    n_outputs = 1

    init_xavier = tf.keras.initializers.GlorotUniform()

    x = Input(n_inputs)
    hidden1 = Dense( n_hidden1, tf.nn.elu, init_xavier)(x)
    hidden2 = Dense( n_hidden2, tf.nn.elu, init_xavier)(hidden1)
    V = Dense( n_outputs, None, init_xavier)(hidden2)
    return Model( inputs=x, outputs=[ V ] )

def get_actor_model():
    n_inputs = 2
    n_hidden1 = 20
    n_hidden2 = 20
    n_outputs = 2

    init_xavier = tf.keras.initializers.GlorotUniform()
        
    x = Input(n_inputs)
    hidden1 = Dense( n_hidden1, tf.nn.elu, init_xavier)(x)
    hidden2 = Dense( n_hidden2, tf.nn.elu, init_xavier)(hidden1)
    params = Dense( n_outputs, None, init_xavier)(hidden2)
    return Model( inputs=x, outputs=[ params ] )

def value_function(critic_model, state):
    return critic_model(state)

def policy_network(actor_model, env, state):
    output = actor_model(state)
    mu = output[:,0]
    sigma = tf.nn.softplus(output[:,1]) + 1e-5    
    norm_dist = tfp.distributions.Normal(mu, sigma)
    actions = tf.squeeze(norm_dist.sample(1), axis=0)
    actions = tf.clip_by_value(
        actions, env.action_space.low[0], 
        env.action_space.high[0])
    return actions, norm_dist

#sample from state space for state normalization                                    
state_space_samples = np.array(
    [env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(state_space_samples)

#function to normalize states
def scale_state(state):                 #requires input shape=(2,)
    scaled = scaler.transform([state])
    return scaled                       #returns shape =(1,2)   

###################################################################

lr_actor = 0.00002  #set learning rates
lr_critic = 0.001

training_op_critic = tf.optimizers.Adam(learning_rate=lr_critic, name='critic_optimizer')
training_op_actor = tf.optimizers.Adam(learning_rate=lr_actor, name='actor_optimizer')

critic_model = get_critic_model()
actor_model = get_actor_model()

################################################################
#Training loop
gamma = 0.99        #discount factor
num_episodes = 300


episode_history = []
actor_losses = []
critic_losses = []
actions = []
for episode in range(num_episodes):
    #receive initial state from E
    state = env.reset()   # state.shape -> (2,)
    scaled_state = scale_state(state)
    reward_total = 0 
    steps = 0
    done = False
    while (not done):
        with tf.GradientTape(persistent=True) as tape:
            tape.watch(actor_model.trainable_variables)
            tape.watch(critic_model.trainable_variables)            

            action, norm_dist = policy_network(actor_model, env, scaled_state)                
            actions.append(action)
            next_state, reward, done, _ = env.step(action) 
            scaled_next_state = scale_state(next_state)
            steps +=1
            reward_total += reward

            V = critic_model(scaled_state)
            V_of_next_state = critic_model(scaled_next_state)

            target = reward + gamma * V_of_next_state
            td_error = target - V

            # define actor (policy) loss function
            loss_actor = -tf.math.log(norm_dist.prob(action) + 1e-5) * tf.stop_gradient(td_error)
            
            # define critic (state-value) loss function
            loss_critic = tf.reduce_mean(tf.math.squared_difference(V, tf.stop_gradient(target)))

        grads_actor = tape.gradient( loss_actor, actor_model.trainable_variables )            
        grads_critic = tape.gradient( loss_critic, critic_model.trainable_variables )
        del tape
        
        actor_losses.append( loss_actor.numpy()[0] )
        critic_losses.append( loss_critic.numpy() )
        
        # Apply gradients      
        training_op_actor.apply_gradients( zip( grads_actor, actor_model.trainable_variables ) ) 
        training_op_critic.apply_gradients( zip( grads_critic, critic_model.trainable_variables ) )        

        scaled_state = scaled_next_state
        #end while

    episode_history.append(reward_total)
    print("Episode: {}, Number of Steps : {}, Cumulative reward: {:0.2f}".format(
        episode, steps, reward_total))

    if np.mean(episode_history[-100:]) > 90 and len(episode_history) >= 101:
        print("****************Solved***************")
        print("Mean cumulative reward over 100 episodes:{:0.2f}" .format(
            np.mean(episode_history[-100:])))



W1209 16:14:45.958248 4557145536 base_layer.py:1814] Layer dense_3 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



InvalidArgumentError: Matrix size-incompatible: In[0]: [1,3], In[1]: [2,20] [Op:MatMul]

In [None]:
def run_ac():

    training_op_critic = tf.optimizers.Adam(learning_rate=lr_critic, name='critic_optimizer')
    training_op_actor = tf.optimizers.Adam(learning_rate=lr_actor, name='actor_optimizer')

    critic_model = get_critic_model()
    actor_model = get_actor_model()

    ################################################################
    #Training loop
    gamma = 0.99        #discount factor

    state = env.reset()   # state.shape -> (2,)
    scaled_state = scale_state(state)
    reward_total = 0 
    steps = 0
    done = False
    while not done:
        with tf.GradientTape(persistent=True) as tape:
            tape.watch(actor_model.trainable_variables)
            tape.watch(critic_model.trainable_variables)            

            action, norm_dist = policy_network(actor_model, env, scaled_state)                
            next_state, reward, done, _ = env.step(action) 
            scaled_next_state = scale_state(next_state)
            steps +=1
            reward_total += reward

            V = critic_model(scaled_state)
            V_of_next_state = critic_model(scaled_next_state)

            target = reward + gamma * V_of_next_state
            td_error = target - V

            # define actor (policy) loss function
            loss_actor = -tf.math.log(norm_dist.prob(action) + 1e-5) * tf.stop_gradient(td_error)

            # define critic (state-value) loss function
            loss_critic = tf.reduce_mean(tf.square(V - tf.stop_gradient(target)))

        grads_actor = tape.gradient( loss_actor, actor_model.trainable_variables )            
        grads_critic = tape.gradient( loss_critic, critic_model.trainable_variables )
        del tape

        # Apply gradients      
        training_op_actor.apply_gradients( zip( grads_actor, actor_model.trainable_variables ) ) 
        training_op_critic.apply_gradients( zip( grads_critic, critic_model.trainable_variables ) )        

        scaled_state = scaled_next_state
        #end while
        
    return reward_total


In [None]:
scores = [ run_ac() for _ in range(20) ]
print(scores)

In [None]:
plt.hist(scores)
len( [ x for x in scores if x > 0 ] )

In [None]:
tf.square(9)