In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np

class AC():

    def __init__(self, GAMMA=0.99, UB=1, LB = -1, CLR = 1e-3, ALR = 1e-3, BATCH = 1):

        self.num_states = 3
        self.num_actions = 1
        self.GAMMA = GAMMA
        self.UPPER_BOUND = UB
        self.LOWER_BOUND = LB
        self.CRITIC_LR = CLR
        self.ACTOR_LR = ALR

    def getActor(self):
        print('Creating actor..')
        
        
        inputs = tf.keras.layers.Input(shape=(self.num_states,))
        
        out = tf.keras.layers.Dense(1024, activation="tanh")(inputs)
        out = tf.keras.layers.Dense(1024, activation="relu")(out)
        
        
        outputs = tf.keras.layers.Dense(self.num_actions,
                                       kernel_initializer = tf.keras.initializers.HeNormal())(out)

        model = tf.keras.Model(inputs, outputs)
        
        return model
    
    def getCritic(self):
        
        print('Creating critic..')
        state_input = tf.keras.layers.Input(shape=(self.num_states))
        
        state_out = tf.keras.layers.Dense(1024, activation="relu")(state_input)
        state_out = tf.keras.layers.Dense(1024, activation="relu")(state_out)

        outputs = tf.keras.layers.Dense(1)(state_out)

        model = tf.keras.Model(state_input, outputs)
        
        return model
    
    
    def make_action(self, actor_model, observations):
          
        logits = actor_model(observations)
                
        dist = tfp.distributions.Categorical(logits)
        
        action = dist.sample()
                
        return action[0].numpy()
    

    def initialize(self):
        
        print('Initializing..')
        
        actor = self.getActor()
        
        critic = self.getCritic()
        
        critic_optimizer = tf.keras.optimizers.RMSprop(self.CRITIC_LR)
        
        actor_optimizer = tf.keras.optimizers.RMSprop(self.ACTOR_LR)
        
        self.getBuffer()
        
        return  [actor, critic, critic_optimizer, actor_optimizer]
    
    def getBuffer(self):
        self.buffer_counter = 0

        self.states_buffer = []
        self.actions_buffer = []
        self.rewards_buffer = []
        self.next_states_buffer = []
        self.G_buffer = []
        self.dones_buffer = []
        

    def record(self, obs_tuple):

        self.states_buffer.append(obs_tuple[0])
        self.actions_buffer.append(obs_tuple[1])
        self.rewards_buffer.append(obs_tuple[2])
        self.next_states_buffer.append(obs_tuple[3])
        self.dones_buffer.append(obs_tuple[4])

        self.buffer_counter += 1
        
    def forget(self):
        
        self.states_buffer = []
        self.actions_buffer = []
        self.rewards_buffer = []
        self.G_buffer = []
        self.next_states_buffer = []
        self.dones_buffer = []
        
        self.buffer_counter = 0
    

    def process_rewards(self):
        G = []
        total_r = 0

        #iterate rewards from Gt to G0
        for r in reversed(self.rewards_buffer):
            total_r = r + total_r * self.GAMMA
            G.insert(0, total_r)

        #whitening rewards
        self.G_buffer = (G - np.array(G).mean())/np.array(G).std()
        
    
    def update(self, actor_model, critic_model, actor_optimizer, critic_optimizer,
               state_batch, action_batch, reward_batch, next_state_batch, done_batch):

        with tf.GradientTape() as tape:
            
            b = critic_model(state_batch, training = True)
                        
            b1 = critic_model(next_state_batch, training = True)
            
            y = reward_batch + self.GAMMA *b1*(1-done_batch)
                        
            critic_loss = tf.math.reduce_mean(tf.math.square(tf.convert_to_tensor(y - b)))
            
        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)

        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )        
    
        with tf.GradientTape() as tape:
            
            logits = actor_model(state_batch, training = True)
                        
            b = critic_model(state_batch, training = True)            
        
            log_probs =  tfp.distributions.Categorical(logits).log_prob(action_batch)
                                    
            actor_loss = tf.math.reduce_mean(-log_probs * tf.convert_to_tensor(b - reward_batch))
                        
        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
            
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )

    def learn(self, actor_model, critic_model, actor_optimizer, critic_optimizer):    
        
        #self.process_rewards()
        
        state_batch = tf.convert_to_tensor(self.states_buffer)
        action_batch = tf.convert_to_tensor(self.actions_buffer)
        reward_batch = tf.convert_to_tensor(self.rewards_buffer)
        next_state_batch = tf.convert_to_tensor(self.next_states_buffer)
        done_batch = tf.convert_to_tensor(self.dones_buffer)
        
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        done_batch = tf.cast(done_batch, dtype=tf.float32)

        self.update(actor_model, critic_model, actor_optimizer, critic_optimizer, 
                    state_batch, action_batch, reward_batch, next_state_batch, done_batch)



2022-04-20 09:09:17.192491: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/giulio/rl_prj/devel/lib:/home/giulio/project_ws/devel/lib:/opt/ros/noetic/lib:/opt/ros/noetic/lib/x86_64-linux-gnu
2022-04-20 09:09:17.192549: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [None]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_probability as tfp


problem = 'CartPole-v1'
env = gym.make(problem)
LOAD = False

agent = AC(ALR = 1e-3, CLR = 1e-3 , GAMMA = 0.99)
agent.num_states = 4
agent.num_actions = 2
agent.BATCH = 1

actor_model, critic_model, critic_optimizer, actor_optimizer = agent.initialize()

MAX_EPISODES = 1000

Gt, score, ep= 0, 0, 0
scores = []

if LOAD == True:
    print('Loading last weights')
    actor_model.load_weights('cartpole_actor.h5')
    critic_model.load_weights('cartpole_critic.h5')
else:
    print('Initializing networks...')

while ep < MAX_EPISODES:
    state = env.reset()
    Gt, step = 0, 0
    agent.forget()
    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(state), 0)
                
        action = agent.make_action(actor_model, tf_prev_state)
        
        next_state, reward, done, _ = env.step(action)
        
        if done:
            Gt += -200
        else:
            Gt += reward
            
        agent.record([state, action, Gt, next_state, done])

        env.render()
        
        step += 1
        if done: 
            ep += 1
            agent.learn(actor_model, critic_model, critic_optimizer, actor_optimizer)
            print("Episode {} is finish. The avg reward-to-go is {} ".format(ep, Gt))
            last = 0
            break
            
        state = next_state

print('Learning is completed. Saving results..')
actor_model.save_weights("cartpole_actor.h5")
critic_model.save_weights("cartpole_critic.h5")


Initializing..
Creating actor..
Creating critic..
Initializing networks...
Episode 1 is finish. The avg reward-to-go is -187.0 
Episode 2 is finish. The avg reward-to-go is -167.0 
Episode 3 is finish. The avg reward-to-go is -192.0 
Episode 4 is finish. The avg reward-to-go is -191.0 
Episode 5 is finish. The avg reward-to-go is -192.0 
Episode 6 is finish. The avg reward-to-go is -193.0 
Episode 7 is finish. The avg reward-to-go is -186.0 
Episode 8 is finish. The avg reward-to-go is -184.0 
Episode 9 is finish. The avg reward-to-go is -188.0 
Episode 10 is finish. The avg reward-to-go is -189.0 
Episode 11 is finish. The avg reward-to-go is -186.0 
Episode 12 is finish. The avg reward-to-go is -186.0 
Episode 13 is finish. The avg reward-to-go is -192.0 
Episode 14 is finish. The avg reward-to-go is -184.0 
Episode 15 is finish. The avg reward-to-go is -179.0 
Episode 16 is finish. The avg reward-to-go is -188.0 
Episode 17 is finish. The avg reward-to-go is -187.0 
Episode 18 is fi

Episode 152 is finish. The avg reward-to-go is -47.0 
Episode 153 is finish. The avg reward-to-go is 118.0 
Episode 154 is finish. The avg reward-to-go is 45.0 
Episode 155 is finish. The avg reward-to-go is 299.0 
Episode 156 is finish. The avg reward-to-go is 273.0 
Episode 157 is finish. The avg reward-to-go is 299.0 
Episode 158 is finish. The avg reward-to-go is 117.0 
