In [2]:
class SARSA():

    def __init__(self, UB=1, LB = -1, LR = 1e-3, EPS = 0.4, GAMMA = 0.99):

        super(SARSA, self).__init__()
        self.num_states = 3
        self.num_actions = 1
        self.UPPER_BOUND = UB
        self.LR = LR
        self.EPS = EPS
        self.GAMMA = GAMMA

    def Critic(self):
        state_input = tf.keras.layers.Input(shape=(self.num_states))
        action_input = tf.keras.layers.Input(shape=(1))
        
        state_hid = tf.keras.layers.Dense(16, activation="relu")(state_input)
        action_hid = tf.keras.layers.Dense(16, activation="relu")(action_input)
        
        common = tf.keras.layers.Add()([state_hid, action_hid])
        
        outputs = tf.keras.layers.Dense(self.num_actions)(common)

        model = tf.keras.Model([state_input, action_input], outputs)

        return model

    def epsilon_greedy_policy(self, model, state, action, epsilon=0):
        
        if np.random.rand() < epsilon:
            return np.random.randint(self.num_actions)
        
        else:
            
            Q_values = model([state,action])
            
            return np.argmax(Q_values)

    def initialize(self):
        
        critic = self.Critic()
        
        critic_optimizer = tf.keras.optimizers.Adam(self.LR)

        
        return critic, critic_optimizer
    
    def update(self, model, optimizer, state, action, reward, next_state):

        with tf.GradientTape() as tape:
            
            q0 = model([state, action], training=True)
            
            q1 = model([next_state, action], training=True)
            
            mq0 = tf.experimental.numpy.amax(q0, axis=1)
            
            mq1 = tf.experimental.numpy.amax(q1, axis=1)
            
            y = tf.math.add(reward, self.GAMMA*mq0)

            critic_loss = tf.math.reduce_mean(tf.math.square(y-mq1))

        critic_grad = tape.gradient(critic_loss, model.trainable_variables)

        optimizer.apply_gradients(
            zip(critic_grad, model.trainable_variables)
        )


In [3]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_probability as tfp

problem = "CartPole-v1"
env = gym.make(problem)
env.seed(42)

agent = SARSA(LR = 1e-3)
agent.num_states = 4
agent.num_actions = 2

model, optimizer = agent.initialize()

MAX_EPISODES = 40000

states, next_states, rewards, actions, score = [], [], [], [], 0

for ep in range(MAX_EPISODES):
    state = env.reset()
    action = tf.expand_dims(tf.convert_to_tensor(np.array([1])), 0)
    epsilon = max(1- ep/MAX_EPISODES, 0.01)
    while True:
        
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(state), 0)
                
        action = agent.epsilon_greedy_policy(model, tf_prev_state, action, epsilon)
                     
        next_state, reward, done, info = env.step(action)
        
        score += reward
        
        actions.append(action)
                
        rewards.append(reward)
                
        states.append(state)
        
        next_states.append(next_state)
                        
        state = next_state
        
        env.render()
        # End this episode when `done` is True
        if done:
            
            agent.update(model, optimizer, 
                         tf.convert_to_tensor(states),
                         tf.convert_to_tensor(actions),
                             tf.cast(tf.convert_to_tensor(rewards), tf.float32),
                             tf.convert_to_tensor(next_states))
            
            

            if score > 190:
                print('Goal is reched!!')
                actor_model.save_weights("cart_actor.h5")
                critic_model.save_weights("cart_critic.h5")
            else:
                if ep % 10 == 0:
                    print('Gaol not reched yet. For the episode {} the final reward-to-go is {}'.format(ep, score))
            
            states, next_states, actions, rewards, score = [], [], [], [], 0            
            break

2022-04-14 14:25:12.036097: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-04-14 14:25:12.036859: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-04-14 14:25:12.064335: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-14 14:25:12.064685: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1650 Ti computeCapability: 7.5
coreClock: 1.485GHz coreCount: 16 deviceMemorySize: 3.82GiB deviceMemoryBandwidth: 178.84GiB/s
2022-04-14 14:25:12.064854: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such

Gaol not reched yet. For the episode 0 the final reward-to-go is 45.0
Gaol not reched yet. For the episode 10 the final reward-to-go is 26.0
Gaol not reched yet. For the episode 20 the final reward-to-go is 13.0
Gaol not reched yet. For the episode 30 the final reward-to-go is 20.0
Gaol not reched yet. For the episode 40 the final reward-to-go is 23.0
Gaol not reched yet. For the episode 50 the final reward-to-go is 35.0
Gaol not reched yet. For the episode 60 the final reward-to-go is 48.0


ValueError: Input 0 of layer dense_1 is incompatible with the layer: : expected min_ndim=2, found ndim=0. Full shape received: ()