copying cartpole from openaigym
---

[keras example](https://keras.io/examples/rl/actor_critic_cartpole/#references)

In [5]:
import gym 
import tensorflow as tf
import numpy as np
from tensorflow import keras
from keras import layers

seed = 2022
gamma = 0.99
max_steps_per_episode = 10000
env = gym.make('CartPole-v0')
env.reset(seed=seed)
eps = np.finfo(np.float32).eps.item()

In [6]:
num_inputs = 4
num_actions = 2
num_hidden = 128

inputs = layers.Input(shape = (num_inputs,))
common = layers.Dense(num_hidden, activation ='relu')(inputs)
action = layers.Dense(num_actions, activation = 'softmax')(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs = inputs, outputs = [action, critic])

In [13]:
optimizer = tf.optimizers.Adam(learning_rate=0.01)
huber = tf.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

while True:
    state = env.reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1,max_steps_per_episode):
            #env.render()
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)
            
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0,0])
            
            action = np.random.choice(num_actions, p = np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0,action]))
            
            state, reward, done, _ = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward
            
            if done:
                break
                
        running_reward = 0.05 * episode_reward + (1-0.05) * running_reward

        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma*discounted_sum
            returns.insert(0, discounted_sum)

        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            diff = ret - value
            actor_losses.append(-log_prob*diff)

            critic_losses.append(
                huber(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )

        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()
    
    episode_count += 1
    if episode_count % 10 == 0:
        template = 'running reward: {:.2f} at episode {}'
        print(template.format(running_reward, episode_count))
    if running_reward > 195:
        print('Solved at episode {}'.format(episode_count))
        break
    

running reward: 11.78 at episode 10
running reward: 17.98 at episode 20
running reward: 17.73 at episode 30
running reward: 14.78 at episode 40
running reward: 13.26 at episode 50
running reward: 11.88 at episode 60
running reward: 11.11 at episode 70
running reward: 10.83 at episode 80
running reward: 10.72 at episode 90
running reward: 11.28 at episode 100
running reward: 12.02 at episode 110
running reward: 11.52 at episode 120
running reward: 11.94 at episode 130
running reward: 11.14 at episode 140
running reward: 10.73 at episode 150
running reward: 11.23 at episode 160
running reward: 11.35 at episode 170
running reward: 12.03 at episode 180
running reward: 15.24 at episode 190
running reward: 18.84 at episode 200
running reward: 16.57 at episode 210
running reward: 24.77 at episode 220
running reward: 62.44 at episode 230
running reward: 85.22 at episode 240
running reward: 81.44 at episode 250
running reward: 101.63 at episode 260
running reward: 141.10 at episode 270
running 

In [14]:
state

array([-0.29101545, -0.34158212, -0.02133838,  0.22399257], dtype=float32)

In [15]:
type(state)

numpy.ndarray

In [16]:
type(reward)

float