In [5]:
import numpy as np
import sys
import gym
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
#from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.legacy import Adam

from collections import deque
import random

# ***Policy Gradient***

In [6]:
class PolicyGradientAgent:
    def __init__(self, states, actions):
        # To see Cartpole learning, change to True
        self.render = False
        self.load_model = False

        # Define size of state and actions
        self.states = states
        self.actions = actions

        # Hyper parameters for the DQN

        self.learning_rate = 0.01
        self.gamma = 0.99

        self.model = self.build_model()


    def build_model(self):
        model = Sequential()
        model.add(Dense(128, input_dim=self.states, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(self.actions, activation='softmax'))
        model.compile(optimizer=Adam(learning_rate=self.learning_rate),
                    loss='categorical_crossentropy')
        #model.summary()
        return model
        
    def choose_action(self, state):
       # Sample an action from the policy
        probabilities = self.model.predict(state, verbose=0)[0]
        action = np.random.choice(self.actions, p=probabilities)
        return action

    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards, dtype=np.float32)
        sum_reward = 0
        for t in reversed(range(len(rewards))):
            running_add = rewards[t] + self.gamma * sum_reward 
            discounted_rewards[t] = running_add
        # Normalize the discounted rewards
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)
        return discounted_rewards

    def train(self, states, actions, rewards):
        discounted_rewards = self.discount_rewards(rewards)
        actions_one_hot = tf.one_hot(actions, self.actions, dtype=tf.float32)

        with tf.GradientTape() as tape:
            # Compute the log probability of the taken actions
            action_probabilities = self.model(states, training=True)
            chosen_action_probabilities = tf.reduce_sum(actions_one_hot * action_probabilities, axis=1)
            log_probabilities = tf.math.log(chosen_action_probabilities)

            # Compute the loss (negative log-likelihood multiplied by discounted rewards)
            loss = -tf.reduce_sum(log_probabilities * discounted_rewards)

        # Update the model using the gradient of the loss with respect to the model parameters
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.model.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))



In [8]:
if __name__ == "__main__":
    env = gym.make('CartPole-v1', render_mode='human')
    states = env.observation_space.shape[0]
    actions = env.action_space.n

    EPISODES = 300

    agent = PolicyGradientAgent(states, actions)

    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        state, _ = env.reset()
        state = np.reshape(state, [1, states])

        states_batch, actions_batch, rewards_batch = [], [], []

        while not done:
            if agent.render == 1:
                env.render()

            action = agent.choose_action(state)
            next_state, reward, done, _, _ = env.step(action)
            next_state = np.reshape(next_state, [1, states])

            states_batch.append(state)
            actions_batch.append(action)
            rewards_batch.append(reward)

            score += reward
            state = next_state

        # Train the model at the end of the episode
        states_batch = np.vstack(states_batch)
        actions_batch = np.array(actions_batch)
        rewards_batch = np.array(rewards_batch)
        agent.train(states_batch, actions_batch, rewards_batch)

        scores.append(score)
        episodes.append(e)

        print("episode:", e, "  score:", score)

        if np.mean(scores[-min(10, len(scores)):]) > 490:
            break

    env.close()

  if not isinstance(terminated, (bool, np.bool8)):
  discounted_rewards /= np.std(discounted_rewards)


episode: 0   score: 22.0


ValueError: probabilities contain NaN