<a href="https://colab.research.google.com/github/dude123studios/AdvancedReinforcementLearning/blob/main/Policy_Gradient_Method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
import gym

In [None]:
env = gym.make('CartPole-v0')
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n

In [None]:
gamma = 0.95

def discount_and_normalize_rewards(episode_rewards):

    discounted_rewards = np.zeros_like(episode_rewards)

    reward_to_go = 0.0

    for i in reversed(range(len(episode_rewards))):
        reward_to_go = reward_to_go * gamma + episode_rewards[i]
        discounted_rewards[i] = reward_to_go
    
    discounted_rewards -= np.mean(discounted_rewards)
    discounted_rewards /= np.std(discounted_rewards)

    return discounted_rewards

In [None]:
state_ph = tf.placeholder(tf.float32, [None, state_shape], name='state_ph')
action_ph = tf.placeholder(tf.int32, [None, action_shape], name='action_ph')
discounted_rewards_ph = tf.placeholder(tf.float32, [None,], name='discounted_rewards')

In [None]:
layer1 = tf.layers.dense(state_ph, units=32, activation=tf.nn.relu)
layer2 = tf.layers.dense(layer1, units=action_shape)
prob_dist = tf.nn.softmax(layer2)

In [None]:
#Calculates -(0*log(pi(a1|s)) + 0*log(pi(a2|s)) + 1*log(pi(action taken|s) + ...))
#This means it really just gets the log of the probability of the action taken 
neg_loss_policy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=layer2, labels = action_ph)

#Policy gradient
loss = tf.reduce_mean(neg_loss_policy * discounted_rewards_ph)

train = tf.train.AdamOptimizer(1e-2).minimize(loss)

In [None]:
num_iterations = 1000

with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())

    for i in range(num_iterations):

        episode_states, episode_actions, episode_rewards = [], [], []

        done = False

        Return = 0

        state = env.reset()

        while not done:

            state = state.reshape([1, 4])

            pi = sess.run(prob_dist, feed_dict={state_ph: state})

            a = np.random.choice(range(pi.shape[1]), p = pi.ravel())

            next_state, reward, done, info = env.step(a)

            #env.render()

            Return += reward

            action = np.zeros(action_shape)
            action[a] = 1

            episode_states.append(state)
            episode_actions.append(action)
            episode_rewards.append(reward)

            state = next_state

        discounted_rewards = discount_and_normalize_rewards(episode_rewards)

        feed_dict = {state_ph: np.vstack(np.array(episode_states)), 
                     action_ph: np.vstack(np.array(episode_actions)), 
                     discounted_rewards_ph: discounted_rewards}
        
        loss_, _ = sess.run([loss, train], feed_dict=feed_dict)

        if i % 10 == 0:
            print('Iteration: {}, Return: {}'.format(i, Return))

Iteration: 0, Return: 55.0
Iteration: 10, Return: 22.0
Iteration: 20, Return: 47.0
Iteration: 30, Return: 48.0
Iteration: 40, Return: 31.0
Iteration: 50, Return: 28.0
Iteration: 60, Return: 135.0
Iteration: 70, Return: 158.0
Iteration: 80, Return: 165.0
Iteration: 90, Return: 200.0
Iteration: 100, Return: 200.0
Iteration: 110, Return: 181.0
Iteration: 120, Return: 129.0
Iteration: 130, Return: 54.0
Iteration: 140, Return: 200.0
Iteration: 150, Return: 200.0
Iteration: 160, Return: 137.0
Iteration: 170, Return: 163.0
Iteration: 180, Return: 197.0
Iteration: 190, Return: 163.0
Iteration: 200, Return: 200.0
Iteration: 210, Return: 200.0
Iteration: 220, Return: 157.0
Iteration: 230, Return: 200.0
Iteration: 240, Return: 200.0
Iteration: 250, Return: 200.0
Iteration: 260, Return: 200.0
Iteration: 270, Return: 200.0
Iteration: 280, Return: 200.0
Iteration: 290, Return: 200.0
Iteration: 300, Return: 200.0
Iteration: 310, Return: 200.0
Iteration: 320, Return: 200.0
Iteration: 330, Return: 200.