# Importing Libraries

In [1]:
import tensorflow as tf
import numpy as np
import gym

# Preprocessing

In [2]:
env = gym.make('CartPole-v0')
env = e.unwrapped

env.seed(1)

[1]

# Hyperparameters

In [3]:
state_size = 4
action_size = env.action_space.n
print("Possible Actions: ", action_size)

max_episodes = 300
learning_rate = 0.01
gamma = 0.95

Possible Actions:  2


In [None]:
def discount_and_normalized_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cum = 0.
    
    for i in reversed(range(len(episode_rewards))):
        cum = cum*gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cum
        
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / std
    
    return discounted_episode_rewards

# Model Architecture

In [None]:
with tf.name_scope("inputs"):
    inputs_ = tf.placeholder(tf.float32, [None, state_size], name = 'input')
    actions = tf.placeholder(tf.int32, [None, action_size], name = 'actions')
    discounted_episode_rewards_ = tf.placeholder(tf.float32, [None,], name = 'discounted_episode_rewards')
    
    mean_reward_ = tf.placeholder(tf.float32, name = 'mean_reward')
    
    with tf.name_scope('fc1'):
        fc1 = tf.contrib.layers.fully_connected(inputs = inputs_,
                                               num_outputs = 10,
                                               activation_fn = tf.nn.relu,
                                               weights_initializer = tf.contrib.layers.xavier_initializer())
        
    with tf.name_scope('fc2'):
        fc2 = tf.contrib.layers.fully_connected(inputs = fc1,
                                               num_outputs = action_size,
                                               activation_fn = tf.nn.relu,
                                               weights_initializer = tf.contrib.layers.xavier_initializer())
        
    with tf.name_scope('fc3'):
        fc3 = tf.contrib.layers.fully_connected(inputs = fc2,
                                               num_outputs = action_size,
                                               activation_fn = None,
                                               weights_initializer = tf.contrib.layers.xavier_initializer())
        
    with tf.name_scope('softmax'):
        action_distribution = tf.nn.softmax(fc3)
        
    with tf.name_scope('loss'):
        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = fc3,
                                                                 labels = actions)
        loss = tf.reduce_mean(neg_log_prob*discounted_episode_rewards_)
    
    with tf.name_scope('train'):
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

## Tensorboard Config

In [None]:
writer = tf.summary.FileWriter('/tensorboard/pg/1')

tf.summary.scalar('loss', loss)

tf.summary.scalar('reward_mean', mean_reward_)

write_op = tf.summary.merge_all()

# Training

In [None]:
all_rewards = []
total_rewards = 0
maximumRewardRecorded = 0
episode = 0
episode_states, episode_actions, episode_rewards = [], [], []

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for episode in range(1, max_episodes+1):
        episode_rewards_sum = 0
        
        state = env.reset()
        env.render()
        
        while True:
            action_probability_distribution = sess.run(action_distribution,
                                                      feed_dict = {inputs_: state.reshape([1, 4])})
            
            action = np.random.choice(range(action_probability_distribution.shape[1]), p = action_probability_distribution.ravel())
            
            new_state, reward, done, info = env.step(action)
            
            episode_states.append(state)
            
            action_ = np.zeros(action_size)
            action_[action] = 1
            episode_actions.append(action_)
            
            episode_rewards.append(reward)
            
            if done:
                episode_rewards_sum = np.sum(episode_rewards)
                all_rewards.append(episode_rewards_sum)
                total_rewards = np.sum(all_rewards)
                mean_reward = np.divide(total_rewards, episode)
                
                maximumRewardRecorded = np.amax(all_rewards)
                
                print("======================================")
                print("Episode: ", episode)
                print("Reward: ", episode_rewards_sum)
                print("Mean reward: ", mean_reward)
                print("Max reward so far: ", maximumRewardRecorded)
                
                discounted_episode_rewards = discount_and_normalized_rewards(episode_rewards)
                
                loss_, _ = sess.run([loss, train_opt], feed_dict = {inputs_: np.vstack(np.array(episode_states)),
                                                                  actions: np.vstack(np.array(episode_actions)),
                                                                  discounted_episode_rewards_: discounted_episode_rewards})
                
                summary = sess.run(write_op, feed_dict = {inputs_: np.vstack(np.array(episode_states)),
                                                          actions: np.vstack(np.array(episode_actions)),
                                                          discounted_episode_rewards_: discounted_episode_rewards,
                                                          mean_reward_: mean_reward})
                
                writer.add_summary(summary, episode)
                writer.flush()
                
                episode_states, episode_actions, episode_rewards = [], [], []
                
                break;
                
            state = new_state
            
        if episode%10 == 0:
            saver.save(sess, './models/model.ckpt')
            print("Model Saved")

env.close()
sess.close()

# Testing - Agent plays the game

In [None]:
test_episodes = 10

with tf.Session() as sess:
    env.reset()
    rewards = []
    
    saver.restore(sess, './models/model.ckpt')
    
    for episode in range(1, test_episodes+1):
        state = env.reset()
        env.render()
        step = 0
        done = False
        total_rewards = 0
        print("===============================")
        print("Episode: ", episode)
        
        while True:
            action_probability_distribution = sess.run(action_distribution, feed_dict={inputs_: state.reshape([1,state_size])})
            
            action = np.random.choice(range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel())
            new_state, reward, done, info = env.step(action)
            total_rewards += reward
            
            if done:
                rewards.append(total_rewards)
                print ("Score: ", total_rewards)
                break
            
            state = new_state
            
    env.close()
    print ("Average Score: " +  str(sum(rewards)/test_episodes))