# Vanilla Policy Gradient Cart Pole

### Cart Pole

Train a simple Gym discrete cart pole environment using a vanilla policy gradient algorithm

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from ludus.policies import VPGTrainer
from ludus.env import EnvController, make_cart_pole
from ludus.utils import preprocess_atari

In [4]:
from ludus.policies import PPOTrainer

In [9]:
env = make_cart_pole() # This instance of the environment is only used
                       # to get action and observation dimensions
    
# Creating a fully connected NN for the policy
obs_op = Input(shape=env.observation_space.shape)
dense1 = Dense(16, activation='tanh')(obs_op)
dense2 = Dense(16, activation='tanh')(dense1)
act_probs_op = Dense(env.action_space.n, activation='softmax')(dense2) # Prob dist over possible actions


vdense1 = Dense(16, activation='tanh')(obs_op)
vdense2 = Dense(16, activation='tanh')(vdense1)
value_op = Dense(1, activation='softmax')(vdense2)

# Wrap a Vanilla Policy Gradient Trainer on top of the network
network = PPOTrainer(obs_op, act_probs_op, value_op, act_type='discrete')

In [10]:
n_episodes = 5000 # Total episodes of data to collect
max_steps = 200 # Max number of frames per game
batch_size = 10 # Smaller = faster, larger = stabler
print_freq = 10 # How many training updates between printing progress

In [11]:
# Create the environment controller for generating game data
ec = EnvController(make_cart_pole, n_threads=4)

In [12]:
update_rewards = []

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps) # Simualate env to generate data
    update_rewards.append(ec.get_avg_reward()) # Append rewards to reward tracker list
    dat = ec.get_data() # Get all the data gathered
    network.train(dat) # Train the network with VPG
    if i != 0 and i % print_freq == 0:
        print(f'Update #{i}, Avg Reward: {np.mean(update_rewards[-print_freq:])}') # Print an update

Update #10, Avg Reward: 23.03
Update #20, Avg Reward: 27.28
Update #30, Avg Reward: 34.839999999999996
Update #40, Avg Reward: 45.970000000000006
Update #50, Avg Reward: 55.03000000000001
Update #60, Avg Reward: 61.09000000000001
Update #70, Avg Reward: 74.15999999999998
Update #80, Avg Reward: 97.59
Update #90, Avg Reward: 122.24000000000001


KeyboardInterrupt: 

In [None]:
ec.render_episodes(network, 5, max_steps) # Render an episode to see the result