# Vanilla Policy Gradient Cart Pole

### Cart Pole

Train a simple Gym discrete cart pole environment using a vanilla policy gradient algorithm

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, TimeDistributed
from tensorflow.keras.models import Model
from ludus.policies import VPGTrainer
from ludus.env import EnvController, make_cart_pole
from ludus.utils import preprocess_atari, reshape_train_var
from ludus.policies import BaseTrainer

In [2]:
n_episodes = 5000 # Total episodes of data to collect
max_steps = 200 # Max number of frames per game
batch_size = 10 # Smaller = faster, larger = stabler
print_freq = 10 # How many training updates between printing progress

In [3]:
class RNNVPGTrainer(BaseTrainer):
    def __init__(self, in_op, out_op, act_type='discrete', sess=None):
        """
        Create a wrapper for RL networks for easy training.
        Args:
            in_op (tf.Placeholder): Observation input to architecture
            out_op (tf.Variable): Action output of architecture
            act_type (string): 'discrete' for a discrete actions space or 'continuous'
                               for a continuous actions space
            sess (tf.Session): A session if you would like to use a custom session,
                               if left none it will be automatically created
        """
        super().__init__(in_op, out_op, act_type, sess)
        
    def _create_discrete_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for vanilla policy training with a discrete action space
        """
        self.act_holders = tf.placeholder(tf.int32, shape=[None])
        self.reward_holders = tf.placeholder(tf.float32, shape=[None])
        self.f_out_op = tf.reshape(self.out_op, [-1, self.out_op.shape[2].value])
        
        self.act_masks = tf.one_hot(self.act_holders, self.f_out_op.shape[1].value, dtype=tf.float32)
        self.log_probs = tf.log(self.f_out_op)
        
        self.resp_acts = tf.reduce_sum(self.act_masks *  self.log_probs, axis=1)
        self.loss = -tf.reduce_mean(self.resp_acts * self.reward_holders)
        
        self.optimizer = optimizer
        self.update = self.optimizer.minimize(self.loss)
        
        def update_func (train_data):
            print(reshape_train_var(train_data[:, 0]).shape)
            print(train_data[:, 1].reshape(-1).shape)
            print(train_data[:, 2].reshape(-1).shape)
            
            train_obs = np.array([[y[0] for y in x] for x in base[:,:,0]])
            
            self.sess.run(self.update, feed_dict={self.in_op: train_obs,
                                                  self.act_holders: train_data[:, 1].reshape(-1),
                                                  self.reward_holders: train_data[:, 2].reshape(-1)})
        
        self.sess.run(tf.global_variables_initializer())
        
        return update_func

In [4]:
env = make_cart_pole() # This instance of the environment is only used
                       # to get action and observation dimensions
    
# Creating a fully connected NN for the policy
obs_op = Input(shape=[max_steps]+list(env.observation_space.shape))
dense1 = LSTM(16, activation='tanh', return_sequences=True)(obs_op)
dense2 = LSTM(16, activation='tanh', return_sequences=True)(dense1)
act_probs_op = TimeDistributed(Dense(env.action_space.n, activation='softmax'))(dense2) # Prob dist over possible actions
model = Model(inputs=obs_op, outputs=act_probs_op)

# Wrap a Vanilla Policy Gradient Trainer on top of the network
network = RNNVPGTrainer(obs_op, act_probs_op, act_type='discrete')

  result = entry_point.load(False)


In [5]:
# Creating a fully connected NN for the policy
obs_op_o = Input(shape=[1]+list(env.observation_space.shape), batch_size=1)
dense1_o = LSTM(16, activation='tanh', return_sequences=True, stateful=True)(obs_op_o)
dense2_o = LSTM(16, activation='tanh', stateful=True)(dense1_o)
act_probs_op_o = Dense(env.action_space.n, activation='softmax')(dense2_o) # Prob dist over possible actions
model_o = Model(inputs=obs_op_o, outputs=act_probs_op_o)

# Wrap a Vanilla Policy Gradient Trainer on top of the network
network_o = VPGTrainer(obs_op_o, act_probs_op_o, act_type='discrete')

In [6]:
def update_weights():
    model_o.set_weights(model.get_weights())

In [7]:
update_weights()

In [8]:
# Create the environment controller for generating game data
ec = EnvController(make_cart_pole, n_threads=1)

In [9]:
ec.set_pre_episode_steps(model.reset_states)
ec.set_obs_transform(lambda x: [x])

In [10]:
# base = np.empty([len(dat), max_steps, 4], dtype=object)
# stock = [np.zeros([4]), 0, 0, np.zeros([4])]
# base.fill(stock)

# for x in range(len(base)):
#     base[x][:len(dat[x])] = dat[x]

In [11]:
base[0, 0, 0]

NameError: name 'base' is not defined

In [71]:
base[:,:,0].shape

(10, 200)

In [73]:
np.array([[y[0] for y in x] for x in base[:,:,0]]).shape

(10, 200, 4)

In [61]:
bp.array(list(base[:,:,0].reshape(-1))

array([list([array([-0.03056409,  0.04171738,  0.03255501, -0.03062589])]),
       list([array([-0.02972975, -0.15385593,  0.03194249,  0.27214813])]),
       list([array([-0.03280687, -0.34941877,  0.03738545,  0.57473224])]),
       ...,
       list([array([0., 0., 0., 0.]), 0, 0, array([0., 0., 0., 0.])]),
       list([array([0., 0., 0., 0.]), 0, 0, array([0., 0., 0., 0.])]),
       list([array([0., 0., 0., 0.]), 0, 0, array([0., 0., 0., 0.])])],
      dtype=object)

In [None]:
[]

In [55]:
np.concatenate(list(base[:,:,0].reshape(-1)))

ValueError: all the input arrays must have same number of dimensions

In [56]:
list(base[:,:,0].reshape(-1))

[[array([-0.03056409,  0.04171738,  0.03255501, -0.03062589])],
 [array([-0.02972975, -0.15385593,  0.03194249,  0.27214813])],
 [array([-0.03280687, -0.34941877,  0.03738545,  0.57473224])],
 [array([-0.03979524, -0.15484033,  0.0488801 ,  0.29405728])],
 [array([-0.04289205,  0.03955189,  0.05476124,  0.01718225])],
 [array([-0.04210101, -0.1563109 ,  0.05510489,  0.32662779])],
 [array([-0.04522723, -0.35217233,  0.06163744,  0.63616663])],
 [array([-0.05227067, -0.5480973 ,  0.07436078,  0.94760593])],
 [array([-0.06323262, -0.35405114,  0.09331289,  0.67918299])],
 [array([-0.07031364, -0.55033695,  0.10689655,  0.99972499])],
 [array([-0.08132038, -0.7467127 ,  0.12689105,  1.32397502])],
 [array([-0.09625464, -0.55340119,  0.15337055,  1.07354499])],
 [array([-0.10732266, -0.36060194,  0.17484145,  0.83265517])],
 [array([-0.1145347 , -0.16824492,  0.19149456,  0.5996605 ])],
 [array([-0.1178996 ,  0.02375524,  0.20348777,  0.37288061])],
 [array([0., 0., 0., 0.]), 0, 0, array([

In [25]:
reshape_train_var(base[:,0]).shape

(10, 4)

# update_rewards = []

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network_o, batch_size, max_steps) # Simualate env to generate data
    update_rewards.append(ec.get_avg_reward()) # Append rewards to reward tracker list
    dat = ec.get_data(sep_rollouts=True) # Get all the data gathered
    
    base = np.empty([len(dat), max_steps, 4], dtype=object)
    stock = [np.zeros([4]), 0, 0, np.zeros([4])]
    base.fill(stock)

    for x in range(len(base)):
        base[x][:len(dat[x])] = dat[x]
    
    #print(dat)
    network.train(base) # Train the network with VPG
    if i != 0 and i % print_freq == 0:
        print(f'Update #{i}, Avg Reward: {np.mean(update_rewards[-print_freq:])}') # Print an update

In [None]:
ec.render_episodes(network, 5, max_steps) # Render an episode to see the result