# PPO Lunar Lander Example

### Lunar Lander

Train a Gym continuous lunar lander environment using a proximal policy optimization algorithm

Because PPO by itself does not explore much, this example will often get stuck in a local minima

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
import gym
from ludus.policies import PPOTrainer
from ludus.env import EnvController, make_lunar_lander_c

In [None]:
env = make_lunar_lander_c() # This instance of the environment is only used
                              # to get action dimensions

# Creating a conv net for the policy and value estimator
obs_op = Input(shape=env.observation_space.shape)
dense1 = Dense(32, activation='tanh')(obs_op)
dense2 = Dense(32, activation='tanh')(dense1)
act_probs_op = Dense(env.action_space.shape[0])(dense2) # Prob dist over possible actions

# Output value of observed state
vdense1 = Dense(32, activation='tanh')(obs_op)
vdense2 = Dense(32, activation='tanh')(vdense1)
value_op = Dense(1)(vdense2)

# Wrap a Proximal Policy Optimization Trainer on top of the network
network = PPOTrainer(obs_op, act_probs_op, value_op, act_type='continuous', ppo_iters=40, entropy_coef=1.)

In [None]:
n_episodes = 10000 # Total episodes of data to collect
max_steps = 400 # Max number of frames per game
batch_size = 8 # Smaller = faster, larger = stabler
print_freq = 10 # How many training updates between printing progress

In [None]:
# Create the environment controller for generating game data
ec = EnvController(make_lunar_lander_c, n_threads=4)
# Set the preprocessing function for observations
ec.set_act_transform(lambda x: np.clip(x, -1, 1))

In [None]:
update_rewards = []

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps) # Simualate env to generate data
    update_rewards.append(ec.get_avg_reward()) # Append rewards to reward tracker list
    dat = ec.get_data() # Get all the data gathered
    network.train(dat) # Train the network with PPO
    if i != 0 and i % print_freq == 0:
        print(f'Update #{i}, Avg Reward: {np.mean(update_rewards[-print_freq:])}') # Print an update

In [None]:
ec.render_episodes(network, 5, max_steps) # Render an episode to see the result