## PPO

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from time import time
import numpy as np
import gym 

from PPO_agent import ReinforceAgent
from utils import RunningVariance, get_advantages

In [None]:
critic_lr = 0.001
actor_lr =  0.001
LOSS_CLIPPING = 0.01 # Recomendado por el Paper
ENTROPY_LOSS = 0.0 #5e-4

## Explorando Lunar Lander

In [4]:
env = gym.make('LunarLander-v2')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


## Instancio los agentes

In [5]:
initial_time = time()
running_variance = RunningVariance()

reinforce_agent = ReinforceAgent(ENV='LunarLander-v2', 
                                 n_experience_episodes=10, 
                                 EPISODES=1000, 
                                 epochs=10, 
                                 LOSS_CLIPPING=LOSS_CLIPPING,
                                 ENTROPY_LOSS=ENTROPY_LOSS,
                                 lr=actor_lr, 
                                 algorithm='PPO', 
                                 gif_to_board=True, 
                                 batch_size=64, 
                                 gamma=0.99)


critic_model = reinforce_agent.get_critic_model(lr=critic_lr, 
                                           hidden_layer_neurons=128,
                                           input_shape=[reinforce_agent.nS],
                                           output_shape=1)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m







In [15]:
env.observation_space

Box(8,)

## Entreno V(s) para que no tenga basura ##

In [6]:

# Corro episodios con policy random
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)

# Les saco la ultima observación por que no tiene reward
observations = []
for i in range(reinforce_agent.n_experience_episodes):
    observations.append(obs[i][:-1])
observations = np.vstack(observations)

# Entreno V(s)
history_critic = critic_model.fit(observations, np.vstack(disc_sum_rews), verbose=0, 
                                      epochs=reinforce_agent.epochs,
                                      batch_size=reinforce_agent.batch_size)












## Ciclo de entrenamiento del modelo     ##

In [7]:
while reinforce_agent.episode < reinforce_agent.EPISODES:
    
    # Corro episodio con policy que se irá entrenando
    obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)
    
    # Pongo todas las acciones de los distintos episodios juntas
    actions = np.vstack(actions) 
    
    # Pongo las predicciones juntas y las guardo como las viejas para pasarselas al modelo
    # Las nuevas predicciones será la salida de la red neuronal
    old_prediction = np.vstack(preds) 
    
    # Calculo advantages y guardo observaciones sin la última observación
    advantage = []
    observations = []
    for i in range(reinforce_agent.n_experience_episodes):
        values = critic_model.predict(obs[i]) 
        
        advantage.append(get_advantages(values, rewards[i], gamma=reinforce_agent.gamma, lmbda=0.1))
        observations.append(obs[i][:-1])
        
    advantage = np.vstack(advantage)
    observations = np.vstack(observations)
    
    # Calculo de varianza
    for ad in advantage:
        running_variance.add(ad)

    # Normalización de advantage
    advantage = (advantage-advantage.mean()) / advantage.std()
    
    # Entrenamiento de Policy
    history_loss = reinforce_agent.model_train.fit([observations, advantage, old_prediction], 
                                                   actions, 
                                                   verbose=0, 
                                                   epochs=reinforce_agent.epochs, 
                                                   batch_size=reinforce_agent.batch_size)
    
    # Entrenamiento de V(s)
    history_critic = critic_model.fit(observations, np.vstack(disc_sum_rews), 
                                      verbose=0, 
                                      epochs=reinforce_agent.epochs,
                                      batch_size=reinforce_agent.batch_size)
    
    # Logue de resultados
    reinforce_agent.log_data(reinforce_agent.episode, 
                      history_loss.history['loss'][0], 
                      np.mean(ep_len), 
                      reinforce_agent.get_entropy(old_prediction), 
                      running_variance.get_variance(), 
                      history_loss.history['actor_loss'][0], 
                      time() - initial_time, np.mean(ep_returns[-1]), 
                      history_critic.history['loss'][0])
    
reinforce_agent.writer.close()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
correr en linea de comando: tensorboard --logdir logs/
Episode: 51
Model on episode 52 improved from -inf to -102.44621711879805. Saved!
add_video needs package moviepy
Episode: 103
Model on episode 104 did not improved -124.05945068471156. Best saved: -102.44621711879805
Episode: 155
Model on episode 156 did not improved -190.48929802919614. Best saved: -102.44621711879805
Episode: 207
Model on episode 208 improved from -102.44621711879805 to -69.98005340370891. Saved!
add_video needs package moviepy
Episode: 259
Model on episode 260 improved from -69.98005340370891 to -44.97927679578034. Saved!
add_video needs package moviepy
Episode: 311
Model on episode 312 did not improved -45.300654645070686. Best saved: -44.97927679578034
Episode: 363
Model on episode 364 improved from -44.97927679578034 to -43.053678909076176. Saved!
add_video needs package moviepy
Episode: 415
Model on episode 416 imp

In [8]:
reinforce_agent.logdir

'logs/LunarLander-v2/PPO/10_10_64_0.99_0.001_1577719517'