In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
import os
os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)
import moviepy.editor as mpy

In [3]:
from REINFORCE_helper import RunningVariance
from time import time
from REINFORCE_helper import BaseAgent
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import Adam, SGD
import keras.backend as K
import numpy as np

Using TensorFlow backend.


In [4]:
class ReinforceAgent(BaseAgent):
    def proximal_policy_optimization_loss(self, advantage, old_prediction):
        def loss(y_true, y_pred):
            prob = y_true * y_pred
            old_prob = y_true * old_prediction
            r = prob/(old_prob + 1e-10)
            return -K.mean(K.minimum(r * advantage, K.clip(r, min_value=1 - self.LOSS_CLIPPING, max_value=1 + self.LOSS_CLIPPING) * advantage) 
                           + self.ENTROPY_LOSS * -(prob * K.log(prob + 1e-10)))
        return loss

    def get_policy_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=2):
        ## Defino métrica - loss sin el retorno multiplicando
        def actor_loss(y_true, y_pred):
            prob = y_true * y_pred
            old_prob = y_true * old_prediction
            r = prob/(old_prob + 1e-10)
            return K.max(r)
        
        state_input = Input(shape=input_shape)
        advantage = Input(shape=(1,))
        old_prediction = Input(shape=(output_shape,))

        x = Dense(hidden_layer_neurons, activation='relu')(state_input)
        
        out_actions = Dense(output_shape, activation='softmax', name='output')(x)

        model_train = Model(inputs=[state_input, advantage, old_prediction], outputs=[out_actions])
        model_predict = Model(inputs=[state_input], outputs=[out_actions])
        
        model_train.compile(Adam(lr), loss=[self.proximal_policy_optimization_loss(advantage, old_prediction)], metrics=[actor_loss])
        return model_train, model_predict
    
    def get_action(self, eval=False):
        obs = self.scaler.transform(self.observation.reshape(1, self.nS))
        obs = self.observation.reshape(1, self.nS)
        p = self.model_predict.predict(obs)
        if eval is False:
            action = np.random.choice(self.nA, p=p[0]) #np.nan_to_num(p[0])
        else:
            action = np.argmax(p[0])
        action_one_hot = np.zeros(self.nA)
        action_one_hot[action] = 1
        return action, action_one_hot, p
    
    def get_entropy(self, preds, epsilon=1e-12):
        entropy = np.mean(-np.sum(np.log(preds+epsilon)*preds, axis=1)/np.log(self.nA))
        return entropy
    
    def get_critic_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=1):
        model = Sequential()
        model.add(Dense(hidden_layer_neurons, input_shape=input_shape, activation='relu'))
#         model.add(Dense(hidden_layer_neurons, input_shape=input_shape, activation='selu'))
        model.add(Dense(output_shape, activation='linear'))
        model.compile(Adam(lr), loss=['mse'])
        return model
    
    def get_discounted_rewards(self, r):
        # Por si es una lista
        r = np.array(r, dtype=float)
        """Take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, r.size)):
            running_add = running_add * self.gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r 

In [5]:
reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=10, EPISODES=2000, epochs=1, 
                                 lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True, batch_size=32)

Instructions for updating:
Colocations handled automatically by placer.


In [6]:
reinforce_agent.model_predict.summary()
reinforce_agent.model_predict.input.shape

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1152      
_________________________________________________________________
output (Dense)               (None, 4)                 516       
Total params: 1,668
Trainable params: 1,668
Non-trainable params: 0
_________________________________________________________________


TensorShape([Dimension(None), Dimension(8)])

In [7]:
reinforce_agent.model_train.summary()
reinforce_agent.model_train.input

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1152      
_________________________________________________________________
output (Dense)               (None, 4)                 516       
Total params: 1,668
Trainable params: 1,668
Non-trainable params: 0
_________________________________________________________________


[<tf.Tensor 'input_1:0' shape=(?, 8) dtype=float32>,
 <tf.Tensor 'input_2:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'input_3:0' shape=(?, 4) dtype=float32>]

In [8]:
def get_advantages(values, rewards, gamma=0.999, lmbda=0.95):
    #GAE
    returns = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] - values[i]
        gae = delta + gamma * lmbda * gae
        returns.insert(0, gae + values[i])

    adv = np.array(returns) - values[:-1]
    return adv

In [9]:
def compute_n_step_targets(rewards, values, gamma=0.999, n_steps = 5):
    out = []
    ep_len = len(rewards)
    gammas = np.power(gamma, range(ep_len+1)) # El +1 es un hack para que no cuelgue cuando le pasamos n_steps = len(rewards)
    padded_values = np.vstack([values, np.zeros([n_steps, 1])])
    for t in range(ep_len):
        # t desde donde comienzo, por ejemplo si t=0 sumo desde 0 a n_steps-1
        rewards_left = min([0, ep_len-t-n_steps])
        first_term = (gammas[:(n_steps+rewards_left)]*rewards[t:t+n_steps]).sum()
        A_t =  first_term - padded_values[t] + gammas[n_steps]*padded_values[t+n_steps]
        out.append(A_t)
    return np.array(out)

In [10]:
def get_AC_Advantages(rewards, gamma, values):
    return rewards.reshape(-1,1) + gamma*values[1:] - values[:-1]

In [11]:
reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=3, EPISODES=2000, epochs=1, 
                                 lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True, batch_size=32)
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)

In [12]:
# reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=3, EPISODES=2000, epochs=1, 
#                                  lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True, batch_size=32)
# obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)

In [13]:
# from matplotlib import pyplot as plt
# plt.plot(get_advantages(values_, rewards[i], gamma=reinforce_agent.gamma, lmbda=0.1))

In [17]:
critic_lr = 0.001
actor_lr =  0.001
LOSS_CLIPPING = 0.2 # Recomendado por el Paper
ENTROPY_LOSS = 1 #5e-4

reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=10, EPISODES=1000, epochs=10, 
                                 LOSS_CLIPPING=LOSS_CLIPPING,
                                 ENTROPY_LOSS=ENTROPY_LOSS,
                                 lr=actor_lr, algorithm='PPO', gif_to_board=False, batch_size=64, gamma=0.99)

# reinforce_agent = ReinforceAgent('CartPole-v0', n_experience_episodes=1, EPISODES=2000, epochs=1, 
#                                  lr=actor_lr, algorithm='PPO', gif_to_board=False, batch_size=64)

initial_time = time()
running_variance = RunningVariance()
critic_model = reinforce_agent.get_critic_model(lr=critic_lr, 
                                           hidden_layer_neurons=128,
                                           input_shape=[reinforce_agent.nS],
                                           output_shape=1)

###########################################
## Entreno V(s) para que no tenga basura ##
###########################################
# Corro episodios con policy random
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)

# Les saco la ultima observación por que no tiene reward
observations = []
for i in range(reinforce_agent.n_experience_episodes):
    observations.append(obs[i][:-1])
observations = np.vstack(observations)

# Entreno V(s)
history_critic = critic_model.fit(observations, np.vstack(disc_sum_rews), verbose=0, 
                                      epochs=reinforce_agent.epochs,
                                      batch_size=reinforce_agent.batch_size)


###########################################
## Ciclo de entrenamiento del modelo     ##
###########################################

while reinforce_agent.episode < reinforce_agent.EPISODES:
    # Corro episodio con policy que se irá entrenando
    obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)
    actions = np.vstack(actions) # Pongo todas las acciones de los distintos episodios juntas
    # Pongo las predicciones juntas y las guardo como las viejas para pasarselas al modelo
    # Las nuevas predicciones será la salida de la red neuronal
    old_prediction = np.vstack(preds) 
    
    # Calculo advantages y guardo observaciones sin la última observación
    advantage = []
    observations = []
    for i in range(reinforce_agent.n_experience_episodes):
        values = critic_model.predict(obs[i]) 
#         values_ = np.vstack([rewards[i].reshape(-1,1) + reinforce_agent.gamma*values[1:], 0])
        
        advantage.append(get_advantages(values, rewards[i], gamma=reinforce_agent.gamma, lmbda=0.1))
#         advantage.append(get_AC_Advantages(rewards[i], reinforce_agent.gamma, values))
        observations.append(obs[i][:-1])
        
    advantage = np.vstack(advantage)
    observations = np.vstack(observations)
    
    # Calculo de varianza
    for ad in advantage:
        running_variance.add(ad)

    # Normalización de advantage
    advantage = (advantage-advantage.mean()) / advantage.std()
    
    # Entrenamiento de Policy
    history_loss = reinforce_agent.model_train.fit([observations, advantage, old_prediction], 
                                                   actions, verbose=0, 
                                                   epochs=reinforce_agent.epochs, 
                                                   batch_size=reinforce_agent.batch_size)
    
#     disc_sum_rews = (disc_sum_rews - disc_sum_rews.mean()) / disc_sum_rews.std()
    # Entrenamiento de V(s)
    history_critic = critic_model.fit(observations, np.vstack(disc_sum_rews), verbose=0, 
                                      epochs=reinforce_agent.epochs,
                                      batch_size=reinforce_agent.batch_size)
    
    # Logue de resultados
    reinforce_agent.log_data(reinforce_agent.episode, 
                      history_loss.history['loss'][0], 
                      np.mean(ep_len), 
                      reinforce_agent.get_entropy(old_prediction), 
                      running_variance.get_variance(), 
                      history_loss.history['actor_loss'][0], 
                      time() - initial_time, np.mean(ep_returns[-1]), 
                      history_critic.history['loss'][0])
    
reinforce_agent.writer.close()

correr en linea de comando: tensorboard --logdir logs/
Episode: 51
Model on episode 52 improved from -inf to -122.7101183944498. Saved!
Episode: 103
Model on episode 104 improved from -122.7101183944498 to 13.252529696117886. Saved!
Episode: 155
Model on episode 156 did not improved -21.29135259240424. Best saved: 13.252529696117886
Episode: 207
Model on episode 208 did not improved -19.773217154317233. Best saved: 13.252529696117886
Episode: 259
Model on episode 260 did not improved -19.81648339495491. Best saved: 13.252529696117886
Episode: 311
Model on episode 312 improved from 13.252529696117886 to 22.46259275266471. Saved!
Episode: 363
Model on episode 364 improved from 22.46259275266471 to 24.73041717698794. Saved!
Episode: 415
Model on episode 416 did not improved -5.102755587407028. Best saved: 24.73041717698794
Episode: 467
Model on episode 468 did not improved 19.191241586085194. Best saved: 24.73041717698794
Episode: 519
Model on episode 520 improved from 24.73041717698794 t

KeyboardInterrupt: 