In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
import os
os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)
import moviepy.editor as mpy

In [35]:
from REINFORCE_helper import RunningVariance
from time import time
from REINFORCE_helper import BaseAgent
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import Adam, SGD
import keras.backend as K
import numpy as np

In [52]:
LOSS_CLIPPING = 0.2 # Only implemented clipping for the surrogate loss, paper said it was best
ENTROPY_LOSS = 5e-4

class ReinforceAgent(BaseAgent):
    # def __init__(self):
    def proximal_policy_optimization_loss(self, advantage, old_prediction):
        def loss(y_true, y_pred):
            prob = K.sum(y_true * y_pred, axis=-1)
            old_prob = K.sum(y_true * old_prediction, axis=-1)
            r = prob/(old_prob + 1e-10)
            return -K.mean(K.minimum(r * advantage, K.clip(r, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantage) + ENTROPY_LOSS * -(prob * K.log(prob + 1e-10)))
        return loss

    def get_policy_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=2):
        ## Defino métrica - loss sin el retorno multiplicando
        def loss_metric(y_true, y_pred):
            y_true_norm = K.sign(y_true)
            return K.categorical_crossentropy(y_true_norm, y_pred)
        
        state_input = Input(shape=input_shape)
        advantage = Input(shape=(1,))
        old_prediction = Input(shape=(output_shape,))

        x = Dense(hidden_layer_neurons, activation='relu')(state_input)
        
        out_actions = Dense(output_shape, activation='softmax', name='output')(x)

        model = Model(inputs=[state_input, advantage, old_prediction], outputs=[out_actions])
        
        model.compile(Adam(lr), loss=[self.proximal_policy_optimization_loss(advantage, old_prediction)], metrics=[loss_metric])
        return model
    
    def get_action(self, eval=False):
        DUMMY_ACTION, DUMMY_VALUE = np.zeros((1, self.nA)), np.zeros((1, 1))
        p = self.model.predict([self.observation.reshape(1, self.nS), DUMMY_VALUE, DUMMY_ACTION])
        if eval is False:
            action = np.random.choice(self.nA, p=p[0]) #np.nan_to_num(p[0])
        else:
            action = np.argmax(p[0])
        action_one_hot = np.zeros(self.nA)
        action_one_hot[action] = 1
        return action, action_one_hot, p
    
    def get_entropy(self, preds, epsilon=1e-12):
        entropy = np.mean(-np.sum(np.log(preds+epsilon)*preds, axis=1)/np.log(self.nA))
        return entropy
    
    def get_critic_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=1):
        model = Sequential()
        model.add(Dense(hidden_layer_neurons, input_shape=input_shape, activation='tanh'))
        model.add(Dense(output_shape, activation='linear'))
        model.compile(Adam(lr), loss=['mse'])
        return model
    
    def get_discounted_rewards(self, r):
        # Por si es una lista
        r = np.array(r, dtype=float)
        """Take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, r.size)):
            running_add = running_add * self.gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r 

In [53]:
reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=10, EPISODES=2000, epochs=1, 
                                 lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True, batch_size=32)


In [54]:
reinforce_agent.model.summary()

Model: "model_28"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_82 (InputLayer)        (None, 8)                 0         
_________________________________________________________________
dense_72 (Dense)             (None, 128)               1152      
_________________________________________________________________
output (Dense)               (None, 4)                 516       
Total params: 1,668
Trainable params: 1,668
Non-trainable params: 0
_________________________________________________________________


In [55]:
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, last_obs, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)

In [56]:
def get_advantages(values, rewards, gamma=0.999, lmbda=0.95):
    #GAE
    returns = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] - values[i]
        gae = delta + gamma * lmbda * gae
        returns.insert(0, gae + values[i])

    adv = np.array(returns) - values[:-1]
    return np.array(returns), adv

In [70]:
lr = 0.01
LOSS_CLIPPING = 0.2 # Only implemented clipping for the surrogate loss, paper said it was best
ENTROPY_LOSS = 5e-1

reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=1, EPISODES=2000, epochs=1, 
                                 lr=lr, algorithm='PPO', gif_to_board=False, batch_size=32)

initial_time = time()
running_variance = RunningVariance()
critic_model = reinforce_agent.get_critic_model(lr=lr, 
                                           hidden_layer_neurons=128,
                                           input_shape=[reinforce_agent.nS],
                                           output_shape=1)


while reinforce_agent.episode < reinforce_agent.EPISODES:
    obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, last_obs, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)
    old_prediction = preds
    
    all_obs = np.vstack([obs, [last_obs]])
    values = critic_model.predict(all_obs)
           
#     _, advantage = get_advantages(values, rewards, gamma=reinforce_agent.gamma, lmbda=0.1)
    advantage = disc_sum_rews.reshape(-1, 1) - values[:-1]

    for ad in advantage:
        running_variance.add(ad)

    history_loss = reinforce_agent.model.fit([obs, advantage, old_prediction], actions, verbose=0, 
                                             epochs=reinforce_agent.epochs, batch_size=reinforce_agent.batch_size)
    history_critic = critic_model.fit(obs, disc_sum_rews, verbose=0, epochs=reinforce_agent.epochs, batch_size=reinforce_agent.batch_size)
    
    
    reinforce_agent.log_data(reinforce_agent.episode, 
                      history_loss.history['loss'][0], 
                      np.mean(ep_len), 
                      reinforce_agent.get_entropy(preds), 
                      running_variance.get_variance(), 
                      history_loss.history['loss_metric'][0], 
                      time() - initial_time, np.mean(ep_returns[-1]), 
                      history_critic.history['loss'][0])
    
reinforce_agent.writer.close()

correr en linea de comando: tensorboard --logdir logs/
Episode: 50
Model on episode 51 improved from -inf to -784.3085494406373. Saved!
Episode: 102
Model on episode 103 improved from -784.3085494406373 to -190.97967106071556. Saved!
Episode: 154
Model on episode 155 improved from -190.97967106071556 to -141.57707614932536. Saved!
Episode: 362
Model on episode 363 improved from -141.57707614932536 to -118.50776152294408. Saved!
Episode: 414
Model on episode 415 improved from -118.50776152294408 to -21.755517304337207. Saved!
Episode: 2000