In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
import os
os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)
import moviepy.editor as mpy

In [3]:
from REINFORCE_helper import RunningVariance
from time import time
from REINFORCE_helper import BaseAgent
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import Adam, SGD
import keras.backend as K
import numpy as np

Using TensorFlow backend.


In [29]:
class ReinforceAgent(BaseAgent):
    # def __init__(self):
    def get_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=2):
        ## Defino métrica - loss sin el retorno multiplicando
        def loss_metric(y_true, y_pred):
            y_true_norm = K.sign(y_true)
            return K.categorical_crossentropy(y_true_norm, y_pred)
        inp = Input(shape=input_shape)
        x = Dense(hidden_layer_neurons, input_shape=input_shape, activation='tanh')(inp)
        x = Dense(hidden_layer_neurons, input_shape=input_shape, activation='tanh')(inp)
        pi = Dense(output_shape, name='policy', activation='softmax')(x)
        v = Dense(1, name='value', activation='linear')(x)
        ## Por que la categorical_crossentropy funciona ok?
        model = Model(inp, [pi, v])
        model.compile(Adam(lr), 
                      loss={'policy':'categorical_crossentropy', 'value': 'mse'},
                      loss_weights={'policy':1, 'value': 0.001}
                     )
        return model
    
    def get_action(self, eval=False):
        p, v = self.model.predict([self.observation.reshape(1, self.nS)])
        if eval is False:
            action = np.random.choice(self.nA, p=p[0]) #np.nan_to_num(p[0])
        else:
            action = np.argmax(p[0])
        action_one_hot = np.zeros(self.nA)
        action_one_hot[action] = 1
        return action, action_one_hot, p
    
    def get_entropy(self, preds, epsilon=1e-12):
        entropy = np.mean(-np.sum(np.log(preds+epsilon)*preds, axis=1)/np.log(self.nA))
        return entropy
    
    
    def get_discounted_rewards(self, r):
        # Por si es una lista
        r = np.array(r, dtype=float)
        """Take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, r.size)):
            running_add = running_add * self.gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r 

In [30]:
reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=10, EPISODES=2000, epochs=1, 
                                 lr=0.001, algorithm='REINFORCE_V_s_ONE_NET', gif_to_board=True, batch_size=32)

reinforce_agent.model = reinforce_agent.get_model(lr=reinforce_agent.lr, 
                                                  hidden_layer_neurons=reinforce_agent.hidden_layer_neurons, 
                                                  input_shape=[reinforce_agent.nS] ,output_shape=reinforce_agent.nA)

In [31]:
reinforce_agent.model.summary()

Model: "model_16"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 8)            0                                            
__________________________________________________________________________________________________
dense_21 (Dense)                (None, 128)          1152        input_16[0][0]                   
__________________________________________________________________________________________________
policy (Dense)                  (None, 4)            516         dense_21[0][0]                   
__________________________________________________________________________________________________
value (Dense)                   (None, 1)            129         dense_21[0][0]                   
Total params: 1,797
Trainable params: 1,797
Non-trainable params: 0
_______________________

In [32]:
reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=10, EPISODES=2000, epochs=1, 
                                 lr=0.001, algorithm='REINFORCE_V_s_ONE_NET', gif_to_board=True, batch_size=32)

reinforce_agent.model = reinforce_agent.get_model(lr=reinforce_agent.lr, 
                                                  hidden_layer_neurons=reinforce_agent.hidden_layer_neurons, 
                                                  input_shape=[reinforce_agent.nS], output_shape=reinforce_agent.nA)

In [33]:
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, last_obs, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)

In [34]:
# reinforce_agent = ReinforceAgent('Acrobot-v1', n_experience_episodes=1, EPISODES=2000, epochs=1, lr=0.001, algorithm='REINFORCE_V_s')
# reinforce_agent = ReinforceAgent('CartPole-v1', n_experience_episodes=1, 
#                                  EPISODES=2000, epochs=1, lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True)
reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=10, EPISODES=2000, epochs=1, 
                                 lr=0.001, algorithm='REINFORCE_V_s_ONE_NET', gif_to_board=False, batch_size=64)

reinforce_agent.model = reinforce_agent.get_model(lr=reinforce_agent.lr, 
                                                  hidden_layer_neurons=reinforce_agent.hidden_layer_neurons, 
                                                  input_shape=[reinforce_agent.nS] ,output_shape=reinforce_agent.nA)

initial_time = time()
running_variance = RunningVariance()


while reinforce_agent.episode < reinforce_agent.EPISODES:
    obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, last_obs, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)
    
    _, values = reinforce_agent.model.predict(obs)
            
    advantage = disc_sum_rews.reshape(-1, 1) - values

    for ad in advantage:
        running_variance.add(ad)

    pseudolabels = actions*advantage
    
    history_loss = reinforce_agent.model.fit(obs, [pseudolabels, disc_sum_rews], verbose=0, epochs=reinforce_agent.epochs, batch_size=reinforce_agent.batch_size)
    
    reinforce_agent.log_data(reinforce_agent.episode, 
                      history_loss.history['policy_loss'][0], 
                      np.mean(ep_len), 
                      reinforce_agent.get_entropy(preds), 
                      running_variance.get_variance(), 
                      history_loss.history['loss'][0], 
                      time() - initial_time, np.mean(ep_returns[-1]), 
                      history_loss.history['value_loss'][0])
    
reinforce_agent.writer.close()

correr en linea de comando: tensorboard --logdir logs/
Episode: 51
Model on episode 52 improved from -inf to -327.96935510333986. Saved!
Episode: 103
Model on episode 104 improved from -327.96935510333986 to -108.50052783201814. Saved!
Episode: 155
Model on episode 156 improved from -108.50052783201814 to -103.54112750346765. Saved!
Episode: 259
Model on episode 260 improved from -103.54112750346765 to -97.6893305129451. Saved!
Episode: 1227

KeyboardInterrupt: 

In [10]:
history_loss.history

{'loss': [1137.3878158632099],
 'policy_loss': [-136.50657394220892],
 'value_loss': [12738.943538584925]}