In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# conda install swig # needed to build Box2D in the pip install
# pip install box2d-py # a repackaged version of pybox2d

# Advantage - Con baseline dependiente del estado

### 1. Muestrar {$s_t, a_t$} de $\pi_{\theta}(a|s)$ - Correr M trayectorias usando la policy
### 2. Entrenar $\hat{V}^{\pi}_{\phi}(s_t)$ con la suma de las recompensas acumuladas
### 3. Calcular el Advantage $\hat{A}^{\pi}(s_t, a_t) = \sum_{t'=t}^{T}R(s_{t'}, a_{t'}) - \hat{V}^{\pi}_{\phi}(s_t) = \hat{Q}^{\pi}(s_t, a_t)- \hat{V}^{\pi}_{\phi}(s_t)$ 
### 4. Calcular el gradiente $ \nabla_{\theta} J_{\theta} \approx  \sum_{t} \nabla_{\theta} log \pi_{\theta}(a_t|s_t) \hat{A}^{\pi}(s_t, a_t)$
### 5. Entrenar $\pi_{\theta}$ $\quad \quad\theta = \theta + \alpha \nabla_{\theta} J_{\theta}$

### 2. Estimar el retorno: $$ R(\tau_i, t)  \approx \sum_{t'=t}^{T}R(s_{t'}^i, a_{t'}^i)$$
$$ R(\tau_i, t)  \approx \sum_{t'=t}^{T}R(s_{t'}^i, a_{t'}^i)$$
### 3. Entrenar un modelo: $$ \nabla_{\theta} J_{\theta} \approx \frac{1}{M} \sum_{i=1}^{M} \sum_{t=0}^T \nabla_{\theta} log \pi_{\theta}(a_t^i|s_t^i) R(\tau_i, t)$$
### $$ \nabla_{\theta} J_{\theta} \approx \sum_{t=0}^T \nabla_{\theta} log \pi_{\theta}(a_t^i|s_t^i) \hat{Q}(s_t^i, a_t^i)$$
$$\large \theta = \theta + \alpha \nabla_{\theta} J_{\theta}$$

In [3]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
# import os
# os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)
import moviepy.editor as mpy

In [4]:
# from pyvirtualdisplay import Display
# display = Display(visible=0, size=(1400, 900))
# display.start()

In [5]:
from REINFORCE_helper import RunningVariance
from time import time
from REINFORCE_helper import BaseAgent
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, SGD
import keras.backend as K
import numpy as np

In [6]:
# import gym
# import matplotlib.pyplot as plt

# env = gym.make('CartPole-v1') # insert your favorite environment
# render = lambda : plt.imshow(env.render(mode='rgb_array'))
# env.reset()
# render()

In [7]:
# reinforce_agent.env.reset()

In [8]:
# reinforce_agent.env.render(mode = 'rgb_array', close=True)

In [9]:
class ReinforceAgent(BaseAgent):
    # def __init__(self):
    def get_policy_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=2):
        ## Defino métrica - loss sin el retorno multiplicando
        def loss_metric(y_true, y_pred):
            y_true_norm = K.sign(y_true)
            return K.categorical_crossentropy(y_true_norm, y_pred)
        model = Sequential()
        model.add(Dense(hidden_layer_neurons, input_shape=input_shape, activation='relu'))
        model.add(Dense(output_shape, activation='softmax'))
        ## Por que la categorical_crossentropy funciona ok?
        model.compile(Adam(lr), loss=['categorical_crossentropy'], metrics=[loss_metric])
        return model
    
    def get_action(self, eval=False):
        p = self.model.predict([self.observation.reshape(1, self.nS)])
        if eval is False:
            action = np.random.choice(self.nA, p=p[0]) #np.nan_to_num(p[0])
        else:
            action = np.argmax(p[0])
        action_one_hot = np.zeros(self.nA)
        action_one_hot[action] = 1
        return action, action_one_hot, p
    
    def get_entropy(self, preds, epsilon=1e-12):
        entropy = np.mean(-np.sum(np.log(preds+epsilon)*preds, axis=1)/np.log(self.nA))
        return entropy
    
    def get_critic_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=1):
        model = Sequential()
        model.add(Dense(hidden_layer_neurons, input_shape=input_shape, activation='tanh'))
        model.add(Dense(output_shape, activation='linear'))
        model.compile(Adam(lr), loss=['mse'])
        return model
    
    def get_discounted_rewards(self, r):
        # Por si es una lista
        r = np.array(r, dtype=float)
        """Take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, r.size)):
            running_add = running_add * self.gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r 

In [10]:
lr = 0.01
reinforce_agent = ReinforceAgent('Acrobot-v1', n_experience_episodes=1, EPISODES=2000, epochs=1, lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True)
# reinforce_agent = ReinforceAgent('CartPole-v1', n_experience_episodes=1, 
#                                  EPISODES=2000, epochs=1, lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True)
# reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=10, EPISODES=2000, epochs=1, 
#                                  lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True, batch_size=32)

initial_time = time()
running_variance = RunningVariance()
critic_model = reinforce_agent.get_critic_model(lr=0.01, 
                                           hidden_layer_neurons=128,
                                           input_shape=[reinforce_agent.nS],
                                           output_shape=1)


while reinforce_agent.episode < reinforce_agent.EPISODES:
    obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, last_obs, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)
    
    # Fit V(S)
#     obs_ts = np.hstack([obs, time_steps])
    history_critic = critic_model.fit(obs, disc_sum_rews, verbose=0, epochs=1)
    
    values = critic_model.predict(obs)
            
    advantage = disc_sum_rews.reshape(-1, 1) - values

    for ad in advantage:
        running_variance.add(ad)

    pseudolabels = actions*advantage
    
    history_loss = reinforce_agent.model.fit(obs, pseudolabels, verbose=0, epochs=reinforce_agent.epochs, batch_size=reinforce_agent.batch_size)
    
    reinforce_agent.log_data(reinforce_agent.episode, 
                      history_loss.history['loss'][0], 
                      np.mean(ep_len), 
                      reinforce_agent.get_entropy(preds), 
                      running_variance.get_variance(), 
                      history_loss.history['loss_metric'][0], 
                      time() - initial_time, np.mean(ep_returns[-1]), 
                      history_critic.history['loss'][0])
    
reinforce_agent.writer.close()

correr en linea de comando: tensorboard --logdir logs/
Episode: 50
Model on episode 51 improved from -inf to -393.62105513881454. Saved!
Episode: 102
Model on episode 103 improved from -393.62105513881454 to -393.62105513881454. Saved!
Episode: 154
Model on episode 155 improved from -393.62105513881454 to -393.62105513881454. Saved!
Episode: 206
Model on episode 207 improved from -393.62105513881454 to -76.92060216266377. Saved!
Episode: 258
Model on episode 259 improved from -76.92060216266377 to -62.96301113798029. Saved!
Episode: 518
Model on episode 519 improved from -62.96301113798029 to -62.96301113798029. Saved!
Episode: 570
Model on episode 571 improved from -62.96301113798029 to -62.025036174154444. Saved!
Episode: 794

KeyboardInterrupt: 