In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
import os
os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)
import moviepy.editor as mpy

In [3]:
from REINFORCE_helper import RunningVariance
from time import time
from REINFORCE_helper import BaseAgent
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import Adam, SGD
import keras.backend as K
import numpy as np

Using TensorFlow backend.


In [4]:
LOSS_CLIPPING = 0.2 # Only implemented clipping for the surrogate loss, paper said it was best

NOISE = 2.0

class ReinforceAgent(BaseAgent):
    def proximal_policy_optimization_loss_continuous(self, advantage, old_prediction):
        def loss(y_true, y_pred):
            var = K.square(self.noise)
            denom = K.sqrt(2 * np.pi * var)
            prob_num = K.exp(- K.square(y_true - y_pred) / (2 * var))
            old_prob_num = K.exp(- K.square(y_true - old_prediction) / (2 * var))

            prob = prob_num/denom
            old_prob = old_prob_num/denom
            r = prob/(old_prob + 1e-10)

            return -K.mean(K.minimum(r * advantage, K.clip(r, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantage))
        return loss

    def get_policy_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=2):
        def actor_loss(y_true, y_pred):
            var = K.square(self.noise)
            denom = K.sqrt(2 * np.pi * var)
            prob_num = K.exp(- K.square(y_true - y_pred) / (2 * var))
            old_prob_num = K.exp(- K.square(y_true - old_prediction) / (2 * var))

            prob = prob_num/denom
            old_prob = old_prob_num/denom
            r = prob/(old_prob + 1e-10)
            return K.mean(r)
        
        state_input = Input(shape=input_shape)
        advantage = Input(shape=(1,))
        old_prediction = Input(shape=(output_shape,))

        x = Dense(hidden_layer_neurons, activation='relu')(state_input)
#         x = Dense(256, activation='selu')(x)
#         x = Dense(256, activation='selu')(x)
        
        out_actions = Dense(output_shape, activation='tanh', name='output')(x)

        model_train = Model(inputs=[state_input, advantage, old_prediction], outputs=[out_actions])
        model_predict = Model(inputs=[state_input], outputs=[out_actions])
        
        model_train.compile(Adam(lr), loss=[self.proximal_policy_optimization_loss_continuous(advantage, old_prediction)], metrics=[actor_loss])
        return model_train, model_predict
    
    def get_action(self, eval=False):
        obs = self.scaler.transform(self.observation.reshape(1, self.nS))
        obs = self.observation.reshape(1, self.nS)
        p = self.model_predict.predict(obs)*self.env.action_space.high # Esto hay que corregirlo si no es simetrico
        if eval is False:
            action = action_one_hot = p[0] + np.random.normal(loc=0, scale=self.noise, size=p[0].shape)
        else:
            action = action_one_hot = p[0]
        
        return action, action_one_hot, p
    
    
    def get_critic_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=1):
        model = Sequential()
        model.add(Dense(hidden_layer_neurons, input_shape=input_shape, activation='relu'))
#         model.add(Dense(hidden_layer_neurons, input_shape=input_shape, activation='selu'))
        model.add(Dense(output_shape, activation='linear'))
        model.compile(Adam(lr), loss=['mse'])
        return model
    
    def get_discounted_rewards(self, r):
        # Por si es una lista
        r = np.array(r, dtype=float)
        """Take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, r.size)):
            running_add = running_add * self.gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r 

In [5]:
reinforce_agent = ReinforceAgent('MountainCarContinuous-v0', n_experience_episodes=10, EPISODES=2000, epochs=1, 
                                 lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True, batch_size=32)


Instructions for updating:
Colocations handled automatically by placer.


In [6]:
reinforce_agent.model_predict.summary()
reinforce_agent.model_predict.input.shape

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               384       
_________________________________________________________________
output (Dense)               (None, 1)                 129       
Total params: 513
Trainable params: 513
Non-trainable params: 0
_________________________________________________________________


TensorShape([Dimension(None), Dimension(2)])

In [7]:
reinforce_agent.model_train.summary()
reinforce_agent.model_train.input

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               384       
_________________________________________________________________
output (Dense)               (None, 1)                 129       
Total params: 513
Trainable params: 513
Non-trainable params: 0
_________________________________________________________________


[<tf.Tensor 'input_1:0' shape=(?, 2) dtype=float32>,
 <tf.Tensor 'input_2:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'input_3:0' shape=(?, 1) dtype=float32>]

In [8]:
def get_advantages(values, rewards, gamma=0.999, lmbda=0.95):
    #GAE
    returns = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] - values[i]
        gae = delta + gamma * lmbda * gae
        returns.insert(0, gae + values[i])

    adv = np.array(returns) - values[:-1]
    return adv

In [9]:
def compute_n_step_targets(rewards, values, gamma=0.999, n_steps = 5):
    out = []
    ep_len = len(rewards)
    gammas = np.power(gamma, range(ep_len+1)) # El +1 es un hack para que no cuelgue cuando le pasamos n_steps = len(rewards)
    padded_values = np.vstack([values, np.zeros([n_steps, 1])])
    for t in range(ep_len):
        # t desde donde comienzo, por ejemplo si t=0 sumo desde 0 a n_steps-1
        rewards_left = min([0, ep_len-t-n_steps])
        first_term = (gammas[:(n_steps+rewards_left)]*rewards[t:t+n_steps]).sum()
        A_t =  first_term - padded_values[t] + gammas[n_steps]*padded_values[t+n_steps]
        out.append(A_t)
    return np.array(out)

In [10]:
def get_AC_Advantages(rewards, gamma, values):
    return rewards.reshape(-1,1) + gamma*values[1:] - values[:-1]

In [11]:
reinforce_agent = ReinforceAgent('Pendulum-v0', n_experience_episodes=3, EPISODES=2000, epochs=1, 
                                 lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True, batch_size=32, noise=1.0)
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)



In [12]:
reinforce_agent.env.action_space.high, reinforce_agent.env.action_space.low

(array([2.], dtype=float32), array([-2.], dtype=float32))

In [13]:
np.max(actions), np.min(actions)

(3.7000082672385384, -2.9571554798133577)

In [14]:
# reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=3, EPISODES=2000, epochs=1, 
#                                  lr=0.001, algorithm='REINFORCE_V_s', gif_to_board=True, batch_size=32)
# obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)

In [15]:
# from matplotlib import pyplot as plt
# plt.plot(get_advantages(values_, rewards[i], gamma=reinforce_agent.gamma, lmbda=0.1))

In [16]:
critic_lr = 0.01
actor_lr =  0.01
LOSS_CLIPPING = 0.2 # Recomendado por el Paper

reinforce_agent = ReinforceAgent('MountainCarContinuous-v0', n_experience_episodes=4, EPISODES=500, epochs=10, eval_period=12,
                                 LOSS_CLIPPING=LOSS_CLIPPING,
                                 lr=actor_lr, algorithm='PPO', 
                                 gif_to_board=True, batch_size=64, gamma=0.99, noise=1.0)


initial_time = time()
running_variance = RunningVariance()
critic_model = reinforce_agent.get_critic_model(lr=critic_lr, 
                                           hidden_layer_neurons=128,
                                           input_shape=[reinforce_agent.nS],
                                           output_shape=1)

###########################################
## Entreno V(s) para que no tenga basura ##
###########################################
# Corro episodios con policy random
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)

# Les saco la ultima observación por que no tiene reward
observations = []
for i in range(reinforce_agent.n_experience_episodes):
    observations.append(obs[i][:-1])
observations = np.vstack(observations)

# Entreno V(s)
history_critic = critic_model.fit(observations, np.vstack(disc_sum_rews), verbose=0, 
                                      epochs=reinforce_agent.epochs,
                                      batch_size=reinforce_agent.batch_size)


###########################################
## Ciclo de entrenamiento del modelo     ##
###########################################

while reinforce_agent.episode < reinforce_agent.EPISODES:
    # Corro episodio con policy que se irá entrenando
    obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)
    actions = np.vstack(actions) # Pongo todas las acciones de los distintos episodios juntas
    # Pongo las predicciones juntas y las guardo como las viejas para pasarselas al modelo
    # Las nuevas predicciones será la salida de la red neuronal
    old_prediction = np.vstack(preds) 
    
    # Calculo advantages y guardo observaciones sin la última observación
    advantage = []
    observations = []
    for i in range(reinforce_agent.n_experience_episodes):
        values = critic_model.predict(obs[i]) 
#         values_ = np.vstack([rewards[i].reshape(-1,1) + reinforce_agent.gamma*values[1:], 0])
        
        advantage.append(get_advantages(values, rewards[i], gamma=reinforce_agent.gamma, lmbda=0.1))
#         advantage.append(get_AC_Advantages(rewards[i], reinforce_agent.gamma, values))
        observations.append(obs[i][:-1])
        
    advantage = np.vstack(advantage)
    observations = np.vstack(observations)
    
    # Calculo de varianza
    for ad in advantage:
        running_variance.add(ad)

    # Normalización de advantage
    advantage = (advantage-advantage.mean()) / advantage.std()
    
    # Entrenamiento de Policy
    history_loss = reinforce_agent.model_train.fit([observations, advantage, old_prediction], 
                                                   actions, verbose=0, 
                                                   epochs=reinforce_agent.epochs, 
                                                   batch_size=reinforce_agent.batch_size)
    
#     disc_sum_rews = (disc_sum_rews - disc_sum_rews.mean()) / disc_sum_rews.std()
    # Entrenamiento de V(s)
    history_critic = critic_model.fit(observations, np.vstack(disc_sum_rews), verbose=0, 
                                      epochs=reinforce_agent.epochs,
                                      batch_size=reinforce_agent.batch_size)
    
    # Logue de resultados
    reinforce_agent.log_data(reinforce_agent.episode, 
                      history_loss.history['loss'][0], 
                      np.mean(ep_len), 
                      None, 
                      running_variance.get_variance(), 
                      history_loss.history['actor_loss'][0], 
                      time() - initial_time, np.mean(ep_returns[-1]), 
                      history_critic.history['loss'][0])
    
reinforce_agent.writer.close()

Instructions for updating:
Use tf.cast instead.
correr en linea de comando: tensorboard --logdir logs/
Episode: 13
Model on episode 14 improved from -inf to -0.005498244665187006. Saved!


t:   1%|▏         | 14/1000 [00:00<00:07, 131.86it/s, now=None]

MoviePy - Building file /tmp/tmpszmesi8b.gif with imageio.




Episode: 27
Model on episode 28 improved from -0.005498244665187006 to -0.004996542638688931. Saved!


t:   1%|▏         | 14/1000 [00:00<00:07, 130.92it/s, now=None]

MoviePy - Building file /tmp/tmp7yx8go_y.gif with imageio.




Episode: 41
Model on episode 42 improved from -0.004996542638688931 to -0.0009536423694723672. Saved!


t:   1%|▏         | 14/1000 [00:00<00:07, 131.15it/s, now=None]

MoviePy - Building file /tmp/tmpjwf6q30v.gif with imageio.




Episode: 55
Model on episode 56 did not improved -0.008955267669767213. Best saved: -0.0009536423694723672
Episode: 69
Model on episode 70 did not improved -0.009671500552919297. Best saved: -0.0009536423694723672
Episode: 83
Model on episode 84 did not improved -0.002833434860154818. Best saved: -0.0009536423694723672
Episode: 97
Model on episode 98 did not improved -0.0077460746851736985. Best saved: -0.0009536423694723672
Episode: 111
Model on episode 112 did not improved -0.007675717791928941. Best saved: -0.0009536423694723672
Episode: 125
Model on episode 126 did not improved -0.06893483614877022. Best saved: -0.0009536423694723672
Episode: 139
Model on episode 140 did not improved -0.027616102505510148. Best saved: -0.0009536423694723672
Episode: 153
Model on episode 154 did not improved -0.06188588051432109. Best saved: -0.0009536423694723672
Episode: 167
Model on episode 168 improved from -0.0009536423694723672 to 4.406501848450448. Saved!


t:   5%|▍         | 14/308 [00:00<00:02, 130.97it/s, now=None]

MoviePy - Building file /tmp/tmprjleefmo.gif with imageio.




Episode: 181
Model on episode 182 improved from 4.406501848450448 to 7.180220412191438. Saved!


t:   5%|▍         | 12/256 [00:00<00:02, 118.33it/s, now=None]

MoviePy - Building file /tmp/tmpvcydcdk_.gif with imageio.




Episode: 195
Model on episode 196 improved from 7.180220412191438 to 19.93930902287264. Saved!


t:   9%|▉         | 14/151 [00:00<00:01, 130.37it/s, now=None]

MoviePy - Building file /tmp/tmpier9xnoi.gif with imageio.




Episode: 209

t:   0%|          | 0/85 [00:00<?, ?it/s, now=None]


Model on episode 210 improved from 19.93930902287264 to 41.334334464702515. Saved!
MoviePy - Building file /tmp/tmp2bxhqev2.gif with imageio.




Episode: 223
Model on episode 224 did not improved 14.118364358540443. Best saved: 41.334334464702515
Episode: 237

t:   0%|          | 0/78 [00:00<?, ?it/s, now=None]


Model on episode 238 improved from 41.334334464702515 to 43.44927389199169. Saved!
MoviePy - Building file /tmp/tmpxqrxgy27.gif with imageio.




Episode: 251
Model on episode 252 did not improved 38.43489637588831. Best saved: 43.44927389199169
Episode: 265
Model on episode 266 did not improved 25.32755289796818. Best saved: 43.44927389199169
Episode: 279
Model on episode 280 did not improved 28.994050205931856. Best saved: 43.44927389199169
Episode: 293
Model on episode 294 did not improved 28.527438278674193. Best saved: 43.44927389199169
Episode: 307
Model on episode 308 did not improved 29.271368625626206. Best saved: 43.44927389199169
Episode: 321
Model on episode 322 did not improved 40.781048181102996. Best saved: 43.44927389199169
Episode: 335
Model on episode 336 did not improved 38.73822708812725. Best saved: 43.44927389199169
Episode: 349
Model on episode 350 did not improved 39.95342217919006. Best saved: 43.44927389199169
Episode: 363
Model on episode 364 did not improved 26.893785018199146. Best saved: 43.44927389199169
Episode: 377
Model on episode 378 did not improved 27.93257457419002. Best saved: 43.4492738919

In [16]:
critic_lr = 0.001
actor_lr =  0.001
LOSS_CLIPPING = 0.2 # Recomendado por el Paper 0.2

reinforce_agent = ReinforceAgent('Pendulum-v0', n_experience_episodes=10, EPISODES=4000, epochs=10, eval_period=50,
                                 lr=actor_lr, algorithm='PPO', gif_to_board=True, batch_size=64, gamma=0.99, noise=0.5, 
                                 LOSS_CLIPPING=LOSS_CLIPPING)

# reinforce_agent = ReinforceAgent('CartPole-v0', n_experience_episodes=1, EPISODES=2000, epochs=1, 
#                                  lr=actor_lr, algorithm='PPO', gif_to_board=False, batch_size=64)

initial_time = time()
running_variance = RunningVariance()
critic_model = reinforce_agent.get_critic_model(lr=critic_lr, 
                                           hidden_layer_neurons=128,
                                           input_shape=[reinforce_agent.nS],
                                           output_shape=1)

###########################################
## Entreno V(s) para que no tenga basura ##
###########################################
# Corro episodios con policy random
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)

# Les saco la ultima observación por que no tiene reward
observations = []
for i in range(reinforce_agent.n_experience_episodes):
    observations.append(obs[i][:-1])
observations = np.vstack(observations)

# Entreno V(s)
history_critic = critic_model.fit(observations, np.vstack(disc_sum_rews), verbose=0, 
                                      epochs=reinforce_agent.epochs,
                                      batch_size=reinforce_agent.batch_size)


###########################################
## Ciclo de entrenamiento del modelo     ##
###########################################

while reinforce_agent.episode < reinforce_agent.EPISODES:
    # Corro episodio con policy que se irá entrenando
    obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)
    actions = np.vstack(actions) # Pongo todas las acciones de los distintos episodios juntas
    # Pongo las predicciones juntas y las guardo como las viejas para pasarselas al modelo
    # Las nuevas predicciones será la salida de la red neuronal
    old_prediction = np.vstack(preds) 
    
    # Calculo advantages y guardo observaciones sin la última observación
    advantage = []
    observations = []
    for i in range(reinforce_agent.n_experience_episodes):
        values = critic_model.predict(obs[i]) 
#         values_ = np.vstack([rewards[i].reshape(-1,1) + reinforce_agent.gamma*values[1:], 0])
        
        advantage.append(get_advantages(values, rewards[i], gamma=reinforce_agent.gamma, lmbda=0.1))
#         advantage.append(get_AC_Advantages(rewards[i], reinforce_agent.gamma, values))
        observations.append(obs[i][:-1])
        
    advantage = np.vstack(advantage)
    observations = np.vstack(observations)
    
    # Calculo de varianza
    for ad in advantage:
        running_variance.add(ad)

    # Normalización de advantage
    advantage = (advantage-advantage.mean()) / advantage.std()
    
    # Entrenamiento de Policy
    history_loss = reinforce_agent.model_train.fit([observations, advantage, old_prediction], 
                                                   actions, verbose=0, 
                                                   epochs=reinforce_agent.epochs, 
                                                   batch_size=reinforce_agent.batch_size)
    
#     disc_sum_rews = (disc_sum_rews - disc_sum_rews.mean()) / disc_sum_rews.std()
    # Entrenamiento de V(s)
    history_critic = critic_model.fit(observations, np.vstack(disc_sum_rews), verbose=0, 
                                      epochs=reinforce_agent.epochs,
                                      batch_size=reinforce_agent.batch_size)
    
    # Logue de resultados
    reinforce_agent.log_data(reinforce_agent.episode, 
                      history_loss.history['loss'][0], 
                      np.mean(ep_len), 
                      None, 
                      running_variance.get_variance(), 
                      None, 
                      time() - initial_time, np.mean(ep_returns[-1]), 
                      history_critic.history['loss'][0])
    
reinforce_agent.writer.close()

Instructions for updating:
Use tf.cast instead.
correr en linea de comando: tensorboard --logdir logs/
Episode: 51
Model on episode 52 improved from -inf to -465.68147973825245. Saved!


t:   5%|▍         | 10/202 [00:00<00:02, 92.99it/s, now=None]

MoviePy - Building file /tmp/tmpsr3p86sn.gif with imageio.




Episode: 103
Model on episode 104 improved from -465.68147973825245 to -436.14260197375035. Saved!


t:   5%|▍         | 10/202 [00:00<00:02, 95.40it/s, now=None]

MoviePy - Building file /tmp/tmpiglalb2p.gif with imageio.




Episode: 155
Model on episode 156 improved from -436.14260197375035 to -428.0049757427229. Saved!


t:   5%|▍         | 10/202 [00:00<00:02, 94.46it/s, now=None]

MoviePy - Building file /tmp/tmppvyguixd.gif with imageio.




Episode: 207
Model on episode 208 did not improved -679.4313999738971. Best saved: -428.0049757427229
Episode: 259
Model on episode 260 did not improved -556.0535486669834. Best saved: -428.0049757427229
Episode: 311
Model on episode 312 did not improved -631.9602354049055. Best saved: -428.0049757427229
Episode: 363
Model on episode 364 improved from -428.0049757427229 to -360.83892673867075. Saved!


t:   5%|▍         | 10/202 [00:00<00:02, 93.77it/s, now=None]

MoviePy - Building file /tmp/tmpz7u7nuqx.gif with imageio.




Episode: 415
Model on episode 416 did not improved -611.6465408463341. Best saved: -360.83892673867075
Episode: 467
Model on episode 468 did not improved -642.907656693919. Best saved: -360.83892673867075
Episode: 519
Model on episode 520 did not improved -687.8188154098955. Best saved: -360.83892673867075
Episode: 571
Model on episode 572 did not improved -558.5294202997267. Best saved: -360.83892673867075
Episode: 623
Model on episode 624 did not improved -645.8386610442902. Best saved: -360.83892673867075
Episode: 675
Model on episode 676 did not improved -648.7655222725291. Best saved: -360.83892673867075
Episode: 727
Model on episode 728 did not improved -650.7627972459811. Best saved: -360.83892673867075
Episode: 779
Model on episode 780 did not improved -587.1545632857967. Best saved: -360.83892673867075
Episode: 831
Model on episode 832 did not improved -469.3759785536423. Best saved: -360.83892673867075
Episode: 883
Model on episode 884 did not improved -568.2531774999143. Bes

t:   5%|▍         | 10/202 [00:00<00:02, 91.94it/s, now=None]

MoviePy - Building file /tmp/tmp0bflxp_y.gif with imageio.




Episode: 1507
Model on episode 1508 did not improved -645.6825779901825. Best saved: -121.73531820733947
Episode: 1559
Model on episode 1560 improved from -121.73531820733947 to -3.2132599800796178. Saved!


t:   5%|▍         | 10/202 [00:00<00:01, 98.66it/s, now=None]

MoviePy - Building file /tmp/tmplu7vrc3m.gif with imageio.




Episode: 1611
Model on episode 1612 did not improved -541.1204099697152. Best saved: -3.2132599800796178
Episode: 1663
Model on episode 1664 did not improved -601.233195907849. Best saved: -3.2132599800796178
Episode: 1715
Model on episode 1716 did not improved -654.7230864777673. Best saved: -3.2132599800796178
Episode: 1767
Model on episode 1768 did not improved -650.370481032233. Best saved: -3.2132599800796178
Episode: 1819
Model on episode 1820 did not improved -645.8343341543684. Best saved: -3.2132599800796178
Episode: 1871
Model on episode 1872 did not improved -641.3875895614188. Best saved: -3.2132599800796178
Episode: 1923
Model on episode 1924 did not improved -539.9637729316637. Best saved: -3.2132599800796178
Episode: 1975
Model on episode 1976 did not improved -595.8623433649253. Best saved: -3.2132599800796178
Episode: 2027
Model on episode 2028 did not improved -664.2545147730386. Best saved: -3.2132599800796178
Episode: 2079
Model on episode 2080 did not improved -665

t:   5%|▍         | 10/202 [00:00<00:02, 95.96it/s, now=None]

MoviePy - Building file /tmp/tmptgks4wbe.gif with imageio.




Episode: 2183
Model on episode 2184 did not improved -626.5182051258429. Best saved: -1.5910330501839158
Episode: 2235
Model on episode 2236 did not improved -639.8613335180277. Best saved: -1.5910330501839158
Episode: 2287
Model on episode 2288 did not improved -705.3190147891942. Best saved: -1.5910330501839158
Episode: 2339
Model on episode 2340 did not improved -637.7986410299651. Best saved: -1.5910330501839158
Episode: 2391
Model on episode 2392 did not improved -2.545848427187707. Best saved: -1.5910330501839158
Episode: 2443
Model on episode 2444 improved from -1.5910330501839158 to -1.1701681561977368. Saved!


t:   5%|▍         | 10/202 [00:00<00:02, 91.56it/s, now=None]

MoviePy - Building file /tmp/tmp50nkoos3.gif with imageio.




Episode: 2495
Model on episode 2496 did not improved -576.2141755061385. Best saved: -1.1701681561977368
Episode: 2547
Model on episode 2548 did not improved -632.1587018600778. Best saved: -1.1701681561977368
Episode: 2599
Model on episode 2600 did not improved -632.622655718945. Best saved: -1.1701681561977368


KeyboardInterrupt: 