In [1]:
import gym
import numpy as np
import copy, random, os, subprocess, cv2
import tensorflow as tf
import keras.backend as K
from tensorflow.keras import layers, models, regularizers

  import imp
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  if hasattr(pil_image, 'HAMMING'):
  if hasattr(pil_image, 'BOX'):
  if hasattr(pil_image, 'LANCZOS'):


In [2]:
MODEL_NAME = 'model08'
BUCKET = 'gs://etsuji-car-racing-v2'
os.environ['BUCKET'] = BUCKET
!gsutil mb -c regional -l us-west1 $BUCKET
!gsutil ls $BUCKET

Creating gs://etsuji-car-racing-v2/...
ServiceException: 409 A Cloud Storage bucket named 'etsuji-car-racing-v2' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.
gs://etsuji-car-racing-v2/model04/
gs://etsuji-car-racing-v2/model05/
gs://etsuji-car-racing-v2/model06/
gs://etsuji-car-racing-v2/model07/


In [3]:
class ApplyL1Weight(layers.Layer):
    def __init__(self, l1=0.01, **kwargs):
        self.filter_shape = None
        self.l1 = l1
        super(ApplyL1Weight, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.kernel = self.add_weight(name='weights', shape=[input_shape[3]],
                                      regularizer=regularizers.L1(self.l1))
        self.filter_shape = input_shape

    def get_config(self):
        config = super().get_config()
        config.update({
            'l1': self.l1,
        })
        return config
    
    def call(self, inputs, **kwargs):
        return inputs * self.kernel
    
# Base model
class QValue:
    def __init__(self):
        self.model = self.build_model()

    def build_model(self):
        cnn_input = layers.Input(shape=(48, 48, 3), name='cnn_input')
        cnn1 = layers.Conv2D(16, (5, 5), padding='same',
                             use_bias=True, activation='relu',
                             name='cnn1')(cnn_input)
        pool1 = layers.MaxPooling2D((2, 2), name='pool1')(cnn1)
        cnn2 = layers.Conv2D(16, (5, 5), padding='same',
                             use_bias=True, activation='relu',
                             name='cnn2')(pool1)        
        pool2 = layers.MaxPooling2D((2, 2), name='pool2')(cnn2)
        weighted_filters = ApplyL1Weight(name='weighted_filters')(pool2)

        cnn_flatten = layers.Flatten(name='flatten')(weighted_filters)
        action_input = layers.Input(shape=(5,), name='action_input')
        combined = layers.concatenate([cnn_flatten, action_input], name='concat')
        hidden1 = layers.Dense(2048, activation='relu', name='dense1')(combined)
        hidden2 = layers.Dense(1024, activation='relu', name='dense2')(hidden1)
        hidden3 = layers.Dense(512, activation='relu', name='dense3')(hidden2)
        q_value = layers.Dense(1, name='output')(hidden3)

        model = models.Model(inputs=[cnn_input, action_input], outputs=q_value)
        model.compile(loss='mse')
        return model

    def get_action(self, state):
        states = []
        actions = []
        for a in range(5):
            states.append(np.array(state))
            action_onehot = np.zeros(5)
            action_onehot[a] = 1
            actions.append(action_onehot)
  
        q_values = self.model.predict([np.array(states), np.array(actions)])
        optimal_action = np.argmax(q_values)
        return optimal_action, q_values[optimal_action][0]

In [4]:
q_value = QValue()
q_value.model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 cnn_input (InputLayer)         [(None, 48, 48, 3)]  0           []                               
                                                                                                  
 cnn1 (Conv2D)                  (None, 48, 48, 16)   1216        ['cnn_input[0][0]']              
                                                                                                  
 pool1 (MaxPooling2D)           (None, 24, 24, 16)   0           ['cnn1[0][0]']                   
                                                                                                  
 cnn2 (Conv2D)                  (None, 24, 24, 16)   6416        ['pool1[0][0]']                  
                                                                                              

2022-08-03 04:20:49.701007: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-08-03 04:20:49.701058: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (tensorflow-2-8-20220801-173936): /proc/driver/nvidia/version does not exist
2022-08-03 04:20:49.702848: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
def join_frames(o0, o1, o2):
    gray_image0 = cv2.cvtColor(cv2.resize(o0, (48, 48)), cv2.COLOR_RGB2GRAY)
    gray_image1 = cv2.cvtColor(cv2.resize(o1, (48, 48)), cv2.COLOR_RGB2GRAY)
    gray_image2 = cv2.cvtColor(cv2.resize(o2, (48, 48)), cv2.COLOR_RGB2GRAY)
    
    return np.array(
        [gray_image0.transpose(),
         gray_image1.transpose(),
         gray_image2.transpose()]).transpose()

In [6]:
def get_episode(environ, q_value, epsilon):
    episode = []
    o0 = environ.reset()
    o1 = copy.deepcopy(o0)
    o2 = copy.deepcopy(o0)
    total_r = 0

    if epsilon > 0:
        keep_count = 3
    else:
        keep_count = 1

    c = 0
    while True:
        if c % keep_count == 0: # Get new action
            if np.random.random() < epsilon:
                a = np.random.randint(5)
            else:
                a, _ = q_value.get_action(join_frames(o0, o1, o2))
        c += 1
        o_new, r, done, inf = environ.step(a)                
        total_r += r

        # Terminate episode when total reward becomes negative
        if total_r < 0:
            done = 1

        if done:
            # Terminal state is to achive more than 990 or get out of the field.
            if total_r > 990 or r < -99:
                episode.append((join_frames(o0, o1, o2), a, r, None))
            break
        else:
            episode.append((join_frames(o0, o1, o2), a, r, join_frames(o1, o2, o_new)))
        o0, o1, o2 = o1, o2, o_new

    print('epsilon={}, episode length={}, total rewards={}'.format(epsilon, len(episode), total_r))
    return episode, total_r

In [7]:
def train(environ, q_value, epsilon, checkpoint=0):
    if checkpoint > 0:
        filename = 'car-racing-v2-{}-{}.hd5'.format(checkpoint, MODEL_NAME)
        subprocess.run(['gsutil', 'cp', '{}/{}/{}'.format(BUCKET, MODEL_NAME, filename), './'])
        print('load model {}'.format(filename))
        q_value.model = models.load_model(filename)
        os.remove(filename)

    experience = []
    good_experience = []
    best_r = [-100, -100, -100]

    for n in range(checkpoint + 1, checkpoint + 1000):
        print('iteration {}'.format(n))

        total_len = 0
        if n % 3 == 0:
            print('Testing the current performance...')
            episode, total_r = get_episode(environ, q_value, epsilon=0)
            with open('result.txt', 'a') as f:
                f.write('{},{},{},{}\n'.format(n, epsilon, len(episode), total_r))
            filename = 'car-racing-v2-{}-{}.hd5'.format(n, MODEL_NAME)
            q_value.model.save(filename, save_format='h5')
            subprocess.run(['gsutil', '-m', 'cp',
                            '{}'.format(filename), '{}/{}/'.format(BUCKET, MODEL_NAME)])
            os.remove(filename)
            experience += episode
            total_len += len(episode)

        while total_len < 500:
            episode, total_r = get_episode(environ, q_value, epsilon)
            total_len += len(episode)
            experience += episode

            # Keep the top 3 episodes
            if total_r > min(best_r):
                best_r = best_r[1:] + [total_r]
                good_experience += episode
                if len(good_experience) > 999 * 3:
                    good_experience = good_experience[-999 * 3:]

            
        if len(experience) > 999 * 5: # remember last 5 episodes
            experience = experience[-999 * 5:]

        epsilon = (epsilon - 0.2) * 0.99 + 0.2

        print('Training the model...')
        # Use latest episode + past episodes (sampling) + top 3 episode (sampling)
        latest_experience = experience[-total_len:]
        past_experience = experience[:-total_len]
        examples = latest_experience + \
            random.sample(past_experience, min(len(past_experience), 999)) + \
            random.sample(good_experience, min(len(good_experience), 999))
        
        # Show some statistics
        print('experience length={}'.format(len(experience)))
        print('number of examples={}'.format(len(examples)))
        print('best total reward = ', best_r)
        np.random.shuffle(examples)
                        
        states, actions, labels = [], [], []
        for state, a, r, state_new in examples:
            states.append(np.array(state))

            action_onehot = np.zeros(5)
            action_onehot[a] = 1
            actions.append(action_onehot)
            
            if state_new is None:   # Terminal state
                q_new = 0
            else:
                _, q_new = q_value.get_action(state_new)
            labels.append(np.array(r + q_new))

        hist = q_value.model.fit(
            [np.array(states), np.array(actions)], np.array(labels),
            batch_size=50, epochs=10, verbose=0)
        print('loss = {}'.format(hist.history['loss']))

In [8]:
env = gym.make("CarRacing-v2", continuous=False)
q_value = QValue()
q_value.model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 cnn_input (InputLayer)         [(None, 48, 48, 3)]  0           []                               
                                                                                                  
 cnn1 (Conv2D)                  (None, 48, 48, 16)   1216        ['cnn_input[0][0]']              
                                                                                                  
 pool1 (MaxPooling2D)           (None, 24, 24, 16)   0           ['cnn1[0][0]']                   
                                                                                                  
 cnn2 (Conv2D)                  (None, 24, 24, 16)   6416        ['pool1[0][0]']                  
                                                                                            

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


In [None]:
train(env, q_value, epsilon=1.0, checkpoint=0)

iteration 1
epsilon=1.0, episode length=361, total rewards=-0.09891696750915568
epsilon=1.0, episode length=201, total rewards=-0.065771812080482
Training the model...
experience length=562
number of examples=1124
best total reward =  [-100, -0.09891696750915568, -0.065771812080482]
loss = [408149.6875, 34.685325622558594, 41.809669494628906, 18.42458724975586, 11.12362289428711, 8.753081321716309, 15.194820404052734, 7.518411159515381, 11.407185554504395, 6.876156330108643]
iteration 2
epsilon=0.992, episode length=94, total rewards=-0.00632911392403665
epsilon=0.992, episode length=153, total rewards=-0.07432950191567156
epsilon=0.992, episode length=68, total rewards=-0.0034482758620619502
epsilon=0.992, episode length=69, total rewards=-0.006993006992998846
epsilon=0.992, episode length=76, total rewards=-0.007692307692297201
epsilon=0.992, episode length=196, total rewards=-0.0921568627450775
Training the model...
experience length=1218
number of examples=2217
best total reward = 

Copying file://car-racing-v2-3-model08.hd5 [Content-Type=application/octet-stream]...
- [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


epsilon=0.9840800000000001, episode length=294, total rewards=-0.08823529411758124
epsilon=0.9840800000000001, episode length=192, total rewards=-0.007395498392272354
Training the model...
experience length=1769
number of examples=2549
best total reward =  [-0.006993006992998846, -0.007692307692297201, -0.007395498392272354]
loss = [0.6142647862434387, 0.5402365326881409, 1.6553845405578613, 0.354451984167099, 0.48177987337112427, 0.4879624545574188, 0.48838910460472107, 0.47183340787887573, 0.4230238199234009, 0.44345706701278687]
iteration 4
epsilon=0.9762392000000002, episode length=145, total rewards=-0.05454545454542381
epsilon=0.9762392000000002, episode length=137, total rewards=-0.05429553264601972
epsilon=0.9762392000000002, episode length=185, total rewards=-0.012639405204429383
epsilon=0.9762392000000002, episode length=141, total rewards=-0.06572438162542515
Training the model...
experience length=2377
number of examples=2606
best total reward =  [-0.006993006992998846, -0.

Copying file://car-racing-v2-6-model08.hd5 [Content-Type=application/octet-stream]...
/ [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


epsilon=0.9607920399200003, episode length=139, total rewards=-0.0627177700348247
epsilon=0.9607920399200003, episode length=119, total rewards=-0.047808764940220544
epsilon=0.9607920399200003, episode length=436, total rewards=-0.06363636363623582
Training the model...
experience length=3956
number of examples=2821
best total reward =  [-0.006993006992998846, -0.007692307692297201, -0.007395498392272354]
loss = [0.30184879899024963, 0.2755516469478607, 0.27085748314857483, 0.26630309224128723, 0.2544651925563812, 0.26254957914352417, 0.25063762068748474, 0.24511770904064178, 0.2605138421058655, 0.2512945532798767]
iteration 7
epsilon=0.9531841195208004, episode length=208, total rewards=-0.06666666666661958
epsilon=0.9531841195208004, episode length=92, total rewards=-0.06923076923075233
epsilon=0.9531841195208004, episode length=125, total rewards=-0.06081504702193219
epsilon=0.9531841195208004, episode length=62, total rewards=-0.06947040498441859
epsilon=0.9531841195208004, episode

Copying file://car-racing-v2-9-model08.hd5 [Content-Type=application/octet-stream]...
- [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


epsilon=0.9381957555423366, episode length=106, total rewards=-0.09929328621906852
epsilon=0.9381957555423366, episode length=196, total rewards=-0.0921568627450624
Training the model...
experience length=4995
number of examples=2543
best total reward =  [-0.007692307692297201, -0.007395498392272354, -0.006405693950143759]
loss = [0.2703229486942291, 0.24721290171146393, 0.24391482770442963, 0.229780375957489, 0.21262231469154358, 0.21273629367351532, 0.20505139231681824, 0.21652352809906006, 0.2032577246427536, 0.21023660898208618]
iteration 10
epsilon=0.9308137979869133, episode length=711, total rewards=-0.013559322033995985
Training the model...
experience length=4995
number of examples=2709
best total reward =  [-0.007692307692297201, -0.007395498392272354, -0.006405693950143759]
loss = [0.37331441044807434, 0.3393740653991699, 0.3177172541618347, 0.30817708373069763, 0.3014465868473053, 0.2936227321624756, 0.2964954376220703, 0.2707340121269226, 0.2723509669303894, 0.269093722105

Copying file://car-racing-v2-12-model08.hd5 [Content-Type=application/octet-stream]...
/ [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


Training the model...
experience length=4995
number of examples=2869
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.3564794659614563, 0.3118208348751068, 0.3169439733028412, 0.2849785089492798, 0.27095887064933777, 0.26540669798851013, 0.2655327022075653, 0.25878870487213135, 0.256984144449234, 0.2440417855978012]
iteration 13
epsilon=0.9091078973729041, episode length=206, total rewards=-0.01034482758620145
epsilon=0.9091078973729041, episode length=86, total rewards=-0.02947976878611344
epsilon=0.9091078973729041, episode length=180, total rewards=-0.08198198198193585
epsilon=0.9091078973729041, episode length=105, total rewards=-0.07368421052631047
Training the model...
experience length=4995
number of examples=2575
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.32605743408203125, 0.2891591489315033, 0.2544383406639099, 0.2365403175354004, 0.24487560987472534, 0.227535888552

Copying file://car-racing-v2-15-model08.hd5 [Content-Type=application/octet-stream]...
/ [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


epsilon=0.8949966502151834, episode length=93, total rewards=-0.054205607476629725
epsilon=0.8949966502151834, episode length=307, total rewards=-0.08327645051185262
Training the model...
experience length=4995
number of examples=2559
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.406855970621109, 0.33657950162887573, 0.3177379369735718, 0.28043031692504883, 0.2468646615743637, 0.2419213205575943, 0.239830881357193, 0.22241351008415222, 0.22950196266174316, 0.19739623367786407]
iteration 16
epsilon=0.8880466837130316, episode length=142, total rewards=-0.06512455516012516
epsilon=0.8880466837130316, episode length=714, total rewards=-0.0714285714283378
Training the model...
experience length=4995
number of examples=2854
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.44720742106437683, 0.33948397636413574, 0.3258145749568939, 0.2981969118118286, 0.28645631670951843, 0.2732222378

Copying file://car-racing-v2-18-model08.hd5 [Content-Type=application/octet-stream]...
- [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


Training the model...
experience length=4995
number of examples=2762
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [4.158449649810791, 4.029313564300537, 3.932480812072754, 3.7519350051879883, 3.557628870010376, 3.44242525100708, 3.2559170722961426, 2.4873878955841064, 2.223126173019409, 1.4806220531463623]
iteration 19
epsilon=0.8676110091600708, episode length=279, total rewards=-0.06703910614523179
epsilon=0.8676110091600708, episode length=319, total rewards=-0.051118210862532765
Training the model...
experience length=4995
number of examples=2596
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.45833081007003784, 0.3671867251396179, 0.3332909345626831, 0.2820863127708435, 0.2669203579425812, 0.2383360117673874, 0.2224193811416626, 0.223308727145195, 0.18780922889709473, 0.19462911784648895]
iteration 20
epsilon=0.8609348990684702, episode length=738, total rewards=-0.099261992

Copying file://car-racing-v2-21-model08.hd5 [Content-Type=application/octet-stream]...
- [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


epsilon=0.8543255500777855, episode length=383, total rewards=-0.0858237547891545
Training the model...
experience length=4995
number of examples=2567
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.4841994345188141, 0.35707277059555054, 0.3283551335334778, 0.2761216461658478, 0.21259933710098267, 0.21709072589874268, 0.20886798202991486, 0.2275943011045456, 0.19707676768302917, 0.1853857785463333]
iteration 22
epsilon=0.8477822945770077, episode length=494, total rewards=-0.030035335688885584
epsilon=0.8477822945770077, episode length=243, total rewards=-0.009756097560907423
Training the model...
experience length=4995
number of examples=2735
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.4257183074951172, 0.32799842953681946, 0.28152963519096375, 0.25019434094429016, 0.22053903341293335, 0.21470686793327332, 0.182926207780838, 0.18335889279842377, 0.160765141248703, 0.16036364

Copying file://car-racing-v2-24-model08.hd5 [Content-Type=application/octet-stream]...
/ [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


Training the model...
experience length=4995
number of examples=2789
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [4.135615825653076, 3.9976394176483154, 3.943281650543213, 3.8926806449890137, 3.8688693046569824, 3.7256717681884766, 3.7530412673950195, 3.671354055404663, 3.344362497329712, 3.157416582107544]
iteration 25
epsilon=0.8285425126457759, episode length=680, total rewards=-0.07278911564628854
Training the model...
experience length=4995
number of examples=2678
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.4974924921989441, 0.3104529082775116, 0.27913954854011536, 0.24091923236846924, 0.21612270176410675, 0.23092715442180634, 0.1758459359407425, 0.21051695942878723, 0.15606600046157837, 0.19375020265579224]
iteration 26
epsilon=0.8222570875193183, episode length=801, total rewards=-0.06062717770033754
Training the model...
experience length=4995
number of examples=2799

Copying file://car-racing-v2-27-model08.hd5 [Content-Type=application/octet-stream]...
- [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


epsilon=0.8160345166441252, episode length=709, total rewards=-0.054054054054291756
Training the model...
experience length=4995
number of examples=3160
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.47035494446754456, 0.3155125081539154, 0.3759278357028961, 0.25174033641815186, 0.27354368567466736, 0.22592149674892426, 0.2618136703968048, 0.18999658524990082, 0.25933513045310974, 0.17329852283000946]
iteration 28
epsilon=0.809874171477684, episode length=181, total rewards=-0.08405797101444823
epsilon=0.809874171477684, episode length=312, total rewards=-0.04999999999991653
epsilon=0.809874171477684, episode length=658, total rewards=-0.0692789968652534
Training the model...
experience length=4995
number of examples=3149
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.49571287631988525, 0.32841554284095764, 0.3452061116695404, 0.3160904347896576, 0.38422733545303345, 0.24687938

Copying file://car-racing-v2-30-model08.hd5 [Content-Type=application/octet-stream]...
- [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


epsilon=0.7977376754652781, episode length=271, total rewards=-0.08135593220332038
Training the model...
experience length=4995
number of examples=2648
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.40387967228889465, 0.30479928851127625, 0.2361879199743271, 0.21110209822654724, 0.1987556368112564, 0.17172172665596008, 0.17123323678970337, 0.16941170394420624, 0.1367843598127365, 0.1480874866247177]
iteration 31
epsilon=0.7917602987106254, episode length=684, total rewards=-0.09609120521156211
Training the model...
experience length=4995
number of examples=2682
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.4210302531719208, 0.3143061399459839, 0.2587120234966278, 0.23928846418857574, 0.2066017985343933, 0.16747109591960907, 0.18287518620491028, 0.1709468811750412, 0.15800583362579346, 0.14284391701221466]
iteration 32
epsilon=0.7858426957235192, episode length=430, total rewar

Copying file://car-racing-v2-33-model08.hd5 [Content-Type=application/octet-stream]...
/ [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


Training the model...
experience length=4995
number of examples=2707
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.40902894735336304, 0.28618934750556946, 0.25053805112838745, 0.22588852047920227, 0.21633434295654297, 0.1932445466518402, 0.17684611678123474, 0.16442115604877472, 0.17871129512786865, 0.16387516260147095]
iteration 34
epsilon=0.7741844260786213, episode length=317, total rewards=-0.05396825396819094
epsilon=0.7741844260786213, episode length=797, total rewards=-0.08985507246364333
Training the model...
experience length=4995
number of examples=3112
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.47109997272491455, 0.3569785952568054, 0.31168127059936523, 0.26000675559043884, 0.2615883946418762, 0.21852107346057892, 0.20627517998218536, 0.20432059466838837, 0.19689220190048218, 0.18546034395694733]
iteration 35
epsilon=0.7684425818178351, episode length=682, total

Copying file://car-racing-v2-36-model08.hd5 [Content-Type=application/octet-stream]...
/ [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


Training the model...
experience length=4995
number of examples=2696
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.4069449305534363, 0.28828945755958557, 0.2506836950778961, 0.22042174637317657, 0.20146653056144714, 0.19163882732391357, 0.16250860691070557, 0.16433797776699066, 0.15347528457641602, 0.14706140756607056]
iteration 37
epsilon=0.7571305744396604, episode length=459, total rewards=-0.09836065573758224
epsilon=0.7571305744396604, episode length=761, total rewards=-0.009523809524354138
Training the model...
experience length=4995
number of examples=3218
best total reward =  [-0.007395498392272354, -0.006405693950143759, -0.0015873015872955587]
loss = [0.45980435609817505, 0.3286195695400238, 0.3059435486793518, 0.2538340985774994, 0.2402474284172058, 0.21626046299934387, 0.21377471089363098, 0.18976899981498718, 0.18359291553497314, 0.17592325806617737]
iteration 38
epsilon=0.7515592686952639, episode length=233, total r

Copying file://car-racing-v2-39-model08.hd5 [Content-Type=application/octet-stream]...
- [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


Training the model...
experience length=4995
number of examples=2675
best total reward =  [-0.006405693950143759, -0.0015873015872955587, -0.0035087719297035302]
loss = [0.3834252655506134, 0.2646559178829193, 0.24796733260154724, 0.21374845504760742, 0.20094630122184753, 0.19659484922885895, 0.16809512674808502, 0.16127076745033264, 0.1725781410932541, 0.14402182400226593]
iteration 40
epsilon=0.7405832392482283, episode length=253, total rewards=-0.0031746031745331915
epsilon=0.7405832392482283, episode length=999, total rewards=14.814814814813758
Training the model...
experience length=4995
number of examples=3250
best total reward =  [-0.0035087719297035302, -0.0031746031745331915, 14.814814814813758]
loss = [0.47757869958877563, 0.3721015751361847, 0.29441168904304504, 0.2684447765350342, 0.24448691308498383, 0.22707071900367737, 0.20915746688842773, 0.19403082132339478, 0.1796119064092636, 0.1890692263841629]
iteration 41
epsilon=0.7351774068557462, episode length=618, total rewa

Copying file://car-racing-v2-42-model08.hd5 [Content-Type=application/octet-stream]...
/ [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


Training the model...
experience length=4995
number of examples=2799
best total reward =  [-0.0035087719297035302, -0.0031746031745331915, 14.814814814813758]
loss = [0.5184290409088135, 0.3391527235507965, 0.2780553698539734, 0.5514646768569946, 0.2641904056072235, 0.2500668168067932, 0.21785615384578705, 0.1937199831008911, 0.21778850257396698, 0.1849837750196457]
iteration 43
epsilon=0.724527376459317, episode length=610, total rewards=-0.031297709923628875
Training the model...
experience length=4995
number of examples=2608
best total reward =  [-0.0035087719297035302, -0.0031746031745331915, 14.814814814813758]
loss = [0.43638405203819275, 0.3144887089729309, 0.23865336179733276, 0.23739075660705566, 0.19749824702739716, 0.17612534761428833, 0.1677696853876114, 0.1628779172897339, 0.1520271897315979, 0.13730217516422272]
iteration 44
epsilon=0.7192821026947238, episode length=735, total rewards=-0.07058823529434588
Training the model...
experience length=4995
number of examples=27

Copying file://car-racing-v2-45-model08.hd5 [Content-Type=application/octet-stream]...
/ [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


epsilon=0.7140892816677766, episode length=626, total rewards=-0.004075235109685965
Training the model...
experience length=4995
number of examples=2800
best total reward =  [-0.0035087719297035302, -0.0031746031745331915, 14.814814814813758]
loss = [0.4386675953865051, 0.30856162309646606, 0.2607424259185791, 0.21739815175533295, 0.20892390608787537, 0.18327337503433228, 0.29377374053001404, 0.14934943616390228, 0.15246066451072693, 0.14797236025333405]
iteration 46
epsilon=0.708948388851099, episode length=157, total rewards=-0.051968503936985266
epsilon=0.708948388851099, episode length=999, total rewards=33.82899628252722
Training the model...
experience length=4995
number of examples=3154
best total reward =  [-0.0031746031745331915, 14.814814814813758, 33.82899628252722]
loss = [0.6406534910202026, 0.4101640582084656, 0.3519234359264374, 0.3139306902885437, 0.2786990702152252, 0.2501019835472107, 0.2353590875864029, 0.19916562736034393, 0.18539586663246155, 0.18553222715854645]
i

Copying file://car-racing-v2-48-model08.hd5 [Content-Type=application/octet-stream]...
/ [1/1 files][ 56.2 MiB/ 56.2 MiB] 100% Done                                    
Operation completed over 1 objects/56.2 MiB.                                     


Training the model...
experience length=4995
number of examples=2716
best total reward =  [-0.0031746031745331915, 14.814814814813758, 33.82899628252722]
loss = [0.5421455502510071, 0.39488789439201355, 0.3021458089351654, 0.33136940002441406, 0.2553878724575043, 0.23430569469928741, 0.21284975111484528, 0.20815305411815643, 0.18947002291679382, 0.18624867498874664]
iteration 49
epsilon=0.6938321127538326, episode length=352, total rewards=-0.043589743589682345
epsilon=0.6938321127538326, episode length=626, total rewards=-0.013432835820880146
Training the model...
experience length=4995
number of examples=2976
best total reward =  [-0.0031746031745331915, 14.814814814813758, 33.82899628252722]
