In [1]:
import gym
import numpy as np
import copy, random, os, subprocess
from tensorflow.keras import layers, models

  import imp
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  if hasattr(pil_image, 'HAMMING'):
  if hasattr(pil_image, 'BOX'):
  if hasattr(pil_image, 'LANCZOS'):


In [2]:
BUCKET = 'gs://etsuji-car-racing-v2-model04v2'
os.environ['BUCKET'] = BUCKET
!gsutil mb -c regional -l us-west1 $BUCKET
!gsutil ls $BUCKET

Creating gs://etsuji-car-racing-v2-model04v2/...
ServiceException: 409 A Cloud Storage bucket named 'etsuji-car-racing-v2-model04v2' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.
gs://etsuji-car-racing-v2-model04v2/model04v2/


In [3]:
# Base model
class QValue:
    def __init__(self):
        self.model = self.build_model()

    def build_model(self):
        cnn_input = layers.Input(shape=(96, 96, 6), name='cnn_input')
        cnn1 = layers.Conv2D(32, (5, 5), padding='same',
                         use_bias=True, activation='relu',
                        name='cnn1')(cnn_input)
        pool1 = layers.MaxPooling2D((2, 2), name='pool1')(cnn1)
        cnn2 = layers.Conv2D(64, (5, 5), padding='same',
                         use_bias=True, activation='relu',
                        name='cnn2')(pool1)
        pool2 = layers.MaxPooling2D((2, 2), name='pool2')(cnn2)

        cnn_flatten = layers.Flatten(name='flatten')(pool2)
        action_input = layers.Input(shape=(5,), name='action_input')
        combined = layers.concatenate([cnn_flatten, action_input], name='concat')
        hidden1 = layers.Dense(1024, activation='relu', name='dense1')(combined)
        hidden2 = layers.Dense(512, activation='relu', name='dense2')(hidden1)
        q_value = layers.Dense(1, name='output')(hidden2)

        model = models.Model(inputs=[cnn_input, action_input], outputs=q_value)
        model.compile(loss='mse')
        return model

    def get_action(self, state):
        states = []
        actions = []
        for a in range(5):
            states.append(np.array(state))
            action_onehot = np.zeros(5)
            action_onehot[a] = 1
            actions.append(action_onehot)
  
        q_values = self.model.predict([np.array(states), np.array(actions)])
        optimal_action = np.argmax(q_values)
        return optimal_action, q_values[optimal_action][0]

In [4]:
def join_frames(o0, o1):
    return np.r_[o0.transpose(), o1.transpose()].transpose() 

In [5]:
def get_episode(environ, q_value, epsilon):
    episode = []
    o0 = environ.reset()
    o1 = copy.deepcopy(o0)
    total_r = 0

    if epsilon > 0:
        keep_count = 1
    else:
        keep_count = 1

    c = 0
    while True:
        if c % keep_count == 0: # get new action
            if np.random.random() < epsilon:
                a = np.random.randint(5)
            else:
                a, _ = q_value.get_action(join_frames(o0, o1))
        c += 1
        o_new, r, done, inf = environ.step(a)                
        total_r += r

        if total_r < 0:
            done = 1

        if done:
            if total_r > 990 or r < -99:
                episode.append((join_frames(o0, o1), a, r, None))
            break
        else:
            episode.append((join_frames(o0, o1), a, r, join_frames(o1, o_new)))
        o0, o1 = o1, o_new

    return episode, total_r

In [6]:
def train(environ, q_value, epsilon, checkpoint=0):
    if checkpoint > 0:
        filename = 'car-racing-v2-model04v2-{}.hd5'.format(checkpoint)
        subprocess.run(['gsutil', 'cp', '{}/model04v2/{}'.format(BUCKET, filename), './'])
        print('load model {}'.format(filename))
        q_value.model = models.load_model(filename)
        os.remove(filename)

    experience = []
    good_experience = []
    best_r = [-100, -100, -100]

    for n in range(checkpoint + 1, checkpoint + 1000):
        print('iteration {}'.format(n))

        total_len = 0
        if n % 3 == 0:
            print('Testing the current performance...')
            episode, total_r = get_episode(environ, q_value, epsilon=0)
            print('epsilon={}, episode length={}, total rewards={}'.format(0, len(episode), total_r))
            with open('result.txt', 'a') as f:
                f.write('{},{},{},{}\n'.format(n, epsilon, len(episode), total_r))
            filename = 'car-racing-v2-model04v2-{}.hd5'.format(n)
            q_value.model.save(filename, save_format='h5')
            subprocess.run(['gsutil', '-m', 'cp',
                            '{}'.format(filename), '{}/model04v2/'.format(BUCKET)])
            os.remove(filename)
            total_len += len(episode)
            continue

        while total_len < 500:
            episode, total_r = get_episode(environ, q_value, epsilon)
            print('epsilon={}, episode length={}, total rewards={}'.format(epsilon, len(episode), total_r))
            total_len += len(episode)
            experience += episode

            # ベスト3のエピソードを別枠に保存
            if total_r > min(best_r):
                # Keep top 3 episodes
                best_r = best_r[1:] + [total_r]
                good_experience += episode
                if len(good_experience) > 999 * 3:
                    good_experience = good_experience[-333 * 3:]

        # Need enough examples to avoid the inital catastrophic forgetting.
        if len(experience) < 999 * 5:
            continue

        if len(experience) > 999 * 5: # remember last 5 iterations
            experience = experience[-999 * 5:]

        #epsilon = (epsilon - 0.2) * 0.99 + 0.2
        epsilon = (epsilon - 0.1) * 0.99 + 0.1

        print('Training the model...')
        # 直近のデータ + 過去のデータ（サンプリング）で学習
        latest_experience = experience[-total_len:]
        past_experience = experience[:-total_len]
        examples = latest_experience + \
            random.sample(past_experience, min(len(past_experience), 999)) + \
            random.sample(good_experience, min(len(good_experience), 999))
        
        # show some statistics
        print('experience length={}'.format(len(experience)))
        print('number of examples={}'.format(len(examples)))
        print('best total reward = ', best_r)
        np.random.shuffle(examples)
                        
        states, actions, labels = [], [], []
        for state, a, r, state_new in examples:
            states.append(np.array(state))

            action_onehot = np.zeros(5)
            action_onehot[a] = 1
            actions.append(action_onehot)
            
            if state_new is None:   # Terminal state
                q_new = 0
            else:
                _, q_new = q_value.get_action(state_new)
            labels.append(np.array(r + q_new))

        hist = q_value.model.fit(
            [np.array(states), np.array(actions)], np.array(labels),
            batch_size=50, epochs=10, verbose=0)
        print('loss = {}'.format(hist.history['loss']))

In [7]:
env = gym.make("CarRacing-v2", continuous=False)
q_value = QValue()
q_value.model.summary()

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
2022-07-28 04:15:11.232962: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 cnn_input (InputLayer)         [(None, 96, 96, 6)]  0           []                               
                                                                                                  
 cnn1 (Conv2D)                  (None, 96, 96, 32)   4832        ['cnn_input[0][0]']              
                                                                                                  
 pool1 (MaxPooling2D)           (None, 48, 48, 32)   0           ['cnn1[0][0]']                   
                                                                                                  
 cnn2 (Conv2D)                  (None, 48, 48, 64)   51264       ['pool1[0][0]']                  
                                                                                              

2022-07-28 04:15:11.245000: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-28 04:15:11.246014: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-28 04:15:11.249064: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-28 04:15:11.249633: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [8]:
!gsutil cp gs://etsuji-car-racing-v2-model04/model04/car-racing-v2-model04-351.hd5 ./

Copying gs://etsuji-car-racing-v2-model04/model04/car-racing-v2-model04-351.hd5...
| [1 files][292.5 MiB/292.5 MiB]                                                
Operation completed over 1 objects/292.5 MiB.                                    


In [9]:
q_value.model = models.load_model('car-racing-v2-model04-351.hd5')

In [None]:
train(env, q_value, epsilon=0.3, checkpoint=0)

iteration 1


2022-07-28 04:15:19.540026: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200


epsilon=0.3, episode length=999, total rewards=334.92063492063284
iteration 2
epsilon=0.3, episode length=999, total rewards=259.1549295774689
iteration 3
Testing the current performance...
epsilon=0, episode length=999, total rewards=750.5747126436667


Copying file://car-racing-v2-model04v2-3.hd5 [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1/1 files][292.5 MiB/292.5 MiB] 100% Done                                    
Operation completed over 1 objects/292.5 MiB.                                    


iteration 4
epsilon=0.3, episode length=999, total rewards=255.07246376811509
iteration 5
epsilon=0.3, episode length=999, total rewards=185.71428571428956
iteration 6
Testing the current performance...
epsilon=0, episode length=999, total rewards=632.6732673267219


Copying file://car-racing-v2-model04v2-6.hd5 [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1/1 files][292.5 MiB/292.5 MiB] 100% Done                                    
Operation completed over 1 objects/292.5 MiB.                                    


iteration 7
epsilon=0.3, episode length=999, total rewards=211.2582781456983
Training the model...
experience length=4995
number of examples=2997
best total reward =  [334.92063492063284, 259.1549295774689, 255.07246376811509]
loss = [6.191128730773926, 3.2850067615509033, 3.165754556655884, 2.596569299697876, 2.4409754276275635, 2.2982587814331055, 2.2565903663635254, 1.9980108737945557, 2.0028154850006104, 1.8345237970352173]
iteration 8
epsilon=0.298, episode length=999, total rewards=378.0876494023851
Training the model...
experience length=4995
number of examples=2997
best total reward =  [259.1549295774689, 255.07246376811509, 378.0876494023851]
loss = [4.0332207679748535, 2.8116612434387207, 2.586707353591919, 2.24554443359375, 2.123455286026001, 1.8113088607788086, 1.767446517944336, 1.6499706506729126, 1.427217960357666, 1.4644793272018433]
iteration 9
Testing the current performance...
epsilon=0, episode length=999, total rewards=297.1119133573887


Copying file://car-racing-v2-model04v2-9.hd5 [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1/1 files][292.5 MiB/292.5 MiB] 100% Done                                    
Operation completed over 1 objects/292.5 MiB.                                    


iteration 10
epsilon=0.29601999999999995, episode length=999, total rewards=200.69930069930217
Training the model...
experience length=4995
number of examples=2997
best total reward =  [259.1549295774689, 255.07246376811509, 378.0876494023851]
loss = [3.5345206260681152, 2.736424684524536, 2.3548033237457275, 2.366656541824341, 2.151902675628662, 1.8828498125076294, 1.893627405166626, 1.7918059825897217, 1.908003330230713, 1.505308985710144]
iteration 11
epsilon=0.2940598, episode length=498, total rewards=14.896273291927344
epsilon=0.2940598, episode length=999, total rewards=200.6535947712449
Training the model...
experience length=4995
number of examples=3495
best total reward =  [259.1549295774689, 255.07246376811509, 378.0876494023851]
loss = [10.679101943969727, 8.099839210510254, 6.551080703735352, 6.313767433166504, 5.661108493804932, 5.150407314300537, 4.694507122039795, 4.060843467712402, 3.460689067840576, 2.846008062362671]
iteration 12
Testing the current performance...
ep

Copying file://car-racing-v2-model04v2-12.hd5 [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1/1 files][292.5 MiB/292.5 MiB] 100% Done                                    
Operation completed over 1 objects/292.5 MiB.                                    


iteration 13
epsilon=0.29211920199999997, episode length=999, total rewards=273.23943661972027
Training the model...
experience length=4995
number of examples=2997
best total reward =  [255.07246376811509, 378.0876494023851, 273.23943661972027]
loss = [3.5307388305664062, 2.5795435905456543, 2.2319986820220947, 2.1156082153320312, 1.8708524703979492, 1.7658147811889648, 1.5781450271606445, 8.88652229309082, 1.2358659505844116, 1.7786980867385864]
iteration 14
epsilon=0.29019800998, episode length=999, total rewards=340.94488188975794
Training the model...
experience length=4995
number of examples=2997
best total reward =  [378.0876494023851, 273.23943661972027, 340.94488188975794]
loss = [3.7249019145965576, 2.6335253715515137, 2.4930052757263184, 2.134589910507202, 1.9162776470184326, 1.8154079914093018, 1.6538054943084717, 1.7192270755767822, 1.4830487966537476, 1.4555732011795044]
iteration 15
Testing the current performance...
epsilon=0, episode length=999, total rewards=281.270903

Copying file://car-racing-v2-model04v2-15.hd5 [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1/1 files][292.5 MiB/292.5 MiB] 100% Done                                    
Operation completed over 1 objects/292.5 MiB.                                    


iteration 16
epsilon=0.2882960298802, episode length=999, total rewards=251.72413793103803
Training the model...
experience length=4995
number of examples=2997
best total reward =  [378.0876494023851, 273.23943661972027, 340.94488188975794]
loss = [3.6817383766174316, 6.933650493621826, 2.3001656532287598, 2.2819108963012695, 2.1366336345672607, 1.9897379875183105, 2.0302510261535645, 1.736586332321167, 1.6495081186294556, 1.5896267890930176]
iteration 17
epsilon=0.286413069581398, episode length=999, total rewards=39.28571428571611
Training the model...
experience length=4995
number of examples=2997
best total reward =  [378.0876494023851, 273.23943661972027, 340.94488188975794]
loss = [2.782606601715088, 2.205042600631714, 5.346073627471924, 1.4071722030639648, 1.5357848405838013, 1.3847483396530151, 1.3282179832458496, 1.3137636184692383, 3.8869869709014893, 1.137036681175232]
iteration 18
Testing the current performance...
epsilon=0, episode length=999, total rewards=467.7419354838