<a href="https://colab.research.google.com/github/fjpena35226/q-learning/blob/main/car/ai_race_game_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# HIDE OUTPUT
!pip install swig pyvirtualdisplay
!pip install gymnasium gymnasium[box2d] tqdm
#!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

Collecting swig
  Downloading swig-4.1.1.post0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/1.8 MB[0m [31m11.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Installing collected packages: swig, pyvirtualdisplay
Successfully installed pyvirtualdisplay-3.0 swig-4.1.1.post0
Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m11.6 MB/

In [None]:
# HIDE OUTPUT
#!apt-get update > /dev/null 2>&1
#!apt-get install cmake > /dev/null 2>&1
#!apt-get install xvfb > /dev/null 2>&1
#!pip install ez_setup > /dev/null 2>&1

In [3]:
import tensorflow as tf

tf.executing_eagerly()

print(tf.__version__)

2.14.0


In [4]:
import gymnasium as gym
from gym.wrappers import RecordVideo
from pyvirtualdisplay import Display
from gymnasium.utils.save_video import save_video

from IPython import display as ipythondisplay
from IPython.display import HTML, clear_output
import glob
import io
import base64
from pathlib import Path
import os
import pickle
import numpy as np
import cv2
import argparse
from collections import deque
from tqdm import tqdm
import matplotlib.pyplot as plt

# PARAMETERS

In [19]:
RENDER                        = True
STARTING_EPISODE              = 1
ENDING_EPISODE                = 200
SKIP_FRAMES                   = 2
TRAINING_BATCH_SIZE           = 64
SAVE_TRAINING_FREQUENCY       = 25
UPDATE_TARGET_MODEL_FREQUENCY = 5
EPSILON                       = 0.1
MODEL_NAME                    = 'car-model'
MODEL_FILE                    = MODEL_NAME + '_25.h5'
ENV_URL = '/content/drive/MyDrive/q-learning/car'

# UTILS

In [6]:
gpus = tf.config.list_physical_devices('GPU')
print(gpus)
gpuName = 'device:GPU:0' if gpus != None else None

[]


In [7]:
def save_video_to(env, episode = 0, video_folder = "/video"):
    save_video(
         env.render(),
         video_folder=ENV_URL + video_folder,
         fps=env.metadata["render_fps"],
         name_prefix=episode
      )

In [8]:
def render_env(env):
  clear_output(wait=True)
  plt.imshow(env.render())
  plt.axis('off')
  plt.show()

In [9]:
def process_state_image(state):
    state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
    state = state.astype(float)
    state /= 255.0
    return np.reshape(state, [96,96,1])

def generate_state_frame_stack_from_queue(deque):
    frame_stack = np.array(deque)
    # Move stack dimension to the channel dimension (stack, x, y) -> (x, y, stack)
    return np.transpose(frame_stack, (1, 2, 0))

# DEEP-Q-LEARNING-CAR-AGENT

In [11]:
import random
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import Adam

class CarRacingDQNAgent:
    def __init__(
        self,
        observation_shape,
        action_space    = [
            (-1, 1, 0.2), (0, 1, 0.2), (1, 1, 0.2), #           Action Space Structure
            (-1, 1,   0), (0, 1,   0), (1, 1,   0), #        (Steering Wheel, Gas, Break)
            (-1, 0, 0.2), (0, 0, 0.2), (1, 0, 0.2), # Range        -1~1       0~1   0~1
            (-1, 0,   0), (0, 0,   0), (1, 0,   0)
        ],
        frame_stack_num = 3,
        memory_size     = 5000,
        gamma           = 0.95,  # discount rate
        epsilon         = 1.0,   # exploration rate
        epsilon_min     = 0.1,
        epsilon_decay   = 0.9999,
        learning_rate   = 0.001,
    ):
        self.observation_shape=observation_shape
        self.action_space    = action_space
        self.frame_stack_num = frame_stack_num
        self.memory          = []
        self.memory_size     = memory_size
        self.gamma           = gamma
        self.epsilon         = epsilon
        self.epsilon_min     = epsilon_min
        self.epsilon_decay   = epsilon_decay
        self.learning_rate   = learning_rate
        self.model           = self.build_model()
        self.target_model    = self.build_model()
        self.update_target_model()

    def build_model(self):
        # Neural Net for Deep-Q learning Model
        model = tf.keras.Sequential()
        model.add(Conv2D(filters=6, kernel_size=(7, 7), activation='relu', input_shape=self.observation_shape))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(filters=12, kernel_size=(4, 4), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(216, activation='relu'))
        model.add(Dense(len(self.action_space), activation=None))
        model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=self.learning_rate, epsilon=1e-7))
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def clearMemory(self):
        del self.memory
        self.memory = []

    def memorize(self, state, action, reward, next_state, done):
        if(len(self.memory) >= self.memory_size - 1):
          removed = [self.memory.pop(random.randrange(len(self.memory))) for _ in range(1)]
        self.memory.append((state, self.action_space.index(action), reward, next_state, done))

    def act(self, state):
        if np.random.rand() > self.epsilon:
            act_values = self.model.predict(np.expand_dims(state, axis=0), verbose=0)
            action_index = np.argmax(act_values[0])
        else:
            action_index = random.randrange(len(self.action_space))
        return self.action_space[action_index]

    def internalReplay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states = np.array([exp[0] for exp in minibatch])
        next_states = np.array([exp[3] for exp in minibatch])

        targets = self.model.predict(states, verbose = 0)
        next_targets = self.target_model.predict(next_states, verbose = 0)

        for index, experience in enumerate(minibatch):
            state, action_index, reward, next_state, done = experience
            if done:
                targets[index][action_index] = reward
            else:
                """Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]"""
                delta = (
                      reward
                      + self.gamma * np.amax(next_targets[index])
                      - targets[index][action_index]
                  )
                targets[index][action_index] = targets[index][action_index] + self.learning_rate * delta

        self.model.fit(states, np.array(targets), epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def replay(self, batch_size):
        if gpus:
          for gpu in gpus:
            with tf.device(gpuName):
              self.internalReplay(batch_size)
        else:
            self.internalReplay(batch_size)


    def load(self, name):
        self.model.load_weights(name)
        self.model.summary()
        self.update_target_model()

    def save(self, name):
        self.target_model.save_weights(name)

# TRAINING

In [20]:
env = gym.make("CarRacing-v2", domain_randomize=True, render_mode="rgb_array_list")
init_state, info = env.reset()
#init_state = process_state_image(init_state)
agent = CarRacingDQNAgent(observation_shape=init_state.shape, epsilon=EPSILON, memory_size=1000)
if MODEL_FILE:
        agent.load(ENV_URL + '/save/' + MODEL_FILE)


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 90, 90, 6)         888       
                                                                 
 max_pooling2d_12 (MaxPooli  (None, 45, 45, 6)         0         
 ng2D)                                                           
                                                                 
 conv2d_13 (Conv2D)          (None, 42, 42, 12)        1164      
                                                                 
 max_pooling2d_13 (MaxPooli  (None, 21, 21, 12)        0         
 ng2D)                                                           
                                                                 
 flatten_6 (Flatten)         (None, 5292)              0         
                                                                 
 dense_12 (Dense)            (None, 216)              

In [None]:
for e in tqdm(range(ENDING_EPISODE), leave=False):
        init_state, info = env.reset()

        total_reward = 0
        negative_reward_counter = 0
        current_state = init_state #process_state_image(init_state)
        time_frame_counter = 1
        done = False
        #agent.clearMemory()

        while True:

            action = agent.act(current_state)

            reward = 0
            for _ in range(SKIP_FRAMES+1):
                next_state, r, done, truncated, info = env.step(action)
                reward += r
                if done:
                    break

            # If continually getting negative reward 10 times after the tolerance steps, terminate this episode
            negative_reward_counter = negative_reward_counter + 1 if time_frame_counter > 100 and reward < 0 else 0

            # Extra bonus for the model if it uses full gas
            if action[1] == 1 and action[2] == 0:
                reward *= 1.5

            total_reward += reward

            agent.memorize(current_state, action, reward, next_state, done)

            if done or negative_reward_counter >= 25 or total_reward < 0:
                #print('Episode: {}/{}, Scores(Time Frames): {}, Total Rewards(adjusted): {:.2}, Epsilon: {:.2}'.format(e, ENDING_EPISODE, time_frame_counter, float(total_reward), float(agent.epsilon)))
                break
            if len(agent.memory) > TRAINING_BATCH_SIZE:
                agent.replay(TRAINING_BATCH_SIZE)

            time_frame_counter += 1
            current_state = next_state

        if e % UPDATE_TARGET_MODEL_FREQUENCY == 0:
            agent.update_target_model()

        if e % SAVE_TRAINING_FREQUENCY == 0:
            agent.save(ENV_URL + '/save/' + MODEL_NAME +'_{}.h5'.format(e))

env.close()

  6%|▌         | 11/200 [18:39<6:37:25, 126.17s/it]

# "AUTONOMOUS" DRIVING

In [18]:
#Reset environment
init_state, info = env.reset()
print('Driving...')

total_reward = 0
punishment_counter = 0
current_state = init_state
time_frame_counter = 1
terminated = False

while not terminated:
  action = agent.act(current_state)
  next_state, reward, done, truncated, info = env.step(action)

  terminated = done or truncated

  total_reward += reward

  current_state = next_state

  if done:
    #print('Episode: {}/{}, Scores(Time Frames): {}, Total Rewards: {:.2}'.format(e+1, time_frame_counter, float(total_reward)))
    break
  time_frame_counter += 1

save_video_to(env=env, episode = e, video_folder = "/video")

env.reset()
env.close()

  0%|          | 0/1 [00:00<?, ?it/s]

Episode: 0


  0%|          | 0/1 [01:26<?, ?it/s]

Moviepy - Building video /content/drive/MyDrive/q-learning/car/video/0-episode-0.mp4.
Moviepy - Writing video /content/drive/MyDrive/q-learning/car/video/0-episode-0.mp4




t:   0%|          | 0/1001 [00:00<?, ?it/s, now=None][A
t:   4%|▍         | 39/1001 [00:00<00:02, 384.86it/s, now=None][A
t:   8%|▊         | 78/1001 [00:00<00:03, 266.22it/s, now=None][A
t:  11%|█▏        | 113/1001 [00:00<00:03, 295.89it/s, now=None][A
t:  15%|█▌        | 152/1001 [00:00<00:02, 325.06it/s, now=None][A
t:  19%|█▉        | 188/1001 [00:00<00:02, 336.34it/s, now=None][A
t:  23%|██▎       | 227/1001 [00:00<00:02, 352.69it/s, now=None][A
t:  26%|██▋       | 264/1001 [00:00<00:02, 356.60it/s, now=None][A
t:  30%|███       | 301/1001 [00:00<00:01, 351.15it/s, now=None][A
t:  34%|███▍      | 338/1001 [00:00<00:01, 356.62it/s, now=None][A
t:  38%|███▊      | 376/1001 [00:01<00:01, 360.73it/s, now=None][A
t:  41%|████▏     | 413/1001 [00:01<00:01, 345.26it/s, now=None][A
t:  45%|████▍     | 448/1001 [00:01<00:01, 334.51it/s, now=None][A
t:  48%|████▊     | 482/1001 [00:01<00:01, 322.88it/s, now=None][A
t:  51%|█████▏    | 515/1001 [00:01<00:01, 315.10it/s, now=N

Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/q-learning/car/video/0-episode-0.mp4




In [None]:
print('Original code in https://github.com/andywu0913/OpenAI-GYM-CarRacing-DQN')