In [1]:
from dqn import CarRacingAgent
import gym
from collections import deque
import cv2 as cv
import numpy as np

import warnings
warnings.filterwarnings('ignore')

2023-05-20 15:15:50.685719: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-20 15:15:50.753280: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-20 15:15:50.754552: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Hyperparameters

In [2]:
RENDER = False
START_EPISODE = 1
END_EPISODE = 1000
SKIP_FRAMES = 2
BATCH_SIZE = 64
SAVE_MODEL_EVERY = 25
UPDATE_TARGET_MODEL_EVERY = 5
NEGATIVE_REWARD_TOLERANCE_WINDOW = 100
NEGATIVE_REWARD_TOLERANCE_LIMIT = 25
SAVE_MODEL_PATH = './save/episode_{}.h5'

In [3]:
env = gym.make('CarRacing-v1')
agent = CarRacingAgent()

In [4]:
def process_state(state):
    state = cv.cvtColor(state, cv.COLOR_RGB2GRAY)
    state = state.astype(np.float32)
    state /= 255.0
    return state

In [5]:
def deque_to_stack(deque):
    frame_stack = np.array(deque)
    # to channels last
    return np.expand_dims(frame_stack, axis=0)

In [6]:
for episode in range(START_EPISODE, END_EPISODE + 1):
    print(f"Starting Episode: {episode}/{END_EPISODE}")
    initial_state = env.reset()
    initial_state = process_state(initial_state)

    total_reward = 0
    negative_reward_count = 0
    state_stack = deque([initial_state]*agent.frame_stack_num,maxlen=agent.frame_stack_num)
    time_frame_counter = 1
    done = False
    #agent.memory = agent.build_memory()

    while True:
        if RENDER:
            env.render()
        
        current_state_stack = deque_to_stack(state_stack)
        action = agent.act(current_state_stack)

        reward = 0
        for _ in range(SKIP_FRAMES + 1):
            next_state, step_reward, done, _ = env.step(action)
            reward += step_reward
            if done:
                break
        
        negative_reward_count += 1 if time_frame_counter > NEGATIVE_REWARD_TOLERANCE_WINDOW and reward < 0 else 0

        # we can increase rewards here for specific actions to encourage the agent to learn them

        total_reward += reward

        next_state = process_state(next_state)
        state_stack.append(next_state)
        next_state_stack = deque_to_stack(state_stack)

        agent.remember(current_state_stack, action, reward, next_state_stack, done)

        if done or negative_reward_count > NEGATIVE_REWARD_TOLERANCE_LIMIT or total_reward < 0:
            print(f"Episode: {episode}/{END_EPISODE}, Total Reward: {total_reward:.2}, Epsilon: {agent.epsilon:.2}")
            break

        if len(agent.memory) == BATCH_SIZE:
            print("Starting Training")
        
        if len(agent.memory) > BATCH_SIZE:
            agent.replay(BATCH_SIZE)
        time_frame_counter += 1

        if time_frame_counter % 50 == 0:
            print(f"Episode: {episode}/{END_EPISODE}, Iteration:{time_frame_counter}, Total Reward: {total_reward:.2}, Epsilon: {agent.epsilon:.2}")
    
    if episode % UPDATE_TARGET_MODEL_EVERY == 0:
        agent.update_target_model()
    
    if episode % SAVE_MODEL_EVERY == 0:
        agent.save_model(SAVE_MODEL_PATH.format(episode))

env.close()

Starting Episode: 1/1000
Track generation: 1159..1453 -> 294-tiles track
Episode: 1/1000, Total Reward: -0.15, Epsilon: 1.0
Starting Episode: 2/1000
Track generation: 1122..1404 -> 282-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1084..1359 -> 275-tiles track
Starting Training


2023-05-20 15:16:01.870250: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25267200 exceeds 10% of free system memory.
2023-05-20 15:16:01.870299: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25267200 exceeds 10% of free system memory.
2023-05-20 15:16:01.873292: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 27992448 exceeds 10% of free system memory.
2023-05-20 15:16:01.896842: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25267200 exceeds 10% of free system memory.
2023-05-20 15:16:01.896905: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25267200 exceeds 10% of free system memory.


epsilon: 1.0
epsilon: 0.9999
epsilon: 0.9998000100000001
epsilon: 0.9997000299990001
epsilon: 0.9996000599960002
epsilon: 0.9995000999900007
epsilon: 0.9994001499800017
epsilon: 0.9993002099650037
epsilon: 0.9992002799440072
epsilon: 0.9991003599160128
epsilon: 0.9990004498800211
epsilon: 0.9989005498350332
epsilon: 0.9988006597800497
epsilon: 0.9987007797140718
epsilon: 0.9986009096361004
epsilon: 0.9985010495451367
epsilon: 0.9984011994401822
epsilon: 0.9983013593202382
Episode: 2/1000, Total Reward: -0.15, Epsilon: 1.0
Starting Episode: 3/1000
Track generation: 1093..1370 -> 277-tiles track
epsilon: 0.9982015291843062
epsilon: 0.9981017090313877
epsilon: 0.9980018988604846
epsilon: 0.9979020986705985
epsilon: 0.9978023084607315
epsilon: 0.9977025282298854
epsilon: 0.9976027579770624
epsilon: 0.9975029977012647
epsilon: 0.9974032474014946
epsilon: 0.9973035070767544
epsilon: 0.9972037767260468
epsilon: 0.9971040563483742
epsilon: 0.9970043459427393
epsilon: 0.9969046455081451
epsilon

KeyboardInterrupt: 