In [None]:
from dqn import CarRacingAgent
import gym
from collections import deque
import cv2 as cv
import numpy as np

import warnings
warnings.filterwarnings('ignore')

### Hyperparameters

In [None]:
RENDER = False
START_EPISODE = 1
END_EPISODE = 150
SKIP_FRAMES = 2
BATCH_SIZE = 64
SAVE_MODEL_EVERY = 25
UPDATE_TARGET_MODEL_EVERY = 5
NEGATIVE_REWARD_TOLERANCE_WINDOW = 100
NEGATIVE_REWARD_TOLERANCE_LIMIT = 25
SAVE_MODEL_PATH = './save/episode_{}.h5'

In [None]:
#env = gym.make('CarRacing-v2', render_mode='human')
env = gym.make('CarRacing-v2')
agent = CarRacingAgent()

In [None]:
def process_state(state):
    state = cv.cvtColor(state, cv.COLOR_RGB2GRAY)
    state = state.astype(np.float32)
    state /= 255.0
    return state

In [None]:
def deque_to_stack(deque):
    frame_stack = np.array(deque)
    # to channels last
    return np.transpose(frame_stack, (1, 2, 0))

In [None]:
rewards_over_episodes = []
for episode in range(START_EPISODE, END_EPISODE + 1):
    print(f"Starting Episode: {episode}/{END_EPISODE}")
    initial_state, obs_info = env.reset()
    initial_state = process_state(initial_state)
    
    total_reward = 0
    negative_reward_count = 0
    state_stack = deque([initial_state]*agent.frame_stack_num,maxlen=agent.frame_stack_num)
    time_frame_counter = 1
    done = False
    agent.memory = agent.build_memory()
    while True:
        #if RENDER:
        #    env.render()
        
        current_state_stack = deque_to_stack(state_stack)
        action = agent.act(np.expand_dims(current_state_stack, 0))

        reward = 0
        for _ in range(SKIP_FRAMES + 1):
            next_state, step_reward, done, _, _ = env.step(action)
            reward += step_reward
            if done:
                break
        
        negative_reward_count += 1 if time_frame_counter > NEGATIVE_REWARD_TOLERANCE_WINDOW and reward < 0 else 0

        # we can increase rewards here for specific actions to encourage the agent to learn them

        total_reward += reward

        next_state = process_state(next_state)
        state_stack.append(next_state)
        next_state_stack = deque_to_stack(state_stack)

        agent.remember(current_state_stack, action, reward, next_state_stack, done)

        if done or negative_reward_count > NEGATIVE_REWARD_TOLERANCE_LIMIT or total_reward < 0:
            print(f"Episode: {episode}/{END_EPISODE}, Total Reward: {total_reward:.2}, Epsilon: {agent.epsilon:.2}")
            rewards_over_episodes.append(total_reward)
            break

        if len(agent.memory) == BATCH_SIZE:
            print("Starting Training")
        
        if len(agent.memory) > BATCH_SIZE:
            agent.replay(BATCH_SIZE)
        time_frame_counter += 1

        if time_frame_counter % 50 == 0:
            print(f"Episode: {episode}/{END_EPISODE}, Iteration:{time_frame_counter}, Total Reward: {total_reward:.2}, Epsilon: {agent.epsilon:.2}")
    
    if episode % UPDATE_TARGET_MODEL_EVERY == 0:
        agent.update_target_model()
    
    if episode % SAVE_MODEL_EVERY == 0:
        agent.save(SAVE_MODEL_PATH.format(episode))

env.close()

# Stats

In [None]:
#Plot rewards over episodes
import matplotlib.pyplot as plt
print(rewards_over_episodes)
plt.plot(list(range(len(rewards_over_episodes))), rewards_over_episodes)
plt.show()
