In [17]:
import os
import csv 
import cv2
import gym
import random
import datetime
import numpy as np
import highway_env
from tqdm import tqdm
import tensorflow as tf
from collections import deque

In [2]:
class Config:
    """
    Set parameter for training and file path
    :param NO_ACTIONS_SIZE: Fast, slow, up, down, stable
    :param DISCOUNT_FACTOR: The discount factor essentially determines how much the reinforcement learning agents cares about rewards
    :param GENERATIONS: Iterations number
    """
    
    BATCH_SIZE = 64
    GENERATIONS = 3000
    NO_ACTIONS_SIZE = 5
    IM_W, IM_H = 250, 160
    DISCOUNT_FACTOR = 0.99
    NODE_HISTORY_SIZE = 15000
    SAVE_PATH = 'files/training/model_files/cp-{}.ckpt'
    SAVE_DIR = os.path.dirname(SAVE_PATH)
    LOG_PATH = os.path.join("files", "training", "my_logs")
    TF_WRITER = tf.summary.create_file_writer("files/training/my_logs/tf_board")
     
    # Make directory to store log
    def create_log_file():
        if not os.path.exists(log_save_path):
            os.makedirs(log_save_path)

In [3]:
def build_network():
    """
    I builded a network which have 9m parameter and use relu to activation function.
    """
    inp = tf.keras.layers.Input((Config.IM_H, Config.IM_W, 4))
    x = tf.keras.layers.Conv2D(32, (3,3), activation='relu')(inp)
    x = tf.keras.layers.Conv2D(32, (3,3), activation='relu')(x)
    x = tf.keras.layers.Conv2D(64, (3,3), activation='relu')(x)
    x = tf.keras.layers.MaxPool2D((2,2))(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    x = tf.keras.layers.Conv2D(64, (3,3), activation='relu')(x)
    x = tf.keras.layers.Conv2D(128, (3,3), activation='relu')(x)
    x = tf.keras.layers.MaxPool2D((2,2))(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    x = tf.keras.layers.Conv2D(128, (3,3), activation='relu')(x)
    x = tf.keras.layers.Conv2D(256, (3,3), activation='relu')(x)
    x = tf.keras.layers.MaxPool2D((2,2))(x)
    x = tf.keras.layers.Dropout(0.2)(x)

    x = tf.keras.layers.Conv2D(512, (3,3), activation='relu')(x)
    x = tf.keras.layers.MaxPool2D((2,2))(x)
    x = tf.keras.layers.Flatten()(x)

    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = tf.keras.layers.Dense(Config.NO_ACTIONS_SIZE, activation='linear')(x)

    model = tf.keras.Model(inputs=inp, outputs=x)
    model.summary()
    return model

In [8]:
class DeepQnetwork:
    """
    :param counter: We keep what step we are in 
    :param epsilon: Network hyper parameters
    :param decay: Network hyper parameters
    :param training/predict_network: 
    """
    counter=0
    epsilon, min_epsilon = 1, 0.1
    # Init predicting and training network
    train_network = build_network()
    predict_network = build_network()
    decay = epsilon/((Config.GENERATIONS//2)-1)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 160, 250, 4)]     0         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 158, 248, 32)      1184      
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 156, 246, 32)      9248      
_________________________________________________________________
conv2d_18 (Conv2D)           (None, 154, 244, 64)      18496     
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 77, 122, 64)       0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 77, 122, 64)       0         
_________________________________________________________________
conv2d_19 (Conv2D)           (None, 75, 120, 64)       3692

In [5]:
dqn = DeepQnetwork
previous_memory = deque(maxlen=Config.NODE_HISTORY_SIZE) 
# Make environment
env = gym.make('highway-v0')

#### Set Enviroment Properties

In [9]:
# The observations and actions of an environment are parametrized by a configuration, defined as a config dictionary.
configr = {
    "offscreen_rendering": True,
    "observation": {
        "type": "GrayscaleObservation",
        "weights": [0.9, 0.1, 0.5],  # weights for RGB conversion
        "stack_size": 4,
        "observation_shape": (Config.IM_W, Config.IM_H)
    },
    "screen_width": Config.IM_W,
    "screen_height": Config.IM_H,
    "scaling": 5.75,
    "lanes_count":4,
}
env.configure(configr)

#### Utils Function For Train

In [13]:
def get_prediction(states):
    # Get prediction on from the predicted network actions using the states
    states = np.reshape(states, newshape=(states.shape[0], Config.IM_H, Config.IM_W, 4))/255
    prediction = dqn.predict_network(states)
    return prediction


def get_action(state):
    # The function decides to randomly explore the action space or choose one from the predicted network actions.
    # The epsilon value decides between exploration or exploitation mode of the agent and it is reduced over time.
    if np.random.random() > dqn.epsilon:
        _action = get_prediction(np.expand_dims(state, axis=0))
        action = np.argmax(_action)
    else:
        action = np.random.randint(0, Config.NO_ACTIONS_SIZE)
    return action

def save_log(step, quantity, filename):
    # The function save log to analyze result
    with open(os.path.join(Config.LOG_PATH, filename), 'a+') as fi:
        csv_w = csv.writer(fi, delimiter=',')
        csv_w.writerow([step, quantity])


def predict(states):
    # Predict action using network from states, return 1x5 action probability list
    # For example: return -> [0.85679,-0.00060,-0.00030,-0.00112,-0.00044]
    states = np.reshape(states, newshape=(states.shape[0], Config.IM_H, Config.IM_W, 4))/255
    prediction = dqn.train_network(states)
    return prediction


def update_q_value(rewards, current_q_list, next_q_list, actions, done):
    # The function update q value to maximizing Qvalue for the policy
    current_q_list = current_q_list.numpy()
    next_max_qs = np.max(next_q_list, axis=1)
    new_qs = rewards + (np.ones(done.shape)-done)*Config.DISCOUNT_FACTOR * next_max_qs
    for i in range(len(current_q_list)):
        current_q_list[i, actions[i]] = new_qs[i]
    return current_q_list


def loss_f(ground_truth, prediction):
    # We use mean squared error to calculation loss between ground truth and prediction
    loss = tf.keras.losses.mean_squared_error(ground_truth, prediction)
    return loss


@tf.function
def train_step(states, actions):
    with tf.GradientTape() as tape:
        predictions = dqn.train_network(states)
        loss = loss_f(actions, predictions)
    gradients = tape.gradient(loss, dqn.train_network.trainable_variables)
    gradients = [tf.clip_by_norm(gradient, 10) for gradient in gradients]
    dqn.optimizer.apply_gradients(zip(gradients, dqn.train_network.trainable_variables))
    return loss


def save_weigths(dqn):
    # This function saves the model weights to the save_path
    for train_grad, pred_grad in zip(dqn.train_network.trainable_variables, dqn.train_network.trainable_variables):
        pred_grad.assign(train_grad)
    dqn.train_network.save_weights(Config.SAVE_PATH.format(dqn.counter))
    print("artis oldu")

#### Utils Function For Network

In [14]:
def get_reward(observation, info):
    # The reward value is returned according to the action.
    if info['crashed']:
        reward = -1
    else:
        if np.sum(observation[1:, 1]) > 0:
            reward = 0
        else:
            reward = 5
    return reward


def get_batch(sampling_size):
    this_batch = random.sample(previous_memory, sampling_size)
    current_nodes, actions, next_nodes, rewards, done = list(zip(*this_batch))
    return [np.stack(current_nodes), np.array(actions), np.stack(next_nodes), np.array(rewards), np.array(done)]


def train_network():
    previous_memories = get_batch(Config.BATCH_SIZE)
    dqn.counter += 1
    current_nodes, actions, next_nodes, rewards, done = previous_memories
    current_action_qs = predict(current_nodes)
    next_action_qs = get_prediction(next_nodes)
    current_action_qs = update_q_value(rewards, current_action_qs, next_action_qs, actions, done)
    current_nodes = np.reshape(current_nodes, newshape=(Config.BATCH_SIZE, Config.IM_H, Config.IM_W, 4))/255

    loss = train_step(current_nodes, current_action_qs)
    
    with Config.TF_WRITER.as_default():
        tf.summary.scalar("loss", data=np.mean(loss), step=dqn.counter)

    save_log(dqn.counter, np.mean(loss), "loss.csv")


#### Run - Train 

In [None]:
 def run(episodes, train_frequency=2):
    for episode in tqdm(range(episodes)):
        
        # Start enviroment Reader
        observation = env.reset()
        reward_history = []
        step_counter = 0
        
        # Loop during duration time
        while 1:
            # our predicting network will predict actions for the given state 
            action = get_action(observation)
            next_observation, reward, done, info = env.step(action)
            # save the rewards that it got from taking the action
            reward_history.append(reward)
            # collect data to be saved in memory a
            previous_memory.append([observation, action, next_observation, reward, 1 if done else 0])
            observation = next_observation
            
            # Every tenth iteration, the training network will train on the data gathered in the memory. 
            # To avoid training too often.
            if  step_counter%10 == 0:
                if len(previous_memory) >= Config.BATCH_SIZE:
                    train_network()
            if done:
                break
        
        # save result to analyze and show on tensorboard
        with Config.TF_WRITER.as_default():
            tf.summary.scalar("Episodic Average Rewards", data=np.mean(reward_history), step=episode)
            tf.summary.scalar("Epsilon", data=dqn.epsilon, step=episode)
        save_log(episode, np.mean(reward_history), "episodic_reward.csv")

        if Config.GENERATIONS //2 >= episode >=1:
            new_epsilon = dqn.epsilon-dqn.decay
            dqn.epsilon = max(new_epsilon, dqn.min_epsilon)
            
        # For every 10th episode, we are updating our predicted network with our trained model from the locally saved directory.
        if episode > 9 and episode % 10 == 0:
            save_weigths()

In [None]:
# Run training
run(Config.GENERATIONS)

#### Run - Test 

In [19]:
# Create video capture to record game video
cap = cv2.VideoCapture(0)
# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(os.path.join("tmp", 'output2.avi'),fourcc, 20.0, (Config.IM_W, Config.IM_H))

In [23]:
def test(episodes, train_frequency=2):
    
    for episode in tqdm(range(episodes)):
        # Start enviroment Reader and initlize reward_history/counter
        observation = env.reset()
        reward_history = []
        step_counter = 0
        
        # Loop during duration time
        while 1:
            # Our predicting network predict actions for the given state 
            action = get_action(observation)
            next_observation, reward, done, info = env.step(action)
            reward_history.append(reward)
            # Get frame to show and save video
            frame = env.render(mode='rgb_array')
            previous_memory.append([observation, action, next_observation, reward])
            observation = next_observation
            
            # Frame is saved as video
            out.write(frame)
            
            # Break loop if duration is completed
            if done:
                break
    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [None]:
# Run test
test(episodes=100)