# Playing Atari2600 Pong with Reinforcement Learning and Deep Q-Network

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from collections import deque
from tensorflow.keras.layers import Conv2D, Dense, Input, Flatten
import cv2 as cv
import gym
import gnwrapper
import matplotlib.pyplot as plt
from datetime import datetime
import sys
import os

In [None]:
# create the model directory

if not os.path.exists('./models'):
    os.mkdir('./models')

## Experience Replay

In [None]:
class ExperienceReplay:
    def __init__(self, replay_memory_size, sample_size):
        
        self.memorySize = replay_memory_size       #How many transitions store
        self.sampleSize=sample_size                #How many transitions (s, a, r, s1, d) pick (mini_batch)
        self.replay_memory = deque(maxlen=replay_memory_size)
        
    
    def add_experience(self, state, action, reward, next_state, done):
        
        if len(self.replay_memory) ==  self.memorySize:
                self.replay_memory.popleft() # delete the first tuple
                
        state = np.array(state)
        next_state = np.array(next_state)
        
        if done:
            done = 1
        else:
            done = 0
            
        tmp = (state, action, reward, next_state, done)
                
        self.replay_memory.append(tmp)
        
    
    # taken from lectures' code   
    def sample_experience(self): 
        
        batch = []
        random_samples_idxs = np.random.choice(range(len(self.replay_memory)), size=self.sampleSize)

        for idx in random_samples_idxs:
            batch.append(self.replay_memory[idx])
            
        states, actions, rewards, next_states, dones = [np.array([ex[field_index] for ex in batch]) 
                                                        for field_index in range(5)]
        

        return states, actions, rewards, next_states, dones 


## DQN Agent

In [None]:
class DQNAgent():
    def __init__(self, n_actions, frame_stack, img_width, img_height, lrn_rate, batch_size, epsilon):
        
        self.n_actions           = n_actions
        self.img_width           = img_width
        self.img_height          = img_height
        self.frame_stack         = frame_stack
        
        self.batch_size          = batch_size
        self.lrn_rate            = lrn_rate
        self.gamma               = 0.99          #discount factor

        self.epsilon             = epsilon
        self.epsilon_max         = 1.0
        self.epsilon_min         = 0.01
        self.epsilon_decrease    = 100000.0
        
        self.frame_count         = 0
        
        self.model_Q             = self.build_model()
        self.model_Q_target      = self.build_model()
        
        self.loss                = tf.keras.losses.Huber()
        self.opt                 = tf.keras.optimizers.Adam(learning_rate=self.lrn_rate)
        
        
        
    def build_model(self):

        inputs = Input(shape=(
                            self.frame_stack, 
                            self.img_width, 
                            self.img_height, 
                            1
                            ), 
                            batch_size = self.batch_size, 
                            name = "input")
        
                       
        conv_1 = Conv2D(16,
                        8,
                        strides = 4,
                        activation = "relu")(inputs)

        conv_2 = Conv2D(32,
                        4,
                        strides = 2,
                        activation = "relu")(conv_1)

        flatten = Flatten()(conv_2)
        
        dense = Dense(256, activation="relu")(flatten)
        
        output = Dense(self.n_actions, activation="linear", name= "output")(dense)
        
        model = keras.Model(inputs=inputs, outputs=output)
        
        return model
    
    
    def policy_action_selection(self, state):
        
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, self.n_actions)
        
        else:
            action_probability = self.model_Q(tf.expand_dims(state, 0), training=False)
            return tf.argmax(action_probability[0]).numpy()
        
        
    def epsilon_decrease(self):
        
        if frame_count <= exploration_frames:
            self.epsilon = self.epsilon - ( (self.epsilon_max - self.epsilon_min) / self.epsilon_decrease )
        else:
            self.epsilon = self.epsilon_min
            
    
    def training(self, experience_replay: ExperienceReplay):
        
        states, actions, rewards, next_states, dones = experience_replay.sample_experience()
        
        target_q_values = self.model_Q_target(next_states)
        max_target_q_values = tf.reduce_max(target_q_values, axis=1)
        
        #if state is terminal: y_target = r_j
        #otherwise: y_target = r_j + gamma * max_target_q_value
        updated_target_q_values = rewards + (self.gamma * max_target_q_values * (1 - dones))
        
        #encode the actions
        actions_one_hot = tf.one_hot(actions, self.n_actions)
        
        with tf.GradientTape() as tape:
            q_values = self.model_Q(states)

            # select the q_value for the action taken using the mask
            q_values = tf.reduce_sum(q_values * actions_one_hot, axis=1, keepdims=True)

            # calculate loss on target q_value and q_values for the action taken
            loss = self.loss(updated_target_q_values, q_values)

        # we calculate the gradient and apply it to model_Q
        grads = tape.gradient(loss, self.model_Q.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model_Q.trainable_variables))
        
        
    def update_target_network(self):
        self.model_Q_target.set_weights(self.model_Q.get_weights())

## Main

### Environment setup
1. Load the environment
2. Use wrapper for image preprocessing
3. Use wrapper for frame stack

In [None]:
def create_environment(name, frame_skip, frame_stack, screen_size, grayscale_obs, show_env_spec):
    
    env = gym.make(name)
    env = gym.wrappers.AtariPreprocessing(env, frame_skip=frame_skip, screen_size=screen_size, grayscale_obs=grayscale_obs)
    env = gym.wrappers.FrameStack(env, frame_stack)
    
    if show_env_spec:
        obs = np.array(env.reset())
        print(obs.shape)
        plt.imshow(obs[3])
        
    return env

In [None]:
ENVIRONMENT    = "PongNoFrameskip-v4"

frame_skip     = 4
frame_stack    = 4
screen_size    = 84
grayscale_obs  = True
show_env_spec  = True

In [None]:
env = create_environment(
                       name = ENVIRONMENT,
                       frame_skip = frame_skip,
                       frame_stack = frame_stack,
                       screen_size = screen_size,
                       grayscale_obs = grayscale_obs,
                       show_env_spec = show_env_spec
                        )

In [None]:
n_actions = env.action_space.n
env_actions = env.unwrapped.get_action_meanings()
print(env_actions)

## Training

1. Set hyperparameters
2. Set the DQN Agent
3. Set the Experience replay
4. Start the training loop

In [None]:
episode_count         = 0
episode_reward        = 0
episode_max_steps     = 10000
episode_best_reward   = 0

frame_count           = 0
exploration_frames    = 100000

episode_reward_buffer = []
history_reward_buffer = []
best_avg_reward       = -100
avg_reward            = 0

C                     = 1000 #frequency update target network
training_frequency    = 4

In [None]:
DQN = DQNAgent(
               n_actions=n_actions,
               frame_stack=frame_stack,
               img_width=screen_size,
               img_height=screen_size,
               lrn_rate=1e-4,
               batch_size=32,
               epsilon=1.0
              )

In [None]:
ExpR = ExperienceReplay(
                        replay_memory_size=10000,
                        sample_size=32
                        )

In [None]:
try:
    while True:
        state = env.reset()
        episode_reward = 0
        
        for step in range (1, episode_max_steps):
            
            frame_count += 1
            
            #select action following policy
            action = DQN.policy_action_selection(state)
            
            #decrease epsilon value
            if frame_count <= exploration_frames:
                DQN.epsilon = DQN.epsilon - ( (DQN.epsilon_max - DQN.epsilon_min) / DQN.epsilon_decrease )
            else:
                DQN.epsilon = DQN.epsilon_min
            
            #step the policy action
            next_state, reward, done, _ = env.step(action)
            
            #store transition in experience replay
            ExpR.add_experience(state, action, reward, next_state, done)
            
            #update state
            state = next_state
            
            #add reward
            episode_reward += reward
            
            if len(ExpR.replay_memory) > 32:
                DQN.training(ExpR)
                
            if frame_count % C == 0:
                DQN.update_target_network()
            
            if done:
                break
        
        ### END OF EPISODE ROUTINE ### 
        
        episode_count += 1
        
        episode_reward_buffer.append(episode_reward)
        history_reward_buffer.append(episode_reward)
        
        avg_reward = np.mean(episode_reward_buffer)
        avg_history_reward = np.mean(history_reward_buffer)

        # print update of the training at the end of the episode
        now = datetime.now()
        time_string = now.strftime("%H:%M:%S")
        print("Episode {}, h_rwrd {:.2f}, avg_r: {:.2f}, episode reward {}, frame_count {}, eps= {:.4f}, ({})".format(episode_count, avg_history_reward, avg_reward, episode_reward, frame_count, DQN.epsilon, time_string))
        
        with open("training_output.txt", "a") as output_file:
            output_file.write("Episode {}, h_rwrd {:.2f}, avg_r: {:.2f}, episode reward {}, frame_count {}, eps= {:.4f}, ({})\n".format(episode_count, avg_history_reward, avg_reward, episode_reward, frame_count, DQN.epsilon, time_string))
            
        if episode_count % 100 == 0:
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
                episode_reward_buffer = []
                
                now = datetime.now()
                time_string = now.strftime("%H:%M:%S")
                print("+++++++++ New Best Average Reward Model: {:.2f} at episode {}, ({}) +++++++++".format(best_avg_reward, episode_count, time_string))
                DQN.model_Q.save('./models/model_Q_'+ str(best_avg_reward))
                print("Saved the model that holds the record...")
                DQN.model_Q.save('./models/model_Q')
                DQN.model_Q_target.save('./models/model_Q_target')
                
            else:
                episode_reward_buffer = []

        # we save the models every 30 episodes
        if episode_count % 30 == 0:
            DQN.model_Q.save('./models/model_Q')
            DQN.model_Q_target.save('./models/model_Q_target')
        
except Exception as e: 
    print("Exception raised, saving the models...")
    print(e)
    exc_type, exc_obj, exc_tb = sys.exc_info()
    print(exc_type, exc_tb.tb_lineno)
    print()
    DQN.model_Q.save('./models/model_Q')
    DQN.model_Q_target.save('./models/model_Q_target')   

except KeyboardInterrupt as ki:
    print("Execution terminated manually, saving the models...")
    print(ki)
    DQN.model_Q.save('./models/model_Q')
    DQN.model_Q_target.save('./models/model_Q_target')

## Evaluation

1. Set the environment
2. Evaluate model performances over 10 episodes

(The evaluation has been made on Google Colab)

In [None]:
# Google Colab setup

#!(apt update && apt install xvfb ffmpeg python-opengl -y) > /dev/null 2>&1
#!pip install gym-notebook-wrapper > /dev/null 2>&1
#!pip install opencv-python > /dev/null 2>&1
#!pip install gym[atari] > /dev/null 2>&1
#!pip install lz4 > /dev/null 2>&1
#!wget http://www.atarimania.com/roms/Roms.rar
#!unrar e Roms.rar
#!python -m atari_py.import_roms .

In [None]:
ENVIRONMENT    = "PongNoFrameskip-v4"

frame_skip     = 4
frame_stack    = 4
screen_size    = 84
grayscale_obs  = True
show_env_spec  = True

In [None]:
env = create_environment(
                       name = ENVIRONMENT,
                       frame_skip = frame_skip,
                       frame_stack = frame_stack,
                       screen_size = screen_size,
                       grayscale_obs = grayscale_obs,
                       show_env_spec = show_env_spec
                        )

In [None]:
model_Q = keras.models.load_model('/Users/federicozanini/Desktop/2022_aas/models/18.59_dqn_gym_agent/model')
seed = 42
env.seed(seed)
env = gnwrapper.Monitor(env, './videos', video_callable=lambda episode_id: True,force=True)

In [None]:
n_episodes = 10
returns = []

for i in range(n_episodes):
    ret = 0
    state = np.array(env.reset())
    done = False

    while not done:
        
        action_probs = model_Q(tf.expand_dims(state, 0), training=False)        

        # predict the best action
        action = tf.argmax(action_probs[0]).numpy()

        # step action into the environment
        state_next, reward, done, _ = env.step(action)
        state_next = np.array(state_next)

        # set the new state as state for the next prediction
        state = state_next

        # accumulate the reward
        ret += reward

    if ret >= 20.0:    
        env.display()
        break
    else:
        dir = './videos'
        for f in os.listdir(dir):
            os.remove(os.path.join(dir, f))

        
    print("Episode {} terminated with reward {}".format(i, ret))
  # store the episode reward
    returns.append(ret)