In [1]:
import gym
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
from tensorflow.keras.initializers import VarianceScaling
import numpy as np

import pandas as pd
import time
from datetime import datetime
from collections import deque

import sys
import os

import altair as alt

import atari_wrappers as atari

#import numba
#from numba import jit

In [2]:
np.random.seed(10)
tf.random.set_seed(10)

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [4]:

def make_env(env_name, fire=True, frames_num=2, noop_num=30, skip_frames=True):
    
    env = gym.make(env_name)
    
    if skip_frames:
        env = atari.MaxAndSkipEnv(env) ## Return only every skip-th frame
        
    if fire:
        env = atari.FireResetEnv(env) ## Fire at the beggining
        
    env = atari.NoopResetEnv(env,noop_max=noop_num)
    env = atari.WarpFrame(env) ## Reshape image
    env = atari.FrameStack(env, frames_num) ## Stack last 2 frames
    
    return env

In [5]:

class QNet(Model):
    
    my_strategy = tf.distribute.MirroredStrategy()
    with my_strategy.scope():
        @tf.function
        def __init__(self, h_layers, h_size, o_size, h_activation=tf.nn.relu, o_activation=None):
        
            super(QNet,self).__init__()
            #self.conv_layer1 = Conv2D(filters=32, kernel_size=8, strides=4, 
                                      #kernel_initializer=VarianceScaling(scale=2.),use_bias=False,
                                      #padding='valid', activation='relu')
            #self.conv_layer2 = Conv2D(filters=64, kernel_size=4, strides=2,
                                      #kernel_initializer=VarianceScaling(scale=2.),use_bias=False,
                                      #padding='valid', activation='relu')
            #self.conv_layer3 = Conv2D(filters=64, kernel_size=3, strides=1,
                                      #kernel_initializer=VarianceScaling(scale=2.),use_bias=False,
                                      #padding='valid', activation='relu')
            #self.conv_layer4 = Conv2D(filters=1024, kernel_size=7, strides=1,
                                      #kernel_initializer=VarianceScaling(scale=2.),use_bias=False,
                                      #padding='valid', activation='relu')
            self.conv_layer1 = Conv2D(filters=32, kernel_size=8, strides=4, padding='valid', activation='relu')
            self.conv_layer2 = Conv2D(filters=64, kernel_size=4, strides=2, padding='valid', activation='relu')
            self.conv_layer3 = Conv2D(filters=64, kernel_size=3, strides=1, padding='valid', activation='relu')
            self.conv_layer4 = Conv2D(filters=1024, kernel_size=7, strides=1, padding='valid', activation='relu')
        
            self.flatten_layer = Flatten()
            
            self.hidden_layers = [Dense(h_size[i], activation=h_activation) for i in range(h_layers)]
            self.output_layer = Dense(o_size, activation=o_activation)
            
            #self.hidden_layers = [Dense(h_size[i],kernel_initializer=VarianceScaling(scale=2.),
             #                           activation=h_activation) for i in range(h_layers)]
            #self.output_layer = Dense(o_size, kernel_initializer=VarianceScaling(scale=2.),
              #                        activation=o_activation)
                
                
    with my_strategy.scope():
        @tf.function
        def call(self,input_data):
        
            x = input_data
        
            x = self.conv_layer1(x)
            x = self.conv_layer2(x)
            x = self.conv_layer3(x)
            x = self.conv_layer4(x)
        
            x = self.flatten_layer(x)
        
            for layer in self.hidden_layers:
            
                x = layer(x)
            
            return self.output_layer(x)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [6]:
def scale_frames(frames):
    
    return np.array(frames, dtype=np.float32)/255.0

In [7]:
class ExperienceBuffer():
    
    def __init__(self,buffer_size):
        
        self.obs_buf = deque(maxlen=buffer_size)
        self.rew_buf = deque(maxlen=buffer_size)
        self.act_buf = deque(maxlen=buffer_size)
        self.next_obs_buf = deque(maxlen=buffer_size)
        self.done_buf = deque(maxlen=buffer_size)
        
    def add(self, obs, rew, act, next_obs, done):
        
        self.obs_buf.append(obs)
        self.rew_buf.append(rew)
        self.act_buf.append(act)
        self.next_obs_buf.append(next_obs)
        self.done_buf.append(done)
        
    def sample_minibatch(self, batch_size):
        
        mb_indices = np.random.randint(len(self.obs_buf),size=batch_size)
        
        mb_obs = scale_frames([self.obs_buf[i] for i in mb_indices])
        mb_rew = [self.rew_buf[i] for i in mb_indices] 
        mb_act = [self.act_buf[i] for i in mb_indices]
        mb_next_obs = scale_frames([self.next_obs_buf[i] for i in mb_indices])
        mb_done = [self.done_buf[i] for i in mb_indices]
    
        return mb_obs, mb_rew, mb_act, mb_next_obs, mb_done
    
    def __len__(self):
        return len(self.obs_buf)
        

In [8]:
current_milli_time = lambda: int(round(time.time() * 1000))

In [9]:
def update_target(target_qv,online_qv):
    
    target_qv.set_weights(online_qv.get_weights())

In [10]:
def e_greedy(action_values,epsilon=0.1):
    
    if np.random.uniform(0,1) < epsilon:
        
        return np.random.randint(len(action_values))
    
    else:
        
        return np.argmax(action_values)

In [29]:
# return the target value for each item in the mini_batch, that will be used in the loss function

def q_target_values(mini_batch_rewards, mini_batch_done, action_values, 
                    gamma, DQN_variation, mb_next_obs, act_dim,online_qv): 
    
    max_action_value = np.max(action_values, axis=1) #DQN
    
    if DQN_variation == 'DDQN':
        
        q_values = online_qv(mb_next_obs)
        max_actions_online = np.argmax(q_values, axis=1) 
        #print('\n',max_actions_online)
        one_hot_actions = tf.keras.utils.to_categorical(max_actions_online,act_dim,dtype=np.float32)
        Qtarget_onlineaction = tf.reduce_sum(tf.multiply(action_values,one_hot_actions),axis=1)
    
    ys = []
    for reward, done ,action_value, ddqn_target_v in zip(mini_batch_rewards, 
                                          mini_batch_done, max_action_value, Qtarget_onlineaction):    
        
        if done:
            
            ys.append(reward)
        
        else:
            if DQN_variation == 'DQN':
                ys.append(reward + gamma * action_value)
            
            elif DQN_variation == 'DDQN':
                
                ys.append(reward + gamma * ddqn_target_v)
            
    assert len(ys) ==  len(mini_batch_rewards)
    
    return np.array(ys)

In [30]:
def test_agent(env_test, online_qv, num_games=20):
    
    games_rewards = []
    
    for _ in range(num_games):
        
        done = False
        g_reward = 0
        obs = env_test.reset()
        
        while not done:
            
            obs_process = np.array([scale_frames(obs)])
            action_values = online_qv.predict(obs_process)[0]
            
            action = e_greedy(action_values, epsilon=0.05)
            #action = np.argmax(action_values)
            
            next_obs, reward, done, _ = env_test.step(action)
            
            obs = next_obs
            
            g_reward += reward
            
        games_rewards.append(g_reward)
        
    return games_rewards

In [31]:
def DQN(env_name, hidden_layers =1, hidden_size=[32], alpha=1e-2, num_epochs=2000, buffer_size=100000, gamma=0.99,
        update_target_net=1000, batch_size=64, update_freq=4, frames_num=2, min_buffer_size=5000, test_frequency=20,
        start_exp=1, end_exp=0.1, exp_steps=100000, render_cycle=100, DQN_variation = 'DQN'):
    
    #checkpoint_path = 'training_1/cp_dqn.ckpt'
    #checkpoint_dir = os.path.dirname(checkpoint_path)
    #cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                     #save_weights_only=True,
                                                     #verbose=0)
    
    env = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
    env_test = make_env(env_name,frames_num=frames_num, skip_frames=True, noop_num=20)
    
    env_test = gym.wrappers.Monitor(env_test, "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()), force=True,
                                    video_callable=lambda x: x%20==0)
    
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n
    
    target_qv = QNet(h_layers=hidden_layers, h_size=hidden_size, o_size=act_dim)
    online_qv = QNet(h_layers=hidden_layers, h_size=hidden_size, o_size=act_dim)
    
    obs = env.reset()
    obs = scale_frames(obs)
    
    _ = target_qv.predict(np.array([obs]))
    _ = online_qv.predict(np.array([obs]))
    
    online_qv.compile(optimizer = tf.keras.optimizers.Adam(alpha),
                      loss = tf.keras.losses.MeanSquaredError())
    
    target_qv.compile(optimizer = tf.keras.optimizers.Adam(alpha),
                     loss = tf.keras.losses.MeanSquaredError())
    
    update_target(target_qv,online_qv)
    
    #####################
    ### TENSORBOARD ##### --> Not implemented
    #####################
    
    render_the_game = False
    step_count = 0
    last_update_loss = []
    mean_loss = []
    mean_reward_test = []
    steps_test = []
    ep_time = current_milli_time()
    batch_rew = []
    old_step_count = 0
    
    buffer = ExperienceBuffer(buffer_size)
    epsilon = start_exp
    eps_decay = (start_exp - end_exp)/exp_steps
    
    obs = env.reset()
    
    for epoch in range(num_epochs):
        
        game_reward = 0
        done = False
        
        while not done:
            
            obs_process = np.array([scale_frames(obs)])
            action_values = online_qv.predict(obs_process)[0]
            
            action = e_greedy(action_values, epsilon)
            next_obs, reward, done, _ = env.step(action) 
            
            if render_the_game:
                env.render()
            
            buffer.add(obs, reward, action, next_obs, done)
            
            obs = next_obs
            game_reward += reward
            step_count += 1
            
            if epsilon > end_exp:
                epsilon -= eps_decay
                
            if len(buffer) > min_buffer_size and (step_count % update_freq == 0):
                
                mb_obs, mb_reward, mb_action, mb_next_obs, mb_done = buffer.sample_minibatch(batch_size)
                mb_target_actions = target_qv.predict(mb_next_obs)
                
                with tf.GradientTape() as tape:
                    
                    q_values = online_qv(mb_obs)
                    
                    one_hot_actions = tf.keras.utils.to_categorical(mb_action,act_dim,dtype=np.float32)
                    Q = tf.reduce_sum(tf.multiply(q_values,one_hot_actions),axis=1)
                    
                    mini_batch_y = q_target_values(mb_reward, mb_done, 
                                                   mb_target_actions, 
                                                   gamma, DQN_variation,mb_next_obs,act_dim, online_qv)
                       
                    
                    error = Q - mini_batch_y
                    
                    loss = tf.keras.losses.Huber()(mini_batch_y, Q)
                    
                    model_gradients = tape.gradient(loss, online_qv.trainable_variables)
                    online_qv.optimizer.apply_gradients(zip(model_gradients, online_qv.trainable_variables))
                    
                    last_update_loss.append(loss)
                
                # target update
                
            if (len(buffer) > min_buffer_size) and (step_count % update_target_net == 0):
                    
                mean_loss.append(np.mean(last_update_loss))
                last_update_loss = []
                    
                update_target(target_qv,online_qv)
                
            if done:
                
                obs = env.reset()
                batch_rew.append(game_reward)
                game_reward = 0
                render_the_game = False
            
        
        if epoch % test_frequency == 0:
            #start = time.time()
            test_reward = test_agent(env_test, online_qv, num_games=10)
            
            ep_sec_time = int((current_milli_time()-ep_time) / 1000)
            print('Epoch:%4d Reward:%4.2f, Epsilon:%2.2f,  Step:%5d,   Test (mean),(std):(%4.2f), (%4.2f),   Time:%d,   Epoch_Steps:%d' %
                  (epoch,np.mean(batch_rew), epsilon, step_count, np.mean(test_reward), np.std(test_reward), 
                   ep_sec_time, (step_count-old_step_count)/test_frequency))
            
            ep_time = current_milli_time()
            batch_rew = []
            old_step_count = step_count  
            #print('\nTest: ',time.time()-start,'\n')
            
            if DQN_variation == 'DQN':
                online_qv.save_weights('./saved_models/dqn_pong')
            
            elif DQN_variation == 'DDQN':
                online_qv.save_weights('./saved_models/ddqn_pong')
            
            mean_reward_test.append(np.mean(test_reward))
            steps_test.append(step_count)
            
        if epoch % render_cycle == 0:
            render_the_game = True
                
    env.close()
    
    return env_test, online_qv, mean_reward_test, steps_test

In [32]:
env_ddqn, online_qv_ddqn, mean_reward_test_ddqn, steps_test_ddqn= DQN('PongNoFrameskip-v4', hidden_layers =1, hidden_size=[128], alpha=2e-4, num_epochs=200, 
                buffer_size=100000, gamma=0.99, update_target_net=1000, batch_size=32, update_freq=2, 
                frames_num=2, min_buffer_size=1000, test_frequency=20, start_exp=1, end_exp=0.1, 
                exp_steps=100000, render_cycle=10000, DQN_variation = 'DDQN')

Epoch:   0 Reward:-21.00, Epsilon:0.99,  Step: 1016,   Test (mean),(std):(-21.00), (0.00),   Time:132,   Epoch_Steps:50
Epoch:  20 Reward:-20.25, Epsilon:0.82,  Step:20170,   Test (mean),(std):(-20.20), (0.40),   Time:758,   Epoch_Steps:957
Epoch:  40 Reward:-20.20, Epsilon:0.64,  Step:40150,   Test (mean),(std):(-20.10), (0.94),   Time:808,   Epoch_Steps:999
Epoch:  60 Reward:-18.50, Epsilon:0.40,  Step:67111,   Test (mean),(std):(-17.20), (2.04),   Time:1289,   Epoch_Steps:1348
Epoch:  80 Reward:-16.20, Epsilon:0.10,  Step:107693,   Test (mean),(std):(-14.40), (2.46),   Time:1799,   Epoch_Steps:2029
Epoch: 100 Reward:-9.85, Epsilon:0.10,  Step:168292,   Test (mean),(std):(-10.90), (2.77),   Time:2882,   Epoch_Steps:3029
Epoch: 120 Reward:-8.15, Epsilon:0.10,  Step:232755,   Test (mean),(std):(2.60), (7.50),   Time:2990,   Epoch_Steps:3223
Epoch: 140 Reward:-4.95, Epsilon:0.10,  Step:297838,   Test (mean),(std):(13.30), (5.68),   Time:2653,   Epoch_Steps:3254
Epoch: 160 Reward:2.35, E

In [38]:
env, online_qv, mean_reward_test, steps_test = DQN('PongNoFrameskip-v4', hidden_layers =1, hidden_size=[128], alpha=2e-4, num_epochs=200, 
                buffer_size=100000, gamma=0.99, update_target_net=1000, batch_size=32, update_freq=2, 
                frames_num=2, min_buffer_size=1000, test_frequency=20, start_exp=1, end_exp=0.1, 
                exp_steps=100000, render_cycle=10000, DQN_variation = 'DQN')

KeyboardInterrupt: 

In [42]:
env_ddqn.close()
obs = env_ddqn.reset()
done = False

while not done:
            
    obs_process = np.array([scale_frames(obs)])
    action_values = online_qv_ddqn.predict(obs_process)[0]
            
    action = e_greedy(action_values, 0.05)
    next_obs, reward, done, _ = env_ddqn.step(action) 
            
    env_ddqn.render()
            
    obs = next_obs
    

In [43]:
mean_reward_test_ddqn

[-21.0, -20.2, -20.1, -17.2, -14.4, -10.9, 2.6, 13.3, 11.8, 14.7]

In [44]:
steps_test_ddqn

[1016, 20170, 40150, 67111, 107693, 168292, 232755, 297838, 367006, 428811]