In [1]:
import gym
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
import numpy as np

import pandas as pd
import time
from datetime import datetime
from collections import deque
import sys

import altair as alt

import atari_wrappers as atari

#import numba
#from numba import jit

In [54]:
def make_env(env_name, fire=True, frames_num=2, noop_num=30, skip_frames=True):
    
    env = gym.make(env_name)
    
    if skip_frames:
        env = atari.MaxAndSkipEnv(env) ## Return only every skip-th frame
        
    if fire:
        env = atari.FireResetEnv(env) ## Fire at the beggining
        
    env = atari.NoopResetEnv(env,noop_max=noop_num)
    env = atari.WarpFrame(env) ## Reshape image
    env = atari.FrameStack(env, frames_num) ## Stack last 2 frames
    
    return env

In [55]:

class QNet(Model):
    
    my_strategy = tf.distribute.MirroredStrategy()
    with my_strategy.scope():
        @tf.function
        def __init__(self, h_layers, h_size, o_size, h_activation=tf.nn.relu, o_activation=None):
        
            super(QNet,self).__init__()
            self.conv_layer1 = Conv2D(filters=16, kernel_size=8, strides=4, padding='valid', activation='relu')
            self.conv_layer2 = Conv2D(filters=32, kernel_size=4, strides=2, padding='valid', activation='relu')
            self.conv_layer3 = Conv2D(filters=32, kernel_size=3, strides=1, padding='valid', activation='relu')
        
            self.flatten_layer = Flatten()
        
            self.hidden_layers = [Dense(h_size[i], activation=h_activation) for i in range(h_layers)]
            self.output_layer = Dense(o_size, activation=o_activation)
    with my_strategy.scope():
        @tf.function
        def call(self,input_data):
        
            x = input_data
        
            x = self.conv_layer1(x)
            x = self.conv_layer2(x)
            x = self.conv_layer3(x)
        
            x = self.flatten_layer(x)
        
            for layer in self.hidden_layers:
            
                x = layer(x)
            
            return self.output_layer(x)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [56]:
def scale_frames(frames):
    
    return np.array(frames, dtype=np.float32)/255.0

In [57]:
class ExperienceBuffer():
    
    def __init__(self,buffer_size):
        
        self.obs_buf = deque(maxlen=buffer_size)
        self.rew_buf = deque(maxlen=buffer_size)
        self.act_buf = deque(maxlen=buffer_size)
        self.next_obs_buf = deque(maxlen=buffer_size)
        self.done_buf = deque(maxlen=buffer_size)
        
    def add(self, obs, rew, act, next_obs, done):
        
        self.obs_buf.append(obs)
        self.rew_buf.append(rew)
        self.act_buf.append(act)
        self.next_obs_buf.append(next_obs)
        self.done_buf.append(done)
        
    def sample_minibatch(self, batch_size):
        
        mb_indices = np.random.randint(len(self.obs_buf),size=batch_size)
        
        mb_obs = scale_frames([self.obs_buf[i] for i in mb_indices])
        mb_rew = [self.rew_buf[i] for i in mb_indices] 
        mb_act = [self.act_buf[i] for i in mb_indices]
        mb_next_obs = scale_frames([self.next_obs_buf[i] for i in mb_indices])
        mb_done = [self.done_buf[i] for i in mb_indices]
    
        return mb_obs, mb_rew, mb_act, mb_next_obs, mb_done
    
    def __len__(self):
        return len(self.obs_buf)
        

In [58]:
current_milli_time = lambda: int(round(time.time() * 1000))

In [59]:
def update_target(target_qv,online_qv):
    
    target_qv.set_weights(online_qv.get_weights())

In [60]:
def e_greedy(action_values,epsilon=0.1):
    
    if np.random.uniform(0,1) < epsilon:
        
        return np.random.randint(len(action_values))
    
    else:
        
        return np.argmax(action_values)

In [61]:
# return the target value for each item in the mini_batch, that will be used in the loss function
def q_target_values(mini_batch_rewards, mini_batch_done, action_values, gamma): 
    
    max_action_value = np.max(action_values, axis=1)
    
    ys = []
    for reward, done ,action_value in zip(mini_batch_rewards, mini_batch_done, max_action_value):    
        
        if done:
            
            ys.append(reward)
        
        else:
            
            ys.append(reward + gamma * action_value)
            
    assert len(ys) ==  len(mini_batch_rewards)
    
    return np.array(ys)

In [62]:
def test_agent(env_test, online_qv, num_games=20):
    
    games_rewards = []
    
    for _ in range(num_games):
        
        done = False
        g_reward = 0
        obs = env_test.reset()
        obs_process = np.array([scale_frames(obs)])
        action_values = online_qv.predict(obs_process)[0]
        
        while not done:
            
            action = e_greedy(action_values, epsilon=0.05)
            
            next_obs, reward, done, _ = env_test.step(action)
            
            g_reward += reward
            
        games_rewards.append(g_reward)
        
    return games_rewards

In [68]:
def DQN(env_name, hidden_layers =1, hidden_size=[32], alpha=1e-2, num_epochs=2000, buffer_size=100000, gamma=0.99,
        update_target_net=1000, batch_size=64, update_freq=4, frames_num=2, min_buffer_size=5000, test_frequency=20,
        start_exp=1, end_exp=0.1, exp_steps=100000, render_cycle=100):
    
    env = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
    env_test = make_env(env_name,frames_num=frames_num, skip_frames=True, noop_num=20)
    
    env_test = gym.wrappers.Monitor(env_test, "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()), force=True,
                                    video_callable=lambda x: x%20==0)
    
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n
    
    target_qv = QNet(h_layers=hidden_layers, h_size=hidden_size, o_size=act_dim)
    online_qv = QNet(h_layers=hidden_layers, h_size=hidden_size, o_size=act_dim)
    
    obs = env.reset()
    obs = scale_frames(obs)
    
    _ = target_qv.predict(np.array([obs]))
    _ = online_qv.predict(np.array([obs]))
    
    online_qv.compile(optimizer = tf.keras.optimizers.Adam(alpha),
                     loss = tf.keras.losses.MeanSquaredError())
    
    update_target(target_qv,online_qv)
    
    #####################
    ### TENSORBOARD ##### --> Not implemented
    #####################
    
    render_the_game = False
    step_count = 0
    last_update_loss = []
    mean_loss = []
    ep_time = current_milli_time()
    batch_rew = []
    old_step_count = 0
    
    buffer = ExperienceBuffer(buffer_size)
    epsilon = start_exp
    eps_decay = (start_exp - end_exp)/exp_steps
    
    obs = env.reset()
    
    for epoch in range(num_epochs):
        
        game_reward = 0
        done = False
        
        while not done:
            
            obs_process = np.array([scale_frames(obs)])
            action_values = online_qv.predict(obs_process)[0]
            
            action = e_greedy(action_values, epsilon)
            next_obs, reward, done, _ = env.step(action) 
            
            if render_the_game:
                env.render()
            
            buffer.add(obs, reward, action, next_obs, done)
            
            obs = next_obs
            game_reward += reward
            step_count += 1
            
            if epsilon > end_exp:
                epsilon -= eps_decay
                
            if len(buffer) > min_buffer_size and (step_count % update_freq == 0):
                
                mb_obs, mb_reward, mb_action, mb_next_obs, mb_done = buffer.sample_minibatch(batch_size)
                
                
                mb_target_actions = target_qv.predict(mb_next_obs)
                
                #print('Reward: ',len(mb_reward),' Done: ',len(mb_done),' Actions: ',mb_target_actions.shape )
                mini_batch_y = q_target_values(mb_reward, mb_done, mb_target_actions, gamma)
                
                #compute SGD
                
                with tf.GradientTape() as tape:
                    
                    q_values = online_qv(mb_obs)
                    
                    one_hot_actions = tf.keras.utils.to_categorical(mb_action,act_dim,dtype=np.float32)
                    Q = tf.reduce_sum(tf.multiply(q_values,one_hot_actions),axis=1)
                    
                    error = Q - mini_batch_y
                    loss = tf.keras.losses.Huber()(mini_batch_y, Q)
                    
                    model_gradients = tape.gradient(loss, online_qv.trainable_variables)
                    online_qv.optimizer.apply_gradients(zip(model_gradients, online_qv.trainable_variables))
                    
                    last_update_loss.append(loss)
                
                
                #history = online_qv.fit(mb_obs, mini_batch_y,epochs=3,verbose=0)
                
                #last_update_loss.append(history.history['loss'][-1])
                
                # target update
                
            if (len(buffer) > min_buffer_size) and (step_count % update_target_net == 0):
                    
                mean_loss.append(np.mean(last_update_loss))
                last_update_loss = []
                    
                update_target(target_qv,online_qv)
                
            if done:
                
                obs = env.reset()
                batch_rew.append(game_reward)
                game_reward = 0
                render_the_game = False
            
        
        if epoch % test_frequency == 0:
            #start = time.time()
            test_reward = test_agent(env_test, online_qv, num_games=10)
            
            ep_sec_time = int((current_milli_time()-ep_time) / 1000)
            print('Epoch:%4d Reward:%4.2f, Epsilon:%2.2f,  Step:%5d,   Test (mean),(std):(%4.2f), (%4.2f),   Time:%d,   Epoch_Steps:%d' %
                  (epoch,np.mean(batch_rew), epsilon, step_count, np.mean(test_reward), np.std(test_reward), 
                   ep_sec_time, (step_count-old_step_count)/test_frequency))
            
            ep_time = current_milli_time()
            batch_rew = []
            old_step_count = step_count  
            #print('\nTest: ',time.time()-start,'\n')
        if epoch % render_cycle == 0:
            render_the_game = True
                
    env.close()
    
    return mean_loss

In [69]:
mean_loss = DQN('PongNoFrameskip-v4', hidden_layers =1, hidden_size=[128], alpha=2e-4, num_epochs=600, buffer_size=100000, gamma=0.99,
        update_target_net=1000, batch_size=32, update_freq=2, frames_num=2, min_buffer_size=10000, test_frequency=20,
        start_exp=1, end_exp=0.1, exp_steps=100000, render_cycle=10000)

Epoch:   0 Reward:-21.00, Epsilon:0.99,  Step: 1015,   Test (mean),(std):(-21.00), (0.00),   Time:21,   Epoch_Steps:50
Epoch:  20 Reward:-20.60, Epsilon:0.83,  Step:18401,   Test (mean),(std):(-21.00), (0.00),   Time:372,   Epoch_Steps:869
Epoch:  40 Reward:-20.25, Epsilon:0.67,  Step:36511,   Test (mean),(std):(-21.00), (0.00),   Time:506,   Epoch_Steps:905
Epoch:  60 Reward:-19.85, Epsilon:0.50,  Step:55871,   Test (mean),(std):(-20.70), (0.46),   Time:543,   Epoch_Steps:968
Epoch:  80 Reward:-19.05, Epsilon:0.29,  Step:78994,   Test (mean),(std):(-20.90), (0.30),   Time:652,   Epoch_Steps:1156
Epoch: 100 Reward:-18.30, Epsilon:0.10,  Step:105229,   Test (mean),(std):(-21.00), (0.00),   Time:747,   Epoch_Steps:1311
Epoch: 120 Reward:-17.30, Epsilon:0.10,  Step:135564,   Test (mean),(std):(-21.00), (0.00),   Time:869,   Epoch_Steps:1516


KeyboardInterrupt: 