In [None]:
import time
import random
import matplotlib.pyplot as plt
import numpy
import gym

import tensorflow
from tensorflow import keras 
from tensorflow.keras import layers
from baselines.common.atari_wrappers import wrap_deepmind

discount_factor = 0.99 #gamma
learning_rate = 0.0002

#Epsilon greedy params
epsilon = 0.89
epsilon_min = 0.1 
epsilon_decay = ( 1 - epsilon_min) / (1000000) #Epsilon will drecrease by this amount at every step

#Exp replay
max_exp_replay_size = 100000
batch_dim = 8 

#Steps between each update
steps_update_target_net = 5000

max_steps_episode = 25000
max_steps = 1000000 #1 million maximum total training steps

opt = keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=1.0)
loss_function = keras.losses.Huber()

In [None]:
from baselines.common.atari_wrappers import make_atari, wrap_deepmind

#Using no frameskip version, as it will be then made into frameskip (n=4) by the make_atari function
env = gym.make("SpaceInvadersNoFrameskip-v4") 

#Reduce frame size and stacks 4 of them
env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=True, scale=True)
    #Clip_rewards: rewards are transformed in the [-1,1] format
    #Episodic_life: considers losing a life as the end of an episode. Used by deepmind as it helps value estimation

env.seed(42)
num_possible_actions = env.action_space.n

print("Environment input shape: {}".format(env.observation_space.shape))
print("Environment output shape: {}".format(env.action_space.n))

In [None]:
def create_model():
    # Network defined by the Deepmind paper
    inputs = layers.Input(shape=(84, 84, 4,))

    l1 = layers.Conv2D(filters = 32, kernel_size = 8, strides=4, activation="relu")(inputs)
    l2 = layers.Conv2D(filters = 64, kernel_size = 4, strides=2, activation="relu")(l1)
    l3 = layers.Conv2D(filters = 64, kernel_size = 3, strides=1, activation="relu")(l2)

    l4 = layers.Flatten()(l3)

    l5 = layers.Dense(units = 512, activation="relu")(l4)
    
    v_stream = layers.Dense(units = 512,activation='relu')(l5)
    v_stream = layers.Dense(units = 1, activation='linear',name="Value")(v_stream)
    v_stream = layers.RepeatVector(num_possible_actions)(v_stream)
    v_stream = layers.Flatten()(v_stream)

    a_stream = layers.Dense(units = 512,activation='relu')(l5)
    a_stream = layers.Dense(num_possible_actions, activation='linear',name='Activation')(a_stream)

    mean_a_stream = layers.RepeatVector(num_possible_actions)(tensorflow.keras.backend.mean(a_stream,axis=1,keepdims=True))
    mean_a_stream = layers.Flatten(name='meanActivation')(mean_a_stream)

    a_stream = layers.Subtract()([a_stream,mean_a_stream])

    head_q = layers.Add(name = "Q-value")([v_stream, a_stream])

    return keras.Model(inputs=inputs, outputs=head_q)


model = create_model()

target_model = create_model()

In [None]:
#Only if one wants to resume training from save weights
model.load_weights("")
target_model.load_weights("")

In [None]:
model.summary()

In [None]:
def get_action(epsilon, state):
    #Epsilon greedy 
    if epsilon > random.random():
        action = random.randrange(0, num_possible_actions)
    else:
        #Predict Q-values of actions and take best one
        tensorized_state = tensorflow.convert_to_tensor(state)
        tensorized_state = tensorflow.expand_dims(tensorized_state, 0)
        action_probabilities = model(tensorized_state, training=False)
        action = tensorflow.argmax(action_probabilities[0]).numpy()
    return action
            
            
def memorize(action,state,new_state,done,reward):
    action_buffer.append(action)
    state_buffer.append(state)
    new_state_buffer.append(new_state)
    done_buffer.append(done)
    reward_buffer.append(reward)

def train():
    #Randomly extract values from buffers
    #index = numpy.random.choice(range(len(done_buffer)), size=batch_dim)
    index = random.sample(range(len(done_buffer)), batch_dim)

    state_batch = numpy.array([state_buffer[i] for i in index])
    new_state_batch = numpy.array([new_state_buffer[i] for i in index])
    reward_batch = [reward_buffer[i] for i in index]
    action_batch = [action_buffer[i] for i in index]
    done_batch = tensorflow.convert_to_tensor([float(done_buffer[i]) for i in index])

    #Calculate new q values
    next_rewards_batch = target_model.predict(new_state_batch)
    new_q_vals = reward_batch + discount_factor * numpy.amax(next_rewards_batch)

    #If it was the final step change value to -1
    new_q_vals = new_q_vals * (1 - done_batch) - done_batch

    mask = tensorflow.one_hot(action_batch, num_possible_actions)

    with tensorflow.GradientTape() as t:
        q_vals = model(state_batch)
        q_action = tensorflow.reduce_sum(tensorflow.multiply(q_vals, mask), axis=1)
        loss = loss_function(new_q_vals, q_action)

    gradients = t.gradient(loss, model.trainable_variables)
    opt.apply_gradients(zip(gradients, model.trainable_variables))

In [None]:
reward_history = []
running_reward_history = []
episode_duration_history = []

mean_reward_100ep = 0
mean_duration_100ep = 0
episode_counter = 0
step_counter = 0
current_max_reward = 0

action_buffer = []
state_buffer = []
new_state_buffer = []
reward_buffer = []
done_buffer = []

start_time = time.time()

print("starting...")
while True: 
    tot_ep_reward = 0
    
    epoch_start_time = time.time()
    
    cur_state = numpy.asarray(env.reset())
    
    for timestep in range(1, max_steps_episode+1):
    
        #env.render(); 
        
        step_counter += 1

        #Choose next action
        action = get_action(epsilon, cur_state)
        
        #Value of epsilon decreases
        epsilon -= epsilon_decay
        if epsilon < epsilon_min:
            epsilon = epsilon_min

        # do one step with selected action
        new_state, reward, is_done, _ = env.step(action)
        new_state = numpy.asarray(new_state)

        tot_ep_reward += reward

        
        # memorize info in rep buffer
        memorize(action,cur_state,new_state,is_done,reward)
        
        cur_state = new_state

        # when batch is large enough train the model
        if len(done_buffer) > batch_dim:
            train()
            
            
        #every n steps update target netork
        if step_counter % steps_update_target_net == 0:
            target_model.set_weights(model.get_weights())
            print("{}/{} steps, {} episodes. avg reward for last 100 episodes: {:.2f}, Epsilon: {}, avg epoch duration: {:.2f}".format(step_counter, max_steps, episode_counter, mean_reward_100ep, epsilon, mean_duration_100ep))
                        
        # keep buffer size below max
        if len(reward_buffer) > max_exp_replay_size:
            del reward_buffer[:1]
            del state_buffer[:1]
            del new_state_buffer[:1]
            del action_buffer[:1]
            del done_buffer[:1]
            
        if is_done == True:
            break

            
    episode_duration_history.append(time.time() - epoch_start_time)
    mean_duration_100ep = numpy.mean(episode_duration_history[-100:])

    # add rewards to history
    reward_history.append(tot_ep_reward)

    #determine avg reward of past n ep.
    mean_reward_100ep = numpy.mean(reward_history[-100:])

    #insert running reward in running reward history
    running_reward_history.append(mean_reward_100ep)

    episode_counter += 1
    
    #save weights if it reached new record running average
    if step_counter > 20000 and mean_reward_100ep > current_max_reward:
        current_max_reward = mean_reward_100ep
        s = "0"*(15-len(str(step_counter)))+str(step_counter)
        r = "0"*(5-len(str(round(mean_reward_100ep, 2))))+str(round(mean_reward_100ep, 2))
        model.save_weights("./weights/DDQN/r_{}_s_{}/model/weights".format(r, s))
        target_model.save_weights("./weights/DDQN/r_{}_s_{}/target/weights".format(r,s))
        print("reached new record of {:.2f}, saved weights as DDQN_r_{}_s_{}".format(mean_reward_100ep,r,s))
    
    #end condition 3: reached step limit
    if step_counter > max_steps: 
        print("max number of steps {} reached at episode {}, avg reward is: {}".format(max_steps, episode_counter, objective_mean_reward_100ep))
        break

In [None]:
import matplotlib.pyplot as plt

plt.plot(running_reward_history)
plt.ylabel('running reward')
plt.xlabel('steps divided by 5000')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(reward_history)
plt.ylabel('reward')
plt.xlabel('steps divided by 5000')
plt.show()

In [None]:
#Trained net testing

test_model = create_model()

test_model.load_weights("./weights/DDQN/r_10.85_s_000000001119429/target/weights")
#Baseline testing
history = []

print("starting...")
for episode in range(1, 100):
 
    cur_state = np.asarray(env.reset())
    tot_ep_reward = 0
    
    for timestep in range(1, 25000):
    
        #env.render(); 
        action = get_action(0, cur_state)
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.asarray(next_state)
        
        tot_ep_reward += reward

        cur_state = next_state
    
        if done == True:
            #print(tot_ep_reward)
            history.append(tot_ep_reward)
            break
    
    #calculate reward of last 100 episodes
    mean_reward_100ep = np.mean(history)

print("mean reward of 100 episodes is: {}".format(mean_reward_100ep))

In [None]:
print(np.amax(history))