In [None]:
import gym
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from collections import deque
from Stochastic_UPM_env import Factory

In [None]:
class PrioritizedReplayBuffer:
    def __init__(self, maxlen):
        self.priority_scale = 0.8
        self.beta = 0.4 # initial beta
        self.beta_increment_per_sampling = 1e-4
        self.buffer = deque(maxlen=maxlen)
        self.priorities = deque(maxlen=maxlen) 
    
    def add(self, experience):
        self.buffer.append(experience)
        self.priorities.append(max(self.priorities, default=1)) #new experience has higher prob
        
    def get_probabilities(self):
        scaled_priorities = np.array(self.priorities)**self.priority_scale
        probs = scaled_priorities/sum(scaled_priorities)
        return probs
    
    def get_importance(self, probabilities):
        self.beta = np.min([1, self.beta + self.beta_increment_per_sampling])  # max = 1
        importance = (1/len(self.buffer) * 1/probabilities)**self.beta
        importance_normalized = importance / max(importance)
        return importance_normalized
    
    def sample(self, batch_size):
        sample_probs = self.get_probabilities()
        indices = np.arange(len(self.buffer))
        sample_indices = random.choices(indices, k = batch_size, weights=sample_probs)
        samples = np.array(self.buffer, dtype = object)[sample_indices]
        importance = self.get_importance(sample_probs[sample_indices])
        
        return map(np.array, zip(*samples)), importance, indices
    
    def set_priorities(self, indices, errors, offset=0.1):
        for i,e in zip(indices, errors):
            self.priorities[i] = abs(e) + offset

In [None]:
class DQN_agent:
    def __init__(self, n_states, n_actions):
        self.n_states = n_states
        self.n_actions = n_actions
        self.q_network = self.build_q_network()
        self.t_q_network = self.build_q_network()
        self.buffer = PrioritizedReplayBuffer(100000)
        self.optimizer = keras.optimizers.Adam(learning_rate = 3e-4, clipnorm=1.0)
        self.batch_size = 32
        # timestep in an episode
        self.frame_count = 0
        # prob for exploration
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        # for epsilon decay
        self.epsilon_greedy_frames = 100000.0
        # discounted ratio
        self.gamma = 0.99
    
    def build_q_network(self):
        # Network architecture
        inputs = keras.Input(shape = self.n_states)
        x = layers.Conv2D(16, 3, strides = 1, activation = 'relu')(inputs)
        x = layers.Conv2D(16, 3, strides = 1, activation = 'relu')(x)

        x = layers.Conv2D(32, 3, strides = 1, activation = 'relu')(x)
        x = layers.Conv2D(32, 3, strides = 1, activation = 'relu')(x)
        x = layers.Flatten()(x)

        x = layers.Dense(units = 256, activation = 'relu')(x)
        q_value = layers.Dense(units = self.n_actions)(x)

        return keras.Model(inputs = inputs, outputs = q_value)
    
    def choose_action(self, state, legal_one_hot):
        # exploration and exploitation
        if  self.epsilon >= np.random.rand(1)[0]:
            legal = [row for row in state if row[0] != 0]
            action = np.random.choice(len(legal))
        else:
            action_values = self.q_network(np.expand_dims(state, axis=(0,-1)))
            legal_values = legal_one_hot*action_values
            action = np.argmax(np.where(legal_values != 0,legal_values,-np.inf))

        return action

    def decay_epsilon(self):
        # decay probability of taking random action
        self.epsilon -= (1.0 - self.epsilon_min)/self.epsilon_greedy_frames
        self.epsilon = max(self.epsilon, self.epsilon_min)

    def store(self, state, action, next_state, reward, done, next_legal):
        # store training data
        self.buffer.add((state, action, reward, next_state, done, next_legal))
    

    def train_q_network(self):
        # sample
        (states, actions, rewards, next_states, dones, next_legal), importance, \
            indices = self.buffer.sample(self.batch_size)

        next_values = next_legal*self.q_network.predict(next_states)
        next_action = tf.math.argmax(tf.where(next_values != 0,next_values,-np.inf), 1)
        future_rewards = self.t_q_network.predict(next_states)
        mask_next_action = tf.one_hot(next_action, self.n_actions)
        # Q value = reward + discount factor * expected future reward
        updated_q_values = rewards + self.gamma * tf.reduce_sum(tf.multiply(future_rewards, mask_next_action), axis=1)
        
        # set last q value to 0
        updated_q_values = updated_q_values*(1 - dones)
        masks = tf.one_hot(actions, self.n_actions)

        with tf.GradientTape() as tape:
          # Train the model on the states and updated Q-values
          q_values = self.q_network(states)
          # only update q-value which is chosen
          q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
          # calculate loss between new Q-value and old Q-value
          loss = tf.reduce_mean(importance * tf.math.square(q_action - updated_q_values))
        
        # set priorities
        errors = updated_q_values - q_action
        self.buffer.set_priorities(indices, errors)
        
        # Backpropagation
        grads = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.q_network.trainable_variables))

    def update_target_network(self):
        # update per update_target_network steps
        self.t_q_network.set_weights(self.q_network.get_weights())


In [None]:
n_states = (40,9,1)
n_actions = 40
update_per_actions = 4
max_steps_per_episode = 1000
update_target_network = 1000
agent = DQN_agent(n_states, n_actions)
env = Factory()

In [49]:
episode = 0

while True:
    episode += 1
    state, legal = env.reset()
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        agent.frame_count += 1
        # choose action
        action = agent.choose_action(state, legal)
        # decay prob of exploration
        agent.decay_epsilon()

        next_state, reward, done, next_legal, inf = env.step(action)

        episode_reward += reward
        # store training data
        agent.store(
            np.expand_dims(state,axis=-1), action, np.expand_dims(next_state,axis=-1), \
                reward, done, next_legal
            )

        state = next_state
        legal = next_legal
        
        if agent.frame_count % update_per_actions == 0 and len(agent.buffer.buffer) >= agent.batch_size:
            agent.train_q_network()

        if agent.frame_count % update_target_network == 0:
            agent.update_target_network()

        if done:
            print('episode:{}, tardy percentage:{}'.format(episode, inf))
            break

episode:66, tardy percentage:0.115


KeyboardInterrupt: 