In [1]:
import gym
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from collections import deque
from Stochastic_UPM_env import Factory
from Stochastic_UPM_sim import Factory as sim_Factory
from TransformerModel import TransformerModel 
from TransformerModel import CustomSchedule
import pandas as pd
import scipy.stats

In [2]:
# tensorboardX
import time
from tensorboardX import SummaryWriter
writer = SummaryWriter(f'C:/Users/cimlab/UPM_stochastic/log/log-{time.time()}')

In [3]:
class PrioritizedReplayBuffer:
    def __init__(self, maxlen):
        self.priority_scale = 0.8
        self.beta = 0.4 # initial beta
        self.beta_increment_per_sampling = 1e-4
        self.buffer = deque(maxlen=maxlen)
        self.priorities = deque(maxlen=maxlen) 
    
    def add(self, experience):
        self.buffer.append(experience)
        self.priorities.append(max(self.priorities, default=1)) #new experience has higher prob
        
    def get_probabilities(self):
        scaled_priorities = np.array(self.priorities)**self.priority_scale
        probs = scaled_priorities/sum(scaled_priorities)
        return probs
    
    def get_importance(self, probabilities):
        self.beta = np.min([1, self.beta + self.beta_increment_per_sampling])  # max = 1
        importance = (1/len(self.buffer) * 1/probabilities)**self.beta
        importance_normalized = importance / max(importance)
        return importance_normalized
    
    def sample(self, batch_size):
        sample_probs = self.get_probabilities()
        indices = np.arange(len(self.buffer))
        sample_indices = random.choices(indices, k = batch_size, weights=sample_probs)
        samples = np.array(self.buffer, dtype = object)[sample_indices]
        importance = self.get_importance(sample_probs[sample_indices])
        
        return map(np.array, zip(*samples)), importance, indices
    
    def set_priorities(self, indices, errors, offset=0.1):
        for i,e in zip(indices, errors):
            self.priorities[i] = abs(e) + offset

In [4]:
class DQN_agent:
    def __init__(self, n_states, n_actions, d_model, n_encder_lyer, n_heads, dff, s_length, dmlp):
        self.n_states = n_states
        self.n_actions = n_actions
        #self.q_network = self.build_q_network()
        #self.t_q_network = self.build_q_network()
        self.q_network = TransformerModel(n_actions,n_encder_lyer,d_model,n_heads,dff,s_length,dmlp)
        self.t_q_network = TransformerModel(n_actions,n_encder_lyer,d_model,n_heads,dff,s_length,dmlp)
        self.buffer = PrioritizedReplayBuffer(200000)
        self.optimizer = keras.optimizers.Adam(learning_rate = CustomSchedule(d_model), clipnorm=1.0)
        self.batch_size = 32
        # timestep in an episode
        self.frame_count = 0
        # prob for exploration
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        # for epsilon decay
        self.epsilon_greedy_frames = 120000.0
        # discounted ratio
        self.gamma = 0.99
    """
    def build_q_network(self):
        # Network architecture
        inputs = keras.Input(shape = self.n_states)
        x = layers.Conv2D(16, 3, strides = 1, activation = 'relu')(inputs)
        x = layers.Conv2D(16, 3, strides = 1, activation = 'relu')(x)

        x = layers.Conv2D(32, 3, strides = 1, activation = 'relu')(x)
        x = layers.Conv2D(32, 3, strides = 1, activation = 'relu')(x)
        x = layers.Flatten()(x)

        x = layers.Dense(units = 256, activation = 'relu')(x)
        q_value = layers.Dense(units = self.n_actions)(x)

        return keras.Model(inputs = inputs, outputs = q_value)
    """
    def choose_action(self, state, legal_one_hot):
        # exploration and exploitation
        if  self.epsilon >= np.random.rand(1)[0]:
            legal = [row for row in state if row[0] != 0]
            action = np.random.choice(len(legal))
        else:
            action_values = self.q_network(np.expand_dims(state, axis=(0)))
            legal_values = legal_one_hot*action_values
            action = np.argmax(np.where(legal_values != 0,legal_values,-np.inf))

        return action

    def decay_epsilon(self):
        # decay probability of taking random action
        self.epsilon -= (1.0 - self.epsilon_min)/self.epsilon_greedy_frames
        self.epsilon = max(self.epsilon, self.epsilon_min)

    def store(self, state, action, next_state, reward, done, next_legal):
        # store training data
        self.buffer.add((state, action, reward, next_state, done, next_legal))
    

    def train_q_network(self):
        # sample
        (states, actions, rewards, next_states, dones, next_legal), importance, \
            indices = self.buffer.sample(self.batch_size)

        next_values = next_legal*self.q_network.predict(next_states)
        next_action = tf.math.argmax(tf.where(next_values != 0,next_values,-np.inf), 1)
        future_rewards = self.t_q_network.predict(next_states)
        mask_next_action = tf.one_hot(next_action, self.n_actions)
        # Q value = reward + discount factor * expected future reward
        updated_q_values = rewards + self.gamma * tf.reduce_sum(tf.multiply(future_rewards, mask_next_action), axis=1)
        
        # set last q value to 0
        updated_q_values = updated_q_values*(1 - dones)
        masks = tf.one_hot(actions, self.n_actions)

        with tf.GradientTape() as tape:
          # Train the model on the states and updated Q-values
          q_values = self.q_network(states)
          # only update q-value which is chosen
          q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
          # calculate loss between new Q-value and old Q-value
          loss = tf.reduce_mean(importance * tf.math.square(q_action - updated_q_values))
        
        # set priorities
        errors = updated_q_values - q_action
        self.buffer.set_priorities(indices, errors)
        
        # Backpropagation
        grads = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.q_network.trainable_variables))

    def update_target_network(self):
        # update per update_target_network steps
        self.t_q_network.set_weights(self.q_network.get_weights())


In [5]:
n_states = 9
n_actions = 20
n_encder_lyer = 2
d_model = 16
n_heads = 2
dff = 32
dmlp = 32
warm_up_time = 10080
update_per_actions = 4
max_steps_per_episode = 1000
update_target_network = 1000
agent = DQN_agent(n_states, n_actions, d_model, n_encder_lyer, n_heads, dff, n_actions, dmlp)
env = Factory(warm_up_time)

In [6]:
episode = 0
total_episode = 800
while True:
    episode += 1
    state, legal = env.reset()
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        agent.frame_count += 1
        # choose action
        action = agent.choose_action(state, legal)
        # decay prob of exploration
        agent.decay_epsilon()

        next_state, reward, done, next_legal, inf = env.step(action)

        episode_reward += reward
        # store training data
        agent.store(
            state, action, next_state, reward, done, next_legal
            )

        state = next_state
        legal = next_legal
        
        if agent.frame_count % update_per_actions == 0 and len(agent.buffer.buffer) >= agent.batch_size:
            agent.train_q_network()

        if agent.frame_count % update_target_network == 0:
            agent.update_target_network()

        if done:
            tardy_job_percentage = inf
            break
        
    writer.add_scalar('Reward',episode_reward, episode)
    writer.add_scalar('Tardy jobs', tardy_job_percentage,  episode)
    writer.add_scalar('Epsilon', agent.epsilon,  episode)

    if episode >= total_episode:
        break
            

In [15]:
agent.q_network.save_weights('transformer')

In [8]:
replication = 100
random_seeds = random.sample(range(1,10000),replication)
print(random_seeds)

[4155, 1126, 8704, 5692, 901, 9806, 261, 7851, 9011, 9638, 8684, 9074, 2945, 4670, 8522, 68, 1381, 7087, 6076, 1854, 1955, 5046, 2565, 8639, 9165, 2359, 2644, 7725, 5971, 435, 2549, 6884, 9634, 3575, 3775, 5175, 4986, 3054, 1484, 761, 8601, 8521, 7777, 8593, 3020, 2517, 4991, 9315, 8760, 5517, 7140, 8934, 1206, 1857, 4623, 2912, 779, 6524, 5637, 4515, 9949, 2702, 8901, 4041, 9380, 5320, 4046, 3247, 5847, 5381, 5898, 907, 2770, 4535, 7502, 2481, 245, 136, 9101, 5570, 3291, 4259, 721, 8461, 2840, 9882, 5064, 9102, 1565, 4801, 421, 3966, 960, 5149, 3840, 1905, 3234, 1923, 2585, 7370]


In [9]:
performance = np.zeros((replication,9))
agent.epsilon = 0
for i in range(replication):
    performance[i,0] = random_seeds[i]
    np.random.seed(random_seeds[i])
    state, legal = env.reset()
    while True:
        action = agent.choose_action(state, legal)

        next_state, reward, done, next_legal, inf = env.step(action)

        state = next_state
        legal = next_legal

        if done:
            performance[i,-1] = inf
            break

In [10]:
UPM = sim_Factory()
for i in range(replication):
    np.random.seed(random_seeds[i])
    for j in range(7):
        UPM.build(j)
        UPM.warm_up(warm_up_time)
        UPM.env.run(until = UPM.terminal)
        performance[i,j+1] = np.sum(UPM.sink.number_of_late)/UPM.sink.input

In [11]:
pd.DataFrame(performance,columns=['Random seed','SPT','EDD','MST','ST','CR','WSPT','FIFO','DQN']).to_csv('experiment.csv')

In [12]:
def mean_confidence_interval(a, confidence=0.95):
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    std = np.std(a, ddof = 1)
    return  m-h, m, m+h, std

In [13]:
performance_str = ['SPT','EDD','MST','ST','CR','WSPT','FIFO','DQN']
performance_trans = performance.transpose()
statistics = pd.DataFrame(columns = ['95%CI LOWER','MEAN','95%CI UPPER','STD'])
for i in range(len(performance_trans)-1):
    l, m, u, se = mean_confidence_interval(performance_trans[i+1], confidence=0.95)
    statistics_row=pd.DataFrame([[l, m, u, se]],columns=['95%CI LOWER','MEAN','95%CI UPPER','STD']
                                ,index = [performance_str[i]])
    statistics=statistics.append(statistics_row)
statistics

Unnamed: 0,95%CI LOWER,MEAN,95%CI UPPER,STD
SPT,0.288717,0.3204,0.352083,0.159677
EDD,0.302248,0.3389,0.375552,0.18472
MST,0.283407,0.31395,0.344493,0.15393
ST,0.268978,0.2974,0.325822,0.143243
CR,0.304809,0.33655,0.368291,0.159968
WSPT,0.286247,0.3148,0.343353,0.143903
FIFO,0.28381,0.3188,0.35379,0.176341
DQN,0.227857,0.24075,0.253643,0.06498
