In [None]:
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

In [None]:
EPISODES = 3000
STEPS = 1000 # NOT NEEDED IF IN OPENAI ENV

In [None]:
class DQN:# DQN Agent Class
    def __init__(self, state_size, action_size):
        
        self.state_size = state_size     #size of state and action inputs in the class
        self.action_size = action_size

        #hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.train_start = 1000
        self.memory = deque(maxlen=2000)    #create replay memory using collections

        #main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        #initialize target model
        self.update_target_model()

    #state being input and Q-Value of each action is output
    def build_model(self):#Neural Network to approximate q function
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    #update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    #epsilon-greedy
    def epsilon_greedy(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    #append <s,a,r,s'> to replay memory i.e. deque here
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def train_model(self):
        if len(self.memory) < self.train_start:
            return

        #get samples randomly from replay memory in batches
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.state_size))
        
        action, reward, done = [], [], []

        for i in range(self.batch_size):
            update_input[i] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3]
            done.append(mini_batch[i][4])

        target = self.model.predict(update_input)
        target_val = self.target_model.predict(update_target)

        for i in range(self.batch_size):
            #QLearning
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.discount_factor * (
                    np.amax(target_val[i]))

        self.model.fit(update_input, target, batch_size=self.batch_size,
                       epochs=1, verbose=0)


if __name__ == "__main__":
    
    #get environment
    env = gym.make('CartPole-v1') #BeerGame()
    
    #size of state and action from environment
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DQN(state_size, action_size)

    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            
            env.render()

            #select action and take one step
            action = agent.epsilon_greedy(state)
            
            next_state, reward, done, info = env.step(action)
            
            next_state = np.reshape(next_state, [1, state_size])
            
            reward = reward if not done or score == 499 else -100 #action makes the episode end, give penalty -100

            agent.append_sample(state, action, reward, next_state, done)    #replay memory
            
            agent.train_model()      #train
            
            score += reward
            
            state = next_state

            if done:
                #update the target model to be same with model
                agent.update_target_model()

                score = score if score == 500 else score + 100
                scores.append(score)
                episodes.append(e)
                pylab.plot(episodes, scores, 'b')
                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)

                #set a criteria : if mean of scores of last 10 episode is bigger than 490
                #stop training
                if np.mean(scores[-min(10, len(scores)):]) > 490:
                    sys.exit()
            
            

        #save model files
        if e % 50 == 0:
            agent.model.save_weights("./save_model/cartpole_dqn.h5")
            pylab.savefig("./save_graph/cartpole_dqn.jpg")