In [1]:
import copy
import pylab
import random
import numpy as np
from environment import Env
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential



Using TensorFlow backend.


In [2]:
EPISODES = 20


# this is DQN Agent for the Cartpole
# it uses Neural Network to approximate q function
# and replay memory & target q network

class DQNAgent:
    def __init__(self):
        self.render = False

        # actions which agent can do
        self.action_space = [0,1,2,3]
        # get size of state and action
        self.action_size = len(self.action_space)
        self.state_size = 22
        self.discount_factor = 0.99
        self.learning_rate = 0.001

        self.epsilon = 1.  # exploration
        self.epsilon_decay = .9999
        self.epsilon_min = 0.01
        self.batch_size = 32
        self.train_start = 100


        # create replay memory using deque
        self.memory = deque(maxlen=10000)
        self.model = self.build_model()
        self.target_model = self.build_model()
        # copy the model to target model
        # --> initialize the target model so that the parameters of model & target model to be same
        self.update_target_model()



    # approximate Q function using Neural Network
    # state is input and Q Value of each action is output of network
    def build_model(self):
        model = Sequential()
        model.add(Dense(20, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(20, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
    
    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
        
        
     # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            # The agent acts randomly
            return random.randrange(self.action_size)
        else:
            # Predict the reward value based on the given state
            state = np.float32(state)
            q_values = self.model.predict(state)
            return np.argmax(q_values[0])
        
    # save sample <s,a,r,s'> to the replay memory
    def replay_memory(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        
        
     # pick samples randomly from replay memory (with batch_size)
    def train_replay(self):
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.action_size))
        
        for i in range(batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            reward = np.float32(reward)
            state = np.float32(state)
            next_state = np.float32(next_state)
            target = self.model.predict(state)[0]
            
            
            # like Q Learning, get maximum Q value at s'
            # But from target model
            
            if done:
                target[action] = reward
            else:
                target = reward + self.discount_factor * \
                                  np.amax(self.model.predict(next_state)[0])

            update_input[i] = state
            update_target[i] = target
            
            
        # make minibatch which includes target q value and predicted q value
        # and do the model fit!
        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
    
    
    # load the saved model
    def load_model(self, name):
        self.model.load_weights(name)

    # save the model which is under training
    def save_model(self, name):
        self.model.save_weights(name)


In [6]:
if __name__ == "__main__":
    # maze game
    # env = Maze()
    env = Env()
    agent = DQNAgent()
    global_step = 0
    scores, episodes = [], []
    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        print("state =",state)
        state = np.reshape(state, [1, 22])
    env.destroy()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 20)                460       
_________________________________________________________________
dense_20 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_21 (Dense)             (None, 4)                 84        
Total params: 964
Trainable params: 964
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 20)                460       
_________________________________________________________________
dense_23 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_24 (De

KeyboardInterrupt: 

In [11]:
env.size

<bound method Misc.grid_size of <environment.Env object .>>

In [13]:
env.rewards

[{'coords': [125.0, 375.0], 'figure': 198, 'reward': -1, 'state': [2, 7]},
 {'coords': [175.0, 125.0], 'figure': 199, 'reward': -1, 'state': [3, 2]},
 {'coords': [125.0, 275.0], 'figure': 200, 'reward': -1, 'state': [2, 5]},
 {'coords': [225.0, 475.0], 'figure': 201, 'reward': -1, 'state': [4, 9]},
 {'coords': [275.0, 375.0], 'figure': 202, 'reward': -1, 'state': [5, 7]},
 {'coords': [325.0, 225.0], 'figure': 203, 'reward': -1, 'state': [6, 4]},
 {'coords': [375.0, 425.0], 'figure': 204, 'reward': -1, 'state': [7, 8]},
 {'coords': [425.0, 175.0], 'figure': 205, 'reward': -1, 'state': [8, 3]},
 {'coords': [475.0, 75.0], 'figure': 206, 'reward': -1, 'state': [9, 1]},
 {'coords': [475.0, 475.0], 'figure': 207, 'reward': 5, 'state': [9, 9]}]