In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers
EPISODES = 5000

Using TensorFlow backend.


In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.997
        self.epsilon_min = 0.1005
        self.sample_size = 32
        self.train_start = 20000
        # create replay memory using deque
        self.memory = deque(maxlen=200000)
        self.train_buffer = deque(maxlen=10240)
        # create main model
        self.model = self.build_model()

    # approximate Q function using Neural Network
    # state is input and Q Value of each action is output of network
    def build_model(self):
        model = Sequential()
        model.add(Dense(32, input_dim=self.state_size, activation='relu',kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu',kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu',kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=optimizers.Adam(lr=self.learning_rate))
        return model

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    # save sample <s,a,r,s'> to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # pick samples randomly from replay memory (with batch_size)
    def train_model(self):

        #do training if buffer is full
        if len(self.train_buffer) == self.train_buffer.maxlen:
            batch_size = self.train_buffer.maxlen        
            states = np.zeros((batch_size, self.state_size))
            next_states = np.zeros((batch_size, self.state_size))
            action, reward, done = [], [], []

            for i in range(batch_size):
                states[i] = self.train_buffer[i][0]
                action.append(self.train_buffer[i][1])
                reward.append(self.train_buffer[i][2])
                next_states[i] = self.train_buffer[i][3]
                done.append(self.train_buffer[i][4])

            target = self.model.predict(states)
            target_val = self.model.predict(next_states)

            for i in range(batch_size):
                # Q Learning: get maximum Q value at s' from target model
                if done[i]:
                    target[i][action[i]] = reward[i]
                else:
                    target[i][action[i]] = reward[i] + self.discount_factor * (
                        np.amax(target_val[i]))

            self.model.fit(states, target, 100, epochs=1, verbose=0)
            self.train_buffer.clear()

In [3]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
scores, episodes = [], []
action_count = 0

for e in range(EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    
    while not done:
        action = agent.get_action(state)        
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])             
        agent.append_sample(state, action, reward, next_state, done)
        
        if len(agent.memory) > agent.train_start:                
            action_count += 1
        
            if action_count == 4:
                mini_batch = random.sample(agent.memory, agent.sample_size)
                agent.train_buffer.extend(mini_batch)
                action_count = 0
                         
        agent.train_model()        
        score += reward
        state = next_state
        
        if done:
            if agent.epsilon > agent.epsilon_min:
                agent.epsilon *= agent.epsilon_decay
            scores.append(score)
            episodes.append(e)
            print("episode:", e, " score:", score, " epsilon:", agent.epsilon," memory size:", len(agent.memory))


[2017-10-16 22:17:41,954] Making new env: LunarLander-v2


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                288       
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 132       
Total params: 2,532
Trainable params: 2,532
Non-trainable params: 0
_________________________________________________________________
episode: 0  score: -326.163917972  epsilon: 0.997  memory size: 96
episode: 1  score: -104.202354129  epsilon: 0.994009  memory size: 156
episode: 2  score: -175.160883645  epsilon: 0.991026973  memory size: 244
episode: 3  score: -393.111129745  epsilon: 0.988053892081  

episode: 94  score: -442.582442649  epsilon: 0.7516921927868743  memory size: 8308
episode: 95  score: -265.129411166  epsilon: 0.7494371162085136  memory size: 8366
episode: 96  score: -342.827390084  epsilon: 0.747188804859888  memory size: 8425
episode: 97  score: -485.949476462  epsilon: 0.7449472384453084  memory size: 8515
episode: 98  score: -323.905149349  epsilon: 0.7427123967299725  memory size: 8575
episode: 99  score: -173.232334296  epsilon: 0.7404842595397826  memory size: 8636
episode: 100  score: -582.377643375  epsilon: 0.7382628067611632  memory size: 8724
episode: 101  score: -240.053477117  epsilon: 0.7360480183408797  memory size: 8801
episode: 102  score: -319.539199495  epsilon: 0.7338398742858571  memory size: 8859
episode: 103  score: -374.511695081  epsilon: 0.7316383546629995  memory size: 8958
episode: 104  score: -327.598033735  epsilon: 0.7294434395990105  memory size: 9026
episode: 105  score: -240.09345367  epsilon: 0.7272551092802134  memory size: 9092


episode: 196  score: -685.910578339  epsilon: 0.5532815489030894  memory size: 16092
episode: 197  score: -455.434212415  epsilon: 0.5516217042563801  memory size: 16150
episode: 198  score: -328.298717427  epsilon: 0.549966839143611  memory size: 16211
episode: 199  score: -531.945306051  epsilon: 0.5483169386261801  memory size: 16308
episode: 200  score: -581.302030887  epsilon: 0.5466719878103016  memory size: 16386
episode: 201  score: -172.032257072  epsilon: 0.5450319718468707  memory size: 16446
episode: 202  score: -559.542594785  epsilon: 0.5433968759313301  memory size: 16522
episode: 203  score: -409.475137071  epsilon: 0.5417666853035361  memory size: 16595
episode: 204  score: -640.289229448  epsilon: 0.5401413852476254  memory size: 16682
episode: 205  score: -359.593059965  epsilon: 0.5385209610918825  memory size: 16754
episode: 206  score: -526.803837655  epsilon: 0.5369053982086068  memory size: 16853
episode: 207  score: -528.704072546  epsilon: 0.535294682013981  m

episode: 295  score: -364.121154308  epsilon: 0.41092906525228523  memory size: 25856
episode: 296  score: -187.107636548  epsilon: 0.40969627805652836  memory size: 26017
episode: 297  score: -163.216339853  epsilon: 0.40846718922235875  memory size: 26104
episode: 298  score: -267.852959745  epsilon: 0.40724178765469166  memory size: 26402
episode: 299  score: -170.804104719  epsilon: 0.4060200622917276  memory size: 26576
episode: 300  score: -452.449099399  epsilon: 0.4048020021048524  memory size: 26875
episode: 301  score: -216.762513154  epsilon: 0.40358759609853784  memory size: 26965
episode: 302  score: -204.77058676  epsilon: 0.4023768333102422  memory size: 27322
episode: 303  score: -360.512394568  epsilon: 0.4011697028103115  memory size: 27523
episode: 304  score: -256.422993427  epsilon: 0.3999661937018806  memory size: 27684
episode: 305  score: -239.960496474  epsilon: 0.39876629512077494  memory size: 27905
episode: 306  score: -127.239544397  epsilon: 0.397569996235

episode: 392  score: -131.592241284  epsilon: 0.30704159714804574  memory size: 47001
episode: 393  score: -62.9067878048  epsilon: 0.3061204723566016  memory size: 47272
episode: 394  score: -95.4828691524  epsilon: 0.3052021109395318  memory size: 47506
episode: 395  score: -247.443749009  epsilon: 0.3042865046067132  memory size: 47687
episode: 396  score: -116.283341661  epsilon: 0.30337364509289305  memory size: 48272
episode: 397  score: -126.750085429  epsilon: 0.30246352415761435  memory size: 48492
episode: 398  score: -152.588199204  epsilon: 0.3015561335851415  memory size: 48952
episode: 399  score: -155.443244875  epsilon: 0.30065146518438607  memory size: 49270
episode: 400  score: -385.292460359  epsilon: 0.2997495107888329  memory size: 49457
episode: 401  score: -284.601266074  epsilon: 0.2988502622564664  memory size: 49645
episode: 402  score: -329.322826655  epsilon: 0.29795371146969696  memory size: 49952
episode: 403  score: -137.161546162  epsilon: 0.297059850335

episode: 488  score: -478.954799365  epsilon: 0.2301083691226874  memory size: 73061
episode: 489  score: -112.99686204  epsilon: 0.22941804401531934  memory size: 73309
episode: 490  score: -360.808689391  epsilon: 0.2287297898832734  memory size: 73972
episode: 491  score: -201.368234154  epsilon: 0.22804360051362357  memory size: 74665
episode: 492  score: -231.783658258  epsilon: 0.2273594697120827  memory size: 74891
episode: 493  score: -257.807367247  epsilon: 0.22667739130294645  memory size: 75124
episode: 494  score: 6.00927783787  epsilon: 0.22599735912903762  memory size: 76124
episode: 495  score: -107.9886623  epsilon: 0.2253193670516505  memory size: 77124
episode: 496  score: -289.982249437  epsilon: 0.22464340895049556  memory size: 77287
episode: 497  score: -27.4970064546  epsilon: 0.22396947872364406  memory size: 77413
episode: 498  score: -422.089488963  epsilon: 0.22329757028747313  memory size: 77599
episode: 499  score: -99.5732964489  epsilon: 0.22262767757661

KeyboardInterrupt: 

In [None]:
res = []
for i in range(len(scores)-100):
    res.append(np.mean(scores[i:(i+100)]))

In [None]:
len(res)

In [None]:
import matplotlib.pyplot as plt
plt.plot(res)
plt.ylabel('Average Score of Consecutive 100 Episodes')
plt.show()

In [None]:
plt.plot(scores)
plt.ylabel('Score per Episode')
plt.show()

In [None]:
tmp = np.array(res)

In [None]:
agent.model.save('DQN_model.h5')  # creates a HDF5 file 'my_model.h5'