In [9]:
#Imports
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout
from keras.optimizers import Adam, Adamax, SGD, RMSprop

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory



In [10]:
ENV_NAME = 'LunarLander-v2'
WINDOW_LENGTH = 1 #we only look ahead by 1 state.
record_video_every = 100

env = gym.make(ENV_NAME)

nb_actions = env.action_space.n


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [11]:
#build a model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(32, kernel_initializer='lecun_uniform', activation='relu'))
model.add(Dense(32, kernel_initializer='lecun_uniform', activation='relu'))
model.add(Dense(32, kernel_initializer='lecun_uniform', activation='relu'))
#model.add(LeakyReLU(alpha=0.3))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

In [12]:
memory = SequentialMemory(
    limit=1000000,                 #Remember previous 1 million states
    window_length=WINDOW_LENGTH)   #Look only 1 state ahead


In [13]:
#train first with eploration, then expoitation.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),#train starting with low esp, to high esp.
                              attr='eps',        #ammeal this attr
                              value_max=0.1,       #starting_eps
                              value_min=.001,      #ending:eps
                              value_test=.0001,  #test_mode:eps
                              nb_steps=100000)   #take 1 million steps to slowly alter epsilon.

In [14]:
dqn = DQNAgent(model=model, 
               nb_actions=nb_actions, 
               memory=memory, 
               nb_steps_warmup=1000,     #Run this many before anealing (learn to fall)
               target_model_update=1000, #update model/adjust weights? every 1000 steps?
               policy=policy)

dqn.compile(Adam(lr=.1e-3), metrics=['mse'])

In [None]:
#first fit
dqn.fit(env, 
        nb_steps=100000, 
        verbose=0
       )

In [None]:
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

In [8]:
dqn.load_weights('dqn_{}_weights.h5f'.format(ENV_NAME))

#Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=200, visualize=True)

Testing for 200 episodes ...
Episode 1: reward: -164.321, steps: 295
Episode 2: reward: -46.669, steps: 305
Episode 3: reward: -254.808, steps: 335
Episode 4: reward: -166.231, steps: 329
Episode 5: reward: -319.106, steps: 388
Episode 6: reward: -164.950, steps: 202
Episode 7: reward: -242.380, steps: 255
Episode 8: reward: -143.327, steps: 254
Episode 9: reward: -251.808, steps: 360
Episode 10: reward: -263.444, steps: 302
Episode 11: reward: -130.936, steps: 257
Episode 12: reward: -95.916, steps: 288
Episode 13: reward: -168.654, steps: 247
Episode 14: reward: -90.556, steps: 195
Episode 15: reward: -220.904, steps: 410
Episode 16: reward: -194.780, steps: 206
Episode 17: reward: -316.824, steps: 226
Episode 18: reward: -62.211, steps: 281
Episode 19: reward: -351.617, steps: 256
Episode 20: reward: -26.460, steps: 303
Episode 21: reward: -325.109, steps: 298
Episode 22: reward: -345.710, steps: 487
Episode 23: reward: 156.426, steps: 552
Episode 24: reward: 112.563, steps: 654
Epi

Episode 199: reward: -281.417, steps: 315
Episode 200: reward: -269.991, steps: 495


<keras.callbacks.History at 0x7f48989aee48>