In [15]:
#https://www.analyticsvidhya.com/blog/2017/01/introduction-to-reinforcement-learning-implementation/

#git clone https://github.com/matthiasplappert/keras-rl.git
#cd keras-rl
#python setup.py install

#pip install gym
#pip install tensorflow


In [5]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [6]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [7]:
print(env.action_space.n)
print(env.observation_space)  #Box(4) = (x,y) of top and (x,y) of bottom positions


2
Box(4,)


In [8]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)

Training for 5000 steps ...
Instructions for updating:
Use tf.cast instead.




   79/5000: episode: 1, duration: 3.009s, episode steps: 79, steps per second: 26, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.060 [-0.402, 0.722], loss: 0.429106, mean_absolute_error: 0.496745, mean_q: 0.052343
  113/5000: episode: 2, duration: 0.517s, episode steps: 34, steps per second: 66, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.151 [-0.159, 0.753], loss: 0.355710, mean_absolute_error: 0.448637, mean_q: 0.190170
  163/5000: episode: 3, duration: 0.832s, episode steps: 50, steps per second: 60, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.082 [-0.295, 0.778], loss: 0.315021, mean_absolute_error: 0.467283, mean_q: 0.322757
  196/5000: episode: 4, duration: 0.549s, episode steps: 33, steps per second: 60, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action:

  667/5000: episode: 31, duration: 0.216s, episode steps: 13, steps per second: 60, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.091 [-1.016, 1.693], loss: 0.480848, mean_absolute_error: 2.233629, mean_q: 4.208690
  678/5000: episode: 32, duration: 0.183s, episode steps: 11, steps per second: 60, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.095 [-1.030, 1.750], loss: 0.402849, mean_absolute_error: 2.266804, mean_q: 4.336007
  688/5000: episode: 33, duration: 0.165s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.132 [-1.348, 2.173], loss: 0.504009, mean_absolute_error: 2.319485, mean_q: 4.431838
  698/5000: episode: 34, duration: 0.166s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean act

  988/5000: episode: 61, duration: 0.166s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.100 [-0.994, 1.596], loss: 0.847301, mean_absolute_error: 3.457591, mean_q: 6.525866
 1000/5000: episode: 62, duration: 0.199s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.110 [-1.175, 2.006], loss: 0.914532, mean_absolute_error: 3.495514, mean_q: 6.673672
 1012/5000: episode: 63, duration: 0.199s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.115 [-1.325, 2.013], loss: 0.990485, mean_absolute_error: 3.560205, mean_q: 6.722445
 1024/5000: episode: 64, duration: 0.200s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean act

 1529/5000: episode: 90, duration: 1.100s, episode steps: 66, steps per second: 60, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.068 [-0.350, 0.781], loss: 1.153327, mean_absolute_error: 4.681675, mean_q: 8.767479
 1591/5000: episode: 91, duration: 1.033s, episode steps: 62, steps per second: 60, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: -0.109 [-0.839, 0.513], loss: 1.338410, mean_absolute_error: 4.873975, mean_q: 9.127050
 1629/5000: episode: 92, duration: 0.633s, episode steps: 38, steps per second: 60, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.113 [-0.838, 0.310], loss: 1.438148, mean_absolute_error: 4.971040, mean_q: 9.299198
 1691/5000: episode: 93, duration: 1.032s, episode steps: 62, steps per second: 60, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean a

 2192/5000: episode: 119, duration: 0.132s, episode steps: 8, steps per second: 61, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.160 [-2.576, 1.578], loss: 4.131147, mean_absolute_error: 6.723415, mean_q: 12.541424
 2200/5000: episode: 120, duration: 0.133s, episode steps: 8, steps per second: 60, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.147 [-2.572, 1.612], loss: 1.688442, mean_absolute_error: 6.678586, mean_q: 12.726779
 2209/5000: episode: 121, duration: 0.149s, episode steps: 9, steps per second: 60, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.173 [-2.848, 1.726], loss: 2.062706, mean_absolute_error: 6.766495, mean_q: 12.947590
 2219/5000: episode: 122, duration: 0.165s, episode steps: 10, steps per second: 61, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean

 3124/5000: episode: 148, duration: 0.717s, episode steps: 43, steps per second: 60, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.047 [-1.010, 0.588], loss: 3.060229, mean_absolute_error: 8.351409, mean_q: 15.931489
 3152/5000: episode: 149, duration: 0.466s, episode steps: 28, steps per second: 60, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.098 [-0.745, 0.276], loss: 3.326830, mean_absolute_error: 8.458238, mean_q: 16.168598
 3200/5000: episode: 150, duration: 0.799s, episode steps: 48, steps per second: 60, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: -0.139 [-0.775, 0.232], loss: 3.285151, mean_absolute_error: 8.535563, mean_q: 16.286034
 3266/5000: episode: 151, duration: 1.099s, episode steps: 66, steps per second: 60, episode reward: 66.000, mean reward: 1.000 [1.000, 1.000]

<keras.callbacks.History at 0x1319ad3c8>

In [11]:
print("e-Greedy epsilon=",policy.eps)

e-Greedy epsilon= 0.1


In [12]:
dqn.test(env, nb_episodes=25, visualize=True)

Testing for 25 episodes ...
Episode 1: reward: 68.000, steps: 68
Episode 2: reward: 66.000, steps: 66
Episode 3: reward: 140.000, steps: 140
Episode 4: reward: 147.000, steps: 147
Episode 5: reward: 60.000, steps: 60
Episode 6: reward: 76.000, steps: 76
Episode 7: reward: 69.000, steps: 69
Episode 8: reward: 58.000, steps: 58
Episode 9: reward: 60.000, steps: 60
Episode 10: reward: 79.000, steps: 79
Episode 11: reward: 98.000, steps: 98
Episode 12: reward: 64.000, steps: 64
Episode 13: reward: 74.000, steps: 74
Episode 14: reward: 114.000, steps: 114
Episode 15: reward: 80.000, steps: 80
Episode 16: reward: 90.000, steps: 90
Episode 17: reward: 71.000, steps: 71
Episode 18: reward: 58.000, steps: 58
Episode 19: reward: 71.000, steps: 71
Episode 20: reward: 67.000, steps: 67
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 76.000, steps: 76
Episode 23: reward: 82.000, steps: 82
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 66.000, steps: 66


<keras.callbacks.History at 0x1319400b8>

In [35]:
dir(dqn)

['_DQNAgent__policy',
 '_DQNAgent__test_policy',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_on_test_begin',
 '_on_test_end',
 '_on_train_begin',
 '_on_train_end',
 'backward',
 'batch_size',
 'compile',
 'compiled',
 'compute_batch_q_values',
 'compute_q_values',
 'custom_model_objects',
 'delta_clip',
 'dueling_type',
 'enable_double_dqn',
 'enable_dueling_network',
 'fit',
 'forward',
 'gamma',
 'get_config',
 'layers',
 'load_weights',
 'memory',
 'memory_interval',
 'metrics_names',
 'model',
 'nb_actions',
 'nb_steps_warmup',
 'policy',
 'process_state_batch',
 'processor',
 'recent_action',
 'recent_observation',
 'reset_states',
 'save_weights',
 'step',


In [24]:
dqn.batch_size

32

In [15]:
dir(dqn.model)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_inbound_node',
 '_base_init',
 '_build_input_shape',
 '_built',
 '_check_trainable_weights_consistency',
 '_collected_trainable_weights',
 '_compute_previous_mask',
 '_expects_training_arg',
 '_feed_input_names',
 '_feed_input_shapes',
 '_feed_inputs',
 '_feed_loss_fns',
 '_feed_output_names',
 '_feed_output_shapes',
 '_feed_outputs',
 '_feed_sample_weight_modes',
 '_feed_sample_weights',
 '_feed_targets',
 '_function_kwargs',
 '_get_node_attribute_at_index',
 '_inbound_nodes',
 '_init_graph_network',
 '_init_subclassed_network',
 '_initial_weights',
 '_input_coordinate

In [33]:
dir(dqn.model.summary)

['__call__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__func__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__self__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [20]:
dir(dqn.model.outputs)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [31]:
dqn.model.outputs.count

<function list.count(value, /)>