# Introduction to Reinforcement Learning

This notebook is a walkthrough of applying reinforcement learning on different problems using the keras and tensorflow backend neural network over Q-Learning algorithm

### Section 1: Libraries and Modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import tensorflow as tf
import keras.backend as K

import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings('ignore')

## Cartpole Problem

### Section 2: Defining the Environment

In [3]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

### Section 3: Defining the ANN Architecture

In [4]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(4))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 40        
_________________________________________________________________
activation_1 (Activation)    (None, 8)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 36        
_________________________________________________________________
activation_2 (Activation)    (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 10        
_________________________________________________________________
activation_3 (Activation)    (None, 2)                 0         
Total para

### Section 4: Training the Model using the Q-Learning Algorithm

In [5]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=False, verbose=2)

Training for 5000 steps ...
   10/5000: episode: 1, duration: 0.044s, episode steps: 10, steps per second: 227, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.132 [-1.967, 3.014], loss: --, mean_absolute_error: --, mean_q: --
   18/5000: episode: 2, duration: 0.454s, episode steps: 8, steps per second: 18, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.125 [0.000, 1.000], mean observation: 0.172 [-1.323, 2.220], loss: 0.496810, mean_absolute_error: 0.524521, mean_q: 0.138401
   29/5000: episode: 3, duration: 0.016s, episode steps: 11, steps per second: 673, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.091 [0.000, 1.000], mean observation: 0.150 [-1.711, 2.817], loss: 0.472517, mean_absolute_error: 0.522254, mean_q: 0.188424
   39/5000: episode: 4, duration: 0.016s, episode steps: 10, steps per second: 635, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], m

  410/5000: episode: 43, duration: 0.013s, episode steps: 8, steps per second: 618, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.174 [-1.530, 2.589], loss: 0.583812, mean_absolute_error: 1.042709, mean_q: 2.516412
  420/5000: episode: 44, duration: 0.017s, episode steps: 10, steps per second: 605, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.159 [-1.915, 3.070], loss: 0.445071, mean_absolute_error: 0.972192, mean_q: 2.464103
  429/5000: episode: 45, duration: 0.015s, episode steps: 9, steps per second: 610, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.167 [-1.721, 2.843], loss: 0.497742, mean_absolute_error: 1.029385, mean_q: 2.709285
  438/5000: episode: 46, duration: 0.014s, episode steps: 9, steps per second: 643, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean actio

  808/5000: episode: 83, duration: 0.018s, episode steps: 11, steps per second: 618, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.091 [0.000, 1.000], mean observation: 0.122 [-1.784, 2.810], loss: 0.261569, mean_absolute_error: 1.908658, mean_q: 4.688330
  817/5000: episode: 84, duration: 0.014s, episode steps: 9, steps per second: 651, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.142 [-1.716, 2.761], loss: 0.290721, mean_absolute_error: 1.954617, mean_q: 4.740277
  826/5000: episode: 85, duration: 0.013s, episode steps: 9, steps per second: 677, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.153 [-1.772, 2.814], loss: 0.236059, mean_absolute_error: 1.965842, mean_q: 4.769562
  835/5000: episode: 86, duration: 0.013s, episode steps: 9, steps per second: 677, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean actio

 1223/5000: episode: 123, duration: 0.019s, episode steps: 12, steps per second: 634, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.079 [-1.209, 1.760], loss: 0.079413, mean_absolute_error: 2.830729, mean_q: 5.604668
 1234/5000: episode: 124, duration: 0.017s, episode steps: 11, steps per second: 666, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.364 [0.000, 1.000], mean observation: 0.106 [-0.979, 1.598], loss: 0.108560, mean_absolute_error: 2.896635, mean_q: 5.688229
 1245/5000: episode: 125, duration: 0.016s, episode steps: 11, steps per second: 679, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.364 [0.000, 1.000], mean observation: 0.112 [-1.176, 1.788], loss: 0.099689, mean_absolute_error: 2.894938, mean_q: 5.681004
 1260/5000: episode: 126, duration: 0.021s, episode steps: 15, steps per second: 704, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], 

 1633/5000: episode: 158, duration: 0.014s, episode steps: 9, steps per second: 623, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.146 [-2.813, 1.769], loss: 0.780561, mean_absolute_error: 4.024928, mean_q: 7.621020
 1643/5000: episode: 159, duration: 0.016s, episode steps: 10, steps per second: 631, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.116 [-2.646, 1.786], loss: 1.115473, mean_absolute_error: 4.029023, mean_q: 7.576444
 1651/5000: episode: 160, duration: 0.013s, episode steps: 8, steps per second: 617, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.129 [-2.510, 1.617], loss: 1.172853, mean_absolute_error: 4.180949, mean_q: 7.808504
 1663/5000: episode: 161, duration: 0.018s, episode steps: 12, steps per second: 672, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], m

 2024/5000: episode: 197, duration: 0.019s, episode steps: 11, steps per second: 567, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.909 [0.000, 1.000], mean observation: -0.124 [-2.829, 1.739], loss: 0.826796, mean_absolute_error: 4.616831, mean_q: 8.844901
 2034/5000: episode: 198, duration: 0.016s, episode steps: 10, steps per second: 637, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.141 [-3.065, 1.927], loss: 0.782135, mean_absolute_error: 4.509972, mean_q: 8.657809
 2045/5000: episode: 199, duration: 0.017s, episode steps: 11, steps per second: 667, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.909 [0.000, 1.000], mean observation: -0.104 [-2.733, 1.785], loss: 1.124796, mean_absolute_error: 4.540173, mean_q: 8.621413
 2056/5000: episode: 200, duration: 0.017s, episode steps: 11, steps per second: 645, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000

 2417/5000: episode: 236, duration: 0.017s, episode steps: 10, steps per second: 605, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.144 [-3.053, 1.905], loss: 0.536041, mean_absolute_error: 4.440173, mean_q: 8.516920
 2429/5000: episode: 237, duration: 0.022s, episode steps: 12, steps per second: 555, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.917 [0.000, 1.000], mean observation: -0.101 [-3.025, 1.980], loss: 0.698722, mean_absolute_error: 4.548768, mean_q: 8.700689
 2440/5000: episode: 238, duration: 0.016s, episode steps: 11, steps per second: 670, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.155 [-3.316, 2.104], loss: 0.399125, mean_absolute_error: 4.410175, mean_q: 8.529565
 2450/5000: episode: 239, duration: 0.015s, episode steps: 10, steps per second: 680, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000

 2816/5000: episode: 273, duration: 0.043s, episode steps: 30, steps per second: 703, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: -0.014 [-3.229, 2.288], loss: 0.272215, mean_absolute_error: 3.966243, mean_q: 7.624074
 2834/5000: episode: 274, duration: 0.026s, episode steps: 18, steps per second: 681, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.722 [0.000, 1.000], mean observation: -0.084 [-2.550, 1.539], loss: 0.278259, mean_absolute_error: 3.882715, mean_q: 7.447523
 2845/5000: episode: 275, duration: 0.016s, episode steps: 11, steps per second: 674, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.119 [-3.286, 2.174], loss: 0.292358, mean_absolute_error: 3.821169, mean_q: 7.320729
 2855/5000: episode: 276, duration: 0.015s, episode steps: 10, steps per second: 667, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000

 3257/5000: episode: 304, duration: 0.016s, episode steps: 9, steps per second: 576, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.119 [-2.460, 1.605], loss: 0.344723, mean_absolute_error: 3.623901, mean_q: 6.852263
 3267/5000: episode: 305, duration: 0.016s, episode steps: 10, steps per second: 644, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.143 [-3.044, 1.934], loss: 0.333039, mean_absolute_error: 3.818430, mean_q: 7.227118
 3277/5000: episode: 306, duration: 0.015s, episode steps: 10, steps per second: 676, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.122 [-3.063, 1.990], loss: 0.232312, mean_absolute_error: 3.702419, mean_q: 7.051692
 3288/5000: episode: 307, duration: 0.016s, episode steps: 11, steps per second: 676, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000],

 3668/5000: episode: 338, duration: 0.019s, episode steps: 11, steps per second: 588, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.120 [-3.289, 2.144], loss: 0.462506, mean_absolute_error: 3.484932, mean_q: 6.378937
 3677/5000: episode: 339, duration: 0.014s, episode steps: 9, steps per second: 629, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.137 [-2.804, 1.763], loss: 0.332764, mean_absolute_error: 3.612020, mean_q: 6.698168
 3689/5000: episode: 340, duration: 0.018s, episode steps: 12, steps per second: 681, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.917 [0.000, 1.000], mean observation: -0.094 [-3.032, 1.992], loss: 0.483753, mean_absolute_error: 3.585841, mean_q: 6.604118
 3699/5000: episode: 341, duration: 0.016s, episode steps: 10, steps per second: 640, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000],

 4059/5000: episode: 373, duration: 0.020s, episode steps: 12, steps per second: 598, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.833 [0.000, 1.000], mean observation: -0.128 [-2.734, 1.780], loss: 0.540008, mean_absolute_error: 3.303685, mean_q: 5.998034
 4068/5000: episode: 374, duration: 0.014s, episode steps: 9, steps per second: 634, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.144 [-2.472, 1.580], loss: 0.389310, mean_absolute_error: 3.284341, mean_q: 6.018667
 4080/5000: episode: 375, duration: 0.018s, episode steps: 12, steps per second: 681, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.833 [0.000, 1.000], mean observation: -0.111 [-2.703, 1.790], loss: 0.391832, mean_absolute_error: 3.085925, mean_q: 5.624223
 4090/5000: episode: 376, duration: 0.015s, episode steps: 10, steps per second: 656, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000],

 4469/5000: episode: 410, duration: 0.020s, episode steps: 10, steps per second: 505, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.144 [-1.859, 1.147], loss: 0.377574, mean_absolute_error: 3.027922, mean_q: 5.591921
 4479/5000: episode: 411, duration: 0.018s, episode steps: 10, steps per second: 568, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: -0.125 [-1.830, 1.216], loss: 0.453219, mean_absolute_error: 3.061948, mean_q: 5.620577
 4490/5000: episode: 412, duration: 0.019s, episode steps: 11, steps per second: 568, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.098 [-1.731, 1.196], loss: 0.396594, mean_absolute_error: 3.049589, mean_q: 5.580395
 4503/5000: episode: 413, duration: 0.025s, episode steps: 13, steps per second: 526, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000

 4950/5000: episode: 443, duration: 0.067s, episode steps: 42, steps per second: 626, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.452 [0.000, 1.000], mean observation: -0.155 [-0.728, 0.388], loss: 0.568587, mean_absolute_error: 3.276893, mean_q: 5.975963
done, took 8.300 seconds


<keras.callbacks.History at 0x1894502c940>

In [7]:
dqn.test(env, nb_episodes=10, visualize=True)
env.close()

Testing for 10 episodes ...
Episode 1: reward: 78.000, steps: 78
Episode 2: reward: 64.000, steps: 64
Episode 3: reward: 41.000, steps: 41
Episode 4: reward: 64.000, steps: 64
Episode 5: reward: 50.000, steps: 50
Episode 6: reward: 44.000, steps: 44
Episode 7: reward: 39.000, steps: 39
Episode 8: reward: 72.000, steps: 72
Episode 9: reward: 48.000, steps: 48
Episode 10: reward: 36.000, steps: 36


## Car Problem

### Section 2: Defining the Environment

In [8]:
ENV_NAME = 'MountainCar-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

### Section 3: Defining the ANN Architecture

In [9]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                192       
_________________________________________________________________
activation_4 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                1040      
_________________________________________________________________
activation_5 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_6 (Activation)    (None, 16)                0         
__________

### Section 4: Training the Model using the Q-Learning Algorithm

In [10]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=400,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 16.024 seconds


<keras.callbacks.History at 0x1894fa7aba8>

In [11]:
dqn.test(env, nb_episodes=5, visualize=True)
env.close()

Testing for 5 episodes ...
Episode 1: reward: -200.000, steps: 200
Episode 2: reward: -200.000, steps: 200
Episode 3: reward: -200.000, steps: 200
Episode 4: reward: -200.000, steps: 200
Episode 5: reward: -200.000, steps: 200


### Section 5: Random Action

In [12]:
env.reset()
action = 0
for i in range(1000):
    env.render()
    if i < 400:
        env.step(0) # take a random action
    else:
        env.step(2) # take a random action
        # env.step(env.action_space.sample()) # take a random action
env.close()