# Introduction to Reinforcement Learning

This notebook is a walkthrough of applying reinforcement learning on different problems using the keras and tensorflow backend neural network over Q-Learning algorithm

### Section 1: Libraries and Modules

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import tensorflow as tf
import keras.backend as K

import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [3]:
import warnings
warnings.filterwarnings('ignore')

## Cartpole Problem

### Section 2: Defining the Environment

In [2]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

### Section 3: Defining the ANN Architecture

In [13]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 40        
_________________________________________________________________
activation_5 (Activation)    (None, 8)                 0         
_________________________________________________________________
dense_6 (Dense)              (None, 8)                 72        
_________________________________________________________________
activation_6 (Activation)    (None, 8)                 0         
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 18        
_________________________________________________________________
activation_7 (Activation)    (None, 2)                 0         
Total para

### Section 4: Training the Model using the Q-Learning Algorithm

In [14]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=False, verbose=2)

Training for 5000 steps ...
   83/5000: episode: 1, duration: 2.098s, episode steps: 83, steps per second: 40, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.166 [-0.378, 1.102], loss: 0.416868, mean_absolute_error: 0.475442, mean_q: 0.140875
  164/5000: episode: 2, duration: 0.365s, episode steps: 81, steps per second: 222, episode reward: 81.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: 0.126 [-0.398, 0.942], loss: 0.208963, mean_absolute_error: 0.410182, mean_q: 0.599427
  219/5000: episode: 3, duration: 0.242s, episode steps: 55, steps per second: 228, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.161 [-0.249, 0.924], loss: 0.098684, mean_absolute_error: 0.503921, mean_q: 1.177370
  282/5000: episode: 4, duration: 0.284s, episode steps: 63, steps per second: 222, episode reward: 63.000, mean reward: 1.0

 1106/5000: episode: 30, duration: 0.217s, episode steps: 48, steps per second: 221, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.688 [0.000, 1.000], mean observation: 0.112 [-3.830, 3.456], loss: 0.741582, mean_absolute_error: 4.310705, mean_q: 8.346580
 1157/5000: episode: 31, duration: 0.226s, episode steps: 51, steps per second: 226, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.107 [-0.256, 0.741], loss: 1.043115, mean_absolute_error: 4.520559, mean_q: 8.637922
 1191/5000: episode: 32, duration: 0.163s, episode steps: 34, steps per second: 208, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.096 [-0.252, 0.730], loss: 1.017994, mean_absolute_error: 4.655987, mean_q: 8.942511
 1207/5000: episode: 33, duration: 0.083s, episode steps: 16, steps per second: 193, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean

 2360/5000: episode: 59, duration: 0.246s, episode steps: 55, steps per second: 223, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.527 [0.000, 1.000], mean observation: 0.094 [-0.270, 0.620], loss: 2.408211, mean_absolute_error: 8.407770, mean_q: 16.235958
 2407/5000: episode: 60, duration: 0.219s, episode steps: 47, steps per second: 215, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.101 [-0.700, 0.370], loss: 3.094864, mean_absolute_error: 8.565195, mean_q: 16.427967
 2445/5000: episode: 61, duration: 0.172s, episode steps: 38, steps per second: 221, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.069 [-0.389, 0.840], loss: 1.985275, mean_absolute_error: 8.559685, mean_q: 16.649176
 2518/5000: episode: 62, duration: 0.357s, episode steps: 73, steps per second: 204, episode reward: 73.000, mean reward: 1.000 [1.000, 1.000], 

 4413/5000: episode: 89, duration: 0.357s, episode steps: 80, steps per second: 224, episode reward: 80.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.176 [-0.439, 0.912], loss: 3.664387, mean_absolute_error: 12.500826, mean_q: 24.330875
 4573/5000: episode: 90, duration: 0.703s, episode steps: 160, steps per second: 228, episode reward: 160.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.347 [-2.403, 0.633], loss: 3.919044, mean_absolute_error: 12.744471, mean_q: 24.814951
 4763/5000: episode: 91, duration: 0.848s, episode steps: 190, steps per second: 224, episode reward: 190.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.283 [-2.413, 0.600], loss: 3.757337, mean_absolute_error: 13.152532, mean_q: 25.699707
 4904/5000: episode: 92, duration: 0.620s, episode steps: 141, steps per second: 227, episode reward: 141.000, mean reward: 1.000 [1.000

<keras.callbacks.History at 0x1c995bb5da0>

In [16]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 199.000, steps: 199
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 194.000, steps: 194
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 150.000, steps: 150
Episode 10: reward: 200.000, steps: 200


<keras.callbacks.History at 0x1c995c39c50>

## Ant Problem

### Section 2: Defining the Environment

In [12]:
ENV_NAME = 'MountainCar-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

### Section 3: Defining the ANN Architecture

In [76]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_4 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                192       
_________________________________________________________________
activation_10 (Activation)   (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 16)                1040      
_________________________________________________________________
activation_11 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_12 (Activation)   (None, 16)                0         
__________

### Section 4: Training the Model using the Q-Learning Algorithm

In [84]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=400,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 88.158 seconds


<keras.callbacks.History at 0x279e390acc0>

In [85]:
dqn.test(env, nb_episodes=5, visualize=True)
env.close()

Testing for 5 episodes ...
Episode 1: reward: -200.000, steps: 200
Episode 2: reward: -200.000, steps: 200
Episode 3: reward: -200.000, steps: 200
Episode 4: reward: -200.000, steps: 200
Episode 5: reward: -200.000, steps: 200


### Section 5: Random Action

In [68]:
env.reset()
action = 0
for i in range(1000):
    env.render()
    if i < 400:
        env.step(0) # take a random action
    else:
        env.step(2) # take a random action
        # env.step(env.action_space.sample()) # take a random action
env.close()