In [219]:
import gym
import tensorflow as tf
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from IPython.display import clear_output

%matplotlib inline
# base code from udacity-deep-learning/reinforcement/Q-learning-cart.ipynb

In [220]:
# Create new cart pole environment
env = gym.make('CartPole-v0')
state = env.reset()
print(state)

[2017-05-22 21:05:06,414] Making new env: CartPole-v0


[-0.01959151 -0.01626964  0.04402413  0.0121828 ]


In [221]:
# Create class QNetwork
class QNetwork:
    def __init__(self, \
                 learning_rate=0.01, \
                 state_size=4, 
                 action_size=2, \
                 hidden_size=10, \
                 hidden_layers=2, \
                 alpha=0., \
                 name='QNetwork'):
        
        # create Q Network
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, \
                                          [None, state_size], \
                                          name='inputs')
            
            # placeholder for actions, to be one-hot encoded next
            self.actions_ = tf.placeholder(tf.int32, \
                                           [None], \
                                           name='actions')
            
            # one hot encode actions
            one_hot_actions = tf.one_hot(self.actions_, \
                                         action_size)
            
            # placeholder for target Qs
            self.targetQs_ = tf.placeholder(tf.float32, \
                                            [None], \
                                            name='target')
            
                
            # ReLU hidden layers
            self.fc1 = tf.layers.dense(self.inputs_, \
                                        hidden_size,\
                                        activation=None,\
                                        kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.fc1 = tf.maximum(alpha*self.fc1,self.fc1)
            
            if hidden_layers == 1:
                out_layer = self.fc1
            else:
                
                self.fc2 = tf.layers.dense(self.fc1, hidden_size,\
                                            activation=None,\
                                            kernel_initializer=tf.contrib.layers.xavier_initializer())
                self.fc2 = tf.maximum(alpha*self.fc2,self.fc2)
                
                if hidden_layers == 2:
                    out_layer = self.fc2
                else:
                    self.fc3 = tf.layers.dense(self.fc2, hidden_size,\
                                            activation=None,\
                                            kernel_initializer=tf.contrib.layers.xavier_initializer())
                    self.fc3 = tf.maximum(alpha*self.fc3,self.fc3)
                    out_layer = self.fc3

            # Linear output layer
            self.output = tf.layers.dense(out_layer, action_size, \
                                          activation=None,\
                                          kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            ### Train with loss (targetQ - Q)^2
            # output has length 2, for two actions. This next line chooses
            # one value from output (per row) according to the one-hot encoded actions.
            self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q))
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

In [222]:
# create memory class for storing previous experiences
class Memory():
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

In [223]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / N 

In [224]:
def initialize_memory_rand_states(memory_size=10000,pretrain_length=20):
    # Initialize the simulation
    state = env.reset()
    
    memory = Memory(max_size=memory_size)

    # Make a bunch of random actions and store the experiences
    for ii in range(pretrain_length):

        # Make a random action
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)

        if done:
            # The simulation fails so no next state
            next_state = np.zeros(state.shape)
            # Add experience to memory
            memory.add((state, action, reward, next_state))

            # Start new episode
            state = env.reset()

        else:
            # Add experience to memory
            memory.add((state, action, reward, next_state))
            state = next_state
            
    return memory

In [230]:
def train_q_network(train_episodes=500,\
                   gamma=0.99,\
                   explore_start=1.0,\
                   explore_stop=0.01,\
                   decay_rate=0.0001,\
                   hidden_size=64,\
                   hidden_layers=2,\
                   learning_rate=0.0001,\
                   memory_size=10000,\
                   batch_size=20,\
                   max_steps=195,\
                   alpha=0.,\
                   verbose=True):
    
    
    mainQN = QNetwork(name='main', hidden_size=hidden_size, hidden_layers=hidden_layers, learning_rate=learning_rate, alpha=alpha)
    
    memory = initialize_memory_rand_states(memory_size=memory_size,pretrain_length=batch_size)
    
    # Now train with experiences
    saver = tf.train.Saver()
    rewards_list = []
    with tf.Session() as sess:
        # Initialize variables
        sess.run(tf.global_variables_initializer())

        step = 0
        state = env.reset()
        for ep in range(train_episodes):
            total_reward = 0
            t = 0
            
            while t < max_steps:
                step += 1
                # Uncomment this next line to watch the training
                # env.render() 

                # Explore or Exploit
                explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step) 
                if explore_p > np.random.rand():
                    # Make a random action
                    action = env.action_space.sample()
                else:
                    # Get action from Q-network
                    feed = {mainQN.inputs_: state.reshape((1, *state.shape))}
                    Qs = sess.run(mainQN.output, feed_dict=feed)
                    action = np.argmax(Qs)

                # Take action, get new state and reward
                next_state, reward, done, _ = env.step(action)

                total_reward += reward

                if done:
                    # the episode ends so no next state
                    next_state = np.zeros(state.shape)
                    t = max_steps

                    # Add experience to memory
                    memory.add((state, action, reward, next_state))

                else:
                    # Add experience to memory
                    memory.add((state, action, reward, next_state))
                    state = next_state
                    t += 1

                # Sample mini-batch from memory
                batch = memory.sample(batch_size)
                states = np.array([each[0] for each in batch])
                actions = np.array([each[1] for each in batch])
                rewards = np.array([each[2] for each in batch])
                next_states = np.array([each[3] for each in batch])

                # Train network
                target_Qs = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states})

                # Set target_Qs to 0 for states where episode ends
                episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1)
                target_Qs[episode_ends] = (0, 0)

                targets = rewards + gamma * np.max(target_Qs, axis=1)

                loss, _ = sess.run([mainQN.loss, mainQN.opt],
                                    feed_dict={mainQN.inputs_: states,
                                               mainQN.targetQs_: targets,
                                               mainQN.actions_: actions})
                
            if verbose:
                print('Episode: {}'.format(ep),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss),
                      'Explore P: {:.4f}'.format(explore_p))
            rewards_list.append((ep, total_reward))
            
            # Start new episode
            state = env.reset()
            
        saver.save(sess, "checkpoints/cartpole.ckpt")
        return rewards_list, mainQN, saver

In [226]:
def plot_rewards(rewards_list):
    eps, rews = np.array(rewards_list).T
    smoothed_rews = running_mean(rews, 10)
    plt.plot(eps[-len(smoothed_rews):], smoothed_rews)
    plt.plot(eps, rews, color='grey', alpha=0.3)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')

In [227]:
def test_q_network(mainQN, saver, test_episodes=20, test_max_steps=500, render=True):

    env.reset()
    avg_rewards = 0.
    with tf.Session() as sess:
        saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
   
        state = env.reset()
        for ep in range(test_episodes):
            t = 0
            while t < test_max_steps:
                if render:
                    env.render() 

                # Get action from Q-network
                feed = {mainQN.inputs_: state.reshape((1, *state.shape))}
                Qs = sess.run(mainQN.output, feed_dict=feed)
                action = np.argmax(Qs)

                # Take action, get new state and reward
                next_state, reward, done, _ = env.step(action)
                
                avg_rewards = avg_rewards + reward / test_episodes
                if done:
                    t = test_max_steps
                    state = env.reset()
                    # Take one random step to get the pole and cart moving
                    #state, reward, done, _ = env.step(env.action_space.sample())

                else:
                    state = next_state
                    t += 1
                    
    return avg_rewards

In [228]:
def test_and_train_qnetwork(train_episodes=500,\
                   gamma=0.99,\
                   explore_start=1.0,\
                   explore_stop=0.01,\
                   decay_rate=0.0001,\
                   hidden_size=64,\
                   hidden_layers=2,\
                   learning_rate=0.0001,\
                   memory_size=10000,\
                   batch_size=20,\
                   test_episodes=20,\
                   render=False,\
                   alpha=0.,\
                   verbose=True):
    
    # reset graph
    tf.reset_default_graph()

    # train q-network
    rewards_list, mainQN, saver = train_q_network(train_episodes = train_episodes, \
                                                  gamma=gamma,\
                                                  explore_start=explore_start,\
                                                  explore_stop=explore_stop,\
                                                  decay_rate=decay_rate,\
                                                  hidden_size=hidden_size,\
                                                  hidden_layers=hidden_layers,\
                                                  learning_rate=learning_rate,\
                                                  memory_size=memory_size,\
                                                  batch_size=batch_size,\
                                                  alpha=alpha,\
                                                  verbose=verbose)

    if verbose:
        # plot training
        plot_rewards(rewards_list)
    
    avg_train_rewards = np.sum([each[1] for each in rewards_list]) / len(rewards_list)
    if verbose:
        print('average training reward = ',avg_train_rewards)

    # test q-network
    avg_test_rewards = test_q_network(mainQN, saver, test_episodes=test_episodes, render=verbose)
    if verbose:
        print('average test reward = ', avg_test_rewards)
    
    return avg_test_rewards, avg_train_rewards


In [199]:
# test implementation
average_rewards = test_and_train_qnetwork(train_episodes=100, verbose=False)
print('average test reward = ', average_rewards)

INFO:tensorflow:Restoring parameters from checkpoints\cartpole.ckpt


[2017-05-22 15:54:17,537] Restoring parameters from checkpoints\cartpole.ckpt


average test reward =  9.249999999999996


In [217]:
train_eps = 500
verb = False
gamma = [0.99,0.98,0.96]
decay_rate = [0.0001,0.0002,0.0004]
exp_start=1.0
exp_stop=0.1
hidden_size=64
hidden_layers=1
learning_rate=0.001
batch_size=20
num_averages = 2
results = []
alpha_relu = [0., 0.1]


for gaIndex in range(len(gamma)):
    for drIndex in range(len(decay_rate)):
        ga = gamma[gaIndex]
        dr = decay_rate[drIndex]
        train_params_name = 'dr='+str(dr)+'_ga='+str(ga)
        average_test_rewards = 0.
        average_train_rewards = 0.
        for i in range(num_averages):
            test,train = test_and_train_qnetwork(train_episodes=train_eps,\
                                   gamma=ga,\
                                   explore_start=exp_start,\
                                   explore_stop=exp_stop,\
                                   decay_rate=dr,\
                                   hidden_layers=hidden_layers,\
                                   hidden_size=hidden_size,\
                                   learning_rate=learning_rate,\
                                   batch_size=batch_size,\
                                   alpha = alpha_relu,\
                                   verbose=verb)
            average_test_rewards += test
            average_train_rewards += train

        average_test_rewards = average_test_rewards / num_averages
        average_train_rewards = average_train_rewards / num_averages
        results.append([train_params_name+' test avg='+str(average_test_rewards)+'  train avg='+str(average_train_rewards)])
        clear_output()
        for each in results:
            print(each)

['dr=0.0001_ga=0.99 test avg=189.52500000000884  train avg=110.806']
['dr=0.0002_ga=0.99 test avg=200.00000000001123  train avg=139.79']
['dr=0.0004_ga=0.99 test avg=10.075000000000008  train avg=136.847']
['dr=0.0001_ga=0.98 test avg=194.92500000001007  train avg=130.697']
INFO:tensorflow:Restoring parameters from checkpoints\cartpole.ckpt


[2017-05-22 21:04:17,130] Restoring parameters from checkpoints\cartpole.ckpt


KeyboardInterrupt: 

In [None]:
test,train = test_and_train_qnetwork(train_episodes=500,\
                                           gamma=0.99,\
                                           explore_start=exp_start,\
                                           explore_stop=exp_stop,\
                                           decay_rate=0.0001,\
                                           hidden_layers=1,\
                                           hidden_size=64,\
                                           learning_rate=0.001,\
                                           batch_size=10,\
                                           verbose=True)

Episode: 0 Total reward: 14.0 Training loss: 1.2950 Explore P: 0.9987
Episode: 1 Total reward: 43.0 Training loss: 1.2038 Explore P: 0.9949
Episode: 2 Total reward: 23.0 Training loss: 1.3081 Explore P: 0.9928
Episode: 3 Total reward: 9.0 Training loss: 1.2267 Explore P: 0.9920
Episode: 4 Total reward: 21.0 Training loss: 1.6801 Explore P: 0.9902
Episode: 5 Total reward: 41.0 Training loss: 1.7272 Explore P: 0.9865
Episode: 6 Total reward: 10.0 Training loss: 1.5540 Explore P: 0.9856
Episode: 7 Total reward: 12.0 Training loss: 1.6622 Explore P: 0.9846
Episode: 8 Total reward: 23.0 Training loss: 2.3423 Explore P: 0.9825
Episode: 9 Total reward: 19.0 Training loss: 2.1084 Explore P: 0.9809
Episode: 10 Total reward: 10.0 Training loss: 2.5088 Explore P: 0.9800
Episode: 11 Total reward: 15.0 Training loss: 2.9160 Explore P: 0.9787
Episode: 12 Total reward: 37.0 Training loss: 2.2928 Explore P: 0.9754
Episode: 13 Total reward: 22.0 Training loss: 3.6535 Explore P: 0.9735
Episode: 14 Total

In [207]:
print(test)
print(train)

11.050000000000022
100.14


In [None]:
env.close()