# OpenAI GYM Pendulum-v0 # 

Swingup and balancing of a pendulum using an actor/critic reinforcement learning.

In [3]:
import tensorflow as tf
import numpy as np
import random
import gym

from collections import deque

Create gym environment:

In [4]:
env = gym.make('Pendulum-v0')
num_states = np.prod(np.array(env.observation_space.shape)) 
num_actions = np.prod(np.array(env.action_space.shape))

## Hyperparameters ##

In [5]:
epochs = 10000
num_steps = 10000
batch_size = 1024
critic_learn_rate = 0.001
actor_learn_rate = 0.001

#### ReplayBuffer Class ####
Class to create and handel the buffer used for the replay of previous experiences. This is needed to remove temporal correlation from the state data.

In [6]:
class ReplayBuffer:

    def __init__(self, buffer_size):
        " Initializes the replay buffer by creating a deque() and setting the size and buffer count. "
        self.buffer = deque()
        self.buffer_size = buffer_size
        self.count = 0
         
    def add(self, s, a, r, t, s2):
         
        """ Adds new experience to the ReplayBuffer(). If the buffer size is
        reached, the oldest item is removed.
         
        Inputs needed to create new experience:
            s      - State
            a      - Action
            r      - Reward
            t      - Time
            s2     - Resulting State     
        """
        # Create experience list
        experience = (s, a, r, t, s2)
        
        # Check the size of the buffer
        if self.count < self.buffer_size:
            self.count += 1
        else:
            self.buffer.popleft()
            
        # Add experience to buffer
        self.buffer.append(experience)
        
    def size(self):
        " Return the amount of stored experiences. " 
        return self.count
    
    def batch(self, batch_size):
        "Return a \"batch_size\" number of random samples from the buffer."
        
        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)
        
        batch_state = np.array([item[0] for item in batch])
        batch_action = np.array([item[1] for item in batch])
        batch_reward = np.array([item[2] for item in batch])
        batch_time = np.array([item[3] for item in batch])
        batch_next_state = np.array([item[4] for item in batch])
        
        return batch_state, batch_action, batch_reward, batch_time, batch_next_state 
            
    def clear(self):
        " Remove all entries from the ReplayBuffer. "
        self.buffer.clear()
        self.count = 0

## Neural Networks ##
Create the neural network classes for the actor and critic part of the reinforcement learning controller. 

Reset TensorFlow graph, just to be sure:

In [12]:
tf.reset_default_graph()

#### Actor Neural Network ####

In [7]:
class ActorNetwork:
    
    # Actor Network Parameters
    num_inputs = num_states
    num_outputs = num_actions
    num_hidden_1 = 8
    num_hidden_2 = 8 
    
    output_min = env.action_space.low
    output_max = env.action_space.high
    
    def __init__(self):
        
        # Define input placeholder
        self.input = tf.placeholder(tf.float32, [None,self.num_inputs])
        
        # Weights
        weight = {
            'w1': tf.Variable(tf.random_normal([self.num_inputs, self.num_hidden_1]), name = 'weights_layer_1'),
            'w2': tf.Variable(tf.random_normal([self.num_hidden_1, self.num_hidden_2]), name = 'weights_layer_2'),
            'out': tf.Variable(tf.random_normal([self.num_hidden_2, self.num_outputs]), name = 'weights_layer_out')}
        
        # Biases
        bias = {
            'b1': tf.Variable(tf.zeros([self.num_hidden_1]), name = 'biases_layer_1'),
            'b2': tf.Variable(tf.zeros([self.num_hidden_2]), name = 'biases_layer_2'),
            'out': tf.Variable(tf.zeros([self.num_outputs]), name = 'biases_layer_out'),
        }
        
        layer_1 = tf.add(tf.matmul(self.input, weight['w1']), bias['b1'])
        layer_1 = tf.nn.relu(layer_1)
        
        layer_2 = tf.add(tf.matmul(layer_1, weight['w2']), bias['b2'])
        layer_2 = tf.nn.relu(layer_2)
        
        network = tf.add(tf.matmul(layer_2, weight['out']), bias['out'])
        network = tf.add(tf.multiply(tf.nn.sigmoid(network), tf.constant(self.output_max, tf.float32)), self.output_min)
        
        self.network = network


#### Critic Neural Network ####
Network layour experiments seem to suggest that better results are obtained if the action input skips the first layer before being added. 

In [8]:
class CriticNetwork:
    
    # Critic Network Parameters
    num_inputs = num_states
    num_outputs = 1
    num_hidden_1 = 8
    num_hidden_2 = 8
    
    def __init__(self):
        
        # Define input placeholders
        self.input = tf.placeholder(tf.float32, [None,num_inputs])
        self.action = tf.placeholder(tf.float32, [None,num_actions])
        
        # Weights
        weight = {
            'w1': tf.Variable(tf.random_normal([self.num_inputs, self.num_hidden_1]), name = 'weights_layer_1'),
            'w2': tf.Variable(tf.random_normal([self.num_hidden_1, self.num_hidden_2]), name = 'weights_layer_2'),
            'w2a': tf.Variable(tf.random_normal([self.num_actions, self.num_hidden_2]), name = 'weights_layer_2_actions'),
            'out': tf.Variable(tf.random_normal([self.num_hidden_2, self.num_outputs]), name = 'weights_layer_out')}
        
        # Biases
        bias = {
            'b1': tf.Variable(tf.zeros([self.num_hidden_1]), name = 'biases_layer_1'),
            'b2': tf.Variable(tf.zeros([self.num_hidden_2]), name = 'biases_layer_2'),
            'out': tf.Variable(tf.zeros([self.num_outputs]), name = 'biases_layer_out'),
        }
        
        layer_1 = tf.add(tf.matmul(self.input, weight['w1']), bias['b1'])
        layer_1 = tf.nn.relu(layer_1)
        
        layer_2 = tf.add(tf.add(tf.matmul(layer_1, weight['w2']), tf.matmul(self.action, weight['w2a'])), bias['b2'])
        layer_2 = tf.nn.relu(layer_2)
        
        self.network = tf.add(tf.matmul(layer_2, weight['out']), bias['out'])
        

In [None]:
s = env.reset()
env.render()

In [None]:
for i in range(100):
    env.step(env.action_space.sample())
    env.render()