In [742]:
import numpy as np
import theano
import theano.tensor as T

In [339]:
'''
Simulates cartpole starting at x0 with action u -- ported from CS287 Matlab code
'''

def sim_cartpole(x0, u, dt):
    
    def dynamics(x, u):
        mc = 10
        mp = 1
        l = 0.5
        g = 9.81
        T = 0.25
        s = np.sin(x[1])
        c = np.cos(x[1])
        
        xddot = (u + np.multiply(mp*s, l*np.power(x[3],2) + g*c))/(mc + mp*np.power(s,2))
        tddot = (-u*c - np.multiply(np.multiply(mp*l*np.power(x[3],2), c),s) - 
                 np.multiply((mc+mp)*g,s)) / (l * (mc + np.multiply(mp, np.power(s,2))))
        xdot = x[2:4]
        xdot = np.append(xdot, xddot)
        xdot = np.append(xdot, tddot)
        
        return xdot
    
    DT = 0.1
    t = 0
    while t < dt:
        current_dt = min(DT, dt-t)
        x0 = x0 + current_dt * dynamics(x0, u)
        t = t + current_dt
    
    return x0
    
 

In [2]:
'''
Linearizes the dynamics of cartpole around a reference point for use in an LQR controler
'''

def linearize_cartpole(x_ref, u_ref, dt, eps):
    A = np.zeros([4,4])

    for i in range(4):
        increment = np.zeros([4,])
        increment[i] = eps
        A[:,i] = (sim_cartpole(x_ref + increment, u_ref, dt) - 
                  sim_cartpole(x_ref, u_ref, dt)) / (eps)
    
    B = (sim_cartpole(x_ref, u_ref + eps, dt) - sim_cartpole(x_ref, u_ref, dt)) / (eps)
    
    c = x_ref
    
    return A, B, c

In [4]:
'''
Computes the LQR infinte horizon controller associated with linear dyamics A, B and quadratic cost Q, R

NOTE: Current version only works for cartpole because I hardcoded a couple of numbers for now
'''

def lqr_infinite_horizon(A, B, Q, R):
    nA = A.shape[0]

    if len(B.shape) == 1:
        nB = 1
    else:
        nB = B.shape[1]

    P_current = np.zeros([nA, nA])

    P_new = np.eye(nA)

    K_current = np.zeros([nB, nA])

    K_new= np.triu(np.tril(np.ones([nB,nA]),0),0)

    while np.linalg.norm(K_new - K_current, 2) > 1E-4:
        P_current = P_new
      
        K_current = K_new
        
        K_new = -np.linalg.inv(R + np.dot(np.dot( np.transpose(B), 
                                                  P_current), 
                                                  B)) * np.dot(np.dot( np.transpose(B), 
                                                                       P_current), 
                                                                       A)

        P_new = Q + np.dot(np.dot( np.transpose(K_new), 
                                   R), 
                                   K_new) + np.dot(np.dot( np.transpose(A + np.dot(B.reshape(4,1), K_new)),
                                                           P_current),
                                                           (A + np.dot(B.reshape(4,1), K_new.reshape(1,4)))
                          )
        
    return K_new, P_new

In [5]:
x_ref = np.array([0, np.pi, 0, 0])
A, B, c = linearize_cartpole(x_ref, 0, 0.1, 0.1)
Q = np.eye(4)
R = np.eye(1)

K_inf, P_inf = lqr_infinite_horizon(A, B, Q, R)

In [6]:
'''
Basic implementation of a feed forward neural network with a single hidden layer. 

Takes input, a 1-d parameter vector, an activation function, and numbers of neurons at each layer. The parameter 
vector should be encoded as [W1.flatten(); B1; W2.flatten(); B2] where the Ws and Bs are the matrix weights and 
offset vectors. Activation function for the output layer is assumed to be linear.

'''

def simpleFeedForward(input, params, n_in, n_hidden, n_out, activation=np.tanh):
    
    ## Reshape our parameters to be used to calculate the output
    
    w1 = params[0 : n_in*n_hidden].reshape(n_hidden, n_in)
    b1 = params[n_in*n_hidden : n_in*n_hidden + n_hidden]
    w2 = params[n_in*n_hidden + n_hidden : n_in*n_hidden + n_hidden + n_hidden * n_out].reshape(n_out, n_hidden)
    b2 = params[n_in*n_hidden + n_hidden + n_hidden * n_out:]
    
    lin_midstep = np.dot(w1, input) + b1
    if activation == None:
        midstep = lin_midstep
    else: 
        midstep = activation(lin_midstep)
    
    output = np.dot(w2, midstep) + b2
    
    return output

In [11]:
'''
Function to evaluate the total penalty associated with a parameter vector theta for our simple feed forward network.

Simulates cartpole with the controller given by theta and computes the sum of costs, which are assumed to be a 
quadratic form of the distance from the current position x to the target position.

'''

def evaluate_theta(start, target, T, Q, params, n_in, n_hidden, n_out):

    dt = 0.1
    
    u = np.zeros(T)
    x = np.zeros([n_in, T+1])
    x[:, 0] = start
    for t in range(T):
        u[t] = simpleFeedForward(x[:, t], params, n_in, n_hidden, n_out)
        x[:, t+1] = sim_cartpole(x[:,t], u[t], dt)
    
    x_diff = x - np.transpose(np.tile(target, (x.shape[1],1)))
    penalty =  np.sum(np.diag(np.dot(np.transpose(x_diff), np.dot(Q, x_diff))))
    
    return penalty, x, u
    

In [14]:
'''
Some code to try to run policy gradient. Does not work yet - blows up to NaNs.

'''

##
## Initialize our variables
##
rng = np.random.RandomState(1234)
input = np.array([0, np.pi - np.pi/10, 0, 0])
n_in = 4
n_hidden = 20
n_out = 1

W1 = np.asarray(
                rng.uniform(
                    low = -np.sqrt(6. / (n_in + n_hidden)), 
                    high = np.sqrt(6. / (n_in + n_hidden)),
                    size = (n_in, n_hidden)
                ))

W2 = np.asarray(
                rng.uniform(
                    low = -np.sqrt(6. / (n_out + n_hidden)), 
                    high = np.sqrt(6. / (n_out + n_hidden)),
                    size = (n_hidden, n_out)
                ))

B1 = np.zeros([n_hidden,1])
B2 = np.zeros([n_out, 1])

## Cost function penalizes equal deviation from target coordinates
target = np.array([0, np.pi, 0, 0])
Q = np.eye(n_in)

def penalty(x):
    if len(x.shape) == 1:
        return np.dot(x-target,np.dot(Q, x-target))
    else:
        x_diff = x - np.transpose(np.tile(target, (x.shape[1],1)))
        return np.sum(np.diag(np.dot(np.transpose(x_diff), np.dot(Q, x_diff))))
    


params = np.append(np.append(np.append(W1.flatten(), B1), W2.flatten()), B2)

max_iter = 100

learning_rate = 0.0001

epsilon = 0.1

penalties = np.zeros(max_iter)

for i in range(max_iter):
    
    ## Generate trajectory to evaluate our policy

    start = np.array([0, np.pi - np.pi/10, 0, 0])
    
    penalties[i], x, u = evaluate_theta(start, target, 500, Q, params, n_in, n_hidden, n_out)
        
    ## Perform stochastic gradient descent
    
    # Choose random parameter to update
    direction = np.random.randint(0, len(params))
    
    unitv = np.zeros(len(params))
    unitv[direction] = epsilon
    
    # Calculate the penalty in that direction
    new_penalty, x, u = evaluate_theta(start, target, 500, Q, params + unitv, n_in, n_hidden, n_out)
    
    partial_x = unitv
    partial_x[direction] = (new_penalty - penalties[i])/epsilon
    
    ## Update the parameter vector
    params = params - learning_rate * partial_x

        



In [744]:
"""
Implementation of hidden layer class. Modified from here: http://deeplearning.net/tutorial/mlp.html#mlp.

Note: I changed feeding forward to a function because I couldn't figure out how to update the internal state 'input'
    when iterating to generate paths. May be better to switch back to that in the long run to make more layers
    easier to link together?

"""

class HiddenLayer(object):
    def __init__(self, rng, n_in, n_out, W=None, b=None, activation=T.tanh):
        self.input = input
        
        if W is None:
            W_values = np.asarray(
                rng.uniform(
                    low = -np.sqrt(6. / (n_in + n_out)), 
                    high = np.sqrt(6. / (n_in + n_out)),
                    size = (n_in, n_out)
                ),
                dtype = theano.config.floatX)
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4
                
            W = theano.shared(value=W_values, name='W', borrow=True)
            
        if b is None:
            b_values = np.zeros((n_out,), dtype = theano.config.floatX)
            b = theano.shared(value = b_values, name='b', borrow=True)
        
        self.W = W
        self.b = b
        
        self.params = [self.W, self.b]
        
        self.activation = activation
        
    def feed_forward(self, input):
        lin_output = T.dot(input, self.W) + self.b
        output = (
            lin_output if self.activation is None
            else self.activation(lin_output)
        )
        
        return output
        

In [745]:
'''
Class wrapper for our basic feed forward network in Theano. 

Modified from here: http://deeplearning.net/tutorial/mlp.html#mlp.

Note: similar to above, modified to make feed-forward a function in the main version
'''

class SingleLayerNet(object):
    
    def __init__(self, rng, n_in, n_hidden, n_out):
        
        dt = 0.1  ## Delete?
        
        num_steps = 10  ## Delete?
        
        self.hiddenLayer = HiddenLayer(
            rng = rng,
            n_in = n_in, 
            n_out = n_hidden, 
            activation = T.tanh
        )
        
        self.outputLayer = HiddenLayer(
            rng = rng,
            n_in = n_hidden,
            n_out = n_out,
            activation = None
        )
             
        
        ## L1 and L2 regularization. Not used for now.
        self.L1 = ( abs(self.hiddenLayer.W).sum() + abs(self.outputLayer.W).sum() )
        
        self.L2_sqr = ( (self.hiddenLayer.W ** 2).sum() + (self.outputLayer.W ** 2).sum())
        
        ## Don't think notion of errors makes sense
        #self.errors = self.logRegressionLayer.errors
        
        self.params = self.hiddenLayer.params + self.outputLayer.params
        
        
    def feed_forward(self, input):
        return self.outputLayer.feed_forward(self.hiddenLayer.feed_forward(input))
        

In [801]:
'''
Define a function symbollically that computes a trajectory for cartpole from x0 according to a given policy
'''

C = T.scalar()

rng = np.random.RandomState(1234)
policy = SingleLayerNet(rng, 4, 10, 1)

## Define variables
mc = 10
mp = 1
l = 0.5
g = 9.81

## Compute the cartpole dyanmics updates
def calc_step(x, u, dt):
    xdot = T.concatenate( [
                x[2:4], 
                (u + mp*T.sin(x[1])*(l*x[3]**2 + g*T.cos(x[1])))/(mc + mp*T.sin(x[1])**2),
                (-u*T.cos(x[1]) -(mp*l*x[3]**2) * T.cos(x[1]) * T.sin(x[1]) - 
                     (mc+mp)*g* T.sin(x[1])) / (l * (mc + mp * T.sin(x[1]) ** 2))
            ])
    
    return x + dt * xdot

## Symbolically compute the trajectory associated with the policy encoded in our network
gen_traj, traj_update = theano.scan(
    lambda x, u, cost, dt, Q, x_t: [
        calc_step(x,u,dt), 
        policy.feed_forward(calc_step(x,u,dt)), 
        T.dot(T.dot(x - x_t, Q), x-x_t)
    ],
    outputs_info = [x, policy.feed_forward(x), T.ones_like(C)],
    non_sequences = [dt, Q, x_t],
    n_steps = num_steps
)

## Function we will use to simulate
sim_cartpole_T = theano.function([x, dt, C, Q, x_t, num_steps], gen_traj, updates=traj_update) ## Can we remove C??

In [806]:
'''
Example of simulating the cartpole
'''

x_traj, u_traj, c_traj = sim_cartpole_T(x0, 0.1, 1, np.eye(4), np.array([0,np.pi, 0, 0]), 100)

print x_traj[-10:]

print gen_traj[2]

[[  -2.47799548 -113.49662655   -0.35523164  -18.69905353]
 [  -2.51351865 -115.3665319    -1.067611    -16.62361342]
 [  -2.62027975 -117.02889324   -2.0287525   -16.3569875 ]
 [  -2.823155   -118.66459199   -1.1774809   -16.55228305]
 [  -2.94090309 -120.3198203    -0.27601218  -19.20026729]
 [  -2.96850431 -122.23984703   -1.72444398  -15.90647545]
 [  -3.1409487  -123.83049457   -2.0547737   -15.99450836]
 [  -3.34642607 -125.42994541   -0.95323955  -17.31783526]
 [  -3.44175003 -127.16172893   -0.59309686  -18.47300343]
 [  -3.50105972 -129.00902928   -2.15443399  -16.28914898]]
Subtensor{int64::}.0


In [816]:
'''
Next steps: this will allow us to compute the cost of a trajectory for our starting vals, target vals, and parameters.

Then it will symbolically calculate the gradient, allowing us to do gradient descent.
'''

learning_rate = 0.01

x_t = T.vector('x_t')
num_steps = T.iscalar('num_steps')
Q = T.matrix('Q')

## Compute the cost associated with a given trajectory

total_cost = T.sum(gen_traj[2])
