In [742]:
import numpy as np
import theano
import theano.tensor as T
theano.config.exception_verbosity='high'



# Section 1: pure python

In [955]:
'''
Simulates cartpole starting at x0 with action u -- ported from CS287 Matlab code
'''

def sim_cartpole(x0, u, dt):
    
    def dynamics(x, u):
        mc = 10
        mp = 1
        l = 0.5
        g = 9.81
        T = 0.25
        s = np.sin(x[1])
        c = np.cos(x[1])
        
        xddot = (u + np.multiply(mp*s, l*np.power(x[3],2) + g*c))/(mc + mp*np.power(s,2))
        tddot = (-u*c - np.multiply(np.multiply(mp*l*np.power(x[3],2), c),s) - 
                 np.multiply((mc+mp)*g,s)) / (l * (mc + np.multiply(mp, np.power(s,2))))
        xdot = x[2:4]
        xdot = np.append(xdot, xddot)
        xdot = np.append(xdot, tddot)
        
        return xdot
    
    DT = 0.1
    t = 0
    while t < dt:
        current_dt = min(DT, dt-t)
        x0 = x0 + current_dt * dynamics(x0, u)
        t = t + current_dt
    
    return x0
    
 

In [956]:
'''
Linearizes the dynamics of cartpole around a reference point for use in an LQR controler
'''

def linearize_cartpole(x_ref, u_ref, dt, eps):
    A = np.zeros([4,4])

    for i in range(4):
        increment = np.zeros([4,])
        increment[i] = eps
        A[:,i] = (sim_cartpole(x_ref + increment, u_ref, dt) - 
                  sim_cartpole(x_ref, u_ref, dt)) / (eps)
    
    B = (sim_cartpole(x_ref, u_ref + eps, dt) - sim_cartpole(x_ref, u_ref, dt)) / (eps)
    
    c = x_ref
    
    return A, B, c

In [957]:
'''
Computes the LQR infinte horizon controller associated with linear dyamics A, B and quadratic cost Q, R

NOTE: Current version only works for cartpole because I hardcoded a couple of numbers for now
'''

def lqr_infinite_horizon(A, B, Q, R):
    nA = A.shape[0]

    if len(B.shape) == 1:
        nB = 1
    else:
        nB = B.shape[1]

    P_current = np.zeros([nA, nA])

    P_new = np.eye(nA)

    K_current = np.zeros([nB, nA])

    K_new= np.triu(np.tril(np.ones([nB,nA]),0),0)

    while np.linalg.norm(K_new - K_current, 2) > 1E-4:
        P_current = P_new
      
        K_current = K_new
        
        K_new = -np.linalg.inv(R + np.dot(np.dot( np.transpose(B), 
                                                  P_current), 
                                                  B)) * np.dot(np.dot( np.transpose(B), 
                                                                       P_current), 
                                                                       A)

        P_new = Q + np.dot(np.dot( np.transpose(K_new), 
                                   R), 
                                   K_new) + np.dot(np.dot( np.transpose(A + np.dot(B.reshape(4,1), K_new)),
                                                           P_current),
                                                           (A + np.dot(B.reshape(4,1), K_new.reshape(1,4)))
                          )
        
    return K_new, P_new

In [958]:
x_ref = np.array([0, np.pi, 0, 0])
A, B, c = linearize_cartpole(x_ref, 0, 0.1, 0.1)
Q = np.eye(4)
R = np.eye(1)

K_inf, P_inf = lqr_infinite_horizon(A, B, Q, R)

In [959]:
'''
Basic implementation of a feed forward neural network with a single hidden layer. 

Takes input, a 1-d parameter vector, an activation function, and numbers of neurons at each layer. The parameter 
vector should be encoded as [W1.flatten(); B1; W2.flatten(); B2] where the Ws and Bs are the matrix weights and 
offset vectors. Activation function for the output layer is assumed to be linear.

'''

def simpleFeedForward(input, params, n_in, n_hidden, n_out, activation=np.tanh):
    
    ## Reshape our parameters to be used to calculate the output
    
    w1 = params[0 : n_in*n_hidden].reshape(n_hidden, n_in)
    b1 = params[n_in*n_hidden : n_in*n_hidden + n_hidden]
    w2 = params[n_in*n_hidden + n_hidden : n_in*n_hidden + n_hidden + n_hidden * n_out].reshape(n_out, n_hidden)
    b2 = params[n_in*n_hidden + n_hidden + n_hidden * n_out:]
    
    lin_midstep = np.dot(w1, input) + b1
    if activation == None:
        midstep = lin_midstep
    else: 
        midstep = activation(lin_midstep)
    
    output = np.dot(w2, midstep) + b2
    
    return output

In [1042]:
'''
Function to evaluate the total penalty associated with a parameter vector theta for our simple feed forward network.

Simulates cartpole with the controller given by theta and computes the sum of costs, which are assumed to be a 
quadratic form of the distance from the current position x to the target position.

'''

def evaluate_theta(start, target, T, Q, params, n_in, n_hidden, n_out):

    dt = 0.1
    
    u = np.zeros(T)
    x = np.zeros([n_in, T+1])
    x[:, 0] = start
    for t in range(T):
        u[t] = simpleFeedForward(x[:, t], params, n_in, n_hidden, n_out)
        x[:, t+1] = sim_cartpole(x[:,t], u[t], dt)
    
    x_diff = x - np.transpose(np.tile(target, (x.shape[1],1)))
    penalty =  np.sum(np.diag(np.dot(np.transpose(x_diff), np.dot(Q, x_diff))))
    
    return penalty, x, u
    
    


In [14]:
'''
Some code to try to run policy gradient. Does not work yet - blows up to NaNs.

'''

##
## Initialize our variables
##
rng = np.random.RandomState(1234)
input = np.array([0, np.pi - np.pi/10, 0, 0])
n_in = 4
n_hidden = 20
n_out = 1

W1 = np.asarray(
                rng.uniform(
                    low = -np.sqrt(6. / (n_in + n_hidden)), 
                    high = np.sqrt(6. / (n_in + n_hidden)),
                    size = (n_in, n_hidden)
                ))

W2 = np.asarray(
                rng.uniform(
                    low = -np.sqrt(6. / (n_out + n_hidden)), 
                    high = np.sqrt(6. / (n_out + n_hidden)),
                    size = (n_hidden, n_out)
                ))

B1 = np.zeros([n_hidden,1])
B2 = np.zeros([n_out, 1])

## Cost function penalizes equal deviation from target coordinates
target = np.array([0, np.pi, 0, 0])
Q = np.eye(n_in)

def penalty(x):
    if len(x.shape) == 1:
        return np.dot(x-target,np.dot(Q, x-target))
    else:
        x_diff = x - np.transpose(np.tile(target, (x.shape[1],1)))
        return np.sum(np.diag(np.dot(np.transpose(x_diff), np.dot(Q, x_diff))))
    


params = np.append(np.append(np.append(W1.flatten(), B1), W2.flatten()), B2)

max_iter = 100

learning_rate = 0.0001

epsilon = 0.1

penalties = np.zeros(max_iter)

for i in range(max_iter):
    
    ## Generate trajectory to evaluate our policy

    start = np.array([0, np.pi - np.pi/10, 0, 0])
    
    penalties[i], x, u = evaluate_theta(start, target, 500, Q, params, n_in, n_hidden, n_out)
        
    ## Perform stochastic gradient descent
    
    # Choose random parameter to update
    direction = np.random.randint(0, len(params))
    
    unitv = np.zeros(len(params))
    unitv[direction] = epsilon
    
    # Calculate the penalty in that direction
    new_penalty, x, u = evaluate_theta(start, target, 500, Q, params + unitv, n_in, n_hidden, n_out)
    
    partial_x = unitv
    partial_x[direction] = (new_penalty - penalties[i])/epsilon
    
    ## Update the parameter vector
    params = params - learning_rate * partial_x

        



# Section 2 - Theano code

## A. Classes

In [997]:
"""
Implementation of hidden layer class. Modified from here: http://deeplearning.net/tutorial/mlp.html#mlp.

Note: I changed feeding forward to a function because I couldn't figure out how to update the internal state 'input'
    when iterating to generate paths. May be better to switch back to that in the long run to make more layers
    easier to link together?

"""

class HiddenLayer(object):
    def __init__(self, rng, n_in, n_out, W=None, b=None, activation=T.tanh):
        self.input = input
        
        if W is None:
            W_values = np.asarray(
                rng.uniform(
                    low = -np.sqrt(6. / (n_in + n_out)), 
                    high = np.sqrt(6. / (n_in + n_out)),
                    size = (n_in, n_out)
                ),
                dtype = theano.config.floatX)
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4
                
            W = theano.shared(value=W_values, name='W', borrow=True)
            
        if b is None:
            b_values = np.zeros((n_out,), dtype = theano.config.floatX)
            b = theano.shared(value = b_values, name='b', borrow=True)
        
        self.W = W
        self.b = b
        
        self.params = [self.W, self.b]
        
        self.activation = activation
        
    def feed_forward(self, input):
        lin_output = T.dot(input, self.W) + self.b
        output = (
            lin_output if self.activation is None
            else self.activation(lin_output)
        )
        
        return output
        

In [999]:
'''
Class wrapper for our basic feed forward network in Theano. 

Modified from here: http://deeplearning.net/tutorial/mlp.html#mlp.

Note: similar to above, modified to make feed-forward a function in the main version
'''

class SingleLayerNet(object):
    
    def __init__(self, rng, n_in, n_hidden, n_out):
        
        dt = 0.1  ## Delete?
        
        num_steps = 10  ## Delete?
        
        self.hiddenLayer = HiddenLayer(
            rng = rng,
            n_in = n_in, 
            n_out = n_hidden, 
            activation = T.tanh
        )
        
        self.outputLayer = HiddenLayer(
            rng = rng,
            n_in = n_hidden,
            n_out = n_out,
            activation = None
        )
             
        
        ## L1 and L2 regularization. Not used for now.
        self.L1 = ( abs(self.hiddenLayer.W).sum() + abs(self.outputLayer.W).sum() )
        
        self.L2_sqr = ( (self.hiddenLayer.W ** 2).sum() + (self.outputLayer.W ** 2).sum())
        
        ## Don't think notion of errors makes sense
        #self.errors = self.logRegressionLayer.errors
        
        self.params = self.hiddenLayer.params + self.outputLayer.params
        
        
    def feed_forward(self, input):
        return self.outputLayer.feed_forward(self.hiddenLayer.feed_forward(input))
        

## B. Variable definitions

In [1000]:
## Theano variables for input to our functions below
C = T.scalar('C')
Q = T.matrix('Q')
num_steps = T.iscalar('num_steps')
x_t = T.vector('x_t')
learning_rate = theano.shared(1E-7)

rng = np.random.RandomState(1234)
policy = SingleLayerNet(rng, 4, 10, 1)

## Global variables for cartpole dynamics updates
mc = 10
mp = 1
l = 0.5
g = 9.81

## C. Functions

In [1001]:
'''
COMPUTE TRAJECTORY
'''
## Compute the cartpole dyanmics updates
def calc_step(x, u, dt):
    xdot = T.concatenate( [
                x[2:4], 
                (u + mp*T.sin(x[1])*(l*x[3]**2 + g*T.cos(x[1])))/(mc + mp*T.sin(x[1])**2),
                (-u*T.cos(x[1]) -(mp*l*x[3]**2) * T.cos(x[1]) * T.sin(x[1]) - 
                     (mc+mp)*g* T.sin(x[1])) / (l * (mc + mp * T.sin(x[1]) ** 2))
            ])
    
    return x + dt * xdot

## Symbolically compute the trajectory associated with the policy encoded in our network
gen_traj, traj_update = theano.scan(
    lambda x, u, cost, dt, Q, x_t: [
        calc_step(x,u,dt), 
        policy.feed_forward(calc_step(x,u,dt)), 
        T.dot(T.dot(x - x_t, Q), x-x_t)
    ],
    outputs_info = [x, policy.feed_forward(x), T.ones_like(C)],
    non_sequences = [dt, Q, x_t],
    n_steps = num_steps
)

## Compiled Theano function for simulating the cartpole trajectory
sim_cartpole_T = theano.function([x, dt, C, Q, x_t, num_steps], gen_traj, updates=traj_update) ## Can we remove C??

In [928]:
'''
SET UP GRADIENT DESCENT UPDATES
'''

# Compute the cost associated with a given trajectoy
total_cost = T.sum(gen_traj[2])

gradient = [T.grad(total_cost, param) for param in policy.params]

#output = [total_cost, policy.params[0], gradient]

#outputs = [total_cost, gradient]
#outputs += [param for param in policy.params]


updates = [(param, param - learning_rate * gparam) for param, gparam in zip(policy.params, gradient)]

gradient_step = theano.function(
    inputs= [x, dt, C, Q, x_t, num_steps], 
    outputs= total_cost, 
    updates=updates
                                )



In [1010]:
'''
CALC COST, NEXT STEP, AND COMPUTE GRADIENT: DEBUGGING FUNCTIONS
'''

#compute_gradient = theano.function(
#    inputs = [x, dt, C, Q, x_t, num_steps],
#    outputs = gradient
#)

#compute_cost = theano.function(
#    inputs = [x, dt, C, Q, x_t, num_steps],
#    outputs = total_cost
#)

compute_next_step = theano.function(
    inputs = [x, u, dt],
    outputs = calc_step(x, u, dt)
)

compute_control_input = theano.function(
    inputs = [x],
    outputs = policy.feed_forward(x))

## D. Examples

In [None]:
'''
SIMULATE CARTPOLE
'''

x_traj, u_traj, c_traj = sim_cartpole_T(x0, 0.1, 1, np.eye(4), np.array([0,np.pi, 0, 0]), 100)

print x_traj[-10:]

In [910]:
'''
TRAIN THE MODEL
'''

x0 = np.array([0, np.pi - np.pi/10, 0, 0])
Q0 = np.eye(4)
x_t0 = np.array([0, np.pi, 0, 0])
path_length = 500

max_iter = 10
tolerance = 0.1
loss_iter = []

for i in range(max_iter):
    if (i % 10 == 0):
        print i
    
    loss_iter.append(gradient_step(x = x0, 
                                   dt = 0.1,
                                   C = 1,
                                   Q = Q0,
                                   x_t = x_t0,
                                   num_steps = path_length
                                  )
                    )
    
    current_weights = policy.params
    ## Break when the difference between successive steps is small
    #if i > 0:
    #    if np.abs(loss_iter[i] - loss_iter[i-1]):
    #        break   


0


In [930]:
[param.get_value() for param in policy.params]

[array([[-0.40389585,  0.15987791, -0.08153353,  0.37362209,  0.36657438,
         -0.29774617, -0.29267699,  0.39524346,  0.59984522,  0.49221136],
        [-0.18616089,  0.00130293,  0.24020937,  0.27849233, -0.16988164,
          0.08012462,  0.00403681, -0.63662654,  0.3572139 ,  0.50099492],
        [-0.17690577,  0.15108906, -0.55595646, -0.17174969,  0.56711352,
          0.19820051, -0.13459342,  0.3780365 , -0.23981781,  0.08916207],
        [ 0.4833012 , -0.0835686 ,  0.39560413, -0.46641871,  0.26744039,
          0.26785981, -0.36818756,  0.55628231, -0.07575553,  0.53592039]]),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 array([[-0.65020487],
        [-0.46633888],
        [-0.66860056],
        [ 0.25831627],
        [ 0.13977006],
        [ 0.04920237],
        [-0.67455506],
        [ 0.09074267],
        [-0.25159638],
        [ 0.0043823 ]]),
 array([ 0.])]

In [931]:
compute_gradient(
    x = x0,
    dt = 0.1, 
    C = 1,
    Q = Q0,
    x_t = x_t0,
    num_steps = path_length
)

[array([[ -3.38628206e+03,  -2.71029260e+06,  -2.23740378e+02,
           3.24032906e+02,   7.84169639e+02,   6.14002030e+04,
          -2.16855556e+02,   1.43394391e+02,  -1.39972025e+03,
           1.07858852e+00],
        [ -3.90804517e+06,  -7.81069913e+07,   8.70493071e+04,
           9.38288561e+05,   6.94381360e+05,   1.98308349e+05,
           8.43941699e+04,   4.19861765e+04,  -2.89411837e+04,
          -3.61929299e+02],
        [  1.00414462e+05,  -9.70872628e+05,  -1.73246346e+03,
          -3.71718270e+04,  -3.28545188e+04,   3.48526740e+03,
          -6.53258312e+02,  -2.77350879e+03,   1.44247451e+04,
           9.55770370e+00],
        [ -1.51722105e+06,  -1.13066753e+07,  -4.95092424e+04,
           5.91058845e+05,   5.44002070e+05,   1.08913205e+04,
          -3.22916210e+04,   4.81845346e+04,  -3.95528720e+05,
           2.42316391e+02]]),
 array([  1.01709805e+05,   7.45554712e+05,   3.24692216e+04,
         -4.24526108e+04,  -4.18788374e+04,  -3.25662573e+03,
      

In [932]:
compute_cost(
     x = x0,
    dt = 0.1, 
    C = 1,
    Q = Q0,
    x_t = x_t0,
    num_steps = path_length
    )

array(118406247.35960904)

In [934]:
theano_path = sim_cartpole_T(
    x = x0,
    dt = 0.1, 
    C = 1,
    Q = Q0,
    x_t = x_t0,
    num_steps = path_length
    )

In [952]:
theano_path[0]

array([[  0.00000000e+00,   2.82743339e+00,  -3.10067555e-02,
         -6.65269697e-01],
       [ -3.10067555e-03,   2.76090642e+00,  -6.06702043e-02,
         -1.32798427e+00],
       [ -9.16769598e-03,   2.62810799e+00,  -9.19321233e-02,
         -2.11502816e+00],
       ..., 
       [ -2.79128291e+02,  -8.33340287e+02,  -1.32337141e+01,
         -1.71993952e+01],
       [ -2.80451662e+02,  -8.35060226e+02,  -1.22543988e+01,
         -1.72927900e+01],
       [ -2.81677102e+02,  -8.36789505e+02,  -1.13872279e+01,
         -1.98342062e+01]])

In [937]:
initial_weights = [param.get_value() for param in policy.params]

In [1020]:
reshaped_params = np.array([])
for weight in initial_weights:
    weight = np.transpose(weight).flatten()
    reshaped_params = np.concatenate((reshaped_params, weight))

In [1021]:
reshaped_params

array([-0.40389585, -0.18616089, -0.17690577,  0.4833012 ,  0.15987791,
        0.00130293,  0.15108906, -0.0835686 , -0.08153353,  0.24020937,
       -0.55595646,  0.39560413,  0.37362209,  0.27849233, -0.17174969,
       -0.46641871,  0.36657438, -0.16988164,  0.56711352,  0.26744039,
       -0.29774617,  0.08012462,  0.19820051,  0.26785981, -0.29267699,
        0.00403681, -0.13459342, -0.36818756,  0.39524346, -0.63662654,
        0.3780365 ,  0.55628231,  0.59984522,  0.3572139 , -0.23981781,
       -0.07575553,  0.49221136,  0.50099492,  0.08916207,  0.53592039,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.65020487, -0.46633888, -0.66860056,  0.25831627,  0.13977006,
        0.04920237, -0.67455506,  0.09074267, -0.25159638,  0.0043823 ,  0.        ])

In [1022]:
np_path = np.zeros([4, 501])
np_path[:, 0] = x0
u_np = np.zeros([500,])
for t in range(500):
    u_np[t] = simpleFeedForward(np_path[:, t], reshaped_params, 4, 10, 1)
    np_path[:, t+1] = sim_cartpole(np_path[:,t], u_np[t], 0.1)

In [1023]:
np_path[:, 0:5]

array([[  0.00000000e+00,   0.00000000e+00,  -3.10067555e-03,
         -9.16769598e-03,  -1.83609083e-02],
       [  2.82743339e+00,   2.82743339e+00,   2.76090642e+00,
          2.62810799e+00,   2.41660518e+00],
       [  0.00000000e+00,  -3.10067555e-02,  -6.06702043e-02,
         -9.19321233e-02,  -1.22322672e-01],
       [  0.00000000e+00,  -6.65269697e-01,  -1.32798427e+00,
         -2.11502816e+00,  -3.13173561e+00]])

In [1034]:
np.transpose(theano_path[0])[:,0], np.transpose(theano_path[0])[:,1]

(array([ 0.        ,  2.82743339, -0.03100676, -0.6652697 ]),
 array([-0.00310068,  2.76090642, -0.0606702 , -1.32798427]))

In [1029]:
x_1 = np.transpose(theano_path[0])[:,0]

u_1 = simpleFeedForward(x_1, reshaped_params, 4, 10, 1)

x_2 = sim_cartpole(x_1, u_1, 0.1)

x_1, x_2

(array([ 0.        ,  2.82743339, -0.03100676, -0.6652697 ]),
 array([-0.00310068,  2.76090642, -0.0606702 , -1.32798427]))

In [1031]:
x_2_T = compute_next_step(x_1, compute_control_input(x_1), 0.1)

x_1, x_2_T

(array([ 0.        ,  2.82743339, -0.03100676, -0.6652697 ]),
 array([-0.00310068,  2.76090642, -0.0606702 , -1.32798427]))

In [1045]:
evaluate_theta(x0, np.array([0, np.pi, 0, 0]))

ValueError: setting an array element with a sequence.

In [1019]:
 - np.transpose(theano_path[0]))

ValueError: shapes (4,10) and (4,) not aligned: 10 (dim 1) != 4 (dim 0)