In [1]:
%matplotlib inline
import numpy as np
import theano
import theano.tensor as T
theano.config.exception_verbosity='high'
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab


In [2]:
'''
Simulates cartpole starting at x0 with action u -- ported from CS287 Matlab code
'''

def sim_cartpole(x0, u, dt):
    
    def dynamics(x, u):
        mc = 10
        mp = 1
        l = 0.5
        g = 9.81
        T = 0.25
        s = np.sin(x[1])
        c = np.cos(x[1])
        
        xddot = (u + np.multiply(mp*s, l*np.power(x[3],2) + g*c))/(mc + mp*np.power(s,2))
        tddot = (-u*c - np.multiply(np.multiply(mp*l*np.power(x[3],2), c),s) - 
                 np.multiply((mc+mp)*g,s)) / (l * (mc + np.multiply(mp, np.power(s,2))))
        xdot = x[2:4]
        xdot = np.append(xdot, xddot)
        xdot = np.append(xdot, tddot)
        
        return xdot
    
    DT = 0.1
    t = 0
    while t < dt:
        current_dt = min(DT, dt-t)
        x0 = x0 + current_dt * dynamics(x0, u)
        t = t + current_dt
    
    return x0
    
 

In [3]:
'''
Linearizes the dynamics of cartpole around a reference point for use in an LQR controler
'''

def linearize_cartpole(x_ref, u_ref, dt, eps):
    A = np.zeros([4,4])

    for i in range(4):
        increment = np.zeros([4,])
        increment[i] = eps
        A[:,i] = (sim_cartpole(x_ref + increment, u_ref, dt) - 
                  sim_cartpole(x_ref, u_ref, dt)) / (eps)
    
    B = (sim_cartpole(x_ref, u_ref + eps, dt) - sim_cartpole(x_ref, u_ref, dt)) / (eps)
    
    c = x_ref
    
    return A, B, c

In [4]:
'''
Computes the LQR infinte horizon controller associated with linear dyamics A, B and quadratic cost Q, R

NOTE: Current version only works for cartpole because I hardcoded a couple of numbers for now
'''

def lqr_infinite_horizon(A, B, Q, R):
    nA = A.shape[0]

    if len(B.shape) == 1:
        nB = 1
    else:
        nB = B.shape[1]

    P_current = np.zeros([nA, nA])

    P_new = np.eye(nA)

    K_current = np.zeros([nB, nA])

    K_new= np.triu(np.tril(np.ones([nB,nA]),0),0)

    while np.linalg.norm(K_new - K_current, 2) > 1E-4:
        P_current = P_new
      
        K_current = K_new
        
        Quu = R + np.dot(np.dot( np.transpose(B), P_current), B)
        
        K_new = -np.linalg.inv(Quu) * np.dot(np.dot( np.transpose(B), P_current), A)
    
        P_new = Q + np.dot(np.dot( np.transpose(K_new), 
                                   R), 
                                   K_new) + np.dot(np.dot( np.transpose(A + np.dot(B.reshape(4,1), K_new)),
                                                           P_current),
                                                           (A + np.dot(B.reshape(4,1), K_new.reshape(1,4)))
                          )
        
    return K_new, P_new, Quu

In [10]:
'''
Generate LQR trajectory for solving our simple cartpole problem
'''

x_ref = np.array([0, np.pi, 0, 0])
u_ref = 0.
A, B, c = linearize_cartpole(x_ref, u_ref, 0.1, 0.1)
Q = np.eye(4)
R = np.eye(1)
dt = 0.1
x_init = np.array([0, np.pi - np.pi/10, 0, 0])

K_inf, P_inf, Quu = lqr_infinite_horizon(A, B, Q, R)

x_lqr = np.zeros([4,500])
u_lqr = np.zeros([1,500])

x_lqr[:,0] = np.array([0, np.pi - np.pi/10, 0, 0])
u_lqr[:,0] = np.dot(K_inf, (x_lqr[:,0] - x_ref)) + u_ref

for i in range(499):
    x_lqr[:,i+1] = sim_cartpole(x_lqr[:,i], u_lqr[:,i], dt)
    u_lqr[:,i+1] = np.dot(K_inf, (x_lqr[:,i] - x_ref) ) + u_ref
    
Quu

array([[ 2.23925216]])

In [6]:
'''
Generate samples from the guidance trajectory
'''

def gen_traj_guidance(x_init, x_ref, u_ref, K, variance, traj_size, dt):
    xs = len(x_ref)
    
    if type(u_ref) == float:
        us = 1
    else:
        us = len(u_ref)
    
    x_traj = np.zeros([xs, traj_size])
    u_traj = np.zeros([us, traj_size])
    
    x_traj[:,0] = x_init
    u_traj[:,0] = np.random.multivariate_normal(np.dot(K, (x_traj[:,0] - x_ref) ) + u_ref, variance)
    
    for t in range(traj_size-1):
        x_traj[:,t+1] = sim_cartpole(x_traj[:,t], u_traj[:,t], dt)
        u_mean = np.dot(K, (x_traj[:,t] - x_ref) ) + u_ref
        u_traj[:,t+1] = np.random.multivariate_normal(u_mean, variance)
    
    return x_traj, u_traj

x_traj1, u_traj1 = gen_traj_guidance(x_init, x_ref, u_ref, K_inf, variance, 500, dt)
x_traj1[:,-1]

array([-0.29505298,  3.120957  ,  0.14289469,  0.13392869])

In [8]:
"""
Implementation of hidden layer class. Modified from here: http://deeplearning.net/tutorial/mlp.html#mlp.

Note: I changed feeding forward to a function because I couldn't figure out how to update the internal state 'input'
    when iterating to generate paths. May be better to switch back to that in the long run to make more layers
    easier to link together?

"""

class HiddenLayer(object):
    def __init__(self, rng, n_in, n_out, W=None, b=None, activation=T.tanh):
        self.input = input
        
        if W is None:
            W_values = np.asarray(
                rng.uniform(
                    low = -np.sqrt(6. / (n_in + n_out)), #ADDED 0.001
                    high = np.sqrt(6. / (n_in + n_out)), #ADDED 0.001
                    size = (n_in, n_out)
                ),
                dtype = theano.config.floatX)
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4
                
            W = theano.shared(value=W_values, name='W', borrow=True)
            
        if b is None:
            b_values = np.zeros((n_out,), dtype = theano.config.floatX)
            b = theano.shared(value = b_values, name='b', borrow=True)
        
        self.W = W
        self.b = b
        
        self.params = [self.W, self.b]
        
        self.activation = activation
        
    def feed_forward(self, input):
        lin_output = T.dot(input, self.W) + self.b
        output = (
            lin_output if self.activation is None
            else self.activation(lin_output)
        )
        
        return output
    
'''
Class wrapper for our basic feed forward network in Theano. 

Modified from here: http://deeplearning.net/tutorial/mlp.html#mlp.

Note: similar to above, modified to make feed-forward a function in the main version
'''

class SingleLayerNet(object):
    
    def __init__(self, rng, n_in, n_hidden, n_out):
                
        self.hiddenLayer = HiddenLayer(
            rng = rng,
            n_in = n_in, 
            n_out = n_hidden, 
            activation = T.tanh
        )
        
        self.outputLayer = HiddenLayer(
            rng = rng,
            n_in = n_hidden,
            n_out = n_out,
            activation = None
        )
             
        
        ## L1 and L2 regularization. Not used for now.
        self.L1 = ( abs(self.hiddenLayer.W).sum() + abs(self.outputLayer.W).sum() )
        
        self.L2_sqr = ( (self.hiddenLayer.W ** 2).sum() + (self.outputLayer.W ** 2).sum())
                
        self.params = self.hiddenLayer.params + self.outputLayer.params
        
        
    def feed_forward(self, input):
        output = self.outputLayer.feed_forward(self.hiddenLayer.feed_forward(input))
        return output
        

In [23]:
'''
Define probability of a trajectory and initialize parameter weights
'''
rng = np.random.RandomState(1234)

policy = SingleLayerNet(rng, 4, 10, 1)

Tpi = theano.shared(np.pi)


def normal_pdf(u, u0, var):
    return (1/(var * T.sqrt(2 * Tpi)))*T.exp( -(u-u0)**2 / (2*var**2) )

def log_traj_prob_q(x_traj, u_traj, x_ref, u_ref, K, var):
    T = x_traj.shape[1]
    
    prob = 0
    for t in range(T):
        prob += np.log(sp.stats.norm.pdf(u_traj[:,t], np.dot(K, (x_traj[:,t] - x_ref) ) + u_ref, var))
    return prob

input = T.vector()
T_feed_forward = theano.function([input], policy.feed_forward(input))

def log_traj_prob_pi(x_traj, u_traj, x_ref, u_ref, var):
    T = x_traj.shape[1]
    
    prob = 0
    for t in range(T):
        prob += np.log(sp.stats.norm.pdf(u_traj[:,t], T_feed_forward(x_traj[:,t]), var))
    return prob

var2 = 10
a = log_traj_prob_pi(x_traj1, u_traj1, x_ref, u_ref, var2)
b = log_traj_prob_q(x_traj1, u_traj1, x_ref, u_ref, K_inf, variance)
a, b

xtraj_T = T.dmatrix('xtraj_T')
utraj_T = T.dvector('utraj_T')
x_paths = T.tensor3('x_paths')
u_paths = T.dmatrix('u_paths')
sig = T.dscalar('sig')
prob = T.dvector('prob')
learning_rate = theano.shared(1)


results, updates = theano.scan(lambda u_i, x_i, var: T.log(normal_pdf(u_i, policy.feed_forward(x_i), var)), 
                               sequences=[utraj_T,xtraj_T.T],
                               non_sequences=[sig])

total = T.sum(results)

calculate_prob = theano.function(inputs=[utraj_T, xtraj_T, sig], outputs=[total]) #total

#print calculate_prob(u_lqr.reshape(500,), x_lqr, 1)

grad_logsum = [T.grad(total, param) for param in policy.params]

calculate_grad = theano.function(inputs=[utraj_T, xtraj_T, sig], outputs=grad_logsum)

calculate_grad(u_lqr.reshape(500,), x_lqr, 2)

updates = [(param, param + learning_rate * gparam) for (param, gparam) in zip(policy.params, grad_logsum)]

initialize_model = theano.function(
    inputs = [utraj_T, xtraj_T, sig],
    outputs = total,
    updates = updates)


xtrajs = np.zeros([4,500,10])
utrajs = np.zeros([1,500,10])

for j in range(10):
    xtrajs[:,:,j], utrajs[:,:,j] = gen_traj_guidance(x_init, x_ref, u_ref, K_inf, variance, 500, dt)
for i in range(1000):
    for j in range(10):
        outi = 0.1 * initialize_model(utrajs[:,:,j].reshape(500,), xtrajs[:,:,j], 100) 
    if (i % 100 == 0):
        print "%d : %.3f" % (i, outi)
        #print "Parameters"
        #print [param.get_value() for param in policy.params]
        #print "Gradient"
        #print calculate_grad(u_lqr.reshape(500,), x_lqr, 2)


0 : -276.352
100 : -276.343
200 : -276.328
300 : -276.315
400 : -276.304
500 : -276.296
600 : -276.291
700 : -276.289
800 : -276.290
900 : -276.306


In [94]:
x_i = T.dvector('x_i')

pt_grad = [T.grad(policy.feed_forward(x_i).sum(), param) for param in policy.params]

T_feed_forward = theano.function([x_i], policy.feed_forward(x_i))

calc_pt_grad = theano.function([x_i], pt_grad)

sigma = 1
Tt = 500
m = 10
var_pi = 0.1
var_q = variance
w_r = 1



def calc_cost_gradient(xtrajs, utrajs, x_ref, u_ref, var_pi, var_q, w_r, Tt, m):
    
    cost_grad = [param.get_value() for param in policy.params]
    cost_grad = [param*0 for param in cost_grad]

    Z_num = np.zeros([m,Tt])
    Z_den = np.zeros([m,Tt])
    Zti = np.zeros([m,Tt])

    # Precompute z factors
    for t in range(Tt):
        for i in range(m):
            if t == 0:
                Z_den[i, t] = np.log(sp.stats.norm.pdf(utrajs[:,t,i], np.dot(K_inf, (xtrajs[:,t,i] - x_ref) ) + u_ref, var_q)) 
                Z_num[i, t] = np.log(sp.stats.norm.pdf(utrajs[:,t,i], T_feed_forward(xtrajs[:,t,i]), var_pi))
            else:
                Z_den[i, t] = Z_den[i, t-1] + np.log(sp.stats.norm.pdf(utrajs[:,t,i], np.dot(K_inf, (xtrajs[:,t,i] - x_ref) ) + u_ref, var_q)) 
                Z_num[i, t] = Z_num[i, t-1] + np.log(sp.stats.norm.pdf(utrajs[:,t,i], T_feed_forward(xtrajs[:,t,i]), var_pi))

    Zti = Z_num - Z_den

    C = np.zeros([m, Tt])
    # Precompute costs
    for t in range(Tt):
        for i in range(m):
            C[i, t] = np.linalg.norm(xtrajs[:,t,i] - x_ref)

    for t in range(Tt):
        #if np.mod(t, 10) == 0: 
        #    print t
        for i in range(m):
            pt_grad = calc_pt_grad(xtrajs[:,t,i])
            pt_FF = T_feed_forward(xtrajs[:,t,i])[0]
            current_u = utrajs[:,t,i][0]

            RHS = 0
            for t2 in range(t,Tt):
                Jt = np.dot(np.exp(Zti[:,t2]), C[:,t2])/sum(np.exp(Zti[:,t2]))
                RHS += ( np.exp(Zti[i,t2])/sum(np.exp(Zti[:,t2])) ) * (C[i, t2] - Jt + w_r)

            r_of_grad = ((current_u - pt_FF )/ (sigma^2))*RHS
            for j in range(len(cost_grad)):
                cost_grad[j] += pt_grad[j] * r_of_grad
            
    return cost_grad, C, pt_grad, Zti, RHS, r_of_grad, Jt, w_r, Z_num, Z_den

In [22]:
learning_rate = 0.001


for step in range(10):
    print step
    grads = calc_cost_gradient(xtrajs, utrajs, x_ref, u_ref, var_pi, var_q, w_r, Tt, m)
    print [param.get_value() for param in policy.params]
    print grads
    updates = [(param, param - learning_rate*grad) for param, grad in zip(policy.params, grads)]
    update_params = theano.function([], updates=updates)
    update_params()

0
[array([[ -1.04903373,  -1.26415246,  -1.0909558 ,  -1.41983731,
          5.93730939,  -0.98843879,   0.08488201,   1.44680123,
          3.8935516 ,   1.99884592],
       [ -3.81884282,  -0.79756337,   2.23817274,   4.21614781,
         -1.75709749,  -7.96353856,  -5.60692262,  -2.56036478,
          3.65207593,   2.31769973],
       [ -1.27379169,  15.03169879, -12.33492665,  -6.96538793,
         13.70969153,  -3.17156936,  -0.39207922,   1.3610368 ,
         -5.9037539 ,   0.35035419],
       [  4.21645399, -10.91878693,   4.90761395,  -6.37882656,
         -3.14820412,   7.20881811,  -8.81388986,   2.24063287,
         -8.93513783,   2.13806456]]), array([  9.37093812,  -7.51210679,   5.19599132,  -7.2658325 ,
        -7.8051358 ,  21.53308866,  15.35700982,   0.0418934 ,
        -4.65429995,   0.12510132]), array([[  9.15309127],
       [-23.6719077 ],
       [ 18.59283578],
       [-15.01531865],
       [-20.83424973],
       [ 24.80251039],
       [ 10.15950916],
       [  0

KeyboardInterrupt: 

In [50]:
[param.get_value() for param in policy.params]

[array([[ -1.39323903,   2.45241547,  -0.73233087,  -1.63707152,
           6.8357398 ,  -0.54114185,   0.06917809,   1.40240891,
           3.06266455,   1.98819506],
        [ -3.38575849,  -1.47705329,   2.17019443,   4.22590421,
          -2.30089308,  -9.26046494,  -5.91324043,  -2.45487764,
           3.87918905,   2.38355789],
        [ -0.8053268 ,  15.30104787, -12.30725895,  -8.95738688,
          15.65580159,  -2.85171037,   0.60164361,   1.34517153,
          -5.89465432,   0.3434334 ],
        [  5.08535856, -10.57072355,   5.04315033,  -6.25583491,
          -4.19093011,   6.08708606,  -9.52748037,   2.27824037,
          -9.36692107,   2.13535656]]),
 array([  7.20770837,  -8.20972613,   5.06421715,  -6.97756834,
         -8.52216376,  24.93880891,  15.87473965,   0.09923698,
         -5.00230538,   0.14823921]),
 array([[  8.43135625],
        [-24.1936242 ],
        [ 18.66978265],
        [-16.1111121 ],
        [-22.80493074],
        [ 24.35942861],
        [ 11.120

In [95]:
grads, C, pt_grad, Zti, rhs, r_of_grad, Jt, w_r, Z_num, Z_den = calc_cost_gradient(xtrajs, utrajs, x_ref, u_ref, var_pi, var_q, w_r, Tt, m)

In [77]:
np.dot(np.exp(Zti[:,499]), C[:,499])/sum(np.exp(Zti[:,499]))

nan

In [78]:
np.dot(np.exp(Zti[:,499]), C[:,499])

0.0

In [79]:
sum(np.exp(Zti[:,499]))

0.0

In [85]:
Zti[:,499]

array([-1018.17143816, -1123.2954605 , -1167.12530625, -1126.62695577,
        -925.93954332,  -917.60005614,  -933.57758987, -1292.94995172,
       -1137.40406385,  -948.6364854 ])

In [96]:
Z_num

array([[-inf, -inf, -inf, ..., -inf, -inf, -inf],
       [-inf, -inf, -inf, ..., -inf, -inf, -inf],
       [-inf, -inf, -inf, ..., -inf, -inf, -inf],
       ..., 
       [-inf, -inf, -inf, ..., -inf, -inf, -inf],
       [-inf, -inf, -inf, ..., -inf, -inf, -inf],
       [-inf, -inf, -inf, ..., -inf, -inf, -inf]])

In [98]:
Z_den[:,499]

array([-1744.6911319 , -1639.46472606, -1595.55561968, -1636.19642222,
       -1836.92784735, -1845.33929327, -1829.2542926 , -1469.70325036,
       -1625.31181025, -1814.25730112])