In [111]:
import numpy as np
import theano
import theano.tensor as T
from theano import function
from scipy.stats import zscore

### Test translating from numpy --> Theano

In [333]:
class MyClass:
    def __init__(self, distr='poisson'):
        self.distr = distr
    
    #--------------
    # nonlinearity
    #--------------
    def qu(self, z):
        if(self.distr=='poisson'):
            eps = T.dscalar('eps')
            eps = 0.1
            q = T.log(1+eps+T.exp(z))
        elif(self.distr=='normal'):
            q = z
        elif(self.distr=='binomial'):
            q = T.exp(z)/(1+T.exp(z))
        return q
    
    #-----------------------
    # conditional intensity
    #-----------------------
    def lmb(self, beta0, beta, x):
        z = beta0 + T.dot(x,beta)
        l = self.qu(z)
        return l
    
    #-----------------
    # log likelihoods
    #-----------------
    def logL(self, beta0, beta, x, y):
        l = self.lmb(beta0, beta, x)
        if(self.distr=='poisson'):
            logL = T.sum(y*T.log(l) - l)
        elif(self.distr=='normal'):
            logL = -0.5*T.sum((y-l)**2)
        elif(self.distr=='binomial'):
            z = beta0 + T.dot(x,beta)
            logL = T.sum(y*z - T.log(1+T.exp(z)))
        return logL

    #---------
    # penalty
    #---------
    def penalty(self, alpha, beta):
        P = 0.5*(1-alpha)*T.sum(beta**2) + alpha*T.sum(T.abs_(beta))
        return P

    #---------------
    # loss function
    #---------------
    def loss(self, beta0, beta, alpha, reg_lambda, x, y):    
        L = self.logL(beta0, beta, x, y)
        P = self.penalty(alpha, beta)
        J = -L + reg_lambda*P
        return J

    #--------------------------------------
    # differentiable part of loss function
    #--------------------------------------
    def L2loss(self, beta0, beta, alpha, reg_lambda, x, y):
        L = self.logL(beta0, beta, x, y)
        P = 0.5*(1-alpha)*T.sum(beta**2)
        J = -L + reg_lambda*P
        return J

    #-------------------
    # proximal operator
    #-------------------
    def prox(self,x,l):
        sx = T.sgn(x) * (T.abs_(x) - l) * (T.abs_(x) > l)
        return sx

    #-----------
    # fit model
    #-----------
    def fit(self, x, y, reg_params, opt_params):
    # Implements batch gradient descent (i.e. vanilla gradient descent by computing gradient over entire training set)
        
        # Dataset shape
        n = x.shape[0]
        p = x.shape[1]
        
        # Initialize shared variable
        beta0 = theano.shared(np.random.randn(), name='beta0')
        beta = theano.shared(np.random.randn(p), name='beta')
        
        # Regularization parameters
        reg_lambda = reg_params['reg_lambda']
        alpha = reg_params['alpha']

        # Optimization parameters
        max_iter = opt_params['max_iter']
        e = opt_params['learning_rate']

        # Initialize parameters
        beta0_hat = np.random.randn()
        beta_hat = np.random.randn(p)
        fit = []

        # Outer loop with descending lambda
        for l,rl in enumerate(reg_lambda):
            fit.append({'beta0': 0., 'beta': np.zeros(p), 'L': 10., 'DL': 10.})
            len(fit)
            print('Lambda: {}\n').format(rl)

            # Warm initialize parameters
            if(l == 0):
                fit[-1]['beta0'] = beta0_hat
                fit[-1]['beta'] = beta_hat
            else:
                fit[-1]['beta0'] = fit[-2]['beta0']
                fit[-1]['beta'] = fit[-2]['beta']

            #---------------------------
            # Iterate until convergence
            #---------------------------
            no_convergence = 1
            convergence_threshold = 1e-3
            t = 0

            # Initialize parameters
            beta0.set_value(fit[-1]['beta0'])
            beta.set_value(fit[-1]['beta'])

            # Initialize loss
            L = []
            DL = []

            #Give formula for gradient
            L2loss = self.L2loss(beta0, beta, alpha, rl, x, y)
            grad_beta0, grad_beta = T.grad(L2loss, [beta0, beta])
                
            while(no_convergence and t < max_iter):
                # Update time step
                t = t+1
                
                # Update parameters
                beta0 = beta0 -e*grad_beta0
                beta = self.prox(beta -e*grad_beta, rl*alpha)

                # Calculate loss
                L.append(self.loss(beta0, beta, alpha, rl, x, y).eval())
                print('    iter:{}, loss:{}'.format(t, L[-1]))
                # Delta loss and convergence criterion
                if t > 1:
                    DL.append(L[-1] - L[-2])
                    if(np.abs(DL[-1]/L[-1]) < convergence_threshold):
                        no_convergence = 0
                        print('Converged')
                        print('    Loss function: {}').format(L[-1])
                        print('    dL/L: {}\n').format(DL[-1]/L[-1])

                #if t==99:
                #        no_convergence = 0
                #        print('Converged')
                        
            #Store the parameters after convergence
            print beta0.eval()
            fit[-1]['beta0'] = beta0.eval()
            fit[-1]['beta'] = beta.eval()

        return fit

    #-----------------------------
    # Define the predict function
    #-----------------------------
    def predict(self, x, fitparams):
        yhat = self.lmb(fitparams['beta0'], fitparams['beta'], zscore(x))
        return yhat


In [334]:
mm = MyClass('poisson')

In [335]:
N = 1000
p = 100

x = np.random.randn(N,p)
beta = np.random.randn(p)
beta0 = np.random.randn()
y = np.random.poisson(mm.lmb(beta0, beta, x).eval())

INFO (theano.gof.compilelock): Refreshing lock /home/pavan/.theano/compiledir_Linux-3.16--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.11-64/lock_dir/lock


In [None]:
# Set regularization parameters
reg_lambda = np.logspace(np.log(0.5), np.log(0.01), 10, base=np.exp(1))
alpha = 0.1

fit_params = dict()
fit_params['reg_lambda'] = reg_lambda[-2:]
fit_params['alpha'] = alpha

# Set optimization parameters
opt_params = dict()
opt_params['learning_rate'] = 1e-4
opt_params['max_iter'] = 1000

# Fit model to training data
fit = mm.fit(zscore(x),y,fit_params,opt_params)


Lambda: 0.0154445210495

<TensorType(float64, scalar)>
    iter:1
    iter:1, loss:3697.5048762
<TensorType(float64, scalar)>
    iter:2
    iter:2, loss:3591.85011166
<TensorType(float64, scalar)>
    iter:3
    iter:3, loss:3486.48924325
<TensorType(float64, scalar)>
    iter:4
    iter:4, loss:3381.74311572
<TensorType(float64, scalar)>
    iter:5
    iter:5, loss:3277.64168672
<TensorType(float64, scalar)>
    iter:6
    iter:6, loss:3174.28916017
<TensorType(float64, scalar)>
    iter:7
    iter:7, loss:3071.72845449
<TensorType(float64, scalar)>
    iter:8
    iter:8, loss:2969.96637885
<TensorType(float64, scalar)>
    iter:9
    iter:9, loss:2869.00312926
<TensorType(float64, scalar)>
    iter:10
    iter:10, loss:2768.85645198
<TensorType(float64, scalar)>
    iter:11
    iter:11, loss:2669.57506652
<TensorType(float64, scalar)>
    iter:12
    iter:12, loss:2571.2408361
<TensorType(float64, scalar)>
    iter:13
    iter:13, loss:2474.97523886
<TensorType(float64, scalar)>
   

### Time numpy vs. theano fixed point operations

In [108]:
z = T.dvector('z')
q = T.log(1+0.1+T.exp(z))
qu = function([z], q)


In [109]:
z0 = np.random.randn(200000)
type(z0)

numpy.ndarray

In [110]:
%timeit qu(z0)

100 loops, best of 3: 13.5 ms per loop


In [111]:
def qu_np(z):
    eps = 0.1
    q = np.log(1+eps+np.exp(z))
    return q

In [112]:
%timeit qu_np(z0)

100 loops, best of 3: 12.4 ms per loop


In [99]:
class MyClass:
    # qu function in theano
    z = T.dvector('z')
    q = T.log(1+0.1+T.exp(z))
    qu = function([z], q)
    
    # lmb function in theano
    x = T.dmatrix('x')
    beta = T.dvector('beta')
    beta0 = T.dscalar('beta0')
    z = beta0 + T.dot(x,beta)
    l = qu(z)
    #l = T.log(1+0.1+T.exp(z))
    lmb = function([beta0, beta, x], l)