In [154]:
import numpy as np
import datasets
from scipy.optimize import minimize
from softmax import softmax, log_softmax

In [155]:
X, y = datasets.htwt()

In [71]:
Y = datasets.one_hot(y)
N, D = X.shape
N, C = Y.shape

In [152]:
f = lambda W, b: -np.trace(Y.dot(log_softmax(X.dot(W) + b).T))

def g(W, b):
    Z = softmax(X.dot(W) + b) - Y
    dW = sum([np.kron(z, X[i]) for i, z in enumerate(Z)])
    dW = np.array(dW.reshape((C, C))).T
    db = np.sum(Z, axis = 0)     
    return list(dW.ravel()) + list(db.ravel())

def hess(W, b):
    o = lambda x: np.outer(x, x)
    return sum([np.kron(np.diag(mu) - o(mu), o(X[i])) for i, mu in enumerate(softmax(X.dot(W) + b))])

def H(W, b):
    Z = softmax(X.dot(W) + b)
    o = lambda x: np.outer(x, x)
    dW = sum([np.kron(np.diag(z) - o(z), o(X[i])) for i, z in enumerate(Z)])
    db = sum([np.diag(z) - o(z) for i, z in enumerate(Z)])
    return list(dW.ravel()) + list(db.ravel())

W = np.zeros((D, C))
b = np.zeros(C)

#print f(W, b)
# print g(W, b)
print hess(W, b)
print H(W, b)

[[  239264.    520577.5  -239264.   -520577.5]
 [  520577.5  1169693.   -520577.5 -1169693. ]
 [ -239264.   -520577.5   239264.    520577.5]
 [ -520577.5 -1169693.    520577.5  1169693. ]]
[239264.0, 520577.5, -239264.0, -520577.5, 520577.5, 1169693.0, -520577.5, -1169693.0, -239264.0, -520577.5, 239264.0, 520577.5, -520577.5, -1169693.0, 520577.5, 1169693.0, 52.5, -52.5, -52.5, 52.5]


In [214]:
import theano
from theano import tensor as T

z = T.dvector()
W = z[:-C].reshape((D, C))
b = z[-C:]
loss = T.sum(T.nnet.categorical_crossentropy(T.nnet.softmax(T.dot(X, W) + b), Y))
f = theano.function([z], loss)
g = theano.function([z], T.grad(loss, z))
H = theano.function([z], T.hessian(loss, z))
z = np.zeros(D * C + C)
print f(z)
print g(z)
print H(z)

145.560907918
[-1842.  1842. -2474.  2474.   -32.    32.]
[[  2.39264000e+05  -2.39264000e+05   5.20577500e+05  -5.20577500e+05
    3.53650000e+03  -3.53650000e+03]
 [ -2.39264000e+05   2.39264000e+05  -5.20577500e+05   5.20577500e+05
   -3.53650000e+03   3.53650000e+03]
 [  5.20577500e+05  -5.20577500e+05   1.16969300e+06  -1.16969300e+06
    7.64700000e+03  -7.64700000e+03]
 [ -5.20577500e+05   5.20577500e+05  -1.16969300e+06   1.16969300e+06
   -7.64700000e+03   7.64700000e+03]
 [  3.53650000e+03  -3.53650000e+03   7.64700000e+03  -7.64700000e+03
    5.25000000e+01  -5.25000000e+01]
 [ -3.53650000e+03   3.53650000e+03  -7.64700000e+03   7.64700000e+03
   -5.25000000e+01   5.25000000e+01]]


In [195]:
x = T.constant([[1, 2], [3, 4]]).ravel().reshape((2, 2))[:, -C:]
theano.function([], x)()
T.dvector()[:2]

Subtensor{:int64:}.0

In [156]:
def predict(model, X):
    W, b = model
    return np.argmax(softmax(X.dot(W) + b), axis = 1)

In [172]:
def crossentropy_loss(X, Y, decode):
    N, D = X.shape
    N, C = Y.shape
    
    def loss(params):
        W, b = decode(params)
        return -np.trace(Y.dot(log_softmax(X.dot(W) + b).T))
#        return -sum([Y[i].dot(ll) for i, ll in enumerate(log_softmax(X.dot(W) + b))])

    def grad(params):
        W, b = decode(params)
        Z = softmax(X.dot(W) + b) - Y
        dW = sum([np.kron(z, X[i]) for i, z in enumerate(Z)])
        dW = np.array(dW.reshape((C, C))).T
        db = np.sum(Z, axis = 0)
        return np.array(list(dW.ravel()) + list(db.ravel()))
    
    def hess(params):
        return np.array([0] * 36).reshape((6, 6))
        W, b = decode(params)
        Z = softmax(X.dot(W) + b)
        o = lambda x: np.outer(x, x)
        dW = sum([np.kron(np.diag(z) - o(z), o(X[i])) for i, z in enumerate(Z)])
        db = sum([np.diag(z) - o(z) for i, z in enumerate(Z)])
        return np.array(list(dW.ravel()) + list(db.ravel()))
    
    return loss, grad, hess

def fit(X, y):
    Y = datasets.one_hot(y)
    N, D = X.shape
    N, C = Y.shape
    loss = crossentropy_loss

    params = [0] * (D + 1) * C
    decode = lambda params: (params[:-C].reshape(D, C), params[-C:])
    
    loss, grad, hess = loss(X, Y, decode)

#    params = minimize(loss, params, method = 'Newton-CG', jac = grad, hess = hess).x
    params = minimize(loss, params).x
    return decode(params)

model = fit(X, y)
print '%0.3f' % np.mean(predict(model, X) != y)

0.119


In [159]:
import tensorflow as tf

def fit(X, y):
    Y = datasets.one_hot(y)
    N, D = X.shape
    N, C = Y.shape
    
    W = tf.Variable(tf.zeros((D, C)))
    b = tf.Variable(tf.zeros(C))

    loss = tf.nn.softmax_cross_entropy_with_logits(tf.matmul(X, W) + b, Y)
    optimizer = tf.train.AdamOptimizer().minimize(loss)

    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        sess.run(optimizer)
        return W.eval(), b.eval()
                    
model = fit(X, y)
print '%0.3f' % np.mean(predict(model, X) != y)

0.348


In [63]:
import theano
from theano import tensor as T

def fit(X, y):
    X = X.astype(np.float32)
    Y = datasets.one_hot(y)
    N, D = X.shape
    N, C = Y.shape

    learning_rate = 0.1
    
    W = theano.shared(np.zeros((D, C)))
    b = theano.shared(np.zeros(C))
    x = T.dmatrix()
    y = T.dmatrix()
    
    loss = T.sum(T.nnet.categorical_crossentropy(T.nnet.softmax(T.dot(x, W) + b), y))
    optimizer = theano.function([x, y], updates =
        [[p, p - T.grad(loss, p) * learning_rate] for p in [W, b]]
    )
    
    optimizer(X, Y)
    return W.get_value(), b.get_value()
    
model = fit(X, y)
print '%0.3f' % np.mean(predict(model, X) != y)

0.348
