In [5]:
import numpy as np
import datasets
from scipy.optimize import minimize

from softmax import softmax, log_softmax

In [6]:
X, y = datasets.htwt()

In [7]:
Y = datasets.one_hot(y)
N, D = X.shape
N, C = Y.shape

In [32]:
f = lambda W, b: -np.trace(Y.dot(log_softmax(X.dot(W) + b).T))

def g(W, b):
    ws = []
    for c in xrange(C):
        ws.append(sum([(W[:, c].dot(x) + b[c] - Y[i, c]) * x for i, x in enumerate(X)]))
    return np.array(ws).T

W = np.zeros((D, C))
b = np.zeros(C)

print f(W, b)
print g(W, b)

145.560907918
[[ -8915.  -5231.]
 [-17768. -12820.]]


In [9]:
import theano
from theano import tensor as T

W = T.dmatrix()
b = T.dvector()
loss = T.sum(T.nnet.categorical_crossentropy(T.nnet.softmax(T.dot(X, W) + b), Y))
f = theano.function([W, b], loss)
g = theano.function([W, b], T.grad(loss, W))

W = np.zeros((D, C))
b = np.zeros(C)

print f(W, b)
print g(W, b)

145.560907918
[[-1842.  1842.]
 [-2474.  2474.]]


In [25]:
def predict(model, X):
    W, b = model
    return np.argmax(softmax(X.dot(W) + b), axis = 1)

In [17]:
def crossentropy_loss(X, Y, decode):
    N, D = X.shape
    N, C = Y.shape
    
    def loss(params):
        W, b = decode(params)
        return -np.trace(Y.dot(log_softmax(X.dot(W) + b).T))
#        return -sum([Y[i].dot(ll) for i, ll in enumerate(log_softmax(X.dot(W) + b))])

    def grad(params):
        W, b = decode(params)
        ws = []
        for c in xrange(C):
            ws.append(sum([(W[:, c].dot(x) + b[i] - Y[i, c]) * x for i, x in enumerate(X)])
        # return sum([np.kron(mu - Y[i], X[i]) for i, mu in enumerate(softmax(X.dot(W) + b))])

    def hess(params):
        W, b = decode(params)
        o = lambda x: np.outer(x, x)
        return sum([np.kron(np.diag(mu) - o(mu), o(X[i])) for i, mu in enumerate(softmax(X.dot(W) + b))])
    
    return loss, grad, hess

def fit(X, y):
    Y = datasets.one_hot(y)
    N, D = X.shape
    N, C = Y.shape
    loss = crossentropy_loss

    params = [0] * (D + 1) * C
    decode = lambda params: (params[:-C].reshape(D, C), params[-C:])
    
    loss, grad, hess = loss(X, Y, decode)

    # params = minimize(loss, params, method = 'Newton-CG', jac = grad, hess = hess).x
    params = minimize(loss, params).x
    return decode(params)

model = fit(X, y)
print '%0.3f' % np.mean(predict(model, X) != y)

0.119


In [68]:
import tensorflow as tf

def fit(X, y):
    Y = datasets.one_hot(y)
    N, D = X.shape
    N, C = Y.shape
    
    W = tf.Variable(tf.zeros((D, C)))
    b = tf.Variable(tf.zeros(C))

    loss = tf.nn.softmax_cross_entropy_with_logits(tf.matmul(X, W) + b, Y)
    optimizer = tf.train.AdamOptimizer().minimize(loss)

    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        sess.run(optimizer)
        return W.eval(), b.eval()
                    
model = fit(X, y)
print '%0.3f' % np.mean(predict(model, X) != y)

0.348


In [63]:
import theano
from theano import tensor as T

def fit(X, y):
    X = X.astype(np.float32)
    Y = datasets.one_hot(y)
    N, D = X.shape
    N, C = Y.shape

    learning_rate = 0.1
    
    W = theano.shared(np.zeros((D, C)))
    b = theano.shared(np.zeros(C))
    x = T.dmatrix()
    y = T.dmatrix()
    
    loss = T.sum(T.nnet.categorical_crossentropy(T.nnet.softmax(T.dot(x, W) + b), y))
    optimizer = theano.function([x, y], updates =
        [[p, p - T.grad(loss, p) * learning_rate] for p in [W, b]]
    )
    
    optimizer(X, Y)
    return W.get_value(), b.get_value()
    
model = fit(X, y)
print '%0.3f' % np.mean(predict(model, X) != y)

0.348
