In [334]:
import numpy as np
import datasets
from scipy.optimize import minimize
from scipy.special import expit
from sklearn.metrics import log_loss
from softmax import softmax, log_softmax
import theano
from theano import tensor as T
import tensorflow as tf
from scipy import stats
from numpy.linalg import inv, eigh
from random import sample
from scipy.linalg import sqrtm

In [335]:
# X, y = datasets.htwt()
X, y = datasets.iris()
Y = datasets.one_hot(y)
N, D = X.shape
N, C = Y.shape
Xaug = np.hstack([np.ones((N, 1)), X])

In [213]:
z = T.dvector()
decode = lambda params: (params[1:].reshape((D, 1)), params[0])
mu = lambda w, b: T.nnet.sigmoid(T.dot(X, w) + b)
loss = T.sum(T.nnet.binary_crossentropy(mu(*decode(z)), y.reshape((N, 1))))
grad = theano.function([z], T.grad(loss, z))
hess = theano.function([z], T.hessian(loss, z))
loss = theano.function([z], loss)

In [215]:
mu = lambda w: expit(Xaug.dot(w))
loss = lambda w: log_loss(y, mu(w), normalize = False)
grad = lambda w: Xaug.T.dot(mu(w) - y)
S = lambda mu: np.diag(mu * (1 - mu))
hess = lambda w: Xaug.T.dot(S(mu(w))).dot(Xaug)

In [216]:
w = np.array([0.0] * (D + 1))
w = minimize(loss, w).x
w = minimize(loss, w, method = 'Newton-CG', jac = grad, hess = hess, tol = 1e-6).x
yhat = expit(Xaug.dot(w)) > 0.5
print '%0.3f' % np.mean(yhat != y)

0.119


In [6]:
decode = lambda params: params.reshape((D + 1, C))
z = T.dvector()
mu = lambda W: T.nnet.softmax(T.dot(Xaug, W))
loss = T.sum(T.nnet.categorical_crossentropy(mu(decode(z)), Y))
grad = theano.function([z], T.grad(loss, z))
hess = theano.function([z], T.hessian(loss, z))    
loss = theano.function([z], loss)

In [7]:
decode = lambda params: np.array(params).reshape((D + 1, C))

mu = lambda W: softmax(Xaug.dot(W))
loss = lambda params: log_loss(y, mu(decode(params)), normalize = False)

def grad(params):
    Z = mu(decode(params)) - Y
    return sum([np.kron(Xaug[i], z) for i, z in enumerate(Z)])

def hess(params):
    Mu = mu(decode(params))
    o = lambda x: np.outer(x, x)
    return sum([np.kron(o(Xaug[i]), np.diag(z) - o(z)) for i, z in enumerate(Mu)])

In [8]:
params = [0] * (D + 1) * C
params = minimize(loss, params, method = 'Newton-CG', jac = grad, hess = hess, tol = 1e-6).x
W = params.reshape((D + 1, C))
yhat = np.argmax(softmax(Xaug.dot(W)), axis = 1)
print '%0.3f' % np.mean(yhat != y)

0.114


In [108]:
# bayesian logistic regression
mvn = stats.multivariate_normal
log_prior = lambda w: mvn.logpdf(w[1:], np.zeros(D), 100 * np.eye(D))
mu = lambda w: expit(Xaug.dot(w))
nll = lambda w: log_loss(y, mu(w), normalize = False)
E = lambda w: nll(w) - log_prior(w)
mN = minimize(E, np.zeros(D + 1)).x
S = lambda mu: np.diag(mu * (1 - mu))
H = Xaug.T.dot(S(mu(mN))).dot(Xaug) + 1.0 / 200
VN = inv(H)

In [119]:
# monte carlo approximation
posterior = mvn(mN, VN)
yhat = np.mean([mu(w) for w in posterior.rvs(1000)], axis = 0) > 0.5
print '%0.3f' % np.mean(yhat != y)

0.119


In [110]:
# probit approximation
yhat = (1 + np.pi * Xaug.dot(VN).dot(Xaug.T) / 8) ** -0.5 * Xaug.dot(mN) > 0.5
print '%0.3f' % np.mean(yhat != y)

0.110


In [299]:
# stochastic gradient descent
w = np.array([0.0] * (D + 1))
s = w.copy()
tau0 = 1e-6
eta = 1e-2
batch_size = 100
epochs = 1000

for _ in xrange(epochs):
    i = sample(range(N), batch_size)
    mu = expit(Xaug[i].dot(w))
    g = Xaug[i].T.dot(mu - y[i])
    s += g ** 2
    w -= eta * g / (tau0 + np.sqrt(s)) # adagrad

yhat = expit(Xaug.dot(w)) > 0.5
print '%0.3f' % np.mean(yhat != y)

0.219


In [338]:
# fisher's linear discriminant algorithm
mus = []

L = 2
aaT = lambda a: np.outer(a, a)
aTa = lambda a: np.array(a).T.dot(a)
mu = np.mean(X)
SW = np.zeros((D, D))
SB = np.zeros((D, D))

for c in xrange(C):
    Xc = X[y == c]
    muc = np.mean(Xc, axis = 0) * 1.0
    mus.append(muc)
    SW += sum([aaT(xc - muc) for xc in Xc])
    SB += len(Xc) * aaT(muc - mu)

if C == 2:
    W = inv(SW).dot(mus[1] - mus[0])
else:
    SW_inv_root = sqrtm(inv(SW))
    _, v = eigh(SW_inv_root.dot(SB).dot(SW_inv_root))
    U = v[:, : -L - 1 : -1]
    W = SW_inv_root.dot(U)
    
Z = X.dot(W)
d = np.array([[aTa(z - mc) for z in Z] for mc in np.dot(mus, W)]).T
yhat = np.argmin(d, axis = 1) 
print '%0.3f' % np.mean(yhat != y)

0.020
