In [1]:
import numpy as np
import math
import cancer
import sklearn.preprocessing

In [2]:
X_train, y_train = cancer.get_train()
X_train = sklearn.preprocessing.scale(X_train)
X_test, y_test = cancer.get_test()
X_test = sklearn.preprocessing.scale(X_test)

In [6]:
def soft_threshold(a, delta):
    return np.array([np.sign(x) * max(np.abs(x) - delta, 0) for x in a])

In [7]:
def fit(X, y, delta):
    N, D = X.shape
    cs = np.unique(y)
    C = len(cs)
    sse = np.zeros(D)
    mus = []
    prior = []
    xbar = np.mean(X, axis = 0)

    for c in cs:
        i = y == c
        X_c = X[i]
        prior.append(np.mean(i))
        mu_c = np.mean(X_c, axis = 0)
        mus.append(mu_c)
        sse += np.sum((X_c - mu_c) ** 2, axis = 0)
        
    sigma2 = sse / (1.0 * N - C)
    sigma = np.sqrt(sigma2)
    s0 = np.median(sigma)
    
    m = np.zeros(C)
    d = np.zeros((C, D))
    
    for i, c in enumerate(cs):
        m[i] = math.sqrt(1.0 / sum(y == c) - 1.0 / N)
        d[i] = (mus[i] - xbar) / (m[i] * (sigma + s0))
        d[i] = soft_threshold(d[i], delta)
        mus[i] = xbar + m[i] * (sigma + s0) * d[i]
        
    return cs, np.log(prior), mus, sigma2
        
def predict(model, X):
    cs, log_prior, mus, sigma2 = model
    N, D = X.shape
    C = len(cs)
    loglik = np.empty((N, C))
    
    for i in range(C):
        Z = 0.5 * (X - mus[i]) ** 2 / sigma2
        loglik[:, i] = log_prior[i] - np.sum([Z[:, j] for j in range(D)], axis = 0)
    
    return cs[np.argmax(loglik, axis = 1)]

In [9]:
model = fit(X_train, y_train, 4.3)
np.mean(predict(model, X_test) == y_test)

0.59259259259259256