In [1]:
import numpy as np
np.random.seed(seed=1)
import h5py
with h5py.File('../data/Assignment-1-Dataset/train_128.h5','r') as H:
    data = np.copy(H['data'])
with h5py.File('../data/Assignment-1-Dataset/train_label.h5','r') as H:
    label = np.copy(H['label'])
    
global_variables = {
    'epsilon':1e-5
}

In [2]:
def get_shuffle_index(array):
    train_ix = np.random.choice(range(len(array)), replace=False, size=int(len(array)))
    return train_ix
    
def get_indices(array, index):
    return array[index]

def shuffle(arrays):
    indices = get_shuffle_index(arrays[0])
    return [get_indices(array,indices) for array in arrays]

In [3]:
def accuracy(y_true, y_pred):
    return (y_pred == y_true).mean()

def get_minibatch(X, y, minibatch_size, shuffled=True):
    minibatches = []

    if shuffled:
        X, y = shuffle([X, y])

    for i in range(0, X.shape[0], minibatch_size):
        X_mini = X[i:i + minibatch_size]
        y_mini = y[i:i + minibatch_size]
        minibatches.append((X_mini, y_mini))

    return minibatches

In [4]:
def fc_forward(X, W, b):
    out = X @ W + b
    for_backprop = (W, X)
    return out, for_backprop


def fc_backward(dout, for_backprop):
    W, h = for_backprop

    dW = h.T @ dout
    db = np.sum(dout, axis=0)
    dX = dout @ W.T

    return dX, dW, db

def softmax(X):
    eX = np.exp((X.T - np.max(X, axis=1)).T)
    return (eX.T / eX.sum(axis=1)).T

def relu_forward(X):
    out = np.maximum(X, 0)
    for_backprop = X
    return out, for_backprop


def relu_backward(dout, for_backprop):
    dX = dout.copy()
    dX[for_backprop <= 0] = 0
    return dX


def lrelu_forward(X, a=1e-3):
    out = np.maximum(a * X, X)
    for_backprop = (X, a)
    return out, for_backprop


def lrelu_backward(dout, for_backprop):
    X, a = for_backprop
    dX = dout.copy()
    dX[X < 0] *= a
    return dX


def sigmoid_forward(X):
    out = np.sigmoid(X)
    for_backprop = out
    return out, for_backprop


def sigmoid_backward(dout, for_backprop):
    return for_backprop * (1. - for_backprop) * dout


def tanh_forward(X):
    out = np.tanh(X)
    for_backprop = out
    return out, for_backprop


def tanh_backward(dout, for_backprop):
    dX = (1 - for_backprop**2) * dout
    return dX


def dropout_forward(X, p_dropout):
    u = np.random.binomial(1, p_dropout, size=X.shape) / p_dropout
    out = X * u
    for_backprop = u
    return out, for_backprop


def dropout_backward(dout, for_backprop):
    dX = dout * for_backprop
    return dX


def bn_forward(X, gamma, beta, for_backprop, momentum=.9, train=True):
    rmean, rvar = for_backprop

    if train:
        mu = np.mean(X, axis=0)
        var = np.var(X, axis=0)
        
        #print(mu.shape, rmean.shape, var.shape, rvar.shape)

        X_norm = (X - mu) / np.sqrt(var + global_variables['epsilon'])
        out = gamma * X_norm + beta

        for_backprop = (X, X_norm, mu, var, gamma, beta)

        rmean = exp_running_avg(rmean, mu, momentum)
        rvar = exp_running_avg(rvar, var, momentum)
    else:
        X_norm = (X - rmean) / np.sqrt(rvar + global_variables['epsilon'])
        out = gamma * X_norm + beta
        for_backprop = None

    return out, for_backprop, rmean, rvar

def exp_running_avg(running, new, gamma=.9):
    return gamma * running + (1. - gamma) * new

def bn_backward(dout, for_backprop):
    X, X_norm, mu, var, gamma, beta = for_backprop

    N, D = X.shape

    X_mu = X - mu
    std_inv = 1. / np.sqrt(var + global_variables['epsilon'])

    dX_norm = dout * gamma
    dvar = np.sum(dX_norm * X_mu, axis=0) * -.5 * std_inv**3
    dmu = np.sum(dX_norm * -std_inv, axis=0) + dvar * np.mean(-2. * X_mu, axis=0)

    dX = (dX_norm * std_inv) + (dvar * 2 * X_mu / N) + (dmu / N)
    dgamma = np.sum(dout * X_norm, axis=0)
    dbeta = np.sum(dout, axis=0)

    return dX, dgamma, dbeta

In [5]:
def cel(y_pred, y_train):
    m = y_pred.shape[0]
    prob = softmax(y_pred)
    log_like = -np.log(prob[range(m), y_train])
    return np.sum(log_like) / m


def d_cel(y_pred, y_train):
    m = y_pred.shape[0]
    grad_y = softmax(y_pred)
    grad_y[range(m), y_train] -= 1.
    grad_y /= m
    return grad_y


In [17]:
class FeedForwardNet:
        
    def __init__(self, D, C, H, lam=1e-3, p_dropout=.8, nonlin='relu'):
        self._init_model(D, C, H)

        self.lam = lam
        self.p_dropout = p_dropout
        self.mode = 'classification'

    def train_step(self, X_train, y_train):
        """
        Single training step over minibatch: forward, loss, backprop
        """
        y_pred, cache = self.forward(X_train, train=True)
        loss = cel(y_pred, y_train)
        grad = self.backpropagation(y_pred, y_train, cache)

        return grad, loss

    def predict_proba(self, X):
        score, _ = self.forward(X, False)
        return softmax(score)

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def forward(self, X, train=False):
        gamma1, gamma2 = self.model['gamma1'], self.model['gamma2']
        beta1, beta2 = self.model['beta1'], self.model['beta2']

        u1, u2 = None, None
        bn1_cache, bn2_cache = None, None

        # First layer
        h1, h1_cache = fc_forward(X, self.model['W1'], self.model['b1'])
        bn1_cache = (self.bn_caches['bn1_mean'], self.bn_caches['bn1_var'])
        h1, bn1_cache, run_mean, run_var = bn_forward(h1, gamma1, beta1, bn1_cache, train=train)
        h1, nl_cache1 = relu_forward(h1)

        self.bn_caches['bn1_mean'], self.bn_caches['bn1_var'] = run_mean, run_var

        if train:
            h1, u1 = dropout_forward(h1, self.p_dropout)

        # Second layer
        h2, h2_cache = fc_forward(h1, self.model['W2'], self.model['b2'])
        bn2_cache = (self.bn_caches['bn2_mean'], self.bn_caches['bn2_var'])
        h2, bn2_cache, run_mean, run_var = bn_forward(h2, gamma2, beta2, bn2_cache, train=train)
        h2, nl_cache2 = relu_forward(h2)

        self.bn_caches['bn2_mean'], self.bn_caches['bn2_var'] = run_mean, run_var

        if train:
            h2, u2 = dropout_forward(h2, self.p_dropout)

        # Third layer
        score, score_cache = fc_forward(h2, self.model['W3'], self.model['b3'])

        cache = (X, h1_cache, h2_cache, score_cache, nl_cache1, nl_cache2, u1, u2, bn1_cache, bn2_cache)

        return score, cache

    def backpropagation(self, y_pred, y_train, cache):
        X, h1_cache, h2_cache, score_cache, nl_cache1, nl_cache2, u1, u2, bn1_cache, bn2_cache = cache

        # Hidden layer 3
        to_pass = d_cel(y_pred, y_train)
        to_pass, d_weights_3, d_bias_3 = fc_backward(to_pass, score_cache)

        # Hidden layer 2
        to_pass = relu_backward(to_pass, nl_cache2)
        to_pass = dropout_backward(to_pass, u2)
        to_pass, dgamma2, dbeta2 = bn_backward(to_pass, bn2_cache)
        to_pass, d_weights_2, d_bias_2 = fc_backward(to_pass, h2_cache)
        
        # Hidden layer 1
        to_pass = relu_backward(to_pass, nl_cache1)
        to_pass = dropout_backward(to_pass, u1)
        to_pass, dgamma1, dbeta1 = bn_backward(to_pass, bn1_cache)
        _, d_weights_1, d_bias_1 = fc_backward(to_pass, h1_cache)

        gradients = dict(
            W1=d_weights_1, W2=d_weights_2, W3=d_weights_3, 
            b1=d_bias_1, b2=d_bias_2, b3=d_bias_3, 
            gamma1=dgamma1,gamma2=dgamma2, beta1=dbeta1, beta2=dbeta2
        )

        return gradients

    def _init_model(self, D, C, H):
        self.model = dict(
            W1=np.random.randn(D, H) / np.sqrt(D / 2.),
            W2=np.random.randn(H, H) / np.sqrt(H / 2.),
            W3=np.random.randn(H, C) / np.sqrt(H / 2.),
            b1=np.zeros((1, H)),
            b2=np.zeros((1, H)),
            b3=np.zeros((1, C)),
            gamma1=np.ones((1, H)),
            gamma2=np.ones((1, H)),
            beta1=np.zeros((1, H)),
            beta2=np.zeros((1, H))
        )

        self.bn_caches = dict(
            bn1_mean=np.zeros((1, H)),
            bn2_mean=np.zeros((1, H)),
            bn1_var=np.zeros((1, H)),
            bn2_var=np.zeros((1, H))
        )
        
    def momentum(self, X_train, y_train, test=None, train_rate=1e-3, mb_size=256, iterations=2000, p_iter=100):
        velocity = {k: np.zeros_like(v) for k, v in self.model.items()}
        gamma = .9

        minibatches = get_minibatch(X_train, y_train, mb_size)

        if test:
            X_val, y_val = test
        else:
            X_val, y_val = X_train, y_train

        for iteration in range(1, iterations + 1):
            if iteration == iterations:
                print(iteration)
            idx = np.random.randint(0, len(minibatches))
            X_mini, y_mini = minibatches[idx]

            gradients, loss = self.train_step(X_mini, y_mini)

            if iteration % p_iter == 0:
                val_acc = accuracy(y_val, self.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iteration, loss, val_acc))

            for layer in gradients:
                velocity[layer] = gamma * velocity[layer] + train_rate * gradients[layer]
                self.model[layer] -= velocity[layer]

        return self

In [18]:
np.random.seed(10)
t = FeedForwardNet(128, 10, 200)
X_train = data[:2000]
Y_train = label[:2000]
X_val = data[2000:2500]
Y_val = label[2000:2500]

In [19]:
%%time
D = 128
C = 10
H = 200
lam = 1e-3
p_dropout=0.05
loss='cross_ent'
nonlin='relu'

np.random.seed(10)

t = FeedForwardNet(D, C, H=500, lam=lam, p_dropout=p_dropout, nonlin=nonlin)
t = t.momentum(X_train, Y_train, test=(X_val, Y_val), mb_size=100, iterations=1000, p_iter=200)

Iter-200 loss: 3.1180 validation: 0.588000
Iter-400 loss: 2.4682 validation: 0.592000
Iter-600 loss: 2.3061 validation: 0.588000
Iter-800 loss: 2.3345 validation: 0.606000
1000
Iter-1000 loss: 2.3921 validation: 0.564000
CPU times: user 19 s, sys: 1.92 s, total: 20.9 s
Wall time: 14.9 s
