Ernesto Ezequiel Perez  
SID 23181869  
HW6

# Problem 1 - Mean Squared Error

$J = \frac{1}{2}\sum_{k=1}^{n_{out}}(y_{k} - h_{k}(x))^{2}$  

$v_{j} = \sum_{i}w^{(1)}_{ij}x_{i}$  
$\alpha_{j} = f(v_{j}) = tanh(v_{j})$  
$\lambda_{k} = \sum_{j}w^{(2)}_{jk}\alpha_{j}$  
$h_{k} = sigmoid(\lambda_{k})$  
$h_{k}(x) = sigmoid(\sum_{j}w^{(2)}_{jk}tanh(\sum_{i}w^{(1)}_{ij}x_{i}))$  

Where $x$ is a data point, and $x_{i}$ denotes the ith feature.  

$\frac{\partial J}{\partial w^{(1)}_{ij}} = \sum_{k=1}^{n_{out}}(y_{k} - h_{k})h_{k}(h_{k} - 1)w^{(2)}_{jk}(1 - tanh^{2}(\sum_{i}w^{(1)}_{ij}x_{i}))x_{i}$  

$\frac{\partial J}{\partial w^{(2)}_{jk}} = \sum_{k=1}^{n_{out}}(y_{k} - h_{k})h_{k}(h_{k} - 1)tanh(\sum_{i}w^{(1)}_{ij}x_{i})$

# Problem 1 - Cross Entropy Error

$J = - \sum_{k=1}^{n_{out}}[y_{k}ln(h_{k}(x)) + (1 - y_{k})ln(1 - h_{k}(x))]$  

$\frac{\partial J}{\partial w^{(1)}_{ij}} = - \sum_{k=1}^{n_{out}}(y_{k} - h_{k})w^{(2)}_{jk}(1 - tanh^{2}(\sum_{i}w^{(1)}_{ij}x_{i}))x_{i}$  

$\frac{\partial J}{\partial w^{(2)}_{jk}} = - \sum_{k=1}^{n_{out}}(y_{k} - h_{k})tanh(\sum_{i}w^{(1)}_{ij}x_{i})$  

Where $h_{k}$ is the same as in the mean square error derivation. 

## Problem 2

In [1]:
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
%matplotlib inline

data = loadmat('../data/digit-dataset/train.mat')

In [4]:
raw_images = [img.flatten().astype('float') for img in data['train_images'].T]
raw_images = normalize(raw_images)
trainImg = np.hstack((raw_images, np.ones((60000,1))))
labels = np.asarray([ lbl[0] for lbl in data['train_labels']])

In [3]:
indxs = range(trainImg.shape[0])
np.random.shuffle(indxs)
shuffled_data = [(trainImg[i], labels[i]) for i in indxs]
trainImg, labels = zip(*shuffled_data)

trainLbls = []
for i in labels:
    label = [0.0]*10
    label[i] = 1.0
    trainLbls.append(label)
    
trainLbls = np.array(trainLbls)
trainImg = np.array(trainImg)

In [4]:
trainImg.shape

(60000L, 785L)

In [5]:
def sigfunc(x):
    if x.all() >= 0:
        return 1.0 / (1.0 + np.exp(-x))
    else:
        z = np.exp(x)
        return z / (1 + z)

def sigPrime(z):
    return sigfunc(z)*(1.0 - sigfunc(z))

In [6]:
def forward(x, w1, w2):
    v = np.dot(x, w1)
    a = np.tanh(v)
    lam = np.dot(a, w2)
    hk = sigfunc(lam)
    return a, hk

In [7]:
def mseGradients(x, y, a, hk, w2):
    delta1 = np.multiply((y - hk), (hk - 1.0))
    delta2 = np.multiply(delta1, hk)
    djdw2 = np.dot(np.array([a]).T, np.array([delta2]))
    
    delta3 = np.dot(delta2, w2.T)
    delta4 = np.multiply(delta3, (1.0 - np.square(a)))
    djdw1 = np.dot(np.array([x]).T, np.array([delta4]))
    return djdw1, djdw2

def xeGradients(x, y, a, hk, w2):
    delta1 = hk - y
    djdw2 = np.dot(np.array([a]).T, np.array([delta1]))
    
    delta2 = np.dot(delta1, w2.T)
    delta3 = np.multiply(delta2, (1.0 - np.square(a)))
    djdw1 = np.dot(np.array([x]).T, np.array([delta3]))
    return  djdw1,  djdw2
    

In [8]:
def meanSquareError(y, yHat):
    return 0.5 * np.sum(np.square(y - yHat)) / float(y.shape[0])

def crossEntropyError(y, yHat):
    yHat = np.array([max(a, 1e-15) for a in yHat])
    a = np.multiply(-y, np.log(yHat))
    b = np.multiply((y - 1.0), (np.log((1.0 + 1e-15) - yHat)))
    return np.sum(a + b) / float(y.shape[0])

In [9]:
def trainNN(images, labels, costfunc, gradients, eta=0.2, err=0.00001, maxiters=500000):
    w1 = np.random.uniform(-0.1, 0.1, (785, 51))
    w2 = np.random.uniform(-0.1, 0.1, (51, 10))
    curr_error = 0.0
    lowest_error = 1e+15
    iters = 0.0
    
    while(iters < maxiters):
        idx = np.random.randint(0,images.shape[0])
        image = images[idx]
        label = labels[idx]
        a, yHat = forward(image, w1, w2)
        djdw1, djdw2 = gradients(image, label, a, yHat, w2)
        w1 -= eta*djdw1
        w2 -= eta*djdw2
        iters += 1.0
        curr_error += costfunc(label, yHat)
        if iters % 1000 == 0:
            curr_error /= 1000.0
            if np.absolute(lowest_error - curr_error) < err:
                return w1, w2
            if curr_error < lowest_error:
                lowest_error = curr_error            
            curr_error = 0.0
        if iters % 250000 == 0.0:
            eta -= 0.005
            
    return w1, w2

In [10]:
def predictNN(images, w1, w2):
    labels = []
    
    for img in images:
        _, hk = forward(img, w1, w2)
        labels.append(np.argmax(hk))
    return labels

In [11]:
w1, w2 = trainNN(trainImg[:50000], trainLbls[:50000], costfunc=meanSquareError, gradients=mseGradients, maxiters=10000000)

In [12]:
pred = predictNN(trainImg[50000:], w1, w2)

In [13]:
hits = 0.0
valLabels = labels[50000:]
for i in range(len(valLabels)):
    if valLabels[i] == pred[i]:
        hits += 1.0
print str(hits / float(len(pred)))

0.952
