In [1]:
import numpy as np

In [2]:
def process_data(data,mean=None,std=None):
    # normalize the data to have zero mean and unit variance (add 1e-15 to std to avoid numerical issue)
    if mean is not None:
        # directly use the mean and std precomputed from the training data
        data = (data - mean) / (std + 1e-15)
        return data
    else:
        # compute the mean and std based on the training data
        mean, std = np.mean(data, axis=0), np.std(data, axis=0) # placeholder
        data = (data - mean) / (std + 1e-15)
        return data, mean, std

In [3]:
def process_label(label):
    # convert the labels into one-hot vector for training
    one_hot = np.zeros([len(label),10])

    for i in range(len(label)):
        one_hot[i][label[i]] = 1

    return one_hot

In [4]:
def tanh(x):
    # implement the hyperbolic tangent activation function for hidden layer
    # You may receive some warning messages from Numpy. No worries, they should not affect your final results
    ''' f(x) = (e^x - e^-x) / (e^x + e^-x); derivative of tanh: f'(x) = (1-g(x^2)'''
    # (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
    # 1 - tanh(x) * tanh(x) ; 1- np.power(tanh(x), 2)
    f_x = 2 / (1 + np.exp(-2 * x)) - 1
    return f_x

In [5]:
def softmax(x):
    # implement the softmax activation function for output layer
    f_x = np.exp(x) / np.sum(np.exp(x)) 
    return f_x

In [6]:
def cross_entropy_loss(y, yhat):
    '''
        yHat  = predicted value
        y =  actual label
    '''
    if y == 1:
        return -np.log(yhat)
    else:
        return -np.log(1 - yhat)

# Tanh Activation Function

In [7]:
class MLP:
    def __init__(self,num_hid):
        # initialize the weights
        self.weight_1 = np.random.random([64,num_hid])
        self.bias_1 = np.random.random([1,num_hid])
        self.weight_2 = np.random.random([num_hid,10])
        self.bias_2 = np.random.random([1,10])

    def fit(self,train_x, train_y, valid_x, valid_y):
        # learning rate
        lr = 5e-3
        # counter for recording the number of epochs without improvement
        count = 0
        best_valid_acc = 0

        """
        Stop the training if there is no improvment over the best validation accuracy for more than 50 iterations
        """
        while count<=50:
            # training with all samples (full-batch gradient descents)
            # implement the forward pass (from inputs to predictions)
            # hidden Layer
            hidden_output = self.get_hidden(train_x)
            y_pred = self.predict(train_x)
            y_label = process_label(y_pred)

            # implement the backward pass (backpropagation)
            # compute the gradients w.r.t. different parameters
            label_diff = train_y - y_label

            update_weight_2 = lr * np.dot(hidden_output.T, label_diff)
            update_bias_2 = lr * np.sum(label_diff, axis=0)

            inner = (np.dot(label_diff, self.weight_2.T))
            dz = (hidden_output * (1 - hidden_output)) 
            coeffs = (inner * dz)

            update_weight_1 = lr * np.dot(train_x.T, coeffs)
            update_bias_1 = lr * np.sum(coeffs, axis=0)

            #update the parameters based on sum of gradients for all training samples
            self.weight_1 += update_weight_1
            self.bias_1 += update_bias_1
            self.weight_2 += update_weight_2
            self.bias_2 += update_bias_2

            # evaluate on validation data
            predictions = self.predict(valid_x)
            valid_acc = np.count_nonzero(predictions.reshape(-1)==valid_y.reshape(-1))/len(valid_x)

            # compare the current validation accuracy with the best one
            if valid_acc>best_valid_acc:
                best_valid_acc = valid_acc
                count = 0
            else:
                count += 1

        return best_valid_acc

    def predict(self,x):
        # generate the predicted probability of different classes
        hidden_op = self.get_hidden(x)

        # convert class probability to predicted labels
        y = softmax(np.dot(hidden_op, self.weight_2) + self.bias_2)
        y_max = np.argmax(y, axis=1)
        # y = np.zeros([len(x),]).astype('int') # placeholder

        return y_max

    def get_hidden(self,x):
        # extract the intermediate features computed at the hidden layers (after applying activation function)
        '''
            hidden_input = x.dot(self.weight_1) + self.bias_1
            hidden_ouput = tanh(hidden_input)
        '''
        z = tanh(np.dot(x, self.weight_1) + self.bias_1)
        return z

    def params(self):
        return self.weight_1, self.bias_1, self.weight_2, self.bias_2

In [8]:
# read in data.
# training data
train_data = np.genfromtxt("optdigits_train.txt",delimiter=",")
train_x = train_data[:,:-1]
train_y = train_data[:,-1].astype('int')

# validation data
valid_data = np.genfromtxt("optdigits_valid.txt",delimiter=",")
valid_x = valid_data[:,:-1]
valid_y = valid_data[:,-1].astype('int')

# test data
test_data = np.genfromtxt("optdigits_test.txt",delimiter=",")
test_x = test_data[:,:-1]
test_y = test_data[:,-1].astype('int')

In [9]:
# normalize the data
train_x, mean, std = process_data(train_x)
valid_x = process_data(valid_x, mean, std)
test_x = process_data(test_x, mean, std)

In [10]:
# process training labels into one-hot vectors
train_y = process_label(train_y)

In [11]:
# experiment with different numbers of hidden units
candidate_num_hid = [4,8,16,20,24]
valid_accuracy = []
for i, num_hid in enumerate(candidate_num_hid):
    # initialize the model
    clf = MLP(num_hid=num_hid)
    # update the model based on training data, and record the best validation accuracy
    cur_valid_accuracy = clf.fit(train_x,train_y,valid_x,valid_y)
    valid_accuracy.append(cur_valid_accuracy)
    print('Validation accuracy for %d hidden units is %.3f' %(candidate_num_hid[i],cur_valid_accuracy))


  f_x = 2 / (1 + np.exp(-2 * x)) - 1


Validation accuracy for 4 hidden units is 0.223
Validation accuracy for 8 hidden units is 0.216
Validation accuracy for 16 hidden units is 0.227
Validation accuracy for 20 hidden units is 0.259
Validation accuracy for 24 hidden units is 0.219


In [12]:
# select the best number of hidden unit and use it to train the model
best_num_hid = candidate_num_hid[np.argmax(valid_accuracy)]
clf = MLP(num_hid=best_num_hid)
_ = clf.fit(train_x,train_y,valid_x,valid_y)

# evaluate on test data
predictions = clf.predict(test_x)
accuracy = np.count_nonzero(predictions.reshape(-1)==test_y.reshape(-1))/len(test_x)

print('Test accuracy with %d hidden units is %.3f' %(best_num_hid,accuracy))


  f_x = 2 / (1 + np.exp(-2 * x)) - 1


Test accuracy with 20 hidden units is 0.184


# USING Sigmoid activation function

In [13]:
def sigmoid(x):
    # implement the sigmoid activation function for hidden layer
    f_x = 1.0/(1.0 + np.exp(-x)) # placeholder  
    return f_x

In [None]:
class MLP:
    def __init__(self,num_hid):
        # initialize the weights
        self.weight_1 = np.random.random([64,num_hid])
        self.bias_1 = np.random.random([1,num_hid])
        self.weight_2 = np.random.random([num_hid,10])
        self.bias_2 = np.random.random([1,10])

    def fit(self,train_x, train_y, valid_x, valid_y):
        # learning rate
        lr = 5e-3
        # counter for recording the number of epochs without improvement
        count = 0
        best_valid_acc = 0

        """
        Stop the training if there is no improvment over the best validation accuracy for more than 50 iterations
        """
        while count<=50:
            # training with all samples (full-batch gradient descents)
            # implement the forward pass (from inputs to predictions)
            # hidden Layer
            hidden_output = self.get_hidden(train_x)
            y_pred = self.predict(train_x)
            y_label = process_label(y_pred)

            # implement the backward pass (backpropagation)
            # compute the gradients w.r.t. different parameters
            label_diff = train_y - y_label

            update_weight_2 = lr * np.dot(hidden_output.T, label_diff)
            update_bias_2 = lr * np.sum(label_diff, axis=0)

            inner = (np.dot(label_diff, self.weight_2.T))
            dz = (hidden_output * (1 - hidden_output)) 
            coeffs = (inner * dz)

            update_weight_1 = lr * np.dot(train_x.T, coeffs)
            update_bias_1 = lr * np.sum(coeffs, axis=0)

            #update the parameters based on sum of gradients for all training samples
            self.weight_1 += update_weight_1
            self.bias_1 += update_bias_1
            self.weight_2 += update_weight_2
            self.bias_2 += update_bias_2

            # evaluate on validation data
            predictions = self.predict(valid_x)
            valid_acc = np.count_nonzero(predictions.reshape(-1)==valid_y.reshape(-1))/len(valid_x)

            # compare the current validation accuracy with the best one
            if valid_acc>best_valid_acc:
                best_valid_acc = valid_acc
                count = 0
            else:
                count += 1

        return best_valid_acc

    def predict(self,x):
        # generate the predicted probability of different classes
        hidden_op = self.get_hidden(x)

        # convert class probability to predicted labels
        y = softmax(np.dot(hidden_op, self.weight_2) + self.bias_2)
        y_max = np.argmax(y, axis=1)
        # y = np.zeros([len(x),]).astype('int') # placeholder

        return y_max

    def get_hidden(self,x):
        # extract the intermediate features computed at the hidden layers (after applying activation function)
        z = sigmoid(np.dot(x, self.weight_1) + self.bias_1)
        return z

    def params(self):
        return self.weight_1, self.bias_1, self.weight_2, self.bias_2

In [72]:
class MLP:
    def __init__(self,num_hid):
        # initialize the weights
        self.weight_1 = np.random.random([64,num_hid])
        self.bias_1 = np.random.random([1,num_hid])
        self.weight_2 = np.random.random([num_hid,10])
        self.bias_2 = np.random.random([1,10])


    def fit(self,train_x, train_y, valid_x, valid_y):
        # learning rate
        lr = 5e-3
        # counter for recording the number of epochs without improvement
        count = 0
        best_valid_acc = 0

        """
        Stop the training if there is no improvment over the best validation accuracy for more than 50 iterations
        """
        while count<=50:
            # training with all samples (full-batch gradient descents)
            # implement the forward pass (from inputs to predictions)
            # hidden Layer
            hidden_input = train_x.dot(self.weight_1) + self.bias_1
            hidden_ouput = tanh(hidden_input)
            # Output layer
            output_layer_input = hidden_ouput.dot(self.weight_2) + self.bias_2
            y_pred = softmax(output_layer_input)

            # implement the backward pass (backpropagation)
            # compute the gradients w.r.t. different parameters
            grad_wrt_out_1_input = cross_entropy_loss(train_y, y_pred) * softmax(output_layer_input)
            grad_v = hidden_input.T.dot(grad_wrt_out_1_input)
            grad_v0 = np.sum(grad_wrt_out_1_input, axis=0, keepdims=True)
            
            grad_wrt_hidden_1_input = grad_wrt_out_1_input.dot(self.weight_2.T)* dtanh(hidden_input)
            grad_w = train_x.T.dot(grad_wrt_hidden_1_input)
            grad_w0 = np.sum(grad_wrt_hidden_1_input, axis=0, keepdims=True)

            #update the parameters based on sum of gradients for all training samples
            self.weight_1 -= lr * grad_v
            self.bias_1 -= lr * grad_v0
            self.weight_2 -= lr * grad_w
            self.bias_2 -= lr * grad_w0
        '''
            # evaluate on validation data
            predictions = self.predict(valid_x)
            valid_acc = np.count_nonzero(predictions.reshape(-1)==valid_y.reshape(-1))/len(valid_x)

            # compare the current validation accuracy with the best one
            if valid_acc>best_valid_acc:
                best_valid_acc = valid_acc
                count = 0
            else:
                count += 1
        '''

    def predict(self,x):
        # generate the predicted probability of different classes
        hidden_input = x.dot(self.weight_2) + self.bias_1
        hideen_output = tanh(hidden_input)
        output_layer_input = hideen_output.dot(self.weight_2) + self.bias_2

        y_pred = softmax(output_layer_input)
        # convert class probability to predicted labels

        # y = np.zeros([len(x),]).astype('int') # placeholder

        return y_pred

    def get_hidden(self,x):
        # extract the intermediate features computed at the hidden layers (after applying activation function)
        z = x # placeholder

        return z

    def params(self):
        return self.weight_1, self.bias_1, self.weight_2, self.bias_2


In [61]:
train_data = np.genfromtxt("optdigits_train.txt",delimiter=",")
train_x = train_data[:,:-1]
train_y = train_data[:,-1].astype('int')

test_data = np.genfromtxt("optdigits_test.txt",delimiter=",")
test_x = test_data[:,:-1]
test_y = test_data[:,-1].astype('int')

valid_data = np.genfromtxt("optdigits_valid.txt",delimiter=",")
valid_x = valid_data[:,:-1]
valid_y = valid_data[:,-1].astype('int')


In [62]:
train_x, mean, std = process_data(train_x)

In [63]:
test_x = process_data(test_x, mean, std)
valid_x = process_data(valid_x, mean, std)

In [64]:
train_y = process_label(train_y)

In [65]:
clf = MLP(num_hid=4)

In [73]:
clf.fit(train_x,train_y,valid_x,valid_y)

AttributeError: 'NoneType' object has no attribute 'gradient'

In [None]:
clf.fit(train_x, train_y)
y_pred = np.argmax(clf.predict(test_x), axis=1)
y_test = np.argmax(test_y, axis=1)
accuracy = accuracy_score(test_y, y_pred)
print ("Accuracy:", accuracy)