In [37]:
import numpy as np
import math
from sklearn import datasets

In [38]:
class Softmax():
    def __call__(self, x):
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return e_x / np.sum(e_x, axis=-1, keepdims=True)

    def gradient(self, x):
        p = self.__call__(x)
        return p * (1 - p)

class TanH():
    def __call__(self, x):
        return 2 / (1 + np.exp(-2*x)) - 1

    def gradient(self, x):
        return 1 - np.power(self.__call__(x), 2)

In [17]:
def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    """ Split the data into train and test sets """
    if shuffle:
        X, y = shuffle_data(X, y, seed)
    # Split the training data from test data in the ratio specified in
    # test_size
    split_i = len(y) - int(len(y) // (1 / test_size))
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test

In [18]:
def to_categorical(x, n_col=None):
    """ One-hot encoding of nominal values """
    if not n_col:
        n_col = np.amax(x) + 1
    one_hot = np.zeros((x.shape[0], n_col))
    one_hot[np.arange(x.shape[0]), x] = 1
    return one_hot

In [19]:
def normalize(X, axis=-1, order=2):
    """ Normalize the dataset X """
    l2 = np.atleast_1d(np.linalg.norm(X, order, axis))
    l2[l2 == 0] = 1
    return X / np.expand_dims(l2, axis)

In [20]:
def accuracy_score(y_true, y_pred):
    """ Compare y_true to y_pred and return the accuracy """
    accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
    return accuracy

In [26]:
def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

In [71]:
class CrossEntropy():
    def __init__(self): pass

    def loss(self, y, p):
        # Avoid division by zero
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return - y * np.log(p) - (1 - y) * np.log(1 - p)

    def acc(self, y, p):
        return accuracy_score(np.argmax(y, axis=1), np.argmax(p, axis=1))

    def gradient(self, y, p):
        # Avoid division by zero
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return - (y / p) + (1 - y) / (1 - p)

In [35]:
class MLP():
    def __init__(self, n_hidden, n_iterations=3000, learning_rate=0.01):
        self.n_hidden = n_hidden
        self.n_iterations = n_iterations
        self.learning_rate = learning_rate
        self.hidden_activation = TanH()
        self.output_activation = Softmax()
        self.loss = CrossEntropy()

    def _initialize_weights(self, X, y):
        n_samples, n_features = X.shape
        _, n_outputs = y.shape
        # Hidden layer
        limit   = 1 / math.sqrt(n_features)
        self.W  = np.random.uniform(-limit, limit, (n_features, self.n_hidden))
        self.w0 = np.zeros((1, self.n_hidden))
        # Output layer
        limit   = 1 / math.sqrt(self.n_hidden)
        self.V  = np.random.uniform(-limit, limit, (self.n_hidden, n_outputs))
        self.v0 = np.zeros((1, n_outputs))

    def fit(self, X, y):

        self._initialize_weights(X, y)

        for i in range(self.n_iterations):

            # ..............
            #  Forward Pass
            # ..............

            # HIDDEN LAYER
            hidden_input = X.dot(self.W) + self.w0
            hidden_output = self.hidden_activation(hidden_input)
            # OUTPUT LAYER
            output_layer_input = hidden_output.dot(self.V) + self.v0
            y_pred = self.output_activation(output_layer_input)

            # ...............
            #  Backward Pass
            # ...............

            # OUTPUT LAYER
            # Grad. w.r.t input of output layer
            grad_wrt_out_l_input = self.loss.gradient(y, y_pred) * self.output_activation.gradient(output_layer_input)
            grad_v = hidden_output.T.dot(grad_wrt_out_l_input)
            grad_v0 = np.sum(grad_wrt_out_l_input, axis=0, keepdims=True)
            # HIDDEN LAYER
            # Grad. w.r.t input of hidden layer
            grad_wrt_hidden_l_input = grad_wrt_out_l_input.dot(self.V.T) * self.hidden_activation.gradient(hidden_input)
            grad_w = X.T.dot(grad_wrt_hidden_l_input)
            grad_w0 = np.sum(grad_wrt_hidden_l_input, axis=0, keepdims=True)

            # Update weights (by gradient descent)
            # Move against the gradient to minimize loss
            self.V  -= self.learning_rate * grad_v
            self.v0 -= self.learning_rate * grad_v0
            self.W  -= self.learning_rate * grad_w
            self.w0 -= self.learning_rate * grad_w0

    # Use the trained model to predict labels of X
    def predict(self, X):
        # Forward pass:
        hidden_input = X.dot(self.W) + self.w0
        hidden_output = self.hidden_activation(hidden_input)
        output_layer_input = hidden_output.dot(self.V) + self.v0
        y_pred = self.output_activation(output_layer_input)
        return y_pred
    

In [44]:
train_data = np.genfromtxt("optdigits_train.txt",delimiter=",")
train_x = train_data[:,:-1]
train_y = train_data[:,-1].astype('int')

# test data
test_data = np.genfromtxt("optdigits_test.txt",delimiter=",")
test_x = test_data[:,:-1]
test_y = test_data[:,-1].astype('int')

In [46]:
train_x = normalize(train_x)

In [48]:
test_x = normalize(test_x)

In [45]:
train_y = to_categorical(train_y)

In [39]:
data = datasets.load_digits()
X = normalize(data.data)
y = data.target

# Convert the nominal y values to binary
y = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=1)

In [49]:
 # MLP
clf = MLP(n_hidden=16,
        n_iterations=1000,
        learning_rate=0.01)

In [51]:
test_y = to_categorical(test_y)

In [52]:
clf.fit(train_x, train_y)
y_pred = np.argmax(clf.predict(test_x), axis=1)
y_test = np.argmax(test_y, axis=1)
accuracy = accuracy_score(test_y, y_pred)
print ("Accuracy:", accuracy)

Accuracy: 0.0


  accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)


In [72]:
import numpy as np

def process_data(data,mean=None,std=None):
    # normalize the data to have zero mean and unit variance (add 1e-15 to std to avoid numerical issue)
    if mean is not None:
        # directly use the mean and std precomputed from the training data
        data = (data - mean) / (std + 1e-15)
        return data
    else:
        # compute the mean and std based on the training data
        mean, std = np.mean(data, axis=0), np.std(data, axis=0) # placeholder
        data = (data - mean) / (std + 1e-15)
        return data, mean, std

def process_label(label):
    # convert the labels into one-hot vector for training
    one_hot = np.zeros([len(label),10])

    for i in range(len(label)):
        one_hot[i][label[i]] = 1

    return one_hot

def tanh(x):
    # implement the hyperbolic tangent activation function for hidden layer
    # You may receive some warning messages from Numpy. No worries, they should not affect your final results
    ''' f(x) = (e^x - e^-x) / (e^x + e^-x); derivative of tanh: f'(x) = (1-g(x^2)'''
    # (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
    # 1 - tanh(x) * tanh(x) ; 1- np.power(tanh(x), 2)
    f_x = 2 / (1 + np.exp(-2*x)) - 1
    return f_x

def dtanh(x):
    return 1- np.power(tanh(x), 2)


def softmax(x):
    # implement the softmax activation function for output layer
    f_x = np.exp(x)/np.sum(np.exp(x)) 
    return f_x

class CrossEntropy():
    def __init__(self): pass

    def loss(self, y, p):
        # Avoiding divison by zero
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return - y * np.log(p) - (1 - y) * np.log(1 - p)
    
    def gradient(self, y, p):
        # Avoid division by zero
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return - (y / p) + (1 - y) / (1 - p)


class MLP:
    def __init__(self,num_hid):
        # initialize the weights
        self.weight_1 = np.random.random([64,num_hid])
        self.bias_1 = np.random.random([1,num_hid])
        self.weight_2 = np.random.random([num_hid,10])
        self.bias_2 = np.random.random([1,10])
        self.hum_hid = num_hid
        self.loss = CrossEntropy()


    def fit(self,train_x, train_y, valid_x, valid_y):
        # learning rate
        lr = 5e-3
        # counter for recording the number of epochs without improvement
        count = 0
        best_valid_acc = 0

        """
        Stop the training if there is no improvment over the best validation accuracy for more than 50 iterations
        """
        while count<=50:
            # training with all samples (full-batch gradient descents)
            # implement the forward pass (from inputs to predictions)
            # hidden Layer
            hidden_input = train_x.dot(self.weight_1) + self.bias_1
            hidden_ouput = tanh(hidden_input)
            # Output layer
            output_layer_input = hidden_ouput.dot(self.weight_2) + self.bias_2
            y_pred = softmax(output_layer_input)

            # implement the backward pass (backpropagation)
            # compute the gradients w.r.t. different parameters
            grad_wrt_out_1_input = self.loss.gradient(train_y, y_pred) * softmax(output_layer_input)
            grad_v = hidden_input.T.dot(grad_wrt_out_1_input)
            grad_v0 = np.sum(grad_wrt_out_1_input, axis=0, keepdims=True)
            
            grad_wrt_hidden_1_input = grad_wrt_out_1_input.dot(self.weight_2.T)* dtanh(hidden_input)
            grad_w = train_x.T.dot(grad_wrt_hidden_1_input)
            grad_w0 = np.sum(grad_wrt_hidden_1_input, axis=0, keepdims=True)

            #update the parameters based on sum of gradients for all training samples
            self.weight_1 -= lr * grad_v
            self.bias_1 -= lr * grad_v0
            self.weight_2 -= lr * grad_w
            self.bias_2 -= lr * grad_w0
        '''
            # evaluate on validation data
            predictions = self.predict(valid_x)
            valid_acc = np.count_nonzero(predictions.reshape(-1)==valid_y.reshape(-1))/len(valid_x)

            # compare the current validation accuracy with the best one
            if valid_acc>best_valid_acc:
                best_valid_acc = valid_acc
                count = 0
            else:
                count += 1
        '''

    def predict(self,x):
        # generate the predicted probability of different classes
        hidden_input = x.dot(self.weight_2) + self.bias_1
        hideen_output = tanh(hidden_input)
        output_layer_input = hideen_output.dot(self.weight_2) + self.bias_2

        y_pred = softmax(output_layer_input)
        # convert class probability to predicted labels

        # y = np.zeros([len(x),]).astype('int') # placeholder

        return y_pred

    def get_hidden(self,x):
        # extract the intermediate features computed at the hidden layers (after applying activation function)
        z = x # placeholder

        return z

    def params(self):
        return self.weight_1, self.bias_1, self.weight_2, self.bias_2


In [61]:
train_data = np.genfromtxt("optdigits_train.txt",delimiter=",")
train_x = train_data[:,:-1]
train_y = train_data[:,-1].astype('int')

test_data = np.genfromtxt("optdigits_test.txt",delimiter=",")
test_x = test_data[:,:-1]
test_y = test_data[:,-1].astype('int')

valid_data = np.genfromtxt("optdigits_valid.txt",delimiter=",")
valid_x = valid_data[:,:-1]
valid_y = valid_data[:,-1].astype('int')


In [62]:
train_x, mean, std = process_data(train_x)

In [63]:
test_x = process_data(test_x, mean, std)
valid_x = process_data(valid_x, mean, std)

In [64]:
train_y = process_label(train_y)

In [65]:
clf = MLP(num_hid=4)

In [73]:
clf.fit(train_x,train_y,valid_x,valid_y)

AttributeError: 'NoneType' object has no attribute 'gradient'

In [None]:
clf.fit(train_x, train_y)
y_pred = np.argmax(clf.predict(test_x), axis=1)
y_test = np.argmax(test_y, axis=1)
accuracy = accuracy_score(test_y, y_pred)
print ("Accuracy:", accuracy)

In [78]:
weight_1 = np.random.random([64,4])
bias_1 = np.random.random([1,4])

In [79]:
t = train_x.dot(weight_1) + bias_1

In [80]:
t2 = np.dot(train_x, weight_1) + bias_1

In [81]:
t == t2

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       ...,
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True]])

In [82]:
t2

array([[ 4.57714071,  1.85671739,  6.48423009,  0.9145729 ],
       [ 5.10921049,  3.07194676,  7.44671229,  3.95453715],
       [ 2.69568816,  1.55653054,  4.0450871 ,  3.77621473],
       ...,
       [-0.58281044, -0.33740145,  4.09014969, -0.95568017],
       [-4.19672313, -6.32370116, -3.3862002 , -6.54878695],
       [ 1.084485  ,  3.06805542,  0.37764725,  1.46038028]])

In [84]:
t

array([[ 4.57714071,  1.85671739,  6.48423009,  0.9145729 ],
       [ 5.10921049,  3.07194676,  7.44671229,  3.95453715],
       [ 2.69568816,  1.55653054,  4.0450871 ,  3.77621473],
       ...,
       [-0.58281044, -0.33740145,  4.09014969, -0.95568017],
       [-4.19672313, -6.32370116, -3.3862002 , -6.54878695],
       [ 1.084485  ,  3.06805542,  0.37764725,  1.46038028]])