Block for linking files:

#MLP CLASS

In [None]:
import collections
from multiprocessing import reduction
import numpy as np

np.random.seed(1)

class MLP():
    
    #parameter_init_type = "RANDOM" => all parameters initialzed randomly
    #parameter_init_type = "ACTIV_SPEC" => activation specific:  all parameters of an edge initialized based on
    def __init__(self, M, D, C, hidden_activation_func_list, output_activation_func, cost_function = u.cat_cross_entropy, parameter_init_type = "RANDOM"):
        self.M = M     # M =number of hidden units in hidden layers (width)
        self.C = C     # C outputs (number of classes)        
        self.D = D     # D inputs (number of x inputs)         
        #for reporting on model
        self.parameter_init_type = parameter_init_type 
        self.num_hid_layers = len(hidden_activation_func_list)             
        self.activation_functions = hidden_activation_func_list        
        #list of represent all layers in mlp
        self.layers = self.create_layers(hidden_activation_func_list, output_activation_func, cost_function, parameter_init_type)

    #create layer list here for model
    #called in class constructor
    #returns a list of all layers
    #hard coded so that all layers have to have the same number hidden units but this could be changed
    def create_layers(self, hidden_activation_func_list, output_activation_func, cost_function, init_type):
        layers_list = []    #list of all layers
        init_params = []    #list of parameters for each edge layer
        #dimensions for parameter matrices with bias additions (one's col added to X too)
        Dplusbias = self.D +1       #V dim = (D+1, M)
        Mplusbias = self.M +1       #W dim = (M+1, C)
        #account for case with no hidden layers (log regression)
        if hidden_activation_func_list == None or len(hidden_activation_func_list) == 0  or self.M==0:
            spec_final_edge = l.Edge(Dplusbias, self.C)
            layers_list.append(spec_final_edge)       #create first edge (from X to first HU) and add to list
            init_params.append(spec_final_edge.get_params())
        else:
            #create hidden layers: length of passed activation funcs determines numbre of hidden layers
            #first edge has special dimensions
            edge = l.Edge(Dplusbias, Mplusbias)
            layers_list.append(edge)
            final_index = len(hidden_activation_func_list)-1
            for i,activation_function in enumerate(hidden_activation_func_list):  
                
                hid_layer = l.HiddenLayer(activation_function) 
                layers_list.append(hid_layer)
                if init_type == "ACTIVATION_SPECIFIC":
                    params = edge.get_params()
                    #params = params * 0.1
                    size_last_layer = Mplusbias if i != 0 else Dplusbias
                    params_custom = activation_function.param_init_by_activ_type(params, size_last_layer)
                    edge.set_params(params_custom)
                elif init_type == "AROUND_ZERO":
                    params = edge.get_params()
                    params_custom = params * 0.1
                    edge.set_params(params_custom)
                init_params.append(edge.get_params()) 

                #create new edge
                edge = l.Edge(Mplusbias, Mplusbias) if i != final_index else l.Edge(Mplusbias, self.C)
                layers_list.append(edge)

            init_params.append(edge.get_params()) #append final edge that wasn't specially parameterized

        layers_list.append(l.OutputLayer(output_activation_func, cost_function))         #create output layer
        self.init_params = init_params 

        return layers_list
    
    def print_model_summary(self):
        print("-----Model summary:------------------")
        print(f'Number of Instances Trained On:  N = {self.N}')
        print(f'Number of Inputs Trained On:  D = {self.D}')
        print(f'Number of Hidden Units:  M = {self.M}')
        print(f'Number of Classes:  C = {self.C}')
        print(f'Parameter Initialization Type:  {self.parameter_init_type}')
        print(f'Gradient Descent Learning Rate: {self.learn_rate}')
        print(f'Gradient Descent Iterations: {self.gd_iterations}')
        print(f'Layer Dropout Keep Unit Percentages: {self.dropout_p}') 
        print(f'Number of Hidden Units Layers: {self.num_hid_layers}')
        print("Activation Functions: ")
        for af in self.activation_functions:
            print(type(af).__name__)

    #Compute forward pass
    def forward_pass(self, X):
        self.activations = []
        if self.dropout_p != None: keep_probs = self.dropout_p.copy()    
        last_index = len(self.layers)-1
        for i,layer in enumerate(self.layers):
            input = X if i == 0 else z               
            z = layer.get_output(input)
            if self.dropout_p != None and isinstance(layer, l.HiddenLayer):
              keep_prob_p  = keep_probs.pop(0)
              if keep_prob_p != 0:                                    #remove from list for this iteration
                drop_mask = (np.random.rand(*z.shape) < keep_prob_p) / keep_prob_p    #create dropout mask, invert to make predict scaling unnecc
                z *= drop_mask # drop!
            if isinstance(layer, l.HiddenLayer): self.activations.append(z)    
        yh = z 
        return yh
    
    #perform the backward pass
    #return list(parameter_gradients)  
    #note all dimensions here include bias ie M = M+1 from model creation
    def backward_pass(self, X, Y, Yh):
        layers = self.layers.copy()             #edges and activation layers
        activations = self.activations.copy()   #outputs of each hidden layer
        activations.insert(0, X)                #add x as the beginning input,         
        params = collections.deque()            #a list of parameters for gradient decent later

        #last layer and edge special case:,     
        final_layer = layers.pop(-1)                #just to pop
        dy = Yh - Y                    #N x C       #pderiv(Loss) wrt y
        z = activations.pop()          #N x M       #need last activations 
        final_edge = layers.pop(-1)                 #get last edge with W
        params_from_above = final_edge.get_params() #get the weights for hidden layer calculations
        dw = np.dot(z.T, dy)/self.N    #M x C

        params.append(dw)                           #save for grad desc
        err_from_above = dy           #N x C  #cost so far, backprop
    
        #reverse the layers (propograte from back): encounter hidden unit layer, then edge, then next hidden unit layer, etc
        for layer in reversed(layers): 
            
            if isinstance(layer, l.HiddenLayer):
                dzq = layer.get_af_deriv(z) #N x M #dzq should have dim of z

                z = activations.pop(-1)  #N x M
                dz = np.dot(err_from_above, params_from_above.T) #N x M    #params_abv will be set in the last iteration   
                err_from_above = dz #backprop error to next layer

            else: 
                dv =  np.dot(z.T, dz * dzq)/self.N #z should be activations from last layer DxM
                params_from_above = layer.get_params()  #layer will be edge, get V
                params.appendleft(dv)

        params = list(params)         #params was a deque for efficiency, change back to list
        return params

    #Hyperparameter: sdropout_p will be a list of dropout percentages for each layer
    def fit(self, X, Y, learn_rate=0.1, gd_iterations=50, dropout_p=None):
        self.N = X.shape[0]

        #for printing statistics
        self.learn_rate = learn_rate
        self.gd_iterations = gd_iterations
        self.dropout_p = dropout_p

        #bias implementation: ADD COLS of 1 to x
        bias = np.ones((self.N,1), dtype=float)
        X = np.append(X, bias, axis=1)
        
        def gradient(X, Y, params):    
            Yh = self.forward_pass(X)     
            params = self.backward_pass(X, Y, Yh)
            return params
        
        #create GradientDescent obj here and pass it our HP's, then run GD
        optimizer = gd.GradientDescent(learning_rate=learn_rate, max_iters=gd_iterations)
        learned_params = optimizer.run(gradient, X, Y, self.init_params) #pass grad , x, ,y, initial params         
        
        #run through layers and set params
        for layer in reversed(self.layers):
            if isinstance(layer,l.Edge):
                layer.set_params(learned_params.pop())
        return self

    #returns the PROBABILITIES of classes 
    # (output of softmax rather than one hot encoding)
    def predict_probs(self, X): 
        N = X.shape[0]

        bias = np.ones((N,1), dtype=float)      #must add bias
        X = np.append(X, bias, axis=1)
        yh = self.forward_pass(X)               #compute through layers of functions

        return yh     

    def predict(self, X): 
        N = X.shape[0]

        bias = np.ones((N,1), dtype=float)      #must add bias
        X = np.append(X, bias, axis=1)
        yh_probs = self.forward_pass(X)         #compute through layers of functions

        def one_hot(row):
            #need to use argmax because to break ties
            prediction_index = np.argmax(row, axis = 0) #get the index of most prob class, axis 0 bc single row
            row.fill(0) #in place set all values to 0
            row[prediction_index] =1

            return row
        
        yh =np.apply_along_axis(one_hot, 1, yh_probs)
        
        return yh 

#TASKS

In [None]:
import numpy as np

#TASK 3_1: Varying numbers of depth  layers
def task3_1(Xtrain, Xtest, Ytrain, Ytest):
    print("++++++++++++++++++TASK 3_1: Varying number of depth  layers ++++++++++++++++++")
    Ctask = Ytrain.shape[1] #10 classes in FASHION-MINST dataset, should be one hot encoded
    Ntask,Dtask = Xtrain.shape 

    #1)no hidden layers
    hidlayer_activfunc_list1 = []
    output_activation1 = af.SoftMax()
    #create model object
    model3_1_1 = MLP(M=0, D=Dtask, C=Ctask, hidden_activation_func_list=hidlayer_activfunc_list1, output_activation_func=output_activation1)
    # fit model 
    model3_1_1.fit(Xtrain, Ytrain, learn_rate=0.1, gd_iterations=550, dropout_p=None)
    Yh1 = model3_1_1.predict(Xtest)  
    #print stats
    model3_1_1.print_model_summary()
    u.evaluate_acc(Ytest, Yh1)
    
    #2)1 hidden layer, 128 hidden units
    hidlayer_activfunc_list2 = []
    hidlayer_activfunc_list2.append(af.ReLU())
    output_activation2 = af.SoftMax()
    #create model object
    model3_1_2 = MLP(M=128, D=Dtask, C=Ctask, hidden_activation_func_list=hidlayer_activfunc_list2, output_activation_func=output_activation2, parameter_init_type="ACTIVATION_SPECIFIC")
    # fit model 
    model3_1_2.fit(Xtrain, Ytrain, learn_rate=0.2, gd_iterations=500, dropout_p=None)
    Yh2 = model3_1_2.predict(Xtest)  
    #print stats
    model3_1_2.print_model_summary()
    u.evaluate_acc(Ytest, Yh2)

    #3)2 hidden layers, 128 hidden units
    hidlayer_activfunc_list3 = []
    hidlayer_activfunc_list3.append(af.ReLU())
    hidlayer_activfunc_list3.append(af.ReLU())
    output_activation3 = af.SoftMax()
    #create model object
    model3_1_3 = MLP(M=128, D=Dtask, C=Ctask, hidden_activation_func_list=hidlayer_activfunc_list3, output_activation_func=output_activation3, parameter_init_type="ACTIVATION_SPECIFIC")
    # fit model 
    model3_1_3.fit(Xtrain, Ytrain, learn_rate=0.1, gd_iterations=750, dropout_p=None)
    Yh3 = model3_1_3.predict(Xtest)  
    #print stats
    model3_1_3.print_model_summary()
    u.evaluate_acc(Ytest, Yh3)

#TASK 3_2: Different activations
def task3_2(Xtrain, Xtest, Ytrain, Ytest):
    print("++++++++++++++++++TASK 3_2: Different activation functions ++++++++++++++++++")

    Ctask = Ytrain.shape[1] #10 classes in FASHION-MINST dataset
    Ntask,Dtask = Xtrain.shape 

    #1) 2 layer Tanh
    hidlayer_activfunc_list1 = []
    hidlayer_activfunc_list1.append(af.tanh())
    hidlayer_activfunc_list1.append(af.tanh())
    output_activation1 = af.SoftMax()
    #create model object
    model3_2_1 = MLP(M=128, D=Dtask, C=Ctask, hidden_activation_func_list=hidlayer_activfunc_list1, output_activation_func=output_activation1, parameter_init_type="ACTIVATION_SPECIFIC")
    # fit model 
    model3_2_1.fit(Xtrain, Ytrain, learn_rate=0.1, gd_iterations=750, dropout_p=None)
    Yh1 = model3_2_1.predict(Xtest)  
    #print stats
    model3_2_1.print_model_summary()
    u.evaluate_acc(Ytest, Yh1)

    #2)2 layer leaky relu
    hidlayer_activfunc_list2 = []
    hidlayer_activfunc_list2.append(af.LeakyReLU())
    hidlayer_activfunc_list2.append(af.LeakyReLU())
    output_activation2 = af.SoftMax()
    #create model object
    model3_2_2 = MLP(M=128, D=Dtask, C=Ctask, hidden_activation_func_list=hidlayer_activfunc_list2, output_activation_func=output_activation2, parameter_init_type="ACTIVATION_SPECIFIC")
    # fit model 
    model3_2_2.fit(Xtrain, Ytrain, learn_rate=0.1, gd_iterations=750, dropout_p=None)
    Yh2 = model3_2_2.predict(Xtest)  
    #print stats
    model3_2_2.print_model_summary()
    u.evaluate_acc(Ytest, Yh2)

#TASK 3_3: 2 Hidden Layers, Relu, with DROPOUT
def task3_3(Xtrain, Xtest, Ytrain, Ytest, layer_dropout_percents = [0.8, 0.8]):
    print("++++++++++++++++++TASK 3_3: DROPOUT: 2 Hidden Layers with Relu ++++++++++++++++++")

    Ctask = Ytrain.shape[1] #10 classes in FASHION-MINST dataset
    Ntask,Dtask = Xtrain.shape 

    hidlayer_activfunc_list1 = []
    hidlayer_activfunc_list1.append(af.ReLU())
    hidlayer_activfunc_list1.append(af.ReLU())
    output_activation1 = af.SoftMax()
    #create model object
    model3_2_1 = MLP(M=128, D=Dtask, C=Ctask, hidden_activation_func_list=hidlayer_activfunc_list1, output_activation_func=output_activation1, parameter_init_type="ACTIVATION_SPECIFIC")
    # fit model 
    model3_2_1.fit(Xtrain, Ytrain, learn_rate=0.2, gd_iterations=180, dropout_p=layer_dropout_percents)
    Yh1 = model3_2_1.predict(Xtest)  
    #print stats
    model3_2_1.print_model_summary()
    u.evaluate_acc(Ytest, Yh1)


#TASK 3_4: 2 Hidden Layers, Relu, with UNNORMALIZED IMAGES
def task3_4(Xtrain, Xtest, Ytrain, Ytest):
    print("++++++++++++++++++TASK 3_4: UNNORMALIZED DATA: 2 Hidden Layers with Relu ++++++++++++++++++")
    
    Ctask = Ytrain.shape[1] #10 classes in FASHION-MINST dataset
    Ntask,Dtask = Xtrain.shape 

    hidlayer_activfunc_list1 = []
    hidlayer_activfunc_list1.append(af.ReLU())
    hidlayer_activfunc_list1.append(af.ReLU())
    output_activation1 = af.SoftMax()
    #create model object
    model3_2_1 = MLP(M=128, D=Dtask, C=Ctask, hidden_activation_func_list=hidlayer_activfunc_list1, output_activation_func=output_activation1)
    # fit model 
    model3_2_1.fit(Xtrain, Ytrain, learn_rate=0.3, gd_iterations=150, dropout_p=None)
    Yh1 = model3_2_1.predict(Xtest)  
    #print stats
    print("UNNORMALIZED DATA")
    model3_2_1.print_model_summary()
    u.evaluate_acc(Ytest, Yh1)

#TASK 3_6: Optimize
def task3_6(Xtrain, Xtest, Ytrain, Ytest, hid_units, epoques, learning_rate, dropout_list):
    print("++++++++++++++++++TASK 3_6: BEST: 2 Hidden Layers with Relu ++++++++++++++++++")
    Ctask = Ytrain.shape[1] #10 classes in FASHION-MINST dataset
    Ntask,Dtask = Xtrain.shape 
    hidlayer_activfunc_list1 = []
    hidlayer_activfunc_list1.append(af.ReLU())
    hidlayer_activfunc_list1.append(af.ReLU())
    hidlayer_activfunc_list1.append(af.ReLU())
    output_activation1 = af.SoftMax()
    #create model object
    model3_2_1 = MLP(M=hid_units, D=Dtask, C=Ctask, hidden_activation_func_list=hidlayer_activfunc_list1, output_activation_func=output_activation1, parameter_init_type = "ACTIVATION_SPECIFIC")
    # fit model 
    model3_2_1.fit(Xtrain, Ytrain, learning_rate, gd_iterations=epoques, dropout_p=dropout_list)
    Yh1 = model3_2_1.predict(Xtest)  
    #print stats
    model3_2_1.print_model_summary()
    u.evaluate_acc(Ytest, Yh1)
    print("Accuracy on TRAIN set (seen data):")
    Yh2 = model3_2_1.predict(Xtrain)  
    #print stats
    model3_2_1.print_model_summary()
    u.evaluate_acc(Ytrain, Yh2)

Xtrain, Ytrain, Xtest, Ytest = load_dataset() # load dataset
Xtrain, Xtest = prep_pixels(Xtrain, Xtest)
print('Train: X=%s, y=%s' % (Xtrain.shape, Ytrain.shape))
print('Test: X=%s, y=%s' % (Xtest.shape, Ytest.shape))
print(Xtrain.shape)
print(Xtest.shape)

Train: X=(60000, 784), y=(60000, 10)
Test: X=(10000, 784), y=(10000, 10)
(60000, 784)
(10000, 784)
++++++++++++++++++TASK 3_1: Varying number of depth  layers ++++++++++++++++++
-----Model summary:------------------
Number of Instances Trained On:  N = 60000
Number of Inputs Trained On:  D = 784
Number of Hidden Units:  M = 128
Number of Classes:  C = 10
Parameter Initialization Type:  ACTIVATION_SPECIFIC
Gradient Descent Learning Rate: 0.1
Gradient Descent Iterations: 750
Layer Dropout Keep Unit Percentages: None
Number of Hidden Units Layers: 2
Activation Functions: 
ReLU
ReLU
-------------------------------------
test accuracy: 0.822
-------------------------------------
++++++++++++++++++TASK 3_2: Different activation functions ++++++++++++++++++
-----Model summary:------------------
Number of Instances Trained On:  N = 60000
Number of Inputs Trained On:  D = 784
Number of Hidden Units:  M = 128
Number of Classes:  C = 10
Parameter Initialization Type:  ACTIVATION_SPECIFIC
Gradient

# DATA LOADING
(See Dataloading.py for more)

In [None]:
from matplotlib import pyplot
from keras.datasets import fashion_mnist
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical

def load_dataset():
	# load dataset
	(trainX, trainY), (testX, testY) = fashion_mnist.load_data()
	#reshape dataset to have a single channel
	trainX = trainX.reshape((trainX.shape[0], 784))
	testX = testX.reshape((testX.shape[0], 784))
	trainY = to_categorical(trainY)
	testY = to_categorical(testY)
 
	return trainX, trainY, testX, testY

In [None]:
# scale pixels
def prep_pixels(train, test):
	# convert from integers to floats
	train_norm = train.astype('float32')
	test_norm = test.astype('float32')
	# normalize to range 0-1
	train_norm = train_norm / 255.0
	test_norm = test_norm / 255.0
	# return normalized images
	return train_norm, test_norm