In [4]:
from keras.datasets import fashion_mnist
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [5]:
np.random.seed(0)

In [6]:
(x_train,y_train),(x_test,y_test)=fashion_mnist.load_data()

In [7]:
class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle Boot"]

In [8]:
x_train = x_train / 255.0
x_test = x_test / 255.0

In [9]:
x_train = x_train.reshape(len(x_train),784)
x_test = x_test.reshape(len(x_test),784)

In [10]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=1)

In [11]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(54000, 784)
(54000,)
(6000, 784)
(6000,)


In [12]:
enc = OneHotEncoder()
y_OHE_train = enc.fit_transform(np.expand_dims(y_train,1)).toarray()
y_OHE_val = enc.fit_transform(np.expand_dims(y_val,1)).toarray()
y_OHE_test = enc.fit_transform(np.expand_dims(y_test,1)).toarray()
print(y_OHE_train.shape, y_OHE_val.shape, y_OHE_test.shape)

(54000, 10) (6000, 10) (10000, 10)


In [15]:
class FFNNClass:   
    def __init__(self, n_inputs, n_outputs, hidden_sizes=[3], algo='GD'):
        self.nx = n_inputs
        self.ny = n_outputs
        self.nh = len(hidden_sizes)
        self.sizes = [self.nx] + hidden_sizes + [self.ny] 
        self.algo = algo
        self.params={}
        #self.gradients={}
        self.dW = {}
        self.dB = {}
        self.dH = {}
        self.dA = {}
        self.update_params={}
        self.prev_update_params={}
        for i in range(self.nh+1):
            self.update_params["v_w"+str(i+1)]=0
            self.update_params["v_b"+str(i+1)]=0
            self.update_params["m_b"+str(i+1)]=0
            self.update_params["m_w"+str(i+1)]=0
            self.prev_update_params["v_w"+str(i+1)]=0
            self.prev_update_params["v_b"+str(i+1)]=0
        for i in range(self.nh+1):
            self.params["W"+str(i+1)]= np.random.randn(self.sizes[i], self.sizes[i+1]) * np.sqrt(1/(self.sizes[i]+self.sizes[i+1]))
            self.params["B"+str(i+1)]= np.zeros((1, self.sizes[i+1]))
            
    def sigmoid(self, x):
        #return np.where(X >= 0, 1 / (1 + np.exp(-X)), np.exp(X) / (1 + np.exp(X)))
        return 1.0/(1.0 + np.exp(-x))
    
    def grad_sigmoid(self, x):
        return x*(1-x)

    def softmax(self, x):
        exps = np.exp(x)
        return exps / np.sum(exps,axis=1).reshape(-1,1)

    def forward_pass(self, x, params=None):
        if params is None:
            params=self.params
        self.A = {}
        self.H = {}
        #self.H[0] = x.reshape(1, -1)
        self.H[0] = x  #dimension is 16 * 784
        for i in range(self.nh):
            self.A[i+1] = np.matmul(self.H[i], params["W"+str(i+1)]) + params["B"+str(i+1)]
            self.H[i+1] = self.sigmoid(self.A[i+1])
        self.A[self.nh+1] = np.matmul(self.H[self.nh], params["W"+str(self.nh+1)]) + params["B"+str(self.nh+1)]
        self.H[self.nh+1] = self.softmax(self.A[self.nh+1])
        return self.H[self.nh+1]

    def predict(self, X):
        Y_pred = self.forward_pass(X)
        return np.array(Y_pred).squeeze()

#     def cross_entropy(self,label,pred):
#         yl = np.multiply(pred,label)
#         yl = yl[yl!=0]
#         yl = -np.log(yl)
#         yl = np.mean(yl)
#         return yl

    def grad(self, x, y, params=None):
        if params is None:
            params=self.params
        self.forward_pass(x,params)
        m = x.shape[0]
        L = self.nh + 1
        self.dA[L] = (self.H[L] - y)
        for k in range(L, 0, -1):
            self.dW[k] = np.matmul(self.H[k-1].T, self.dA[k])
            self.dB[k] = np.sum(self.dA[k],axis=0).reshape(1,-1)
            self.dH[k-1] = np.matmul(self.dA[k], params["W"+str(k)].T)
            self.dA[k-1] = np.multiply(self.dH[k-1], self.grad_sigmoid(self.H[k-1]))

    def fit(self, X, Y, epochs = 5, eta=0.01, display_loss=False, gamma=0.9, eps=1e-8, beta=0.9, beta1=0.9, beta2=0.9, mini_batch_size=16):

        if display_loss:
            loss = {}
                
        if self.algo == "GD":
            for epoch in range(epochs):
                m = X.shape[0]
                self.grad(X,Y)
                for i in range(self.nh+1):
                    self.params["W"+str(i+1)] -= eta * (self.dW[i+1]/m)
                    self.params["B"+str(i+1)] -= eta * (self.dB[i+1]/m)
        elif self.algo == "MiniBatch":
            for epoch in range(epochs):
                m = X.shape[0]
                for k in range(0,m,mini_batch_size):
                    self.grad(X[k:k+mini_batch_size], Y[k:k+mini_batch_size])
                    for i in range(self.nh+1):
                        self.params["W"+str(i+1)] -= eta * (self.dW[i+1]/mini_batch_size)
                        self.params["B"+str(i+1)] -= eta * (self.dB[i+1]/mini_batch_size)
                        
        elif self.algo == "Momentum":
            for epoch in range(epochs):
                m = X.shape[0]
                for k in range(0,m,mini_batch_size):
                    self.grad(X[k:k+mini_batch_size], Y[k:k+mini_batch_size])
                #self.grad(X, Y)
                    for i in range(self.nh+1):
                        self.update_params["v_w"+str(i+1)] = gamma *self.update_params["v_w"+str(i+1)] + eta * (self.dW[i+1]/mini_batch_size)
                        self.update_params["v_b"+str(i+1)] = gamma *self.update_params["v_b"+str(i+1)] + eta * (self.dB[i+1]/mini_batch_size)
                        self.params["W"+str(i+1)] -= self.update_params["v_w"+str(i+1)]
                        self.params["B"+str(i+1)] -= self.update_params["v_b"+str(i+1)]
                        
        
        elif self.algo == "RMS_Prop":
            for epoch in range(epochs):
                m = X.shape[0]
                for k in range(0,m,mini_batch_size):
                    self.grad(X[k:k+mini_batch_size], Y[k:k+mini_batch_size])
                #self.grad(X, Y)
                    for i in range(self.nh+1):
                        self.update_params["v_w"+str(i+1)] = beta*self.update_params["v_w"+str(i+1)] +(1-beta)*((self.dW[i+1]/mini_batch_size)**2)
                        self.update_params["v_b"+str(i+1)] = beta*self.update_params["v_b"+str(i+1)] +(1-beta)*((self.dB[i+1]/mini_batch_size)**2)
                        self.params["W"+str(i+1)] -= (eta/(np.sqrt(self.update_params["v_w"+str(i+1)]+eps)))*(self.dW[i+1]/mini_batch_size)
                        self.params["B"+str(i+1)] -= (eta/(np.sqrt(self.update_params["v_b"+str(i+1)]+eps)))*(self.dB[i+1]/mini_batch_size)
                        
        elif self.algo == "NAG":
            for epoch in range(epochs):
                m = X.shape[0]
                temp_params = {}
                
                for k in range(0,m,mini_batch_size):
                    for i in range(self.nh+1):
                        self.update_params["v_w"+str(i+1)]=gamma*self.prev_update_params["v_w"+str(i+1)]
                        self.update_params["v_b"+str(i+1)]=gamma*self.prev_update_params["v_b"+str(i+1)]
                        temp_params["W"+str(i+1)]=self.params["W"+str(i+1)]-self.update_params["v_w"+str(i+1)]
                        temp_params["B"+str(i+1)]=self.params["B"+str(i+1)]-self.update_params["v_b"+str(i+1)]
                    self.grad(X[k:k+mini_batch_size], Y[k:k+mini_batch_size],temp_params)
                    for i in range(self.nh+1):
                        self.update_params["v_w"+str(i+1)] = gamma *self.update_params["v_w"+str(i+1)] + eta * (self.dW[i+1]/mini_batch_size)
                        self.update_params["v_b"+str(i+1)] = gamma *self.update_params["v_b"+str(i+1)] + eta * (self.dB[i+1]/mini_batch_size)
                        self.params["W"+str(i+1)] -= eta * (self.update_params["v_w"+str(i+1)])
                        self.params["B"+str(i+1)] -= eta * (self.update_params["v_b"+str(i+1)])
                    self.prev_update_params=self.update_params 
                    
        elif self.algo == "Adam":
            for epoch in range(epochs):
                m = X.shape[0]
                for k in range(0,m,mini_batch_size):
                    self.grad(X[k:k+mini_batch_size], Y[k:k+mini_batch_size])
                    num_updates=0
                    for i in range(self.nh+1):
                        num_updates+=1
                        self.update_params["m_w"+str(i+1)]=beta1*self.update_params["m_w"+str(i+1)]+(1-beta1)*(self.dW[i+1]/mini_batch_size)
                        self.update_params["v_w"+str(i+1)]=beta2*self.update_params["v_w"+str(i+1)]+(1-beta2)*((self.dW[i+1]/mini_batch_size)**2)
                        m_w_hat=self.update_params["m_w"+str(i+1)]/(1-np.power(beta1,num_updates))
                        v_w_hat=self.update_params["v_w"+str(i+1)]/(1-np.power(beta2,num_updates))
                        self.params["W"+str(i+1)] -=(eta/np.sqrt(v_w_hat+eps))*m_w_hat

                        self.update_params["m_b"+str(i+1)]=beta1*self.update_params["m_b"+str(i+1)]+(1-beta1)*(self.dB[i+1]/mini_batch_size)
                        self.update_params["v_b"+str(i+1)]=beta2*self.update_params["v_b"+str(i+1)]+(1-beta2)*((self.dB[i+1]/mini_batch_size)**2)
                        m_b_hat=self.update_params["m_b"+str(i+1)]/(1-np.power(beta1,num_updates))
                        v_b_hat=self.update_params["v_b"+str(i+1)]/(1-np.power(beta2,num_updates))
                        self.params["B"+str(i+1)] -=(eta/np.sqrt(v_b_hat+eps))*m_b_hat
                        
        elif self.algo == "NAdam":
            for epoch in range(epochs):
                m = X.shape[0]
                for k in range(0,m,mini_batch_size):
                    self.grad(X[k:k+mini_batch_size], Y[k:k+mini_batch_size])
                    num_updates=0
                    for i in range(self.nh+1):
                        num_updates+=1
                        self.update_params["m_w"+str(i+1)]=beta1*self.update_params["m_w"+str(i+1)]+(1-beta1)*(self.dW[i+1]/mini_batch_size)
                        self.update_params["v_w"+str(i+1)]=beta2*self.update_params["v_w"+str(i+1)]+(1-beta2)*((self.dW[i+1]/mini_batch_size)**2)
                        m_w_hat=self.update_params["m_w"+str(i+1)]/(1-np.power(beta1,num_updates))
                        v_w_hat=self.update_params["v_w"+str(i+1)]/(1-np.power(beta2,num_updates))
                        self.params["W"+str(i+1)] -=(eta/np.sqrt(v_w_hat+eps))*((beta1*m_w_hat + (1-beta1)*v_w_hat)/(1-beta1))

                        self.update_params["m_b"+str(i+1)]=beta1*self.update_params["m_b"+str(i+1)]+(1-beta1)*(self.dB[i+1]/mini_batch_size)
                        self.update_params["v_b"+str(i+1)]=beta2*self.update_params["v_b"+str(i+1)]+(1-beta2)*((self.dB[i+1]/mini_batch_size)**2)
                        m_b_hat=self.update_params["m_b"+str(i+1)]/(1-np.power(beta1,num_updates))
                        v_b_hat=self.update_params["v_b"+str(i+1)]/(1-np.power(beta2,num_updates))
                        self.params["B"+str(i+1)] -=(eta/np.sqrt(v_b_hat+eps))*((beta1*m_b_hat + (1-beta1)*v_b_hat)/(1-beta1))

                    
        if display_loss:
            plt.plot(loss.values())
            plt.xlabel('Epochs')
            plt.ylabel('CE')
            plt.show()

In [16]:
ffnn = FFNNClass(784,10,[32,32,32],algo = 'NAdam')
ffnn.fit(x_train,y_OHE_train, epochs=10, eta=0.0001,display_loss = False,mini_batch_size=32)

In [17]:
y_pred_train = ffnn.predict(x_train)
y_pred_train = np.argmax(y_pred_train,1)

accuracy_train = accuracy_score(y_train, y_pred_train)
print("Training accuracy", round(accuracy_train, 3))
#print(y_pred_train.shape)
#print(y_train.shape)

Training accuracy 0.881


In [18]:
y_pred_val = ffnn.predict(x_val)
y_pred_val = np.argmax(y_pred_val,1)
accuracy_test = accuracy_score(y_val, y_pred_val)
print("Testing accuracy", round(accuracy_test, 3))

Testing accuracy 0.862
