In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class DNN:
    def __init__(self,sizes,beta=0.9,gamma=0.99,alpha=0.08):
        self.Di = sizes[0]
        self.Do = sizes[-1]
        self.Dh = sizes[1:-1]
        self.K = len(sizes)-1
        self.beta = []
        self.Omega = []
        self.m = []
        self.v = []
        self.m_decay = beta
        self.v_decay = gamma
        self.alpha = alpha
        self.t = 0
        for index in range(self.K):
            sigma = np.sqrt(4/(sizes[index]+sizes[index+1]))
            self.beta.append(np.random.normal(size=(sizes[index+1],1)) * sigma)
            self.Omega.append(np.random.normal(size=(sizes[index+1],sizes[index])) * sigma)
            self.m.append([])
            self.v.append([])

            self.m[index].append(np.zeros((sizes[index+1],1)))
            self.v[index].append(np.zeros((sizes[index+1],1)))
            self.m[index].append(np.zeros((sizes[index+1],sizes[index])))
            self.v[index].append(np.zeros((sizes[index+1],sizes[index])))
    
    def run_all(self,x):
        values = [x]
        for layer in range(self.K):
            pre_act = np.matmul(self.Omega[layer],values[-1]) + self.beta[layer]
            values.append(self.ReLU(pre_act))
        return values
    
    def compute_gradient(self,x,y):
        values = self.run_all(x)
        grad_beta = [None] * self.K
        grad_Omega = [None] * self.K
        grad_beta[-1] = values[-1]-y

        for layer in range(self.K-1,-1,-1):
            if layer != self.K-1:
                grad_activation = values[layer+1].astype(bool)*1.1 - 0.1
                OmegaT = np.transpose(self.Omega[layer+1])
                grad_beta[layer] = (np.matmul(OmegaT,grad_beta[layer+1])) * grad_activation
            hT = np.transpose(values[layer],axes=[0,2,1])
            grad_Omega[layer] = np.matmul(grad_beta[layer],hT)
        
        for layer in range(self.K):
            grad_beta[layer] = sum(grad_beta[layer])/x.shape[0]
            grad_Omega[layer] = sum(grad_Omega[layer])/x.shape[0]

        return grad_beta,grad_Omega
    
    def update_weights(self,grad_beta,grad_Omega):
        self.t += 1
        for layer in range(self.K):
            self.m[layer][0] *= self.m_decay
            self.m[layer][0] += grad_beta[layer] * (1-self.m_decay)
            self.m[layer][1] *= self.m_decay
            self.m[layer][1] += grad_Omega[layer] * (1-self.m_decay)

            self.v[layer][0] *= self.v_decay
            self.v[layer][0] += grad_beta[layer]**2 * (1-self.v_decay)
            self.v[layer][1] *= self.v_decay
            self.v[layer][1] += grad_Omega[layer]**2 * (1-self.v_decay)

            factor = (1-self.m_decay**self.t)/(1-self.v_decay**self.t)

            delta_beta = -1 * self.alpha * self.m[layer][0] / (np.sqrt(self.v[layer][0]) + 1e-12)
            delta_beta /= factor
            self.beta[layer] += delta_beta 

            delta_Omega = -1 * self.alpha * self.m[layer][1] / (np.sqrt(self.v[layer][1]) + 1e-12)
            delta_Omega /= factor
            self.Omega[layer] += delta_Omega 
    
    def run(self,x):
        return self.run_all(x)[-1]
    
    def train_epoch(self,data,batch_num):
        batch_indices = np.linspace(0,len(data[0]),batch_num+1).astype(int)
        order = np.random.permutation(len(data[0]))
        for num in range(batch_num):
            xs = data[0][order[batch_indices[num]:batch_indices[num+1]]]
            ys = data[1][order[batch_indices[num]:batch_indices[num+1]]]

            grad_beta,grad_Omega = self.compute_gradient(xs,ys)

            self.update_weights(grad_beta,grad_Omega)
    
    def train(self,data,epochs=100,batch_num=1):
        for epoch in range(epochs):
            self.train_epoch(data,batch_num)
            print(f"Loss: {self.loss(data)}")
        print(f"Final loss: {self.loss(data)}")

    def ReLU(self,arr):
        pos = arr.clip(min=0.0)
        neg = arr.clip(max=0.0)*0.1
        return pos+neg
    
    def loss(self,data):
        y_pred = self.run(data[0])
        return np.sum(data[1]-y_pred)**2

In [3]:
myNN = DNN(sizes=[784,300,20,10])
xs = np.random.normal(size=(10,myNN.Di,1))
ys = np.random.normal(size=(10,myNN.Do,1))
data = [xs,ys]
myNN.train(data)

Loss: 6.632690853372921
Loss: 2.1432222426634726
Loss: 1.7866356015478255
Loss: 2.935068051267336
Loss: 8.564830774301646
Loss: 1.4193572727196666
Loss: 4.5055553263188886
Loss: 0.6435312056377346
Loss: 17.03876516189159
Loss: 4.186206081300191
Loss: 3.076721184490269
Loss: 5.91293590808331
Loss: 0.6792814577410462
Loss: 7.87645638153392
Loss: 1.911209945936933
Loss: 3.5853523372224947
Loss: 3.3819698043496205
Loss: 0.5516474369752723
Loss: 1.5334551543365311
Loss: 0.25612938318770995
Loss: 3.2871034320467443
Loss: 0.5811118898445617
Loss: 0.8853890295585304
Loss: 2.0085461402828937
Loss: 0.22562381519607708
Loss: 0.33276970896137414
Loss: 0.2254448076083678
Loss: 0.19522109397052237
Loss: 0.7572368130967829
Loss: 0.07086497696861475
Loss: 0.8004882007393531
Loss: 0.9945011009247355
Loss: 0.03373105018445637
Loss: 0.8354500612164436
Loss: 0.19560653798735106
Loss: 0.12523205946142524
Loss: 0.03581257295804273
Loss: 0.34863796988207224
Loss: 0.20009317953323702
Loss: 0.3895953081163221
