In [1]:
import numpy as np
import matplotlib.pyplot as plt
from random import randint

In [2]:
class DNN:
    def __init__(self,sizes,beta=0.9,gamma=0.99,alpha=0.08):
        self.Di = sizes[0]
        self.Do = sizes[-1]
        self.Dh = sizes[1:-1]
        self.K = len(sizes)-1
        self.beta = []
        self.Omega = []
        self.m = []
        self.v = []
        self.m_decay = beta
        self.v_decay = gamma
        self.alpha = alpha
        self.t = 0
        for index in range(self.K):
            sigma = np.sqrt(4/(sizes[index]+sizes[index+1]))
            self.beta.append(np.random.normal(size=(sizes[index+1],1)) * sigma)
            self.Omega.append(np.random.normal(size=(sizes[index+1],sizes[index])) * sigma)
            self.m.append([])
            self.v.append([])

            self.m[index].append(np.zeros((sizes[index+1],1)))
            self.v[index].append(np.zeros((sizes[index+1],1)))
            self.m[index].append(np.zeros((sizes[index+1],sizes[index])))
            self.v[index].append(np.zeros((sizes[index+1],sizes[index])))
    
    def run_all(self,x):
        values = [x]
        for layer in range(self.K):
            pre_act = np.matmul(self.Omega[layer],values[-1]) + self.beta[layer]
            values.append(self.ReLU(pre_act))
        return values
    
    def Ps(self,values):
        exp_outs = np.exp(values)
        exp_sums = np.sum(exp_outs,axis=1)
        exp_sums=exp_sums.reshape(exp_sums.shape+(1,))
        Ps = exp_outs / exp_sums
        return Ps
    
    def compute_gradient(self,x,y):
        values = self.run_all(x)
        grad_beta = [None] * self.K
        grad_Omega = [None] * self.K
        grad_beta[-1] = self.Ps(values[-1]) - y

        for layer in range(self.K-1,-1,-1):
            if layer != self.K-1:
                grad_activation = values[layer+1].astype(bool)*1.1 - 0.1
                OmegaT = np.transpose(self.Omega[layer+1])
                grad_beta[layer] = (np.matmul(OmegaT,grad_beta[layer+1])) * grad_activation
            hT = np.transpose(values[layer],axes=[0,2,1])
            grad_Omega[layer] = np.matmul(grad_beta[layer],hT)
        
        for layer in range(self.K):
            grad_beta[layer] = sum(grad_beta[layer])/x.shape[0]
            grad_Omega[layer] = sum(grad_Omega[layer])/x.shape[0]

        return grad_beta,grad_Omega
    
    def update_weights(self,grad_beta,grad_Omega):
        self.t += 1
        for layer in range(self.K):
            self.m[layer][0] *= self.m_decay
            self.m[layer][0] += grad_beta[layer] * (1-self.m_decay)
            self.m[layer][1] *= self.m_decay
            self.m[layer][1] += grad_Omega[layer] * (1-self.m_decay)

            self.v[layer][0] *= self.v_decay
            self.v[layer][0] += grad_beta[layer]**2 * (1-self.v_decay)
            self.v[layer][1] *= self.v_decay
            self.v[layer][1] += grad_Omega[layer]**2 * (1-self.v_decay)

            factor = (1-self.m_decay**self.t)/(1-self.v_decay**self.t)

            delta_beta = -1 * self.alpha * self.m[layer][0] / (np.sqrt(self.v[layer][0]) + 1e-12)
            delta_beta /= factor
            self.beta[layer] += delta_beta 

            delta_Omega = -1 * self.alpha * self.m[layer][1] / (np.sqrt(self.v[layer][1]) + 1e-12)
            delta_Omega /= factor
            self.Omega[layer] += delta_Omega 
    
    def run(self,x):
        return self.run_all(x)[-1]
    
    def train_epoch(self,data,batch_size):
        batch_num = int(data[0].shape[0]/batch_size)
        batch_indices = np.linspace(0,len(data[0]),batch_num+1).astype(int)
        order = np.random.permutation(len(data[0]))
        for num in range(batch_num):
            xs = data[0][order[batch_indices[num]:batch_indices[num+1]]]
            ys = data[1][order[batch_indices[num]:batch_indices[num+1]]]

            grad_beta,grad_Omega = self.compute_gradient(xs,ys)

            self.update_weights(grad_beta,grad_Omega)
            print(f"Batch Loss: {self.loss([xs,ys])}")
    
    def train(self,data,epochs=100,batch_size=1000):
        ys = np.zeros((data[0].shape[0],self.Do,1))
        for ind, ans in enumerate(data[1]):
            ys[ind][int(ans[0])] += 1
        real_data = [data[0],ys]
        for epoch in range(epochs):
            self.train_epoch(real_data,batch_size)
            print(f"Epoch Loss: {self.loss(real_data)}")
        print(f"Final loss: {self.loss(real_data)}")

    def ReLU(self,arr):
        pos = arr.clip(min=0.0)
        neg = arr.clip(max=0.0)*0.1
        return pos+neg
    
    def loss(self,data):
        values = self.run(data[0])
        derivs = self.Ps(values)
        derivs *= data[1]
        Ps = np.sum(derivs,axis=1)
        return -1 * np.sum(np.log(Ps))
    
    def solve(self,x):
        values = self.run(x)
        derivs = self.Ps(values)
        locs = np.argmax(derivs,axis=1)
        return locs
    
    def rate(self,data):
        diffs = (self.solve(data[0])-data[1]).astype(bool)
        incorrect = int(np.sum(diffs))
        return 1-incorrect/data[0].shape[0]

In [3]:
myNN = DNN(sizes=[10,100,70,40,10])
size = 10
xs = np.random.normal(size=(size,myNN.Di,1))
ys = np.zeros((size,myNN.Do,1))
for ind in range(size):
    ys[ind][randint(0,size-1)] += 1
data = [xs,ys]

In [4]:
myNN.train(data)

Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Loss: 30.682426732083414
Epoch Lo

In [5]:
np.load("MNIST/train_images.npy").shape

(60000, 784)

In [6]:
np.load("MNIST/train_labels.npy").shape

(60000, 1)

In [7]:
xs = np.load("MNIST/train_images.npy")
xs = xs.reshape(xs.shape+(1,))/256
ys = np.load("MNIST/train_labels.npy")
cutoff = int(0.9 * xs.shape[0])
train_xs = xs[:cutoff]
test_xs = xs[cutoff:]
train_ys = ys[:cutoff]
test_ys = ys[cutoff:]

In [8]:
myNN = DNN(sizes=[784,300,100,30,10],alpha=0.05)

In [9]:
myNN.train([train_xs,train_ys],epochs=1,batch_size=1000)
myNN.rate([test_xs,test_ys])

Batch Loss: 2441.997399884678
Batch Loss: 2163.1896181961065
Batch Loss: 2015.7149861333098
Batch Loss: 1696.8917697224374
Batch Loss: 1341.0624366026095
Batch Loss: 1136.5765817953115
Batch Loss: 1043.7375931370198
Batch Loss: 1070.8948017907728
Batch Loss: 840.6059058681442
Batch Loss: 712.4969300067733
Batch Loss: 845.3011360693779
Batch Loss: 801.9219619821274
Batch Loss: 759.9263026102903
Batch Loss: 700.1423541311367
Batch Loss: 722.6635718027953
Batch Loss: 718.6291318426927
Batch Loss: 730.6803233464307
Batch Loss: 530.5902894270587
Batch Loss: 640.3772276463962
Batch Loss: 567.030863499988
Batch Loss: 618.7203340193626
Batch Loss: 667.1295003717858
Batch Loss: 536.7115780880262
Batch Loss: 475.4411847505417
Batch Loss: 495.06167873563015
Batch Loss: 599.6903214963697
Batch Loss: 455.2520602609834
Batch Loss: 586.8535405314512
Batch Loss: 526.9107600972164
Batch Loss: 450.13181049080083
Batch Loss: 450.23083506486444
Batch Loss: 566.9771588354249
Batch Loss: 451.87477687679535


0.9241666666666667

In [10]:
myNN.train([train_xs,train_ys],epochs=1,batch_size=1000)
myNN.rate([test_xs,test_ys])

Batch Loss: 296.55815958815646
Batch Loss: 272.8665822111809
Batch Loss: 270.9523535924417
Batch Loss: 346.89144819184406
Batch Loss: 369.59329991936346
Batch Loss: 322.9594643845679
Batch Loss: 364.0716983077774
Batch Loss: 274.85030755839506
Batch Loss: 356.9355273650043
Batch Loss: 295.6208558662754
Batch Loss: 275.16339093579677
Batch Loss: 278.64109426166317
Batch Loss: 368.7000353118458
Batch Loss: 372.55725156759814
Batch Loss: 359.4952763750249
Batch Loss: 418.1689448007305
Batch Loss: 455.2445959351807
Batch Loss: 398.48329200063324
Batch Loss: 330.3535155610849
Batch Loss: 397.5001990921752
Batch Loss: 369.34644571490935
Batch Loss: 289.9711383660492
Batch Loss: 402.4823289517569
Batch Loss: 345.3819618168701
Batch Loss: 355.90228595569135
Batch Loss: 324.1352082550153
Batch Loss: 385.654515723749
Batch Loss: 309.39385638115914
Batch Loss: 285.93566820588785
Batch Loss: 331.1602944314314
Batch Loss: 250.11193479456938
Batch Loss: 322.5295852432842
Batch Loss: 264.535749910862

0.925

In [11]:
myNN.train([train_xs,train_ys],epochs=1,batch_size=3000)
myNN.rate([test_xs,test_ys])

Batch Loss: 892.6762079839318
Batch Loss: 924.962325140333
Batch Loss: 819.3594318079709
Batch Loss: 891.2771453552114
Batch Loss: 1005.2267528532267
Batch Loss: 874.100208948423
Batch Loss: 870.300870657104
Batch Loss: 793.4438916272078
Batch Loss: 938.8960349408045
Batch Loss: 839.1918762626967
Batch Loss: 830.2338820028468
Batch Loss: 939.825723281219
Batch Loss: 930.2313411493872
Batch Loss: 752.7885522767273
Batch Loss: 888.2766201271098
Batch Loss: 832.899818483216
Batch Loss: 741.2845273166188
Batch Loss: 838.15847371281
Epoch Loss: 14874.36923977339
Final loss: 14874.36923977339


0.9355

In [12]:
myNN.train([train_xs,train_ys],epochs=5,batch_size=3000)
myNN.rate([test_xs,test_ys])

Batch Loss: 729.7229384450721
Batch Loss: 797.253528540409
Batch Loss: 718.0089705489268
Batch Loss: 771.7028263111329
Batch Loss: 683.4828912547499
Batch Loss: 778.2078762825831
Batch Loss: 766.0696978923578
Batch Loss: 803.4939906366685
Batch Loss: 827.1552022077354
Batch Loss: 660.5489696931106
Batch Loss: 854.048846787631
Batch Loss: 657.1508016504215
Batch Loss: 850.3908203793004
Batch Loss: 861.664724604472
Batch Loss: 773.9153335126291
Batch Loss: 765.8513938189956
Batch Loss: 886.9095631897976
Batch Loss: 707.0959811895598
Epoch Loss: 12290.473802190538
Batch Loss: 627.0486513195572
Batch Loss: 701.0821367869951
Batch Loss: 761.4707311820923
Batch Loss: 604.2704342716214
Batch Loss: 713.2553650168381
Batch Loss: 702.0061221025148
Batch Loss: 600.5082275995246
Batch Loss: 680.9170613739848
Batch Loss: 648.9715711776535
Batch Loss: 665.4081316381216
Batch Loss: 631.3112648688672
Batch Loss: 611.0372495976967
Batch Loss: 666.63614470435
Batch Loss: 639.2362969978609
Batch Loss: 63

0.9145

In [13]:
myNN.rate([train_xs,train_ys])

0.9066481481481481