In [1]:
import os
import sys
import random
import numpy as np


Load MNIST data

In [2]:
import pickle 
import gzip
f = gzip.open('mnist.pkl.gz', 'rb')
training, validation, test = pickle.load(f, encoding='latin1')
f.close()

In [3]:
images = training[0][:]
labels = training[1][:]

In [4]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
def dsigmoid(x):
    return x * (1.0 - x)
def softplus(x):
    return np.log(1.0 + np.exp(x))
def dsoftplus(x):
    return 1.0 - np.exp(-x)
def softmax(x):
    ex = np.exp(x)
    return ex / np.sum(ex)
def dsoftmax(x):
    return x*(1.0-x)

In [5]:
nneurons = 500
noutput  = 10
nintlayers = 3
nlayers      = nintlayers + 2
initialscale = 0.01
learningrate = 0.01
dropout      = 0.5
momentum     = 0.75
nbatch       = 100
nimages      = len(images)
nupdates     = round(nimages / nbatch) * 100

class Layer():
    def __init__(self, nneurons, afunc, pn = -1):
        self.nneurons=nneurons
        self.afunc = afunc
        if afunc == sigmoid:
            self.dfunc = dsigmoid
        if afunc == softplus:
            self.dfunc = dsoftplus
        if afunc == softmax:
            self.dfunc = dsoftmax
        self.b  = np.random.normal(size=nneurons) * initialscale
        self.db = np.zeros(nneurons)
        if pn > 0:
            self.w  = np.random.normal(size=[pn, nneurons]) * initialscale
            self.dw = np.zeros([pn, nneurons])
        else:
            self.w  = None
            self.dw = None

In [6]:
def remove(n):
    return np.where(np.random.random(n) > dropout, 1, 0)

In [7]:
nn= [Layer(len(images[1]), sigmoid)]
for i in range(1, nintlayers+1):
    nn.append(Layer(nneurons, softplus, pn = nn[i-1].nneurons))
    print(nneurons, nn[i-1].nneurons)
nn.append(Layer(noutput, softmax, pn = nn[nlayers-2].nneurons))

500 784
500 500
500 500


In [8]:
nupdates = 1
for k in range(nupdates):
    
    r = [1.0] + [ remove(nneurons) for l in nn[1:-1] ] + [1.0]

    batchindexes = np.random.randint(nimages, size=nbatch)
    x = images[batchindexes]     # 6.0 and 3.0 are empirical and could be added to w and b
    expected = np.zeros([nbatch, noutput])
    for i, j in enumerate(batchindexes):
        expected[i, labels[j]] = 1.0
        
    
    # "propagate" y = f(w*x + b)
    #
    y = [nn[0].afunc(x + nn[0].b)]
    print(np.argmax(y[0],axis=1))
    #print(y[0])
    #print(x + nn[0].b)
    for i in range(1, nlayers):
        #y.append(  (np.matmul(y[i-1],nn[i].w) + nn[i].b ) * r[i] )
        #y.append( nn[i].afunc( np.matmul(y[i-1],nn[i].w) + nn[i].b ) * r[i] )
        y.append( nn[i].afunc( np.matmul(y[i-1],nn[i].w) + nn[i].b ) * r[i] ) 
        #print(np.argmax(y[i],axis=1))
        #print(nn[i].w)
        #print(y[i-1])
        print(np.argmax(y[i],axis = 1))
    # estimate errors (e) backprogapagation to evaluate derivatives with respect to results
    #
    e = [expected - y[nintlayers +1]]
    for i in range(nintlayers, -1, -1):
        e = [np.matmul(e[0], nn[i+1].w.transpose()) * nn[i].dfunc(y[i]) * r[i]] + e
    for i in range(nlayers):
            nn[i].db += np.sum(e[i])
            nn[i].b  += learningrate * nn[i].db
            nn[i].db *= momentum
            if type(nn[i].w) is np.ndarray:
                nn[i].dw += np.sum(np.matmul(y[i-1].transpose(), e[i]))
                nn[i].w  += nn[i].dw * learningrate
                nn[i].dw *= momentum

    ncorrect = np.sum(np.argmax(y[4],axis=1) == labels[batchindexes]) 
    learningrate *= (nupdates - k) / (nupdates - k + 1)   

    if k != 0 and k%100 == 0:
        print('-------------')
        print('Itereation ' + str(k))
        print(str(100*ncorrect/nbatch)+ '% correct')
        print(np.argmax(y[4],axis=1))
    
    
    

[573 439 497 408 240 626 263 458 439 261 406 408 573 573 406 292 263 517
 439 517 547 688 437 439 440 263 573 517 573 573 573 517 263 573 406 263
 573 517 565 573 573 573 240 265 408 293 497 458 517 263 263 573 458 573
 536 632 517 497 437 497 497 439 292 439 573 263 573 573 540 573 573 573
 573 573 654 122 497 239  93 688 263 292 263 292 517 458 159 688 497 573
 292 632 540 573 497 431 240 159 497 242]
[321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321
 321 188 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321
 321 321 321 321 321 321 321 321 321 321 321 321 321 321 188 321 321 321
 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321
 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321 321
 321 321 321 321 321 321 321 321 321 321]
[241 241 241 241 241 241 241 241 241 241 241 241 241 241 241 241 241 241
 241 241 241 241 241 241 241 241 241 241 241 241 241 241 241 241 241 241
 241 241 241 241 241 241 241 241 241 241

In [7]:
learningrate = 0.001
dropout      = 0.5
momentum     = 0.75
nbatch       = 100
nimages      = len(images)
nupdates     = round(nimages / nbatch) * 100

def remove(n):
    return np.where(np.random.random(n) > dropout, 1, 0)

In [52]:
nupdates = 1
for i in range(nupdates):
    
    r = [ remove(l.nneurons) for l in nnetwork[1:-1] ] 

    batchindexes = np.random.randint(nimages, size=nbatch)
    x = 6.0 * images[batchindexes]+ 3
    expected = np.zeros([nbatch, noutput])
    for i, j in enumerate(batchindexes):
        expected[i, labels[j]] = 1

        
    y = []
    y.append(nnetwork[0].afunc(x+ nnetwork[0].b))
    for i in range(1, nintlayers+2):
        y.append(nnetwork[i].afunc( np.matmul(y[i-1], weigth[i-1]) + nnetwork[i].b ) )

    print([t.size for t in y])
    e = [[] for i in range(nintlayers + 2)]
    e[nintlayers + 1] = expected - y[nintlayers +1]
    for i in range(nintlayers, 0, -1):
        print(i)
        e[i] = np.dot( nnetwork[i].dfunc(y[i]).transpose(),
            np.matmul(e[i+1], weigth[i].transpose()) )     
       # e[i] = e[i].transpose()
    
    

    

[78400, 50000, 50000, 50000, 1000]
3
2


ValueError: shapes (500,100) and (500,500) not aligned: 100 (dim 1) != 500 (dim 0)

In [None]:
    #e = [[] for i in range(nintlayers + 2)]
    #e[nintlayers + 1] = expected - y[nintlayers +1]
    #for i in range(nintlayers, -1, -1):
    #    e[i] = np.matmul(e[i+1], nn[i+1].w.transpose()) * nn[i].dfunc(y[i]) * r[i]
