In [1]:
def cross_entropy_error(y, t):
    
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)

    if t.size == y.size:
        t = t.argmax(axis=1)

    batch_size = y.shape[0]

    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

In [2]:
def softmax(x):
    
    if x.ndim == 2:
        
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)

    return y.T 

    x = x - np.max(x) 

    return np.exp(x) / np.sum(np.exp(x))

In [3]:
class Relu:
    
    def _init__(self):
        self.mask = None

    def forward(self,x):
        
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

  
    def backward(self,dout):
        
        dout[self.mask] = 0
        dx = dout
        return dx

In [4]:
class sigmoid:
    
    def __init__(self):
        self.out = None

    def forward(self, x):

        out = 1 / (1 + np.exp(-x))
        self.out = out

        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out

        return dx

In [5]:
class softmax_with_loss:
    
    def __init__(self):
        
        self.loss = None
        self.y = None
        self.t = None
        
    def forward(self, x, t):

        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)

        return self.loss

  

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        if self.t.size == self.y.size:
            dx = (self.y - self.t) / batch_size

        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size

        return dx

In [6]:
class Affine:
    
    def __init__(self, W, b):

        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):

        self.x = x
        out = np.dot(self.x, self.W) + self.b

        return out

  

    def backward(self, dout):

        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis = 0)
        
        return dx

In [7]:
class LayerNet:
    
    def __init__(self, input_size, hidden1, hidden2, output_size, weight_init_std = 0.01):
        
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden1)
        self.params['b1'] = np.zeros(hidden1)

        self.params['W2'] = weight_init_std * np.random.randn(hidden1, hidden2) 
        self.params['b2'] = np.zeros(hidden2)

        self.params['W3'] = weight_init_std * np.random.randn(hidden2, output_size) 
        self.params['b3'] = np.zeros(output_size)

        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()

        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.layers['Relu2'] = Relu()
        
        self.layers['Affine3'] = Affine(self.params['W3'], self.params['b3'])

        self.lastLayer = softmax_with_loss()

 
    def predict(self, x):

        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x

    

    def loss(self, x, t):
        
        y = self.predict(x)

        return self.lastLayer.forward(y, t)

    

    def gradient(self, x, t):
        
        self.loss(x, t)

        dout = 1
        dout = self.lastLayer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()

        for layer in layers:
            dout = layer.backward(dout)

        grads = {}

        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db

        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        grads['W3'], grads['b3'] = self.layers['Affine3'].dW, self.layers['Affine3'].db

        return grads

In [12]:
import sys, os
sys.path.append(os.pardir)

import numpy as np
from collections import OrderedDict
from dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = LayerNet(input_size=784, hidden1 = 100, hidden2 = 50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    grad = network.gradient(x_batch, t_batch) 
    
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        print(loss)

2.3026718352422124
2.298291799143924
2.2095351388623676
1.744599412775764
1.138828154392475
0.8651324667657397
0.716157126474082


KeyboardInterrupt: 