In [35]:
def cross_entropy_error(y, t):
    delta = 1e-7
    return -np.sum(t * np.log(y + delta))

def softmax(a):
    exp_a = np.exp(a)
    sum_exp_a = np.sum(exp_a)
    return exp_a / sum_exp_a

## 乘法层

In [36]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None

    def forward(self, x, y):
        self.x = x #保存输入的x, y
        self.y = y                
        out = x * y

        return out

    def backward(self, dout):
        dx = dout * self.y #翻转x, y
        dy = dout * self.x

        return dx, dy

### forward

In [37]:
apple_num = 2
apple = 100
tax = 1.1

mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)
price

220.00000000000003

### backward

In [38]:
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
print('dapple_price:', dapple_price, ',dtax:', dtax)
print('dapple:', dapple, ', dapple_num:', dapple_num)

dapple_price: 1.1 ,dtax: 200
dapple: 2.2 , dapple_num: 110.00000000000001


## 加法层

In [39]:
class AddLayer:
    def __init__(self):
        pass

    def forward(self, x, y):
        out = x + y
        return out

    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy

### 两个苹果 + 三个橘子
### 苹果价格: 100, 橘子价格: 150

In [40]:
apple_num = 2
apple = 100

orange_num = 3
orange = 150

tax = 1.1

apple_mul_layer = MulLayer()
orange_mul_layer = MulLayer()
fruit_add_layer = AddLayer()
tax_layer = MulLayer()

# forward
apple_price = apple_mul_layer.forward(apple, apple_num)
orange_price = orange_mul_layer.forward(orange, orange_num)
fruit_price = fruit_add_layer.forward(apple_price, orange_price)
price = tax_layer.forward(fruit_price, tax)
print(price)

715.0000000000001


In [41]:
# bardward
dprice = 1
dfruit_price, dtax = tax_layer.backward(dprice)
dapple_price, dorange_price = fruit_add_layer.backward(dfruit_price)
dapple, dapple_num = apple_mul_layer.backward(dapple_price)
dorange, dorange_num = orange_mul_layer.backward(dfruit_price)
print(dorange, dorange_num)

3.3000000000000003 165.0


## ReLU 层(Rectified Linear Unit)
y = x (x >= 0)

y = 0 (x < 0)

In [42]:
import numpy as np

class Relu:
    def __init__(self):
        self.mask = None
    
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

relu = Relu()
x = np.array([-1, 1, 2, -2])
out = relu.forward(x)
print(out)

dout = np.array([0, 1, 2, 3])
dx = relu.backward(dout)
print(dx)

[0 1 2 0]
[0 1 2 0]


## Sigmoid层

In [43]:
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out
    
    def backward(self, dout):
        dx = dout * [1 - self.out] * self.out
        return dx

s = Sigmoid()
out = s.forward(np.array([-1, 1, 2]))
print(out)

dx = s.backward(1)
print(dx)



[0.26894142 0.73105858 0.88079708]
[[0.19661193 0.19661193 0.10499359]]


## Affine层

In [68]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        print(W.shape, b.shape)
    
    def forward(self, x):
        self.x = x
        
        
        out = np.dot(x, W) + b
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        dW = np.dot(self.x.T, dout)
        db = np.sum(dout, axis=0)
        return dx

W = np.array([[1, 2, 3],
            [4, 5, 6]])  # 2 x 3
b = np.array([0.3, 0.6, 0.8])  # 1 x 3

x = np.array([[3, 4],
             [5, 6]]
            ) # 1 x 2
affine = Affine(W, b)
out = affine.forward(x)
print('out=', out)

dout = np.array([[1, 1, 1], [2,2,2]]) # 1 x 3

affine.backward(dout)


(2, 3) (3,)
out= [[19.3 26.6 33.8]
 [29.3 40.6 51.8]]


array([[ 6, 15],
       [12, 30]])

## softmax-with-loss

In [69]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
    
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

x = np.array([0.2, 0.5, 0.3])
t = np.array([0, 1, 0])

sl = SoftmaxWithLoss()
loss = sl.forward(x, t)
print(loss)
print(sl.y)

dloss = sl.backward()
print(dloss)

0.9398308048895956
[0.28943311 0.39069383 0.31987306]
[ 0.0964777  -0.20310206  0.10662435]


# 反向传播算法实现

In [101]:
# coding: utf-8
import sys, os
import numpy as np
sys.path.append('SourceCode')
from collections import OrderedDict


class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        print(W.shape, b.shape)
    
    def forward(self, x):
        self.x = x
               
        out = np.dot(self.x, self.W) + self.b
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx

    
def _numerical_gradient_no_batch(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    for idx in range(x.size):
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 还原值
        
    return grad


def numerical_gradient(f, X):
    if X.ndim == 1:
        return _numerical_gradient_no_batch(f, X)
    else:
        grad = np.zeros_like(X)
        
        for idx, x in enumerate(X):
            grad[idx] = _numerical_gradient_no_batch(f, x)
        
        return grad

class TwoLayerNet:

    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 初始化权重
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)

        # 生成层
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        
        return x
        
    # x:输入数据, t:监督数据
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x:输入数据, t:监督数据
    def numerical_gradient(self, x, t):        
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 设定
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        return grads

In [105]:
# 梯度确认
from dataset.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size = 784, hidden_size = 50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]
print('x_batch.shape:', x_batch.shape)
print('t_batch.shape:', t_batch.shape)
grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)
for key in grad_numerical:
    diff = np.average(np.abs(grad_numerical[key] - grad_backprop[key]))
    print('key %s diff %f' % (key, diff))


(784, 50) (50,)
(50, 10) (10,)
x_batch.shape: (3, 784)
t_batch.shape: (3, 10)
key W1 diff 0.000896
key b1 diff 0.005414
key W2 diff 0.012180
key b2 diff 0.306399


In [106]:
# 读入数据
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 梯度
    #grad = network.numerical_gradient(x_batch, t_batch)
    grad = network.gradient(x_batch, t_batch)
    
    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)


(784, 50) (50,)
(50, 10) (10,)
0.10003333333333334 0.098


  
  
  


0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
0.09871666666666666 0.098
