In [2]:
import matplotlib.pylab as plt
import numpy as np
import numpy.typing as npt

# 5.4 単純なレイヤの実装

## 5.4.1 乗算レイヤの実装


In [1]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None

    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
        return out

    def backward(self, dout):
        """
        dout: 上流から流れてきた微分の値
        """
        dx = dout * self.y
        dy = dout * self.x
        return dx, dy

p138 の図 5-16 りんご 2 個の買い物


In [2]:
apple = 100
apple_num = 2
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

print(price)

220.00000000000003


各変数に関する微分


In [3]:
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(dapple, dapple_num, dtax)

2.2 110.00000000000001 200


## 5.4.2 加算レイヤの実装


In [4]:
class AddLayer:
    def __init__(self):
        pass

    def forward(self, x, y):
        out = x + y
        return out

    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy

p140 りんご 2 個とみかん 3 個の買い物


In [6]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# レイヤー
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# 順伝播
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

# 逆伝播
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(price)
print(dapple_num, dapple, dorange, dorange_num, dtax)

715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


# 各レイヤの実装


In [13]:
from common.functions import sigmoid


class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = x <= 0
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dx


class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = sigmoid(x)
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out

        return dx


class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b

        self.x = None
        self.original_x_shape = None
        # 重み・バイアスパラメータの微分
        self.dW = None
        self.db = None

    def forward(self, x):
        # テンソル対応
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)

        dx = dx.reshape(
            *self.original_x_shape
        )  # 入力データの形状に戻す（テンソル対応）
        return dx

array([1, 2, 3, 4, 5, 6, 7])

In [1]:
from common.functions import cross_entropy_error, softmax


class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None  # softmaxの出力
        self.t = None  # 教師データ

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)

        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size:  # 教師データがone-hot-vectorの場合
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size

        return dx

# 誤差逆伝播法に対応した TwoLayerNetwork の実装


In [21]:
from collections import OrderedDict
import sys, os

sys.path.append(os.pardir)
from common.functions import *
from common.gradient import numerical_gradient


class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        """
        input_size: int
        入力層のニューロンの数

        hidden_size: int
        隠れ層のニューロンの数

        output_size: int
        出力層のニューロンの数
        """
        self.params = {}
        self.params["W1"] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params["b1"] = np.zeros(hidden_size)
        self.params["W2"] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params["b2"] = np.zeros(output_size)
        self.layers = OrderedDict()
        self.layers["Affine1"] = Affine(self.params["W1"], self.params["b1"])
        self.layers["Relu1"] = Relu()
        self.layers["Affine2"] = Affine(self.params["W2"], self.params["b2"])
        self.lastLayer = SoftmaxWithLoss()

    def predict(self, x):
        """
        認識（推論）を行う

        x 画像データ
        """
        for layer in self.layers.values():
            x = layer.forward(x)
        return x

    def loss(self, x, t):
        """
        損失関数の値を求める。

        x:画像データ

        t:正解ラベル(one-hot表現)
        """
        y = self.predict(x)
        return self.lastLayer.forward(y, t)

    def accuracy(self, x, t):
        """
        認識精度を求める
        """
        y = self.predict(x)
        # axis=1を指定すると行に沿った方向（ネットワークの出力のうち一つの画像に相当するもの。画像がどの文字を表しているかの確率の配列）
        # どのインデックスの値が一番大きいかを調べることで、推測値を求めることができる
        y = np.argmax(y, axis=1)
        # 訓練データはone-hot表現なので値が1のインデックスの配列になる。
        if t.ndim != 1:
            t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    # 重みパラメータに関する損失関数の勾配を求める
    def numerical_gradient(self, x, t):
        """
        重みパラメータに対する勾配を求める。
        """
        loss_W = lambda W: self.loss(x, t)

        grads = {}
        grads["W1"] = numerical_gradient(loss_W, self.params["W1"])
        grads["b1"] = numerical_gradient(loss_W, self.params["b1"])
        grads["W2"] = numerical_gradient(loss_W, self.params["W2"])
        grads["b2"] = numerical_gradient(loss_W, self.params["b2"])
        return grads

    # numerical_gradientの高速版！
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        # lossの戻り値は使ってないけれど、実行されることで各レイヤのパラメータが更新される。

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)

        layers = list(self.layers.values())
        # レイヤーの順番を逆にする（逆伝播のため）
        layers.reverse()

        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        grads["W1"] = self.layers["Affine1"].dW
        grads["b1"] = self.layers["Affine1"].db
        grads["W2"] = self.layers["Affine2"].dW
        grads["b2"] = self.layers["Affine2"].db
        return grads

# 誤差逆伝播法の勾配確認

**数値微分のメリット**

- 実装が簡単

しかし誤差逆伝播法の実装は複雑で難しい。バグが起こりやすい。

**→ 数値微分の実装を使って誤差逆伝播法の実装がうまく動いているかテストする**


In [27]:
# coding: utf-8
import sys, os

sys.path.append(os.pardir)  # 親ディレクトリのファイルをインポートするための設定
from dataset.mnist import load_mnist

# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ":" + str(diff))

W1:4.444582052481905e-10
b1:2.4134605931819136e-09
W2:7.732877201774421e-09
b2:1.403376352807073e-07


# 誤差逆伝播法を使った実装

これまでと違う点は誤差逆伝播法で勾配を求めるということ


In [31]:
# coding: utf-8
import sys, os

sys.path.append(os.pardir)


from dataset.mnist import load_mnist

# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    # 勾配
    # grad = network.numerical_gradient(x_batch, t_batch)
    grad = network.gradient(x_batch, t_batch)

    # 更新
    for key in ("W1", "b1", "W2", "b2"):
        network.params[key] -= learning_rate * grad[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.11083333333333334 0.1111
0.9054333333333333 0.9094
0.9223666666666667 0.9246
0.93395 0.9343
0.9446666666666667 0.9435
0.9506666666666667 0.9494
0.9547166666666667 0.9524
0.95915 0.957
0.9614 0.958
0.9660666666666666 0.9621
0.9680166666666666 0.963
0.9708 0.9639
0.9721333333333333 0.9645
0.9740333333333333 0.9659
0.9748833333333333 0.9672
0.9760333333333333 0.9664
0.9774333333333334 0.9676
