In [131]:
import numpy as np
from keras import backend as K

from keras.datasets import mnist

def affine(z, W, b):
    return np.dot(z, W) + b


def relu(u):
    return np.log(1+np.exp(u))
#     return np.maximum(0,u)


def softmax(u):
    max_u = np.max(u, axis=1, keepdims=True)
    exp_u = np.exp(u-max_u)
    return exp_u/np.sum(exp_u, axis=1, keepdims=True)


def cross_entropy_error(y, t):
    return -np.sum(t * np.log(np.maximum(y, 1e-7)))/y.shape[0]


def softmax_cross_entropy_error_back(y,t):
    return (y-t)/y.shape[0]


def relu_back(dz, u):
    return dz*np.where(u > 0, 1,0)


def affine_back(du ,z ,W ,b):
    dz = np.dot(du, W.T)
    dW = np.dot(z.T, du)
    db = np.dot(np.ones(z.shape[0]).T, du)
    return dz, dW, db

def learn(x, t, W1, b1, W2, b2, W3, b3, lr):
    u1 = affine(x,W1,b1)
    z1 = relu(u1)
    u2 = affine(z1, W2, b2)
    z2 = relu(u2)
    u3 = affine(z2, W3, b3)
    y = softmax(u3)
    
    dy = softmax_cross_entropy_error_back(y,t)
    dz2, dW3, db3 = affine_back(dy, z2, W3 , b3)
    du2 = relu_back(dz2, u2)
    dz1, dW2, db2 = affine_back(du2, z1, W2, b2)
    du1 = relu_back(dz1, u1)
    dx, dW1, db1 = affine_back(du1, x, W1, b1)
    W1 = W1 - lr * dW1
    b1 = b1 - lr * db1
    W2 = W2 - lr * dW2
    b2 = b2 - lr * db2
    W3 = W3 - lr * dW3
    b3 = b3 - lr * db3
    return W1, b1, W2, b2, W3, b3

# momentum法で使う、重みの履歴を保存する部分
def momentum_decay(history, dW, lr, momentum=0.9):
    history = momentum * history - lr * dW
    return history

# momentum法の学習メソッド 
def Momentum_learn(x, t, W1, b1, W2, b2, W3, b3, history_W1, history_W2, history_W3, lr, momentum=0.9):
    u1 = affine(x,W1,b1)
    z1 = relu(u1)
    u2 = affine(z1, W2, b2)
    z2 = relu(u2)
    u3 = affine(z2, W3, b3)
    y = softmax(u3)
    
    dy = softmax_cross_entropy_error_back(y,t)
    dz2, dW3, db3 = affine_back(dy, z2, W3 , b3)
    du2 = relu_back(dz2, u2)
    dz1, dW2, db2 = affine_back(du2, z1, W2, b2)
    du1 = relu_back(dz1, u1)
    dx, dW1, db1 = affine_back(du1, x, W1, b1)
    history_W1 = momentum_decay(history_W1, dW1, lr)
    history_W2 = momentum_decay(history_W2, dW2, lr)
    history_W3 = momentum_decay(history_W3, dW3, lr)
    W1 = W1 - lr * dW1
    b1 = b1 - lr * db1
    W2 = W2 - lr * dW2
    b2 = b2 - lr * db2
    W3 = W3 - lr * dW3
    b3 = b3 - lr * db3
    return W1, b1, W2, b2, W3, b3, history_W1, history_W2, history_W3

# AdaGrad法で使う、重みの履歴を保存する部分
def weight_decay(history, w):
    history += np.square(w)
    return history

# AdaGrad法の学習メソッド 
def AdaGrad_learn(x, t, W1, b1, W2, b2, W3, b3, history_W1, history_W2, history_W3, lr):
    u1 = affine(x,W1,b1)
    z1 = relu(u1)
    u2 = affine(z1, W2, b2)
    z2 = relu(u2)
    u3 = affine(z2, W3, b3)
    y = softmax(u3)

    dy = softmax_cross_entropy_error_back(y,t)
    dz2, dW3, db3 = affine_back(dy, z2, W3 , b3)
    du2 = relu_back(dz2, u2)
    dz1, dW2, db2 = affine_back(du2, z1, W2, b2)
    du1 = relu_back(dz1, u1)
    dx, dW1, db1 = affine_back(du1, x, W1, b1)
    history_W1 = weight_decay(history_W1, dW1)
    history_W2 = weight_decay(history_W2, dW2)
    history_W3 = weight_decay(history_W3, dW3)

    W1 = W1 - lr * dW1/ (np.sqrt(history_W1)+1e-7)
    b1 = b1 - lr * db1
    W2 = W2 - lr * dW2/ (np.sqrt(history_W2)+1e-7)
    b2 = b2 - lr * db2
    W3 = W3 - lr * dW3/ (np.sqrt(history_W3)+1e-7)
    b3 = b3 - lr * db3

    return W1, b1, W2, b2, W3, b3, history_W1, history_W2, history_W3


def predict(x, W1, b1, W2, b2, W3, b3):
    u1 = affine(x, W1, b1)
    z1 = relu(u1)
    u2 = affine(z1, W2, b2)
    z2 = relu(u2)
    u3 = affine(z2, W3, b3)
    y = softmax(u3)
    return y



# 正解率
def accuracy_rate(y, t):
    max_y = np.argmax(y, axis=1)
    max_t = np.argmax(t, axis=1)
    return np.sum(max_y == max_t)/y.shape[0]


In [116]:
from keras.datasets import mnist
# keras.datasetsのmnistデータを使った学習の実装
# 書籍ではデータをダウンロードして説明していますが、こちらの方がダウンロードの手間がなくて簡単です

(X_train, b_train), (X_test, b_test) = mnist.load_data()

# 書籍のデータセットのnp.shapeが(60000, 576)であり、kerasのデータセットは(数万, 28,28)なので、整形する必要あ
x_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1]*X_train.shape[2]))
x_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1]*X_test.shape[2]))
t_train = np.zeros((b_train.shape[0], 10))
t_test = np.zeros((b_test.shape[0], 10))


for index in range(len(b_train)):
    t_train[index, int(b_train[index])] = 1
for index in range(len(b_test)):
    t_test[index, int(b_test[index])] = 1



## SGD的手法で学習を進める

In [104]:
nx_train = x_train/255
nx_test = x_test/255

d0 = nx_train.shape[1]
d1 = 100
d2 = 50
d3 = 10

np.random.seed(8)
W1 = np.random.rand(d0, d1) * 0.2 -0.1
W2 = np.random.rand(d1, d2) * 0.2 -0.1
W3 = np.random.rand(d2, d3) * 0.2 -0.1

b1 = np.zeros(d1)
b2 = np.zeros(d2)
b3 = np.zeros(d3)
lr = 0.5

batch_size = 100
epoch = 50
y_train = predict(nx_train, W1, b1, W2, b2, W3, b3)
y_test = predict(nx_test, W1, b1, W2, b2, W3, b3)


train_rate, train_err = accuracy_rate(y_train, t_train), cross_entropy_error(y_train, t_train)
test_rate, test_err = accuracy_rate(y_test, t_test), cross_entropy_error(y_test, t_test)
print("{0:3d} train_rate={1:6.2f}% test_rate={2:6.2f}% train_err={3:8.5f} test_err={4:8.5f}".format((0), train_rate*100, test_rate*100, train_err, test_err))
for i in range(epoch):
    for j in range(0,nx_train.shape[0], batch_size):
        W1, b1, W2, b2, W3, b3 = learn(nx_train[j:j+batch_size], t_train[j:j+batch_size], W1, b1, W2, b2, W3, b3, lr)
    y_train = predict(nx_train, W1, b1, W2, b2, W3, b3)
    y_test = predict(nx_test, W1, b1, W2, b2, W3, b3)
    train_rate, train_err = accuracy_rate(y_train, t_train), cross_entropy_error(y_train, t_train)
    test_rate, test_err = accuracy_rate(y_test, t_test), cross_entropy_error(y_test, t_test)
    print("{0:3d} train_rate={1:6.2f}% test_rate={2:6.2f}% train_err={3:8.5f} test_err={4:8.5f}".format((i+1), train_rate*100, test_rate*100, train_err, test_err))

[[ 0.08026386  0.10248935  0.07929109 ...  0.02781289 -0.0886679
   0.06104786]
 [ 0.00041565  0.10930881  0.10617994 ... -0.04557829 -0.08723505
  -0.04816912]
 [-0.02218244 -0.00056607 -0.03774787 ... -0.05218316 -0.02957538
  -0.06802625]
 ...
 [ 0.04726429 -0.08883908 -0.04127646 ... -0.00674194  0.07857021
  -0.008147  ]
 [ 0.09798906  0.08489334  0.07804969 ...  0.06872335  0.08430461
  -0.07813033]
 [ 0.07454521 -0.01339799  0.01067081 ... -0.01921345 -0.02373026
   0.10860668]]
  0 train_rate= 10.26% test_rate= 10.29% train_err= 2.31935 test_err= 2.31875
  1 train_rate= 92.11% test_rate= 92.34% train_err= 0.25767 test_err= 0.24383
  2 train_rate= 95.31% test_rate= 95.14% train_err= 0.15365 test_err= 0.15337
  3 train_rate= 96.65% test_rate= 96.17% train_err= 0.10769 test_err= 0.12189
  4 train_rate= 97.30% test_rate= 96.63% train_err= 0.08490 test_err= 0.10930
  5 train_rate= 97.67% test_rate= 96.97% train_err= 0.07300 test_err= 0.10179
  6 train_rate= 97.96% test_rate= 96.99% 

# AdaGrad法を用いて学習を進める
* 勾配の履歴の二乗和を記憶し、学習を進めるに連れて学習率を下げていく

In [136]:
# 画像のスケールを0-1に正規化
nx_train = x_train/255
nx_test = x_test/255

# dk 第k層の次元を設定 
d0 = nx_train.shape[1]
d1 = 100
d2 = 50
d3 = 10

np.random.seed(8)
W1 = np.random.rand(d0, d1) * 0.2 -0.1
W2 = np.random.rand(d1, d2) * 0.2 -0.1
W3 = np.random.rand(d2, d3) * 0.2 -0.1

# historyにこれまでの学習を記憶させるために、重みと同じ形状の零行列で初期化
history_W1 = np.zeros_like(W1)
history_W2 = np.zeros_like(W2)
history_W3 = np.zeros_like(W3)
b1 = np.zeros(d1)
b2 = np.zeros(d2)
b3 = np.zeros(d3)
lr = 0.9

batch_size = 100
epoch = 500
y_train = predict(nx_train, W1, b1, W2, b2, W3, b3)
y_test = predict(nx_test, W1, b1, W2, b2, W3, b3)

# メソッドの実装はページ上部を参照
history_W1 = weight_decay(history_W1, W1)
history_W2 = weight_decay(history_W2, W2)
history_W3 = weight_decay(history_W3, W3)

train_rate, train_err = accuracy_rate(y_train, t_train), cross_entropy_error(y_train, t_train)
test_rate, test_err = accuracy_rate(y_test, t_test), cross_entropy_error(y_test, t_test)
print("{0:3d} train_rate={1:6.2f}% test_rate={2:6.2f}% train_err={3:8.5f} test_err={4:8.5f}".format((0), train_rate*100, test_rate*100, train_err, test_err))
for i in range(epoch):
    for j in range(0,nx_train.shape[0], batch_size):
        W1, b1, W2, b2, W3, b3, history_W1, history_W2, history_W3 = AdaGrad_learn(nx_train[j:j+batch_size], t_train[j:j+batch_size], W1, b1, W2, b2, W3, b3, history_W1, history_W2, history_W3, lr)
    y_train = predict(nx_train, W1, b1, W2, b2, W3, b3)
    y_test = predict(nx_test, W1, b1, W2, b2, W3, b3)
    train_rate, train_err = accuracy_rate(y_train, t_train), cross_entropy_error(y_train, t_train)
    test_rate, test_err = accuracy_rate(y_test, t_test), cross_entropy_error(y_test, t_test)
    if i%5 ==0:
        print("{0:3d} train_rate={1:6.2f}% test_rate={2:6.2f}% train_err={3:8.5f} test_err={4:8.5f}".format((i+1), train_rate*100, test_rate*100, train_err, test_err))

  0 train_rate= 10.26% test_rate= 10.29% train_err= 2.31935 test_err= 2.31875
  1 train_rate= 28.51% test_rate= 29.30% train_err= 2.05276 test_err= 2.06062
  6 train_rate= 40.37% test_rate= 40.70% train_err= 1.85213 test_err= 1.85329
 11 train_rate= 42.88% test_rate= 43.03% train_err= 1.77157 test_err= 1.77105
 16 train_rate= 44.18% test_rate= 44.30% train_err= 1.72209 test_err= 1.72067
 21 train_rate= 45.10% test_rate= 45.26% train_err= 1.68717 test_err= 1.68519
 26 train_rate= 45.85% test_rate= 45.97% train_err= 1.66059 test_err= 1.65822
 31 train_rate= 46.45% test_rate= 46.60% train_err= 1.63935 test_err= 1.63668
 36 train_rate= 46.99% test_rate= 47.30% train_err= 1.62176 test_err= 1.61888
 41 train_rate= 47.44% test_rate= 47.78% train_err= 1.60684 test_err= 1.60378
 46 train_rate= 47.85% test_rate= 48.19% train_err= 1.59393 test_err= 1.59073
 51 train_rate= 48.16% test_rate= 48.75% train_err= 1.58258 test_err= 1.57927
 56 train_rate= 48.54% test_rate= 49.00% train_err= 1.57248 test

# Momentum法で学習
* 学習エポック間の勾配の変化量に応じて、勾配のベクトルの変化を緩やかにする

In [132]:
# 画像のスケールを0-1に正規化
nx_train = x_train/255
nx_test = x_test/255

# dk 第k層の次元を設定 
d0 = nx_train.shape[1]
d1 = 100
d2 = 50
d3 = 10

np.random.seed(8)
W1 = np.random.rand(d0, d1) * 0.2 -0.1
W2 = np.random.rand(d1, d2) * 0.2 -0.1
W3 = np.random.rand(d2, d3) * 0.2 -0.1

# historyにこれまでの学習を記憶させるために、重みと同じ形状の零行列で初期化
history_W1 = np.zeros_like(W1)
history_W2 = np.zeros_like(W2)
history_W3 = np.zeros_like(W3)

b1 = np.zeros(d1)
b2 = np.zeros(d2)
b3 = np.zeros(d3)
lr = 0.5

batch_size = 100
epoch = 50
y_train = predict(nx_train, W1, b1, W2, b2, W3, b3)
y_test = predict(nx_test, W1, b1, W2, b2, W3, b3)

# メソッドの実装はページ上部を参照
history_W1 = momentum_decay(history_W1, W1, lr)
history_W2 = momentum_decay(history_W2, W2, lr)
history_W3 = momentum_decay(history_W3, W3, lr)

train_rate, train_err = accuracy_rate(y_train, t_train), cross_entropy_error(y_train, t_train)
test_rate, test_err = accuracy_rate(y_test, t_test), cross_entropy_error(y_test, t_test)
print("{0:3d} train_rate={1:6.2f}% test_rate={2:6.2f}% train_err={3:8.5f} test_err={4:8.5f}".format((0), train_rate*100, test_rate*100, train_err, test_err))
for i in range(epoch):
    for j in range(0,nx_train.shape[0], batch_size):
        W1, b1, W2, b2, W3, b3, history_W1, history_W2, history_W3 = Momentum_learn(nx_train[j:j+batch_size], t_train[j:j+batch_size], W1, b1, W2, b2, W3, b3, history_W1, history_W2, history_W3, lr)
    y_train = predict(nx_train, W1, b1, W2, b2, W3, b3)
    y_test = predict(nx_test, W1, b1, W2, b2, W3, b3)
    train_rate, train_err = accuracy_rate(y_train, t_train), cross_entropy_error(y_train, t_train)
    test_rate, test_err = accuracy_rate(y_test, t_test), cross_entropy_error(y_test, t_test)
    print("{0:3d} train_rate={1:6.2f}% test_rate={2:6.2f}% train_err={3:8.5f} test_err={4:8.5f}".format((i+1), train_rate*100, test_rate*100, train_err, test_err))

  0 train_rate= 10.26% test_rate= 10.29% train_err= 2.31935 test_err= 2.31875
  1 train_rate= 91.65% test_rate= 92.23% train_err= 0.26123 test_err= 0.24636
  2 train_rate= 94.27% test_rate= 94.40% train_err= 0.18031 test_err= 0.18563
  3 train_rate= 96.41% test_rate= 96.15% train_err= 0.11143 test_err= 0.12923
  4 train_rate= 97.09% test_rate= 96.52% train_err= 0.09020 test_err= 0.11592
  5 train_rate= 97.50% test_rate= 96.79% train_err= 0.07624 test_err= 0.10875
  6 train_rate= 97.65% test_rate= 96.91% train_err= 0.07185 test_err= 0.11039
  7 train_rate= 98.05% test_rate= 97.13% train_err= 0.05992 test_err= 0.10503
  8 train_rate= 98.13% test_rate= 97.03% train_err= 0.05823 test_err= 0.11039
  9 train_rate= 98.00% test_rate= 96.91% train_err= 0.06050 test_err= 0.11906
 10 train_rate= 98.39% test_rate= 97.28% train_err= 0.04987 test_err= 0.11197
 11 train_rate= 97.69% test_rate= 96.52% train_err= 0.07051 test_err= 0.14725
 12 train_rate= 98.47% test_rate= 97.07% train_err= 0.04649 test