### 交叉商loss 函數 (cross entropy loss function)
對於 softmax 激活函數的交叉商如下:
$$J(\theta ) = - \frac{1} {m} \sum_{i=1}^{m} y^{(i)} \log(h_{\theta} (x^{(i)})) + (1 - y^{(i)}) \log (1 - h_{\theta} (x^{(i)})) $$
對於 softmax 等函數(S型曲線函數)，使用交叉商loss函數，可以收斂得更快!

In [2]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

#載入數據集
mnist = input_data.read_data_sets("MNIST_data", one_hot = True) 

#每一個批次的大小
batch_size = 100 

#計算一共有多少批次
n_batch = mnist.train.num_examples // batch_size 

#定義兩個placeholder，目的在於 train時候透過 feed 傳入 x_data 與 y_data
x = tf.placeholder(tf.float32, [None, 784]) 
y = tf.placeholder(tf.float32, [None, 10]) 

#建立一個神經網路
#隱藏層
W1 = tf.Variable(tf.random_normal([784, 15]))
b1 = tf.Variable(tf.zeros([1, 15]))
L1 = tf.nn.softmax(tf.matmul(x, W1) + b1) #隱藏層的輸出

#輸出層
W = tf.Variable(tf.zeros([15, 10]))
b = tf.Variable(tf.zeros([1, 10]))
prediction = tf.nn.softmax(tf.matmul(L1, W) + b)

#代價函數 : loss = mean((y - prediction)^2)
#loss = tf.reduce_mean(tf.square(y - prediction))
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y, logits = prediction))

#Gradient desent method 
gd = tf.train.AdagradOptimizer(0.31)
#gd = tf.train.GradientDescentOptimizer(0.2)

#最小化 代價函數 (operator) 
train = gd.minimize(loss)

#初始化變數 operator
init = tf.global_variables_initializer()


#結果存在一個 boolean 的變數中
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(prediction, 1)) #argmax 回傳一維張量中最大的值，所在的位置

#求準確率
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 

#開始training
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(300): 
       
        for batch in range(n_batch): #每一個 outer loop 疊代 n_batch 個批次

            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            feed_dict = {x: batch_xs, y: batch_ys} 
            sess.run(train, feed_dict)
        if epoch % 20 == 0:
            #計算一次準確率
            outer_loop_feed_dict = {x: mnist.test.images, y: mnist.test.labels} #testing data feed dictionary
            acc = sess.run(accuracy, outer_loop_feed_dict)
            print("Iter=" + str(epoch) + ", Testing Accuracy=" + str(acc))

Extracting MNIST_data\train-images-idx3-ubyte.gz
Extracting MNIST_data\train-labels-idx1-ubyte.gz
Extracting MNIST_data\t10k-images-idx3-ubyte.gz
Extracting MNIST_data\t10k-labels-idx1-ubyte.gz
Iter=0, Testing Accuracy=0.2917
Iter=20, Testing Accuracy=0.7441
Iter=40, Testing Accuracy=0.8007
Iter=60, Testing Accuracy=0.8201
Iter=80, Testing Accuracy=0.8202
Iter=100, Testing Accuracy=0.8252
Iter=120, Testing Accuracy=0.8229
Iter=140, Testing Accuracy=0.8269
Iter=160, Testing Accuracy=0.8256
Iter=180, Testing Accuracy=0.8267
Iter=200, Testing Accuracy=0.8334
Iter=220, Testing Accuracy=0.9005
Iter=240, Testing Accuracy=0.9043
Iter=260, Testing Accuracy=0.9075
Iter=280, Testing Accuracy=0.9085


### Dropout
在訓練神經網路的時候，對於不一樣的訓練樣本，遮蔽隱藏層的一些神經元，可以減低 overfitting 的可能  
以下是一個沒有 Dropout的例子 (keep_prob = 1.0)， Training Accuracy 比 Test Accuracy 準確許多  
也就是說，這個神經網路已經 Overfitting

In [3]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

#載入數據集
mnist = input_data.read_data_sets("MNIST_data", one_hot = True) 

#每一個批次的大小
batch_size = 100 

#計算一共有多少批次
n_batch = mnist.train.num_examples // batch_size 

#定義兩個placeholder，目的在於 train時候透過 feed 傳入 x_data 與 y_data
x = tf.placeholder(tf.float32, [None, 784]) 
y = tf.placeholder(tf.float32, [None, 10]) 
keep_prob = tf.placeholder(tf.float32) #用來 dropout 的機率

#建立一個神經網路

#隱藏層1
W1 = tf.Variable(tf.truncated_normal([784, 2000], stddev=0.1))
b1 = tf.Variable(tf.zeros([2000]))
L1 = tf.nn.tanh(tf.matmul(x, W1) + b1)
L1_dropout = tf.nn.dropout(L1, keep_prob)


#隱藏層2
W2 = tf.Variable(tf.truncated_normal([2000, 2000], stddev=0.1))
b2 = tf.Variable(tf.zeros([2000]))
L2 = tf.nn.tanh(tf.matmul(L1_dropout, W2) + b2)
L2_dropout = tf.nn.dropout(L2, keep_prob)

#隱藏層3
W3 = tf.Variable(tf.truncated_normal([2000, 1000], stddev=0.1))
b3 = tf.Variable(tf.zeros([1000]))
L3 = tf.nn.tanh(tf.matmul(L2_dropout, W3) + b3)
L3_dropout = tf.nn.dropout(L3, keep_prob)

#輸出層
W4 = tf.Variable(tf.truncated_normal([1000, 10], stddev=0.1))
b4 = tf.Variable(tf.zeros([10]))
prediction = tf.nn.tanh(tf.matmul(L3_dropout, W4) + b4)


#代價函數 : loss = mean((y - prediction)^2)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y, logits = prediction))

#Gradient desent method 
gd = tf.train.AdagradOptimizer(0.31)
#gd = tf.train.GradientDescentOptimizer(0.2)

#最小化 代價函數 (operator)
train = gd.minimize(loss)

#初始化變數 operator
init = tf.global_variables_initializer()


#結果存在一個 boolean 的變數中
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(prediction, 1)) #argmax 回傳一維張量中最大的值，所在的位置

#求準確率
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 

#開始training
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(31): 
       
        for batch in range(n_batch): #每一個 outer loop 疊代 n_batch 個批次

            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            feed_dict = {x: batch_xs, y: batch_ys, keep_prob: 1.0} 
            sess.run(train, feed_dict)
        #計算一次準確率
        train_feed_dict = {x: mnist.train.images, y: mnist.train.labels, keep_prob: 1.0} #train data feed dictionary
        train_acc = sess.run(accuracy, train_feed_dict)
        test_feed_dict = {x: mnist.test.images, y: mnist.test.labels, keep_prob: 1.0} #testing data feed dictionary
        test_acc = sess.run(accuracy, test_feed_dict)          
        print("Iter=" + str(epoch) + ", Training Accuracy=" + str(train_acc) + ", Testing Accuracy=" + str(test_acc))

Extracting MNIST_data\train-images-idx3-ubyte.gz
Extracting MNIST_data\train-labels-idx1-ubyte.gz
Extracting MNIST_data\t10k-images-idx3-ubyte.gz
Extracting MNIST_data\t10k-labels-idx1-ubyte.gz
Iter=0, Training Accuracy=0.9368182, Testing Accuracy=0.9363
Iter=1, Training Accuracy=0.9541636, Testing Accuracy=0.9508
Iter=2, Training Accuracy=0.95910907, Testing Accuracy=0.9537
Iter=3, Training Accuracy=0.96532726, Testing Accuracy=0.9597
Iter=4, Training Accuracy=0.9710182, Testing Accuracy=0.9639
Iter=5, Training Accuracy=0.97187275, Testing Accuracy=0.9632
Iter=6, Training Accuracy=0.9745455, Testing Accuracy=0.9657
Iter=7, Training Accuracy=0.9771091, Testing Accuracy=0.966
Iter=8, Training Accuracy=0.9746364, Testing Accuracy=0.9658
Iter=9, Training Accuracy=0.9768909, Testing Accuracy=0.9669
Iter=10, Training Accuracy=0.9780909, Testing Accuracy=0.9644
Iter=11, Training Accuracy=0.9803636, Testing Accuracy=0.9699
Iter=12, Training Accuracy=0.98392725, Testing Accuracy=0.9737
Iter=13

### 設定 keep_prob = 0.7，採用Dropout 的例子

In [4]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

#載入數據集
mnist = input_data.read_data_sets("MNIST_data", one_hot = True) 

#每一個批次的大小
batch_size = 100 

#計算一共有多少批次
n_batch = mnist.train.num_examples // batch_size 

#定義兩個placeholder，目的在於 train時候透過 feed 傳入 x_data 與 y_data
x = tf.placeholder(tf.float32, [None, 784]) 
y = tf.placeholder(tf.float32, [None, 10]) 
keep_prob = tf.placeholder(tf.float32) #用來 dropout 的機率

#建立一個神經網路

#隱藏層1
W1 = tf.Variable(tf.truncated_normal([784, 2000], stddev=0.1))
b1 = tf.Variable(tf.zeros([2000]))
L1 = tf.nn.tanh(tf.matmul(x, W1) + b1)
L1_dropout = tf.nn.dropout(L1, keep_prob)


#隱藏層2
W2 = tf.Variable(tf.truncated_normal([2000, 2000], stddev=0.1))
b2 = tf.Variable(tf.zeros([2000]))
L2 = tf.nn.tanh(tf.matmul(L1_dropout, W2) + b2)
L2_dropout = tf.nn.dropout(L2, keep_prob)

#隱藏層3
W3 = tf.Variable(tf.truncated_normal([2000, 1000], stddev=0.1))
b3 = tf.Variable(tf.zeros([1000]))
L3 = tf.nn.tanh(tf.matmul(L2_dropout, W3) + b3)
L3_dropout = tf.nn.dropout(L3, keep_prob)

#輸出層
W4 = tf.Variable(tf.truncated_normal([1000, 10], stddev=0.1))
b4 = tf.Variable(tf.zeros([10]))
prediction = tf.nn.tanh(tf.matmul(L3_dropout, W4) + b4)


#代價函數 : loss = mean((y - prediction)^2)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y, logits = prediction))

#Gradient desent method 
gd = tf.train.AdagradOptimizer(0.31)
#gd = tf.train.GradientDescentOptimizer(0.2)

#最小化 代價函數 (operator)
train = gd.minimize(loss)

#初始化變數 operator
init = tf.global_variables_initializer()


#結果存在一個 boolean 的變數中
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(prediction, 1)) #argmax 回傳一維張量中最大的值，所在的位置

#求準確率
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 

#開始training
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(31): 
       
        for batch in range(n_batch): #每一個 outer loop 疊代 n_batch 個批次

            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            feed_dict = {x: batch_xs, y: batch_ys, keep_prob: 0.7} 
            sess.run(train, feed_dict)
        #計算一次準確率
        train_feed_dict = {x: mnist.train.images, y: mnist.train.labels, keep_prob: 1.0} #train data feed dictionary
        train_acc = sess.run(accuracy, train_feed_dict)
        test_feed_dict = {x: mnist.test.images, y: mnist.test.labels, keep_prob: 1.0} #testing data feed dictionary
        test_acc = sess.run(accuracy, test_feed_dict)          
        print("Iter=" + str(epoch) + ", Training Accuracy=" + str(train_acc) + ", Testing Accuracy=" + str(test_acc))

Extracting MNIST_data\train-images-idx3-ubyte.gz
Extracting MNIST_data\train-labels-idx1-ubyte.gz
Extracting MNIST_data\t10k-images-idx3-ubyte.gz
Extracting MNIST_data\t10k-labels-idx1-ubyte.gz
Iter=0, Training Accuracy=0.8870182, Testing Accuracy=0.8944
Iter=1, Training Accuracy=0.9125636, Testing Accuracy=0.9133
Iter=2, Training Accuracy=0.8894, Testing Accuracy=0.895
Iter=3, Training Accuracy=0.9266, Testing Accuracy=0.929
Iter=4, Training Accuracy=0.9251091, Testing Accuracy=0.9259
Iter=5, Training Accuracy=0.9354182, Testing Accuracy=0.9351
Iter=6, Training Accuracy=0.9441091, Testing Accuracy=0.9429
Iter=7, Training Accuracy=0.94243634, Testing Accuracy=0.9425
Iter=8, Training Accuracy=0.9476, Testing Accuracy=0.9486
Iter=9, Training Accuracy=0.9486727, Testing Accuracy=0.9445
Iter=10, Training Accuracy=0.95247275, Testing Accuracy=0.9496
Iter=11, Training Accuracy=0.9529091, Testing Accuracy=0.9506
Iter=12, Training Accuracy=0.9524182, Testing Accuracy=0.9504
Iter=13, Training A

### 作業  
利用這周學到的技巧，讓MINIST 網路的 Test Accuracy 拿到 98% 以上

In [5]:
#自己測出來的，與Ben 老師給的解法不同
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

#載入數據集
mnist = input_data.read_data_sets("MNIST_data", one_hot = True)

#每一個批次的大小
batch_size = 80 

#計算一共有多少批次
n_batch = mnist.train.num_examples // batch_size 

#定義兩個placeholder，目的在於 train時候透過 feed 傳入 x_data 與 y_data
x = tf.placeholder(tf.float32, [None, 784]) 
y = tf.placeholder(tf.float32, [None, 10]) 

#建立一個神經網路

#隱藏層
W1 = tf.Variable(tf.truncated_normal([784, 800], stddev=0.1))
b1 = tf.Variable(tf.zeros([800]))
L1 = tf.nn.tanh(tf.matmul(x, W1) + b1)

#輸出層
W2 = tf.Variable(tf.truncated_normal([800, 10], stddev=0.1))
b2 = tf.Variable(tf.zeros([10]))
prediction = tf.nn.tanh(tf.matmul(L1, W2) + b2)

#代價函數 : loss = mean((y - prediction)^2)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y, logits = prediction)) #交叉商代價函數

#Gradient desent method 
gd = tf.train.AdagradOptimizer(0.2)
#gd = tf.train.GradientDescentOptimizer(0.2)

#最小化 代價函數 (operator)
train = gd.minimize(loss)

#初始化變數 operator
init = tf.global_variables_initializer()


#結果存在一個 boolean 的變數中
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(prediction, 1)) #argmax 回傳一維張量中最大的值，所在的位置

#求準確率
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 

#開始training
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(31): 
       
        for batch in range(n_batch): #每一個 outer loop 疊代 n_batch 個批次

            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            feed_dict = {x: batch_xs, y: batch_ys} 
            sess.run(train, feed_dict)
        #計算一次準確率
        train_feed_dict = {x: mnist.train.images, y: mnist.train.labels} #train data feed dictionary
        train_acc = sess.run(accuracy, train_feed_dict)
        test_feed_dict = {x: mnist.test.images, y: mnist.test.labels} #testing data feed dictionary
        test_acc = sess.run(accuracy, test_feed_dict)          
        print("Iter=" + str(epoch) + ", Training Accuracy=" + str(train_acc) + ", Testing Accuracy=" + str(test_acc))


Extracting MNIST_data\train-images-idx3-ubyte.gz
Extracting MNIST_data\train-labels-idx1-ubyte.gz
Extracting MNIST_data\t10k-images-idx3-ubyte.gz
Extracting MNIST_data\t10k-labels-idx1-ubyte.gz
Iter=0, Training Accuracy=0.92867273, Testing Accuracy=0.9303
Iter=1, Training Accuracy=0.94305456, Testing Accuracy=0.9415
Iter=2, Training Accuracy=0.95403636, Testing Accuracy=0.9496
Iter=3, Training Accuracy=0.96285456, Testing Accuracy=0.9586
Iter=4, Training Accuracy=0.96805453, Testing Accuracy=0.9621
Iter=5, Training Accuracy=0.9748, Testing Accuracy=0.9678
Iter=6, Training Accuracy=0.9777091, Testing Accuracy=0.9683
Iter=7, Training Accuracy=0.9809273, Testing Accuracy=0.9725
Iter=8, Training Accuracy=0.98234546, Testing Accuracy=0.9716
Iter=9, Training Accuracy=0.98261815, Testing Accuracy=0.9737
Iter=10, Training Accuracy=0.98554546, Testing Accuracy=0.9749
Iter=11, Training Accuracy=0.9852727, Testing Accuracy=0.9749
Iter=12, Training Accuracy=0.98783636, Testing Accuracy=0.9768
Iter