In [0]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Create network by layers and use gradient descent with computation graph (Backprop, BP)
### 可以把每個操作拆解成局部區域進行微分: Simple examples

超商裡的蘋果 1 顆 100 元, 橘子 1 顆 150 元, 結帳時要抽 10% 營業稅, 如果小明買了 2 顆蘋果和 3 顆橘子, 請問若 ... <br>
1. 多買一顆蘋果，對結帳價錢的變化?
2. 蘋果一顆的價錢漲 1 元時，對結帳價錢的變化?
3. 營業稅多 100% 時，對價錢的變化?

In [0]:
#--------------------forward and backward function-----------------------
class mul_layer():
    def __init__(self):
        self.x = None
        self.y = None
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
        return out
    def backward(self, dout):
        dx = dout * self.y
        dy = dout * self.x
        return dx, dy

In [0]:
# ----------- #
n_apples = 2
price_per_apple = 100
tax = 1.1


# Build network
mul_apple_layer = mul_layer()
mul_or_layer=mul_layer()
mul_tax_layer = mul_layer()

# forward
apple_price = mul_apple_layer.forward(price_per_apple, n_apples)
price = mul_tax_layer.forward(apple_price, tax)

# backward
dprice = 1 # because we assume using linear function, which y = x, dy/dx = 1
d_total_apples_price, d_tax = mul_tax_layer.backward(dprice)
d_price_per_apple, d_apple_num = mul_apple_layer.backward(d_total_apples_price)

# results
print("final price: %i" % price)
print("1. d_price_per_apple: %.2f" % d_price_per_apple)
print("2. d_Apple_num: %.2f" % d_apple_num)
print("3. d_Tax: %.2f" % d_tax)

final price: 220
1. d_price_per_apple: 2.20
2. d_Apple_num: 110.00
3. d_Tax: 200.00


## Exercise
- 多買一顆橘子，對結帳價錢的變化?
- 橘子一顆的價錢漲 1 元時，對結帳價錢的變化?

# Create network by layers and use BP
開始前再次強調，BP只是對於deep learning在計算偏微分的技巧，是加速計算optimizer的工具，有了BP我們可以批次對不同的參數計算偏微分，
計算兩次neural network(forward pass 和 backward pass)和一些小計算就能完成所有的
取代掉前一個章節取偏微分的算法:一次整個neural network計算(嚴格說起來是兩次)只算出一個參數的偏微分。


(分別執行完範例後應該就能感受到兩者速度上的差異)


https://www.youtube.com/watch?v=ibJpTrp5mcE


#### 公式重提　
#### 就老師的範例而言
<img src="aaa.jpg">
#### 則:
## $\frac{\partial C(\theta)}{\partial w_1}=\frac{\partial z}{\partial w_1}\frac{\partial C(\theta)}{\partial z}=x_1 \times \sigma'(z) (w_3\times\frac{\partial C(\theta)}{\partial z'}+w_4\times\frac{\partial C(\theta)}{\partial z''} )=$
## $x_1 \times \sigma'(z) (w_3\times\frac{\partial y_1}{\partial z'}\frac{\partial C(\theta)}{\partial y_1}+w_4\times\frac{\partial y_2}{\partial z''}\frac{\partial C(\theta)}{\partial y_2} )$
## $=forword pass後得到的值\times backword pass後得到的值$

## Real example
#### Define other layer functions
與第一份範例不同，我們除了要列出layer中的計算過程(forward),還要為了BP寫backward的function

btw backward其實就像是反過來的network, activation function變成原來activation function的微分

### part one: sigmoid 
### $sigmoid(z)=a=\frac{1}{1+e^{-x}}$
### $sigmoid'(z)=a(1-a)$

In [0]:
# Define activation(sigmoid)_layer functions
class sigmoid_layer():
    def __init__(self):
        self.out = None #這裡的out就是影片與上面方程式中的a
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out  #記錄在class中作計算backward備用
        
        return out
    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out # dL/da * da/dz = dL/da * y * (1-y)
        
        return dx
# Define affine_layer functions    
class affine_layer():
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.original_x_shape = None
        self.dW =None
        self.db = None
        
    def forward(self, x):
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        
        self.x = x #記錄在class中作計算backward備用
        out = np.dot(self.x, self.W) + self.b
        
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis = 0)
        dx = dx.reshape(*self.original_x_shape)
        
        return dx
    
def softmax(x):
    if x.ndim == 2:
        x = x.T # Transpose it
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 
    return np.exp(x) / np.sum(np.exp(x))

# define cross_entropy
def cross_entropy(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

class softmax_with_crossentropy():
    def __init__(self):
        self.loss = None
        self.y = None # softmax output
        self.t = None # target (ground-truth)
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)#將predict後的結果餵入最後的softmax作為最後的預測(y_pred)
        self.loss = cross_entropy(self.y, self.t)
        
        return self.loss
    def backward(self, dout = 1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size #以predict 與 truth的差值除以batch size, 
                                            #作為bp的起點(事實上就是Loss func對最後一步z的偏微分)
        
        return dx

In [0]:
from collections import OrderedDict # this is a built-in function -- dictionary with order

#ordereddic和一般dict差別的參考範例：https://blog.csdn.net/liangguohuan/article/details/7088304

class Two_layerNet_bp:
    def __init__(self, 
                 input_size, 
                 hidden_size, 
                 output_size, 
                 weight_init_std = 0.01):
        #------------------------------定義好要用的陣列---------------------------------
        self.params = {}#用dict存放所有陣列
        self.params['w1'] = weight_init_std * np.random.randn(input_size, hidden_size) #以亂數作為w的起始值
        self.params['b1'] = np.zeros(hidden_size) #以0作為b的起始值
        self.params['w2'] = weight_init_std * np.random.randn(hidden_size, output_size) #以亂數作為w的起始值
        self.params['b2'] = np.zeros(output_size) #以0作為b的起始值
        
        #------------------------------build network---------------------------------
        self.layers = OrderedDict()#用ordereddic存放所有layers
        self.layers['affine_1'] = affine_layer(self.params['w1'], self.params['b1'])
        self.layers['sigmoid_1'] = sigmoid_layer()
        self.layers['affine_2'] = affine_layer(self.params['w2'], self.params['b2'])
        # output layer
        self.lastlayer = softmax_with_crossentropy()
    #---------------------predict----------------------    
    def predict(self, x):
        # forward
        for layer in self.layers.values(): #從第一層開始,將最初的x餵進layer,並將輸出當作下一層layer的input持續做到倒數第二層(不包含最後的softmax層)，得到output
            x = layer.forward(x) 
        
        return x
#--------------------------lost function(with cross entropy)---------------              
    def loss(self, x, y_true):
        y_pred = self.predict(x) #調用上面的predict function
        return self.lastlayer.forward(y_pred, y_true)#仔細看到上個cell最後一個class "softmax_with_crossentropy()"的"forward"，
                                                    #除了計算softmax、cross_entropy也將計算結果、y_pred、y_true記錄在softmax_with_crossentropy()內,
                                                    #在調用backward時就不用再餵入y_pred,y_true
#----------------------------------accuracy--------------------------------       
    def compute_acc(self, x, y_true):
        y_pred = self.predict(x)
        # take argmax
        y_pred = y_pred.argmax(axis = 1) 
        y_true = y_true.argmax(axis = 1)
        
        acc = np.sum(y_pred == y_true) / len(y_true)
        return acc
#-----------------------------optimizer(gradient)---------------------------
    def gradient(self, x, t):
        # forward path
        self.loss(x, t) #這步回頭看function就會了解，除了做了loss，也做loss前的predict，另外predict同時各層affine layer 的input "x"、
                        #optimizer(cross)的input "t" and "y"、activation function layer的output "out"與也會記錄在分別的class中,
                        #而各層affine layer的x就是'partial(z)/partial(w)'，所以別以為怎麼少了許多步驟。
        
        # backward
        dout = 1
        dout = self.lastlayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse() #將layers反過來準備開始backward
        for layer in layers:#backward 開始!!!!
            dout = layer.backward(dout)#分別計算dw、db並各自記錄在自己的layer中
        
        # gradient init and setting
        grads = {}
        grads['w1'] = self.layers['affine_1'].dW
        grads['b1'] = self.layers['affine_1'].db
        grads['w2'] = self.layers['affine_2'].dW
        grads['b2'] = self.layers['affine_2'].db
        
        return grads

In [0]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import time

'''-----------------------------------資料前處理---------------------------------------------'''
digits = load_digits()
x_, y_ = digits.data, digits.target


#---------將資料做One-Hot Encoding---------
y_one_hot = np.zeros((len(y_), 10))
y_one_hot[np.arange(len(y_)), y_] = 1 
#---------one way of normalization--------
x_ = x_ / x_.max() # normailze it to 0 - 1 標準化

#----------------------------資料training set, testing set 分割---------------------------------------
x_train, x_test, y_train, y_test = train_test_split(x_, y_one_hot, test_size = 0.1, stratify = y_)
'''--------------------------------建立List準備用來存過程中的acc與loss變化-------------------------'''
# define training settings ------------
train_loss_list = []
train_acc_list = []
test_acc_list = []
'''-----------------------------------參數設定與創建新模型--------------------------------------------'''
iters_num = 5001
train_size = x_train.shape[0] # numbers of training samples
bz = 100 # batch size
lr = 0.1 # learning rate

network = Two_layerNet_bp(input_size=64, hidden_size=25, output_size=10)
'''-----------------------------------start_training---------------------------------------------'''
starttime=time.time()
for i in tqdm(range(iters_num)):
    batch_mask = np.random.choice(train_size, bz)
    x_batch = x_train[batch_mask]
    y_batch = y_train[batch_mask]
    
    grad = network.gradient(x_batch, y_batch) # already contain a feed-forward processing in this step
    
    for key in ("w1", "b1", "w2", "b2"):
        network.params[key] -= lr * grad[key] #更新參數
        
    this_loss = network.loss(x_batch, y_batch)
    train_loss_list.append(this_loss)
    
     #每執行100次紀錄一次 
    if i % 100 == 0:
        # compute accuracy for every 100 updates
        train_acc = network.compute_acc(x_train, y_train)#計算每個training set的 acc
        test_acc = network.compute_acc(x_test, y_test)#記算每個testing set的 acc
        
        train_acc_list.append(train_acc)#記錄每個training set的 acc
        test_acc_list.append(test_acc)#記錄每個testing set的 acc
timerange=time.time()-starttime        
print('總共費時：',timerange)  
   

100%|██████████| 5001/5001 [00:02<00:00, 1839.72it/s]

總共費時： 2.7217187881469727





In [0]:
  '''-----------------------------------圖形化---------------------------------------------'''  
#------------------------------------------------------plot------------------------------------------------------
#print("Train accuarcy, Test accuracy | " + str(train_acc) + ", " + str(test_acc))
    
#lose圖

plt.plot(np.arange(len(train_loss_list)), train_loss_list, 'b-')
plt.show()

#training/validation accuracy圖
plt.plot(np.arange(len(train_acc_list)), train_acc_list, 'b-', label = 'training accuracy')
plt.plot(np.arange(len(test_acc_list)), test_acc_list, 'r-', label = 'validation accuracy')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

loss趨勢圖，x座標是看過單筆資料的次數，y軸是loss值


NameError: name 'train_loss_list' is not defined

## 觀看資料相同次數，更新參數相同次數，可以發現BP速度加快非常多!!

## Exercise
分別試著完成以下動作
1. 試著改變hidden size並比較執行結果 (easy 找到關鍵參數修改值即可)
2. 試著再加一層hidden layer並比較執行結果(little hard 得在所有跟layer有關的多個地方做增加) 