In [1]:
import numpy as np

In [2]:
test = np.load('test.npz')
train = np.load('train.npz')

In [3]:
list(test.keys())

['image', 'label']

In [4]:
test['label']
np.unique(test['label'])

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [5]:
y = test['label']

np.zeros((y.size, 10)).shape
np.arange(y.size)

new_y = np.eye(10)[[int(y_) for y_ in y.tolist()]]
new_y[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])

In [6]:
y_test = test['label']
X_test = test['image']
y_train = train['label']
X_train = train['image']

# transfer to one-hot vector
#y_onehot_train = np.zeros((y_train.size, 10))
#y_onehot_train[np.arange(y_train.size), y_train] = 1
#y_onehot_test = np.zeros((y_test.size, 10))
#y_onehot_test[np.arange(y_test.size), y_train] = 1
y_onehot_train = np.eye(10)[[int(y_) for y_ in y_train.tolist()]]
y_onehot_test = np.eye(10)[[int(y_) for y_ in y_test.tolist()]]


y_train = np.moveaxis(y_onehot_train, 1, 0)
print(X_train.shape)
X_train = X_train.reshape(-1, X_train.shape[1] * X_train.shape[2])
print(X_train.shape)
X_train = np.moveaxis(X_train, 1, 0)
print(X_train.shape)

y_test = np.moveaxis(y_onehot_test, 1, 0)
X_test = X_test.reshape(-1, X_test.shape[1] * X_test.shape[2])
X_test = np.moveaxis(X_test, 1, 0)

(12000, 28, 28)
(12000, 784)
(784, 12000)


In [7]:
X_train[X_train < 100] = -1
X_train[X_train >= 100] = 1

In [8]:
y_test[:, 0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])

In [9]:
class Layer:
    def __init__(self, last_layer_num, node_num, activation_function=None, custom_W=None, custom_b=None):
        self.__node_num = node_num  #output_dim
        self.__last_layer_num = last_layer_num  #input_dim

        
        self.__layer_input = None
        self.activation = activation_function
        
        if custom_W is not None:
            self.W_ = custom_W
        else:
            limit = 1 / np.sqrt(self.__last_layer_num)
            self.W_ = np.random.uniform(-limit, limit, (self.__last_layer_num, self.__node_num))
        
        if custom_b is not None:
            self.b_ = custom_b
        else:
            self.b_ = np.zeros((self.__node_num, 1))

    def forward_propagation(self, last_layer):
        self.__layer_input = last_layer
        if self.activation:
            return np.dot(self.W_.T, last_layer) + self.b_
        else:
            return np.dot(self.W_.T, last_layer) + self.b_
        
    
    def back_propagation(self, gradient, learning_rate = 0.0001):
        if self.activation:
            gradient = self.activation.gradient(gradient)
        
        W_temp = self.W_
        
        gradient_W = self.__layer_input.dot(gradient.T)
        gradient_b = np.sum(gradient, axis=1, keepdims=True)
        
        self.W_ = self.W_ - learning_rate * gradient_W
        assert self.W_.shape == gradient_W.shape
        self.b_ = self.b_ - learning_rate * gradient_b
        assert self.b_.shape == gradient_b.shape
        
        accumulated_gradient = W_temp.dot(gradient)
        return accumulated_gradient

In [10]:
# test forward propagation

W = np.array([[1, 2, 3], [1, 2, 3]]).T
b = np.array([0, 1])
data = np.array([[3, 4, 5],[3, 4, 5]]).T

layer = Layer(3, 2, None, W, b)
layer1_output = layer.forward_propagation(data)

print(layer1_output)

layer2 = Layer(2, 3)
print(layer2.W_)
layer2.forward_propagation(layer1_output)

[[26 27]
 [26 27]]
[[ 0.61079314 -0.00506093  0.28955549]
 [-0.19369355  0.15849927 -0.18915272]]


array([[10.84458942, 11.26168901],
       [ 3.98939685,  4.14283519],
       [ 2.61047192,  2.71087469]])

In [11]:
def Softmax(x):
    e_x = np.exp(x - np.max(x))  # 減max會比較穩定,避免overflow
    return e_x / e_x.sum(axis=0)  # -xmax會消掉

In [12]:
class Sigmoid():
    def __call__(self,x):
        # To prevent from overflow
        x = np.clip(x, 1e-15, 1 - 1e-15)
        return 1.0 / (1.0 + np.exp(-x))
    def gradient(self, x):
        return self(x) * (1.0 - self(x))

In [13]:
class ReLU():
    def __call__(self, x):
        return np.maximum(x, 0.0)
    
    def gradient(self, x):
        x[x <= 0] = 0.0
        x[x > 0] = 1.0
        return x


In [14]:
class CrossEntropy():
    def __call__(self, y_hat, y):
        # Avoid division by zero
        y_hat = np.clip(y_hat, 1e-15, 1 - 1e-15)
        # return
        loss_sum = 0
        for n in range(y.shape[0]):
            for k in range(y.shape[1]):
                if y[n, k] == 1:
                    loss_sum += -np.log(y_hat[n, k])
        loss = np.sum(loss_sum) / y.shape[0]
        return loss

    def gradient(self, y_hat, y):
        return y_hat - y

In [15]:
class CrossEntropy2():
    def __call__(self, y_hat, y):
        # Avoid division by zero
        y_hat = np.clip(y_hat, 1e-15, 1 - 1e-15)
        # softmax
        m = y.shape[0]
        p = Softmax(y_hat)
        y = np.argmax(y, axis=1)
        
        log_likelihood = -np.log(p[range(m), y])
        loss = np.sum(log_likelihood) / m
        return loss
    
    def gradient(self, y_hat, y):
        return y_hat - y

In [16]:
 # test 
y = np.array([[1, 0 , 0], [1, 0, 0], [0 , 1, 0]])
y_hat = np.array([[3, 2, 2], [2, 5 ,6], [4 ,2 ,6]])

CrossEntropy()(y_hat, y)


9.992007221626415e-16

In [17]:
np.argmax(y, axis=1)

array([0, 0, 1])

In [18]:
HiddenLayer = Layer(last_layer_num=784, node_num=392, activation_function=Sigmoid())
HiddenLayer2 = Layer(last_layer_num=392, node_num=198, activation_function=Sigmoid())
HiddenLayer3 = Layer(last_layer_num=198, node_num=2, activation_function=Sigmoid())
OutputLayer = Layer(last_layer_num=2, node_num=10)
#loss_func = CrossEntropy()
loss_func = CrossEntropy2()

In [19]:
def train_epoch(X_train, y_train, epoch=10, learning_rate=1e-4):
    """ Gradient Descent """
    for i in range(epoch):
        output1 = HiddenLayer.forward_propagation(X_train)
        theta = OutputLayer.forward_propagation(output1)
        y_hat = Softmax(theta)

        loss = loss_func(y_hat, y_train)

        print(loss)
        #print(y_hat)
        
        gradient = loss_func.gradient(y_hat, y_train)

        gradient = OutputLayer.back_propagation(gradient, learning_rate=learning_rate)
        HiddenLayer.back_propagation(gradient, learning_rate=learning_rate)
    

In [20]:
#train_epoch(X_train, y_train, learning_rate=0.1)

In [23]:
def SGD_train_epoch(X_train, y_train, batch_size=16, epoch=10, learning_rate=1e-10):
    """ Stochastic Gradient Descent """
    batch_num = 0
    all_loss = []
    for i in range(epoch):
        epoch_loss = []
        for start in range(0, X_train.shape[1], batch_size):
            end = min(X_train.shape[1], start + batch_size)
            
            X = X_train[:, start:end]
            y = y_train[:, start:end]
            
            #import ipdb; ipdb.set_trace()
            
            output1 = HiddenLayer.forward_propagation(X)
            output2 = HiddenLayer2.forward_propagation(output1)
            output3 = HiddenLayer3.forward_propagation(output2)
            theta = OutputLayer.forward_propagation(output3)
            
            y_hat = Softmax(theta)

            #loss = loss_func(y_hat, y) / (end - start)
            loss = loss_func(theta, y) / (end - start)
            epoch_loss.append(loss)
            
            
            if loss > 100:
                import ipdb; ipdb.set_trace()
            #print(y_hat)

            gradient = loss_func.gradient(y_hat, y) / (end - start)

            gradient = OutputLayer.back_propagation(gradient, learning_rate=learning_rate)
            gradient = HiddenLayer3.back_propagation(gradient, learning_rate=learning_rate)
            gradient = HiddenLayer2.back_propagation(gradient, learning_rate=learning_rate)
            HiddenLayer.back_propagation(gradient, learning_rate=learning_rate)

        print('Epoch:', i+1, 'Average loss:', sum(epoch_loss)/len(epoch_loss))
            
        all_loss.extend(epoch_loss)

In [24]:
SGD_train_epoch(X_train, y_train, epoch=500)

Epoch: 1 Average loss: 0.14889816700595176
Epoch: 2 Average loss: 0.14889910322543742
Epoch: 3 Average loss: 0.148900039346592
Epoch: 4 Average loss: 0.1489009753966413
Epoch: 5 Average loss: 0.14890191050480797
Epoch: 6 Average loss: 0.1489028456415601
Epoch: 7 Average loss: 0.14890378087553985
Epoch: 8 Average loss: 0.14890471561578866
Epoch: 9 Average loss: 0.14890565017313745
Epoch: 10 Average loss: 0.1489065842285539
Epoch: 11 Average loss: 0.14890751778158445
Epoch: 12 Average loss: 0.1489084513269892
Epoch: 13 Average loss: 0.14890938457596853
Epoch: 14 Average loss: 0.1489103173876182
Epoch: 15 Average loss: 0.14891124987268123
Epoch: 16 Average loss: 0.14891218241182705
Epoch: 17 Average loss: 0.14891311467349497
Epoch: 18 Average loss: 0.148914046380684
Epoch: 19 Average loss: 0.14891497767925888
Epoch: 20 Average loss: 0.14891590892164583
Epoch: 21 Average loss: 0.14891683994861205
Epoch: 22 Average loss: 0.14891777090556663
Epoch: 23 Average loss: 0.14891870172725022
Epoch:

Epoch: 188 Average loss: 0.14906993377220437
Epoch: 189 Average loss: 0.14907083030932466
Epoch: 190 Average loss: 0.1490717268121669
Epoch: 191 Average loss: 0.14907262309116595
Epoch: 192 Average loss: 0.14907351909919325
Epoch: 193 Average loss: 0.14907441482220893
Epoch: 194 Average loss: 0.14907531027548046
Epoch: 195 Average loss: 0.14907620523260193
Epoch: 196 Average loss: 0.14907709995401758
Epoch: 197 Average loss: 0.14907799388143103
Epoch: 198 Average loss: 0.1490788872463811
Epoch: 199 Average loss: 0.14907978051459256
Epoch: 200 Average loss: 0.1490806738310016
Epoch: 201 Average loss: 0.1490815671692914
Epoch: 202 Average loss: 0.14908246086739316
Epoch: 203 Average loss: 0.14908335438842701
Epoch: 204 Average loss: 0.1490842470607822
Epoch: 205 Average loss: 0.1490851397629674
Epoch: 206 Average loss: 0.1490860317488965
Epoch: 207 Average loss: 0.14908692310005456
Epoch: 208 Average loss: 0.14908781415108577
Epoch: 209 Average loss: 0.1490887052037983
Epoch: 210 Average

Epoch: 372 Average loss: 0.14923087754372177
Epoch: 373 Average loss: 0.14923173316120453
Epoch: 374 Average loss: 0.14923258829046673
Epoch: 375 Average loss: 0.14923344267833075
Epoch: 376 Average loss: 0.14923429772502095
Epoch: 377 Average loss: 0.1492351529320637
Epoch: 378 Average loss: 0.1492360088529395
Epoch: 379 Average loss: 0.14923686549971157
Epoch: 380 Average loss: 0.1492377224988818
Epoch: 381 Average loss: 0.14923857965513096
Epoch: 382 Average loss: 0.14923943696866274
Epoch: 383 Average loss: 0.14924029417080792
Epoch: 384 Average loss: 0.14924115086273348
Epoch: 385 Average loss: 0.14924200709524038
Epoch: 386 Average loss: 0.1492428632615258
Epoch: 387 Average loss: 0.1492437192646944
Epoch: 388 Average loss: 0.1492445740956953
Epoch: 389 Average loss: 0.1492454285697761
Epoch: 390 Average loss: 0.14924628257157033
Epoch: 391 Average loss: 0.149247135831147
Epoch: 392 Average loss: 0.1492479894540498
Epoch: 393 Average loss: 0.1492488430266075
Epoch: 394 Average lo

In [None]:
X_train.shape

In [None]:
X_train[:, -4:].shape

In [None]:
HiddenLayer.W_.shape

In [None]:
HiddenLayer.b_.shape

In [None]:
HiddenLayer.forward_propagation(X_train[:, -4:])

In [None]:
y = np.array([[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]])

In [None]:
y.shape[0]
y.size
len(y)

In [None]:
y.shape

In [None]:
sigmoid(3)