In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
test = np.load('test.npz')
train = np.load('train.npz')

In [3]:
list(test.keys())

['image', 'label']

In [4]:
test['image']

array([[[  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        ...,
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.]],

       [[  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        ...,
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.]],

       [[  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ..., 139.,  53.,   0.],
        ...,
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.]],

       ...,

       [[  0.,   0.,   0

In [5]:
test['label']

array([5., 6., 1., ..., 7., 4., 7.])

In [6]:
y_test = test['label'][:100]
X_test = test['image'][:100]
y_train = train['label'][:100]
X_train = train['image'][:100]

lb = preprocessing.LabelBinarizer()

y_train = lb.fit_transform(y_train).reshape(10, -1)
X_train = X_train.reshape(X_train.shape[1] * X_train.shape[2], -1)
y_test = lb.fit_transform(y_test).reshape(10, -1)
X_test = X_test.reshape(X_test.shape[1] * X_test.shape[2], -1)

In [7]:
X_train[X_train >= 100] = 1
X_train[X_train < 100] = 0

In [8]:
class Layer:
    def __init__(self, last_layer_num, node_num, activation_function=None, custom_W=None, custom_b=None):
        self.__node_num = node_num  #output_dim
        self.__last_layer_num = last_layer_num  #input_dim

        
        self.__layer_input = None
        self.activation = activation_function
        
        if custom_W is not None:
            self.W_ = custom_W
        else:
            limit = 1 / np.sqrt(self.__last_layer_num)
            self.W_ = np.random.uniform(-limit, limit, (self.__last_layer_num, self.__node_num))
        
        if custom_b is not None:
            self.b_ = custom_b
        else:
            self.b_ = np.zeros((self.__node_num, 1))

    def forward_propagation(self, last_layer):
        self.__layer_input = last_layer
        if self.activation:
            return np.dot(self.W_.T, last_layer) + self.b_
        else:
            return np.dot(self.W_.T, last_layer) + self.b_
        
    
    def back_propagation(self, gradient, learning_rate = 0.0001):
        if self.activation:
            gradient = self.activation.gradient(gradient)
        
        W_temp = self.W_ 
        
        gradient_W = self.__layer_input.dot(gradient.T)
        gradient_b = gradient
        
        self.W_ = self.W_ - learning_rate * gradient_W
        assert self.W_.shape == gradient_W.shape
        self.b_ = self.b_ - learning_rate * gradient_b
        assert self.b_.shape == gradient_b.shape
        
        accumulated_gradient = W_temp.dot(gradient)
        return accumulated_gradient

In [9]:
# test forward propagation

W = np.array([[1, 2, 3], [1, 2, 3]]).T
b = np.array([0, 1])
data = np.array([[3, 4, 5],[3, 4, 5]]).T

layer = Layer(3, 2, None, W, b)
layer1_output = layer.forward_propagation(data)

print(layer1_output)

layer2 = Layer(2, 3)
print(layer2.W_)
layer2.forward_propagation(layer1_output)

[[26 27]
 [26 27]]
[[-0.30795213 -0.40884523 -0.31074255]
 [ 0.13715907 -0.46785593  0.16232522]]


array([[ -4.44061955,  -4.61141261],
       [-22.79423026, -23.67093142],
       [ -3.85885041,  -4.00726774]])

In [10]:
def Softmax(x):
    e_x = np.exp(x - np.max(x))  # 減max會比較穩定,避免overflow
    return e_x / e_x.sum(axis=0)  # -xmax會消掉

In [11]:
class Sigmoid():
    def __call__(self,x):
        # To prevent from overflow
        x = np.clip(x, 1e-15, 1 - 1e-15)
        return 1.0 / (1.0 + np.exp(-x))
    def gradient(self, x):
        return self(x) * (1.0 - self(x))

In [12]:
class ReLU():
    def __call__(self, x):
        return np.maximum(x, 0.0)
    
    def gradient(self, x):
        x[x <= 0] = 0.0
        x[x > 0] = 1.0
        return x


In [13]:
class CrossEntropy():
    def __call__(self, y_hat, y):
        # Avoid division by zero
        y_hat = np.clip(y_hat, 1e-15, 1 - 1e-15)
        # return
        loss_sum = 0
        for n in range(y.shape[0]):
            for k in range(y.shape[1]):
                if y[n, k] == 1:
                    loss_sum += -np.log(y_hat[n, k])
        loss = np.sum(loss_sum) / y.shape[0]
        return loss

    def gradient(self, y_hat, y):
        return y_hat - y

In [14]:
class CrossEntropy2():
    def __call__(self, y_hat, y):
        # Avoid division by zero
        y_hat = np.clip(y_hat, 1e-15, 1 - 1e-15)
        # softmax
        m = y.shape[0]
        p = Softmax(y_hat)
        y = np.argmax(y, axis=1)
        
        log_likelihood = -np.log(p[range(m), y])
        loss = np.sum(log_likelihood) / m
        return loss
    
    def gradient(self, y_hat, y):
        return y_hat - y

In [15]:
 # test 
y = np.array([[1, 0 , 0], [1, 0, 0], [0 , 1, 0]])
y_hat = np.array([[3, 2, 2], [2, 5 ,6], [4 ,2 ,6]])

CrossEntropy()(y_hat, y)


9.992007221626415e-16

In [16]:
np.argmax(y, axis=1)

array([0, 0, 1])

In [39]:
HiddenLayer = Layer(last_layer_num=784, node_num=2, activation_function=ReLU())
OutputLayer = Layer(last_layer_num=2, node_num=10)
loss_func = CrossEntropy()

In [40]:
def train_epoch(X_train, y_train, epoch=10, learning_rate=1e-4):
    for i in range(epoch):
        output1 = HiddenLayer.forward_propagation(X_train)
        theta = OutputLayer.forward_propagation(output1)
        y_hat = Softmax(theta)

        loss = loss_func(y_hat, y_train)

        print(loss)
        #print(y_hat)
        
        gradient = loss_func.gradient(y_hat, y_train)

        gradient = OutputLayer.back_propagation(gradient, learning_rate=learning_rate)
        HiddenLayer.back_propagation(gradient, learning_rate=learning_rate)
    

In [41]:
train_epoch(X_train, y_train)

23.025850929940425
23.024840093303624
23.023829299302434
23.022818547873076
23.021807838951787
23.02079717247485
23.019786548378498
23.018775966599076
23.017765427072924
23.016754929736397


In [20]:
y = np.array([[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]])

In [21]:
y.shape[0]
y.size
len(y)

3

In [22]:
y.shape

(3, 4)

In [23]:
sigmoid(3)

NameError: name 'sigmoid' is not defined