In [1]:
import numpy as np

In [None]:
def TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, std=4e-3):
        self.params={}
        self.params['W1'] = std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
    def loss(self, X, y=None, reg=0.0):
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        

        scores1 = X.dot(W1) + b1 # forward pass
        scores1[scores1<0] == 0 # Relu
        scores = scores1.dot(W2) + b2 # foward pass
        
        # This step is very important!!!!! If not, the exponent of e will be too large
        # np.max() will return an array in row, change it to column
        scores = scores - np.max(scores, axis=1).reshape(scores.shape[0], -1)
        
        # tobe used in cross-entropy loss
        exp_scores = np.exp(scores)
        sum_exp_Scores = np.sum(exp_scores, axis=1) # return array in row
        
        # cross-entropy loss: -ylogx
        loss = np.sum(np.log(sum_exp_scores) - scores[range(scores.shape[0]), y]) # sum over axis 0
        loss = loss / X.shape[0] # loss should be divided by the instance number N
        loss += 0.5 * reg * (np.sum(W1*W1) + np.sum(W2*W2)) # L2 regularization
        
        # Backpropagation
        correct_martix = np.zeros_like(scores)
        correct_matrix[range(scores.shape[0]), y] = 1
        # see mathematics below
        d_loss_scores = exp_scores/sum_exp_scores.reshape(-1, 1) - correct_matrix # same shape with scores: (n, 10)
        
        # abbr. dx: d_loss_x
        dW1 = np.zeros_like(W1) # Assume (3000, 100)
        dW2 = np.zeros_like(W2) # Assume (100, 10)
        db1 = np.zeros_like(b1) # Assume (100,)
        db2 = np.zeros_like(b2) # Assume (10,)
        
        # Assume X: (n, 3000) ==> scores1 = X.dot(W1) (n, 100) ==> scores = scores1.dot(W2) (n, 10)
        
        # dL/dW2 = dL/dScores(d_loss_scores) * dScores/W2(scores1)
        dW2 += scores1.T.dot(d_loss_scores) # (100, n) * (n, 10) = (100, 10) shape is a good indicator
        dW2 /= X.shape[0]
        dW2 += reg * dW2
        
        # todo: matrix derivatives
        db2 += np.sum(d_loss_scores, axis=0)
        db2 /= X.shape[0]
        
        # Now the hidden_layer - output part backpropagation is done
        # Next is input_layer - hidden_layer part backpropagation
        
        # dL/dW1(3000, 100) = dL/dScores(d_loss_scores)(n, 10) 
        #                     * dScores/dScores1(W2)(100, 10) 
        #                     * dScores1/dW1(X)(n, 3000)
        # We should first solve for dL/dscores1 (n, 100):
        d_loss_scores1 = d_loss_scores.dot(W2.T)
        d_loss_scores1[scores1==0] = 0 # We used Relu activation, where scores1==0 contributes no gradient
        
        dW1 += X.T.dot(d_loss_scores1)
        dW1 /= X.shape[0]
        dW1 += reg * dW1
        
        db1 += np.sum(d_loss_scores1, axis=0)
        db1 /= X.shape[0]
        
        grad = dict()
        grads['W1'] = dW1
        grads['W2'] = dW2
        grads['b1'] = db1
        grads['b2'] = db2
        
        return loss, grads
    
    def train(self, X_train, y_train, X_val, y_val, learning_rate=1e-4, learning_rate_decay=0.95,
              reg=5e-6, num_iters=10000, batch_size=200, verbose=False):
        pass

**cross-entropy loss**:

$loss = -ylog(\hat{y}) = \sum_{i}{y_i}(\log\sum_{j}e^{\hat{y_j}} - \hat{y_i}) = $

where $i, j \in [0, 9]$

Only for the right label $y_k = 1$, else $y = 0$, so:

$loss = \log\sum_{j}e^{\hat{y_j}} - \hat{y_k}$

**d_loss_scores:**

For $\frac{\partial{loss}}{\partial{\hat{y_i}}}$, two possibilities exist:

if $i = k$: $= \frac{\partial{\log\sum_{j}e^{\hat{y_j}}}}{\partial{\hat{y_i}}}
- 1 = \frac{e^{\hat{y_i}}}{\sum_{j}e^{\hat{y_j}}} - 1$



if $i \ne k:$ = $\frac{e^{\hat{y_i}}}{\sum_{j}e^{\hat{y_j}}}$
