# Neural Network (numpy)

In [None]:
from IPython.display import Image
from utils import *
from YourAnswer import *

%matplotlib inline

## Random Seed
- Fix the seed to constraint the randomness and reproduce the results

In [None]:
seed = 0
np.random.seed(seed)

## Data preprocessing

- We will use the CIFAR10 dataset. <br>
- Load the data as numpy type

In [None]:
X_tr, Y_tr, X_te, Y_te, mean_img = get_CIFAR10_data()
print ('Train data shape : %s,  Train labels shape : %s' % (X_tr.shape, Y_tr.shape))
print ('Test data shape : %s,  Test labels shape : %s' % (X_te.shape, Y_te.shape))

## Visualize training images
- Check what the data looks like

In [None]:
class_names = ['airplane','automobile','bird','cat','deer',
               'dog','frog','horse','ship','truck']

images_index = np.int32(np.round(np.random.rand(18,)*10000,0))

fig, axes = plt.subplots(3, 6, figsize=(18, 6),
                         subplot_kw={'xticks': [], 'yticks': []})

fig.subplots_adjust(hspace=0.3, wspace=0.05)

for ax, idx in zip(axes.flat, images_index):
    img = (X_tr[idx,:3072].reshape(32, 32, 3) + mean_img.reshape(32, 32, 3))
    ax.imshow(img)
    ax.set_title(class_names[Y_tr[idx]])
    

## 1. Softmax function

### To do:
- Implement `softmax` in `YourAnswer.py` file <br> 

<center> $\Large p(\mathbf{y}=i|\mathbf{x}; { {\theta}_{1} , ... , {\theta}_{k} } )$ =  $\Large \frac{exp({\theta}_{i}^T) \mathbf{x}} {\Sigma_{j=1}^{k} exp({\theta}_{j}^T) \mathbf{x}}$  <br><br>
    where $\mathbf{x}\in \mathbb{R}^{d}$ and $\mathbf{y}\in\mathbb{R}^{k}$


- The output should be [2.06115362e-09 1.80485138e-35 9.99999998e-01]
- Sum of the softmax output should be 0.9999999999999999

In [None]:
temp_x = np.array([[2060,2000,2080]])
softmax_result1 = softmax(temp_x)
print('Softmax result :\n',softmax_result1)
print ('\nSum of the softmax :',np.sum(softmax_result1))

- The output should be <br>
    [2.06115362e-09   1.80485138e-35   9.99999998e-01] <br>
    [2.06106005e-09   4.53978686e-05   9.99954600e-01]]
 
- Sum of the softmax output should be [1. 1.]

In [None]:
temp_x = np.array([[2060,2000,2080],[1010,1020,1030]])
softmax_result2 = softmax(temp_x)
print('Softmax result :\n',softmax_result2)
print ('\nSum of the softmax :',np.sum(softmax_result2,axis=1))

## 2. Cross-Entropy Loss

### To do:
- Implement `cross_entropy_loss` in `YourAnswer.py` file <br> 
- Here, we will use `temp_score` instead of $h_{\theta}(x)$
- Loss function is composed of data loss and regularization loss
<center> $ L(\theta)$ = $ \frac{1}{N} \Sigma_{i=1}^{N} L_{i}{(h_{\theta},(x_{i},y_{i}))} + \lambda R(\theta)$ <br><br>

- We choose to use cross entropy loss <br> 
   <center>$L_{i} = - y_{i} log(h_{\theta}(x_{i}))$ 

In [None]:
temp_theta1 = np.array([[0.1,0.2,0.3],[-0.5,0.3,-0.8]])
temp_theta2 = np.array([[0.9,-0.5,0.3],[0.9,0.6,-0.8]])

thetas = {}
thetas['T1'] = (temp_theta1)
thetas['T2'] = (temp_theta2)

reg_term = 0.001

- The output should be 20.72530583694641
- It should not be NaN

In [None]:
temp_score0 = np.array([[0.0, 0.0, 0.0]])
temp_target0 = np.array([[0,1,0]])
loss0 = cross_entropy_loss(temp_score0, temp_target0, thetas, reg_term)
print('Total Loss for temp_0 =', loss0)

- The output should be 1.2060128009926026
- It should not be NaN

In [None]:
temp_score1 = np.array([[0.1, 0.3, 0.6]])
temp_target1 = np.array([[0,1,0]])
loss1 = cross_entropy_loss(temp_score1, temp_target1 , thetas, reg_term)
print('Total Loss for temp_1 =', loss1)

- The output should be 0.7439146816378243
- It should not be NaN

In [None]:
temp_score2 = np.array([[0.1, 0.3, 0.6],[0.2,0.4,0.4],[0.9,0.05,0.05]])
temp_target2 = np.array([[0,1,0],[0,0,1],[1,0,0]])
loss2 = cross_entropy_loss(temp_score2, temp_target2 , thetas, 0.001)
print('Total Loss for temp_2 =', loss2)

## 3. Output Layer

### To do:
- Implement `OutputLayer` in `YourAnswer.py` file 
- Output layer is a layer where the softmax score and loss are computed
- Use Cross Entropy Loss this time
- W in the picture below is the same as $\theta$

In [None]:
Image('fig/Output_Layer.png')

In [None]:
outputlayer = OutputLayer(thetas, reg_term)

- Forward output should be 13.097100867144416
- Backward output should be [ 0.90887517, -0.99999795,  0.09112277]

In [None]:
temp_x1 = np.array([[3, -10, 0.7]])
temp_t1 = np.array([[0,1,0]])
output_forward1 = outputlayer.forward(temp_x1, temp_t1)
output_backward1 = outputlayer.backward()
print('Forward propagation of output layer :', output_forward1)
print('Backward propagation of output layer :', output_backward1)

- Forward output should be 7.077588386844261
- Backward output should be [ 3.02958391e-01, -3.33332649e-01,  3.03742579e-02],
       [-3.32509126e-01,  3.32509088e-01,  3.74189683e-08],
       [ 7.26173786e-04,  2.92959414e-01, -2.93685588e-01]]

In [None]:
temp_x2 = np.array([[3, -10, 0.7],[9,15,-1],[-5,1,-1]])
temp_t2 = np.array([[0,1,0],[1,0,0],[0,0,1]])
output_forward2 = outputlayer.forward(temp_x2, temp_t2)
output_backward2 = outputlayer.backward()
print('Forward propagation of output layer :', output_forward2)
print('\nBackward propagation of output layer :', output_backward2)

# 4. ReLU

### To do:
- Implement `ReLU` in `YourAnswer.py` file <br><br> 
- ReLU passes only the positive values and sets the non-positive values as zero
- We can mathematically describe it as follows :
<center> $\Large ReLU(x) =  max(0,x)$

In [None]:
relu = ReLU()

- Forward propagation should be [3.  0.  0.7]
- Backward propagation should be [-10   0   0]

In [None]:
temp_x1 = np.array([[3, -10, 0.7]])
temp_x2 = np.array([[-10,1,0]])
relu_forward1 = relu.forward(temp_x1)
relu_backward1 = relu.backward(temp_x2)
print('Forward propagation of ReLU :', relu_forward1)
print('Backward propagation of ReLU :', relu_backward1)

- Forward propagation should be 
            [ 3. ,  0. ,  0.7],
            [ 9. , 15. ,  0. ],
            [ 0. ,  1. ,  0. ]
<br>
- Backward propagation should be
            [  3,   0, -10],
            [  5,  -4,   0],
            [  0,  -5,   0]

In [None]:
temp_x3 = np.array([[3, -10, 0.7],[9,15,-1],[-5,1,-1]])
temp_x4 = np.array([[3,5,-10],[5,-4,2],[-3,-5,3]])
relu_forward2 = relu.forward(temp_x3)
relu_backward2 = relu.backward(temp_x4)
print('Forward propagation of ReLU :', relu_forward2)
print('\nBackward propagation of ReLU :', relu_backward2)

# 5. Sigmoid

### To do:
- Implement `ReLU` in `YourAnswer.py` file <br><br> 
- Sigmoid is an activation function which converts all the values between 0 and 1 
- The mathematical description is as follows

<center> $\Large  \sigma(x) =  \frac{1}{1+exp^{-x}}$

In [None]:
sigmoid = Sigmoid()

- Forward propagation output should be [9.52574127e-01 4.53978687e-05 6.68187772e-01]
- Backward propagation output should be [ 0.13552998 -0.00045396  0.15519901]

In [None]:
temp_x1 = np.array([[3, -10, 0.7]])
sigmoid_forward1 = sigmoid.forward(temp_x1)
sigmoid_backward1 = sigmoid.backward(temp_x1)
print('Forward propagation of sigmoid :',sigmoid_forward1)
print('Backward propagation of sigmoid :',sigmoid_backward1)

- Forward propagation output should be <br>
   [9.52574127e-01 4.53978687e-05 6.68187772e-01], <br>
   [9.99876605e-01 9.99999694e-01 2.68941421e-01], <br>
   [6.69285092e-03 7.31058579e-01 2.68941421e-01]

- Backward propagation output should be  <br>
 [ 1.35529979e-01 -4.53958077e-04  1.55199011e-01], <br>
 [ 1.11041415e-03  4.58853200e-06 -1.96611933e-01], <br>
 [-3.32402834e-02  1.96611933e-01 -1.96611933e-01]

In [None]:
temp_x2 = np.array([[3, -10, 0.7],[9,15,-1],[-5,1,-1]])
sigmoid_forward2 = sigmoid.forward(temp_x2)
sigmoid_backward2 = sigmoid.backward(temp_x2)
print('Forward propagation of sigmoid :',sigmoid_forward2)
print('\nBackward propagation of sigmoid :',sigmoid_backward2)

# 6. Affine

### To do:
- Implement `Affine` in `YourAnswer.py` file <br><br> 
- Affine layer connects the input and output with weights and bias
- It is also called as dense layer or linear layer

<center>   Affine$(\theta,b) =\Large \theta X + b$

- Forward propagation output should be <br>
[ 0.51 -0.39  0.84] <br>
 [-0.07 -0.02  0.02]
 
- Backward propagation output should be <br>
[-0.61  0.28] <br>
 [-0.25 -0.21]

In [None]:
temp_theta = np.array([[0.2, -0.3, 0.6], [-0.9, 0.1, -0.4]])
temp_bias = np.array([[0.2, -0.3, 0.6]])
temp_x = np.array([[0.2, -0.3], [-0.9, 0.1]])
temp_gradient = np.array([[0.1, 0.5, -0.8], [0.4, 0.7, -0.2]])

affine = Affine(temp_theta, temp_bias)
affine_forward1 = affine.forward(temp_x)
affine_backward1 = affine.backward(temp_gradient)
print('Forward propagation of Affine :\n', affine_forward1)
print('\nBackward propagation of Affine :\n', affine_backward1)

- dW of affine should be <br>
        [-0.34, -0.53,  0.02],
        [ 0.01, -0.08,  0.22]
<br>
- db of affine should be
        [ 0.5,  1.2, -1. ]

In [None]:
dt = affine.dT
db = affine.db
print('Gradient of the thetas :\n', dt)
print('\nGradient of the biases :',db)

# 7. TwoLayerNN

### To do:
- Implement `TwoLayerNet` in `YourAnswer.py` file <br><br> 
- Construct a two layer NN
- You need to implement both the forward pass and the backward pass
- Use the functions you implemented so far
- You can find some helps about the OrderedDict here (https://pymotw.com/2/collections/ordereddict.html)

### Numerical gradient vs Backpropagation

In [None]:
network = TwoLayerNet(input_size=3072, hidden_size=10, output_size=10, regularization = 100)

x_batch = X_tr[:3]
t_batch = Y_tr[:3]

nb_classes = 10

targets = t_batch.reshape(-1)
t_batch = np.eye(nb_classes)[targets]

- Running time of grad_backprop should be much faster than the time of grad_numerical 

In [None]:
start_time = time.time() 
grad_backprop = network.gradient(x_batch, t_batch)
print("[grad_backprop] running time(sec) : " +str(time.time() - start_time))

start_time = time.time() 
grad_numerical = network.numerical_gradient(x_batch, t_batch)
print("[grad_numerical] running time(sec) : "+str(time.time() - start_time))

In [None]:
for key in grad_numerical.keys():
    diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
    print(key + ":" + str(diff))

### Change the Cifar-10 dataset Label as One-hot vector
- The labels were the natural language like deer, horse, car
- We need to change it to One-hot vector so that the machine can differentiate them
- One-hot vector is a vector that only one element is 1 and the others are 0. <br>

  For example : 
      [0, 0, 1, 0 ,0 ] ( O )
      [0, 1, 0, 1, 0 ] ( X )
  

In [None]:
nb_classes = 10
train_targets = Y_tr.reshape(-1)
Y_tr_onehot = np.eye(nb_classes)[train_targets]

test_targets = Y_te.reshape(-1)
Y_te_onehot = np.eye(nb_classes)[test_targets]

The input size is chosen by the size of the picture. (In this case, it is 3x32x32) <br>
The hidden size is the size of the hidden layer and it is a value we can choose arbitrarily <br>
The output size if the number of classes we want to classify. (In this case, it is 10)

In [None]:
input_size_=3072
hidden_size_=1024
output_size_=10
regularization_ = 0.0001

network = TwoLayerNet(input_size=input_size_, hidden_size=hidden_size_, output_size=output_size_, regularization = regularization_)

iters_num = 2000
train_size = X_tr.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list_two = []
train_acc_list_two = []
test_acc_list_two = []

iter_per_epoch = max(train_size / batch_size, 1)

In [None]:
for i in range(iters_num):

    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = X_tr[batch_mask]
    t_batch = Y_tr_onehot[batch_mask]

    grad = network.gradient(x_batch, t_batch) 

    for key in ('T1', 'b1', 'T2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    for key in ('T1','T2'):
        network.thetas[key] = network.params[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list_two.append(loss)    

    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(X_tr, Y_tr_onehot)
        test_acc = network.accuracy(X_te, Y_te_onehot)
        train_acc_list_two.append(train_acc)
        test_acc_list_two.append(test_acc)

        print("Epoch : ",i / iter_per_epoch + 1, "Training acc : ", round(train_acc,2), "Test acc : ", round(test_acc,2))

In [None]:
model_plot(train_acc_list_two,test_acc_list_two)

# 8. ThreeLayerNN

### To do:
- Implement `ThreeLayerNet` in `YourAnswer.py` file <br><br> 


In [None]:
_input_size=3072
_hidden_size1=1024
_hidden_size2=1024
_output_size=10
_regularization= 0.0001

network = ThreeLayerNet(input_size=_input_size, hidden_size1=_hidden_size1, hidden_size2 = _hidden_size2, output_size = _output_size, regularization = _regularization)

iters_num = 2000
train_size = X_tr.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list_three = []
train_acc_list_three = []
test_acc_list_three = []

iter_per_epoch = max(train_size / batch_size, 1)


for i in range(iters_num):

    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = X_tr[batch_mask]
    t_batch = Y_tr_onehot[batch_mask]

    grad = network.gradient(x_batch, t_batch)
    
    for key in ('T1', 'b1', 'T2', 'b2', 'T3', 'b3'):

        network.params[key] -= learning_rate * grad[key]
        
    for key in ('T1','T2'):

        network.thetas[key] = network.params[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list_three.append(loss)

    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(X_tr, Y_tr_onehot)
        test_acc = network.accuracy(X_te, Y_te_onehot)
        train_acc_list_three.append(train_acc)
        test_acc_list_three.append(test_acc)

        print("Epoch : ",i / iter_per_epoch + 1, "Training acc : ", round(train_acc,2), "Test acc : ", round(test_acc,2))
        

In [None]:
model_plot(train_acc_list_three,test_acc_list_three)

# 9. Selecting hyperparameters
- Apply various hyperparameters to the TwoLayerNN and check the results
- Find the hyperparameters that derive the best test accuracy

In [None]:
hidden_size_arr= [512, 1024, 2048]
regularization_arr = [0.1, 0.01, 0.001]

learning_rate = 0.1
iters_num = 2000
_input_size= 3072
_output_size= 10

train_size = X_tr.shape[0]
batch_size = 100
learning_rate = 0.1


best_result = np.zeros((3,))

for _hidden_size in (hidden_size_arr):
    for _regularization in (regularization_arr):
        
        print ("\nHidden size : " +str(_hidden_size) + "  Regularization : " + str(_regularization)+"\n")

        network = TwoLayerNet(input_size=_input_size, hidden_size=_hidden_size, output_size=_output_size, regularization = _regularization)

        train_size = X_tr.shape[0]
        batch_size = 100

        iter_per_epoch = max(train_size / batch_size, 1)


        for i in range(iters_num):

            batch_mask = np.random.choice(train_size, batch_size)
            x_batch = X_tr[batch_mask]
            t_batch = Y_tr_onehot[batch_mask]

            grad = network.gradient(x_batch, t_batch) 
            
            for key in ('T1', 'b1', 'T2', 'b2'):

                network.params[key] -= learning_rate * grad[key]

            for key in ('T1','T2'):

                network.thetas[key] = network.params[key]

            loss = network.loss(x_batch, t_batch)

            if i % iter_per_epoch == 0:
                train_acc = network.accuracy(X_tr, Y_tr_onehot)
                test_acc = network.accuracy(X_te, Y_te_onehot)
                print("Epoch : ",i / iter_per_epoch + 1, "Training acc : ", round(train_acc,3), "Test acc : ", round(test_acc,3))
                if test_acc > best_result[0]:
                    best_result[0] = test_acc
                    best_result[1] = _hidden_size
                    best_result[2] = _regularization

In [None]:
print ('Best test accruacy : ', best_result[0])
print ('Best hyperparameter : \n Hidden size : ' + str(best_result[1]) + '\n Regularization : ' + str(best_result[2]))