# Upgrading our MNIST Network

In [25]:
using MLDatasets
train_x, train_y = MNIST.traindata()
test_x,  test_y  = MNIST.testdata();

(images, labels) = (reshape(train_x[:,:,1:1000], (28*28, 1000)), train_y[1:1000])
one_hot_labels = zeros(10,length(labels))
for (i,l) in enumerate(labels)
    one_hot_labels[l+1, i] = 1.0
end
labels = one_hot_labels

test_images = reshape(test_x, (28*28, size(test_x,3)))
test_labels = zeros((10, size(test_x,3)))

for (i,l) in enumerate(test_y)
    test_labels[l+1, i] = 1.0
end

using Random
Random.seed!(1)

tanh2deriv(output) = 1 - output^2

function softmax(x)
    temp = exp.(x)
    return temp ./ sum(temp, dims=2)
end

alpha, iterations, hidden_size = (2, 300, 100)
pixels_per_image, num_labels = (784, 10)
batch_size = 100

weights_0_1 = 0.2 .* rand(pixels_per_image,hidden_size) .- 0.1
weights_1_2 = 0.2 .* rand(hidden_size,num_labels) .- 0.1

for j = 1:iterations
    Error, Correct_cnt = (0.0, 0)
    
    for i = 1:batch_size:size(images, 2)-batch_size
        batch_start, batch_end = i, i+batch_size-1
        layer_0 = images[:, batch_start:batch_end]
        layer_1 = tanh.(layer_0' * weights_0_1)
        dropout_mask = bitrand(size(layer_1))
        layer_1 .*= (dropout_mask .* 2)
        layer_2 = softmax(layer_1 * weights_1_2)
        
        Error += sum((labels[:, batch_start:batch_end]' .- layer_2) .^ 2)
        
        for k=1:batch_size
            Correct_cnt += Int(argmax(layer_2[k, :]) == argmax(labels[:, batch_start+k-1]))
        end
        layer_2_delta = (labels[:, batch_start:batch_end]' .- layer_2) ./ (batch_size * size(layer_2)[1])
        layer_1_delta = (layer_2_delta * weights_1_2') .* tanh2deriv.(layer_1)

        layer_1_delta .*= dropout_mask

        weights_1_2 += alpha .* layer_1' * layer_2_delta
        weights_0_1 += alpha .* layer_0 * layer_1_delta
        
    end
        
    if (j % 10 == 0)
        test_Error, test_Correct_cnt = (0.0, 0)
        for i = 1:size(test_images, 2)
            layer_0 = test_images[:, i]
            layer_1 = tanh.(layer_0' * weights_0_1)
            layer_2 = layer_1 * weights_1_2

            test_Error += sum((test_labels[:, i]' .- layer_2) .^ 2)
            test_Correct_cnt += Int(argmax(layer_2[1,:]) == argmax(test_labels[:, i]))
        end
        println("I: $(j) Train accuracy: $(Correct_cnt/size(images, 2)) Test-Acc:: $(test_Correct_cnt/size(test_images, 2))")
    end
end



I: 10 Train accuracy: 0.432 Test-Acc:: 0.5506
I: 20 Train accuracy: 0.591 Test-Acc:: 0.6536
I: 30 Train accuracy: 0.652 Test-Acc:: 0.706
I: 40 Train accuracy: 0.681 Test-Acc:: 0.7361
I: 50 Train accuracy: 0.705 Test-Acc:: 0.7611
I: 60 Train accuracy: 0.725 Test-Acc:: 0.7816
I: 70 Train accuracy: 0.737 Test-Acc:: 0.7941
I: 80 Train accuracy: 0.743 Test-Acc:: 0.8035
I: 90 Train accuracy: 0.765 Test-Acc:: 0.811
I: 100 Train accuracy: 0.781 Test-Acc:: 0.8164
I: 110 Train accuracy: 0.778 Test-Acc:: 0.8225
I: 120 Train accuracy: 0.787 Test-Acc:: 0.8272
I: 130 Train accuracy: 0.782 Test-Acc:: 0.8314
I: 140 Train accuracy: 0.788 Test-Acc:: 0.834
I: 150 Train accuracy: 0.792 Test-Acc:: 0.8367
I: 160 Train accuracy: 0.802 Test-Acc:: 0.8406
I: 170 Train accuracy: 0.802 Test-Acc:: 0.8435
I: 180 Train accuracy: 0.801 Test-Acc:: 0.8459
I: 190 Train accuracy: 0.811 Test-Acc:: 0.8474
I: 200 Train accuracy: 0.81 Test-Acc:: 0.8481
I: 210 Train accuracy: 0.805 Test-Acc:: 0.8498
I: 220 Train accuracy: 0.8