In [None]:
using MLDatasets
train_x, train_y = MNIST.traindata()
test_x,  test_y  = MNIST.testdata();

(images, labels) = (train_x[:,:,1:1000], train_y[1:1000])
one_hot_labels = zeros(10,length(labels))
for (i,l) in enumerate(labels)
    one_hot_labels[l+1, i] = 1.0
end
labels = one_hot_labels

# test_images = reshape(test_x, (28*28, size(test_x,3)))
test_labels = zeros((10, size(test_x,3)))

for (i,l) in enumerate(test_y)
    test_labels[l+1, i] = 1.0
end

using Random
Random.seed!(1)

tanh2deriv(output) = 1 - output^2

function softmax(x)
    temp = exp.(x)
    return temp ./ sum(temp, dims=1)
end

alpha, iterations = (2, 300)
pixels_per_image, num_labels = (784, 10)
batch_size = 128

input_rows = 28
input_cols = 28

kernel_rows = 3
kernel_cols = 3
num_kernels = 16

hidden_size = ((input_rows - kernel_rows) * 
               (input_cols - kernel_cols)) * num_kernels

kernels = 0.02 .* rand(num_kernels, kernel_rows*kernel_cols) .- 0.01
weights_1_2 = 0.2 .* rand(num_labels, hidden_size) .- 0.1

function get_image_section(layer,row_from, row_to, col_from, col_to)
    section = layer[row_from:row_to,col_from:col_to, :]
    return reshape(section, (row_to-row_from+1, col_to-col_from+1, 1, :))
end

for j=1:iterations
    Correct_cnt = 0
    for i = 1:batch_size:size(images, 3)-batch_size
        
        batch_start, batch_end = i, i+batch_size-1
        layer_0 = images[:,:, batch_start:batch_end]
        
        sects = []
        for row_start=1:size(layer_0, 1)-kernel_rows
            for col_start=1:size(layer_0, 2) - kernel_cols
                sect = get_image_section(layer_0,row_start, row_start+kernel_rows-1, col_start, col_start+kernel_cols-1)
                push!(sects, sect)
            end
        end
        expanded_input = cat(sects...,dims=3)
        es = size(expanded_input)
        flattened_input = reshape(expanded_input, (:, es[3]*es[4]))
        kernel_output = kernels * flattened_input
        layer_1 = tanh.(reshape(kernel_output, (:, size(expanded_input, 4))))
        
        dropout_mask = bitrand(size(layer_1))
        layer_1 .*= dropout_mask .* 2
        layer_2 = softmax(weights_1_2 * layer_1)
        
        for k=1:batch_size
            Correct_cnt += Int(argmax(layer_2[:,k]) == argmax(labels[:, batch_start+k-1]))
        end
        
        layer_2_delta = (labels[:, batch_start:batch_end] .- layer_2) ./ (batch_size * size(layer_2, 2))
        layer_1_delta = (weights_1_2' * layer_2_delta) .* tanh2deriv.(layer_1)
        layer_1_delta .*= dropout_mask
        weights_1_2 .+= alpha .* layer_2_delta * layer_1_delta'
        l1d_reshape = reshape(layer_1_delta, size(kernel_output))
        k_update = l1d_reshape * flattened_input'
        kernels .-= alpha .* k_update
    end
    
    test_correct_cnt = 0
    
    for i=1:size(test_x, 3)
        layer_0 = test_x[:,:, i]
        sects = []
        for row_start=1:size(layer_0, 1)-kernel_rows
            for col_start=1:size(layer_0, 2) - kernel_cols
                sect = get_image_section(layer_0,row_start, row_start+kernel_rows-1, col_start, col_start+kernel_cols-1)
                push!(sects, sect)
            end
        end
        expanded_input = cat(sects...,dims=3)
        es = size(expanded_input)
        flattened_input = reshape(expanded_input, (:, es[3]*es[4]))
        kernel_output = kernels * flattened_input
        layer_1 = tanh.(reshape(kernel_output, (:, size(expanded_input, 4))))
        
        dropout_mask = bitrand(size(layer_1))
        layer_1 .*= dropout_mask .* 2
        layer_2 = weights_1_2 * layer_1
        test_correct_cnt += Int(argmax(layer_2) == argmax(test_labels[:, i]))
    end 
    if (j%1 == 0)
        println("I: $(j) Train accuracy: $(Correct_cnt/size(images, 3)) Test-Acc:: $(test_correct_cnt/size(test_images, 3))")
    end             
end

In [None]:
l1d_reshape = reshape(layer_1_delta, size(kernel_output))

In [None]:
size(l1d_reshape * flattened_input')

In [None]:
size(test_x)

In [None]:
a = rand(3,3,2)

In [None]:
reshape(a, (3,3,1,:))

In [None]:
size(test_x)

In [None]:
b = get_image_section(images[:,:,1:100], 1,10,1,10)

In [None]:
size(kernel_output)

In [None]:
a = sects[1]
b = sects[2]

In [None]:
size(a)

In [None]:
size(cat(sects...,dims=3))

In [None]:
expanded_input = cat(sects...,dims=3)
flattened_input = reshape(expanded_input, (:, size(expanded_input, 3)*size(expanded_input, 4)))

In [None]:
size(kernels)

In [None]:
kernel_output = kernels * flattened_input

In [None]:
layer_1 = tanh.(reshape(kernel_output, (:, size(expanded_input, 4))))

In [None]:
layer_2 = weights_1_2 * layer_1

In [None]:
layer_1 .*= dropout_mask .* 2

In [None]:
dropout_mask = bitrand(size(layer_1))

In [None]:
layer_2 = softmax1(layer_2)

In [None]:
layer_2[:,1]|>sum

In [None]:
function softmax1(x)
    temp = exp.(x)
    return temp ./ sum(temp, dims=1)
end

In [None]:
for k in range(batch_size):
            labelset = labels[batch_start+k:batch_start+k+1]
            _inc = int(np.argmax(layer_2[k:k+1]) == 
                               np.argmax(labelset))
            correct_cnt += _inc

In [None]:
size(layer_2_delta * layer_1_delta' )

In [None]:
layer_1_delta = weights_1_2' * layer_2_delta;

In [None]:
layer_2_delta = (labels[:, 1:128] .- layer_2) ./ (batch_size * size(layer_2, 2))

In [None]:
size(weights_1_2)