In [2]:
# \bar C̄
# \dot Ċ
# \ddot C̈
using Zygote

In [25]:
struct MNIST
    train_datas::AbstractArray
    train_labels::AbstractArray
    test_datas::AbstractArray
    test_labels::AbstractArray
end

function load_mnist(filepath="/Users/yanghanlin/Desktop/julia-2-week/dataset/")
    train_label_file = "train-labels-idx1-ubyte"
    train_data_file = "train-images-idx3-ubyte"
    
    test_label_file = "t10k-labels-idx1-ubyte"
    test_data_file = "t10k-images-idx3-ubyte"
    
    train_labels = Array{UInt8, 1}()
    train_datas = []
    
    test_labels = Array{UInt8, 1}()
    test_datas = []
    
    devnull = Array{UInt8, 1}()
    open(filepath*train_label_file, "r") do f
        readbytes!(f, devnull, 8)
        readbytes!(f, train_labels, 60000)
    end
    open(filepath*train_data_file, "r") do f
        readbytes!(f, devnull, 16)
        for x in 1:60000
            temp = Array{UInt8, 1}()
            readbytes!(f, temp, 28*28)
            push!(train_datas, temp)
        end
    end
    open(filepath*test_label_file, "r") do f
        readbytes!(f, devnull, 8)
        readbytes!(f, test_labels, 10000)
    end
    open(filepath*test_data_file, "r") do f
        readbytes!(f, devnull, 16)
        for x in 1:10000
            temp = Array{UInt8, 1}()
            readbytes!(f, temp, 28*28)
            push!(test_datas, temp)
        end
    end
    return MNIST(train_datas, train_labels, test_datas, test_labels)
end

mnist = load_mnist()
println(size(mnist.test_datas), size(mnist.train_datas))

(10000,)(60000,)


In [1]:
sigmoid(𝒛) = 1.0/(1.0+exp(-𝒛))
softmax(data) = [exp(x)/sum(exp.(data)) for x in data]

struct Network
    weights::AbstractArray
    biases::AbstractArray
    activation::Any
end

function initnetwork(sizes::Union{Tuple, AbstractArray}, activation::Any)
    weights = []
    biases = []
    for (x, y) in [(sizes[x], sizes[x+1]) for x in 1:(length(sizes)-1)]
        push!(weights, rand(y, x))
        push!(biases, rand(y))
    end
    return Network(weights, biases, activation)
end

function loss(net::Network, data::AbstractArray, label)
    for x in 1:length(net.weights)
        data = net.activation.(net.weights[x] * data + net.biases[x])
    end
    data = softmax(data)
    # notice : this for-loop must start with 0, because the MNIST'data is 0-9
    onehot = [x == label ? 1.0 : 0.0 for x in 0:(length(net.biases[end])-1)]
    return sum((data - onehot).^2 / length(data))
end

function batch_loss(net::Network, one_batch_datas::AbstractArray, one_batch_labels::AbstractArray)
    total_loss = 0.0
    for x in 1:length(one_batch_datas)
        total_loss += loss(net, one_batch_datas[x], one_batch_labels[x])
    end
    return total_loss / length(one_batch_datas)
end

function shuffle(len, batch_size)
    len % batch_size != 0 && error("batch_size parameter wrong")
    batches = [x for x in 1:len]
    for x in 1:10000
        temp = abs.(rand(Int64, 2)).% len
        temp = temp.+1
        batches[temp[1]], batches[temp[2]] = batches[temp[2]], batches[temp[1]]
    end
    
    return batches
end

function update(net::Network, one_batch_datas::AbstractArray, one_batch_labels::AbstractArray)
    learning_rate = 0.1
    loss₁ = batch_loss(net, one_batch_datas, one_batch_labels)
    
    # calculate gradient for every parameter
    weights_dx = deepcopy(net.weights)
    biases_dx = deepcopy(net.biases)
    
    for cind in CartesianIndices(net.weights)
        for i in CartesianIndices(net.weights[cind])
            net.weights[cind][i] = net.weights[cind][i] + 0.00001
            loss₂ = batch_loss(net, one_batch_datas, one_batch_labels)
            η = (loss₂ - loss₁) / 0.00001
            weights_dx[cind][i] = -learning_rate * η
            net.weights[cind][i] = net.weights[cind][i] - 0.00001
        end
    end
    
    for bind in CartesianIndices(net.biases)
        for i in CartesianIndices(net.biases[bind])
            net.biases[bind][i] = net.biases[bind][i] + 0.00001
            loss₂ = batch_loss(net, one_batch_datas, one_batch_labels)
            η = (loss₂ - loss₁) / 0.00001
            biases_dx[bind][i] = -learning_rate * η
            net.biases[bind][i] = net.biases[bind][i] - 0.00001
        end
    end
    
    for x in length(net.weights)
        net.weights[x] = net.weights[x].-weights_dx[x]
    end
    for x in length(net.biases)
        net.biases[x] = net.biases[x].-biases_dx[x]
    end
end

function train(net::Network, datas::AbstractArray, labels::AbstractArray, mini_batches::Int64)
    len = length(datas)
    for sround_index in 1:20
        index_batches = shuffle(len, mini_batches)
        rounds = len / mini_batches
        for x in 0:(rounds-1)
            sind = Int64((x*mini_batches+1))
            eind = Int64((x*mini_batches+mini_batches))
            one_batch_datas = [datas[index_batches[t]] for t in sind:eind]
            one_batch_labels = [datas[index_batches[t]] for t in sind:eind]
            update(net, one_batch_datas, one_batch_labels)
            println("rounds $(sround_index) $(sind)-$(eind) has updated over.")
        end
    end
end

network = initnetwork([784, 300, 100, 10], sigmoid)
#loss(network, mnist.train_datas[3], mnist.train_labels[3])
#train(network, mnist.train_datas, mnist.train_labels, 30)

Network(Any[[0.4413361169379262 0.4833344564494011 … 0.5838152403520882 0.7548351085188092; 0.5473357351732486 0.955858461188932 … 0.6868857827205286 0.37147868569241504; … ; 0.31478076520964327 0.4054403837430205 … 0.318559927429068 0.0011768274174845939; 0.0340892763029077 0.41791397359988447 … 0.2994023368598915 0.8922630047572884], [0.8008054347810603 0.5120252114070658 … 0.6915723254969965 0.9731168736749531; 0.4264208394612492 0.45985464075511406 … 0.33061107330112005 0.5116013921459024; … ; 0.08807712279080482 0.844810192903988 … 0.11782260345904172 0.0683175627627366; 0.46409452296463694 0.7612055332747163 … 0.06166663588574761 0.2177929536447527], [0.28352289184533497 0.24801943005848925 … 0.8410612646185245 0.2749781640581823; 0.7186764564600143 0.35769651927479185 … 0.7809215344190703 0.07711913931912528; … ; 0.5117611635749404 0.963042176706915 … 0.959441666647699 0.2567896722288636; 0.03602084467714839 0.1584638189983094 … 0.558497900403804 0.4250423752285326]], Any[[0.770

In [39]:
# test for gradient
net_test = initnetwork([20, 10, 5, 2], sigmoid)
data = rand(20)
label = 1
zygote_grad = gradient(loss, net_test, data, label)[1]

function my_gradient(loss_function, parameters...)
    net = parameters[1]
    data = parameters[2]
    label = parameters[3]
    loss₁ = loss_function(net, data, label)
    
    # calculate gradient for every parameter
    weights_dx = deepcopy(net.weights)
    biases_dx = deepcopy(net.biases)
    
    for cind in CartesianIndices(net.weights)
        for i in CartesianIndices(net.weights[cind])
            net.weights[cind][i] = net.weights[cind][i] + 0.00001
            loss₂ = loss_function(net, data, label)
            weights_dx[cind][i] = (loss₂ - loss₁) / 0.00001
            net.weights[cind][i] = net.weights[cind][i] - 0.00001
        end
    end
    
    for bind in CartesianIndices(net.biases)
        for i in CartesianIndices(net.biases[bind])
            net.biases[bind][i] = net.biases[bind][i] + 0.00001
            loss₂ = loss_function(net, data, label)
            biases_dx[bind][i] = (loss₂ - loss₁) / 0.00001
            net.biases[bind][i] = net.biases[bind][i] - 0.00001
        end
    end
    return (weights_dx, biases_dx)
end

my_grad = my_gradient(loss, net_test, data, label)

#println("zygote gradients:")
#println(zygote_grad[:weights])
#println("my gradients:")
#println(my_grad[1])
diff = zygote_grad[:weights].-my_grad[1]
diff[1] = reshape(diff[1], 200, 1)
diff[2] = reshape(diff[2], 50, 1)
diff[3] = reshape(diff[3], 10, 1)
diff = [diff[1];diff[2];diff[3]].^2
println(max(diff...))


4.67606532906383e-15
