In [1]:
using Pkg
Pkg.activate("..")

[32m[1m  Activating[22m[39m project at `~/Library/CloudStorage/OneDrive-Personal/Documents/Studia/Semestr 8/Algorytmy w inżynierii danych/Projekt/KamienMilowy2/KM2_Piotr_Szczerba`


In [3]:
include("../autodiff/graph.jl")
include("../autodiff/forward.jl")
include("../autodiff/backward.jl")
include("../autodiff/operators.jl")
include("../neuralnet/dataloader.jl")

using JLD2
using Random
using Statistics
using Printf

X_train = load("../data/imdb_dataset_prepared.jld2", "X_train")
y_train = load("../data/imdb_dataset_prepared.jld2", "y_train")
X_test = load("../data/imdb_dataset_prepared.jld2", "X_test")
y_test = load("../data/imdb_dataset_prepared.jld2", "y_test")

y_train = Float32.(y_train)
y_test = Float32.(y_test)

dataset = DataLoader(X_train, y_train, 64, shuffle=true)

input_neurons  = size(X_train, 1)
hidden_neurons = 32
output_neurons = 1

ϵ = Constant(1e-7)
binary_cross_entropy_loss(y, ŷ) = mean(Constant(-1.0) .* (y .* log.(ŷ .+ ϵ) .+ (Constant(1.0) .- y) .* log.(Constant(1.0) .- ŷ .+ ϵ)))

wh = Variable(randn(hidden_neurons, input_neurons) * sqrt(2 / input_neurons), name="wh")
wo = Variable(randn(output_neurons, hidden_neurons), name="wo")
bh = Variable(zeros(hidden_neurons, 1), name="bh")
bo = Variable(zeros(output_neurons, 1), name="bo")
x = Variable(zeros(input_neurons), name="x")
y = Variable(zeros(output_neurons), name="y")

y = Variable(zeros(1, 64), name="y")  # max batch size
x = Variable(zeros(input_neurons, 64), name="x")

function dense(w, b, x, activation) return activation.(w * x .+ b) end

function net(x, wh, bh, wo, bo)
    x̂ = dense(wh, bh, x, relu)
    x̂.name = "x̂"
    ŷ = dense(wo, bo, x̂, σ)
    ŷ.name = "ŷ"
    return ŷ
end

function loss(x, y, wh, bh, wo, bo)
    ŷ = net(x, wh, bh, wo, bo)
    E = binary_cross_entropy_loss(y, ŷ); E.name = "loss"
    return E, ŷ
end

epochs = 5
lr = 0.001

adam_state = Dict{Variable, Tuple{Array, Array, Int}}()
for param in [wh, wo, bh, bo]
    m = zeros(size(param.output))
    v = zeros(size(param.output))
    adam_state[param] = (m, v, 0)
end


for epoch in 1:epochs
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    t = @elapsed begin
        for (xb, yb) in dataset
            batch_size = size(xb, 2)

            x.output .= xb
            y.output .= yb

            L, ŷ_node = loss(x, y, wh, bh, wo, bo)

            graph = topological_sort(L)

            lval = forward!(graph)

            for param in [wh, wo, bh, bo]
                param.gradient = nothing
            end
            backward!(graph)

            # ADAM
            β1 = 0.9
            β2 = 0.999
            ε = 1e-8

            for param in [wh, wo, bh, bo]
                g = param.gradient
                if size(g) != size(param.output)
                    g = mean(g; dims=2)
                    g = dropdims(g; dims=2)
                end

                m, v, t = adam_state[param]
                t += 1

                m .= β1 .* m .+ (1 - β1) .* g
                v .= β2 .* v .+ (1 - β2) .* (g .^ 2)

                m_hat = m ./ (1 - β1^t)
                v_hat = v ./ (1 - β2^t)

                param.output .-= lr .* m_hat ./ (sqrt.(v_hat) .+ ε)

                adam_state[param] = (m, v, t)
            end

            ŷ = ŷ_node.output
            predictions = ŷ .> 0.5
            targets = y.output .> 0.5
            total_correct += count(predictions .== targets)
            total_loss += lval[1] * batch_size
            total_samples += batch_size
        end
    end

    avg_loss = total_loss / total_samples
    avg_acc = total_correct / total_samples

    println(@sprintf("Epoch: %d (%.2fs) \tTrain: (loss: %.4f, acc: %.4f)",
                     epoch, t, avg_loss, avg_acc))
    
end

Epoch: 1 (6.10s) 	Train: (loss: 0.5818, acc: 0.8061)
Epoch: 2 (2.25s) 	Train: (loss: 0.3451, acc: 0.9307)
Epoch: 3 (2.95s) 	Train: (loss: 0.2210, acc: 0.9606)
Epoch: 4 (2.37s) 	Train: (loss: 0.1507, acc: 0.9779)
Epoch: 5 (2.28s) 	Train: (loss: 0.1062, acc: 0.9889)


In [64]:
include("../autodiff/flux_like_api.jl")

In [65]:
model = Chain(
    Dense(input_neurons, hidden_neurons, relu),
    Dense(hidden_neurons, output_neurons, σ)
)

Chain((Dense{typeof(relu)}(Variable(Float32[0.005698695 -0.003132612 … 0.015349576 0.016572604; 0.015555462 -0.013308286 … 0.007920059 0.011096501; … ; -0.015656058 -0.004709352 … -0.0025984442 0.008416636; -0.014263206 0.015561428 … -0.004636731 0.0023076062], nothing, "weight"), Variable([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], nothing, "bias"), Main.relu), Dense{typeof(σ)}(Variable(Float32[-0.19049272 -0.09341225 … -0.22478186 0.08881199], nothing, "weight"), Variable([0.0], nothing, "bias"), Main.σ)))

In [66]:
y = Variable(zeros(1, 64), name="y")  # max batch size
x = Variable(zeros(input_neurons, 64), name="x")

Variable([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], nothing, "x")

In [46]:
function loss(model, x, y)
    ŷ = model(x)
    E = binary_cross_entropy_loss(y, ŷ)
    E.name = "loss"
    return E, ŷ
end

loss (generic function with 2 methods)

In [62]:
adam_state = Dict{Variable,Tuple{Array,Array,Int}}()
for param in trainable(model)
    m = zeros(size(param.output))
    v = zeros(size(param.output))
    adam_state[param] = (m, v, 0)
end

In [63]:
length(adam_state)

4

In [None]:
for epoch in 1:epochs
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    t = @elapsed begin
        for (xb, yb) in dataset
            batch_size = size(xb, 2)

            x.output .= xb
            y.output .= yb

            L, ŷ_node = loss(model, x, y)

            # TODO: Sort the graph only once
            graph = topological_sort(L)

            lval = forward!(graph)

            for param in trainable(model)
                param.gradient = nothing
            end
            backward!(graph)

            # ADAM
            β1 = 0.9
            β2 = 0.999
            ε = 1e-8

            for param in trainable(model)
                g = param.gradient
                if size(g) != size(param.output)
                    g = mean(g; dims=2)
                    g = dropdims(g; dims=2)
                end

                m, v, t = adam_state[param]
                t += 1

                m .= β1 .* m .+ (1 - β1) .* g
                v .= β2 .* v .+ (1 - β2) .* (g .^ 2)

                m_hat = m ./ (1 - β1^t)
                v_hat = v ./ (1 - β2^t)

                param.output .-= lr .* m_hat ./ (sqrt.(v_hat) .+ ε)

                adam_state[param] = (m, v, t)
            end

            ŷ = ŷ_node.output
            predictions = ŷ .> 0.5
            targets = y.output .> 0.5
            total_correct += count(predictions .== targets)
            total_loss += lval[1] * batch_size
            total_samples += batch_size
        end
    end

    avg_loss = total_loss / total_samples
    avg_acc = total_correct / total_samples

    println(@sprintf("Epoch: %d (%.2fs) \tTrain: (loss: %.4f, acc: %.4f)",
        epoch, t, avg_loss, avg_acc))

end

Epoch: 1 (2.99s) 	Train: (loss: 0.6494, acc: 0.7895)
Epoch: 2 (3.20s) 	Train: (loss: 0.4663, acc: 0.9114)
Epoch: 3 (3.27s) 	Train: (loss: 0.3023, acc: 0.9403)
Epoch: 4 (3.15s) 	Train: (loss: 0.2062, acc: 0.9626)
Epoch: 5 (3.21s) 	Train: (loss: 0.1475, acc: 0.9756)


In [67]:
opt = Adam()
adam_state = setup(opt, model)

Dict{Variable, Tuple{Array, Array, Int64}} with 4 entries:
  Variable(Float32[-0.1904… => ([0.0 0.0 … 0.0 0.0], [0.0 0.0 … 0.0 0.0], 0)
  Variable(Float32[0.00569… => ([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 …
  Variable([0.0, 0.0, 0.0,… => ([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
  Variable([0.0], nothing,… => ([0.0], [0.0], 0)

In [69]:
for epoch in 1:epochs
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    t = @elapsed begin
        for (xb, yb) in dataset
            batch_size = size(xb, 2)

            x.output .= xb
            y.output .= yb

            L, ŷ_node = loss(model, x, y)

            # TODO: Sort the graph only once
            graph = topological_sort(L)

            lval = forward!(graph)

            for param in trainable(model)
                param.gradient = nothing
            end
            backward!(graph)

            update!(opt, adam_state, model)

            ŷ = ŷ_node.output
            predictions = ŷ .> 0.5
            targets = y.output .> 0.5
            total_correct += count(predictions .== targets)
            total_loss += lval[1] * batch_size
            total_samples += batch_size
        end
    end

    avg_loss = total_loss / total_samples
    avg_acc = total_correct / total_samples

    println(@sprintf("Epoch: %d (%.2fs) \tTrain: (loss: %.4f, acc: %.4f)",
        epoch, t, avg_loss, avg_acc))

end

Epoch: 1 (3.83s) 	Train: (loss: 0.6453, acc: 0.7745)
Epoch: 2 (3.20s) 	Train: (loss: 0.4581, acc: 0.9136)
Epoch: 3 (3.51s) 	Train: (loss: 0.2980, acc: 0.9413)
Epoch: 4 (3.30s) 	Train: (loss: 0.2038, acc: 0.9629)
Epoch: 5 (4.21s) 	Train: (loss: 0.1459, acc: 0.9751)
