In [None]:
using Flux: Chain, Dense, relu, ADAM, mse, gradient, params, Optimise
using Statistics: mean
include("preprocessing.jl")

In [None]:
function prepare_data(input_dir::String)
    inputs = []
    outputs = []

    for csv in readdir(input_dir)
        df = CSV.read(joinpath(input_dir, csv), DataFrame)
        if size(df)[1] < 500
            continue
        end
        push!(inputs, Matrix{Float32}(df[1:500, 1:end-1]))
        push!(outputs, sum(df[1:500, end]))
    end
    
    # Find the length of the longest input array
    max_length = maximum(size(input, 1) for input in inputs)

    # Pad input arrays with zeros to match the longest array's length
    padded_inputs = []
    for input in inputs
        rows_to_pad = max_length - size(input, 1)
        padded_input = vcat(input, zeros(Float32, rows_to_pad, size(input, 2)))
        push!(padded_inputs, padded_input)
    end

    (padded_inputs, outputs')
end

In [None]:
inputs, outputs = prepare_data("assets/anomalous")

rows = [size(arr, 1)[1] for arr in inputs]
numCSVs = size(inputs)[1]
SEQ_MIN = minimum(rows)
SEQ_MAX = maximum(rows)

println("Number of CSVs: $numCSVs " * "Min Rows: $SEQ_MIN " * "Max Rows: $SEQ_MAX " * "Number of columns: $(size(inputs[1])[2])")
println("Output dim: $(size(outputs))")

In [None]:
function train_model(inputs, outputs, epochs)
    model = Flux.Chain(
                Dense(4, 64, relu),
                Dense(64, 16, relu),
                Dense(16, 1)
            )

    # Define the optimizer
    opt = ADAM(0.01)

    # Define the loss function
    loss(x, y) = mse(model(x), y)

    # Train the model
    batch_size = 1
    num_batches = div(length(outputs), batch_size)
    num_epochs = epochs

    for epoch in 1:num_epochs
        epoch_loss = 0.0
        for i in 1:num_batches
            idx = (i-1)*batch_size+1:i*batch_size
            x_batch = hcat(inputs[idx]...)'  # Transpose the input data
            y_batch = reshape(outputs[idx], 1, length(outputs[idx]))
            grads = Flux.gradient(() -> loss(x_batch, y_batch), params(model))
            Flux.Optimise.update!(opt, params(model), grads)
            epoch_loss += loss(x_batch, y_batch)
        end
        @show epoch, epoch_loss / num_batches
    end

    # Predict the outputs
    X_test = rand(Float32, 4, 10)
    Y_pred = model(X_test)

    model
end


In [None]:
train_model(inputs, outputs, 10)