In [1]:
using Flux, Base
include("preprocessing.jl")

reconstruct_midi_file (generic function with 1 method)

In [2]:
function prepare_data(input_dir::String)
    inputs = []
    outputs = []

    for csv in readdir(input_dir)
        df = CSV.read(joinpath(input_dir, csv), DataFrame)
        if size(df)[1] < 30
            continue
        end
        push!(inputs, df[:, 1:end-1])
        push!(outputs, sum(df[:, end]))
    end
    
    (inputs, outputs)
end

inputs, outputs = prepare_data("assets/anomalous")

(Any[[1m940×4 DataFrame[0m
[1m Row [0m│[1m note  [0m[1m velocity [0m[1m position [0m[1m duration [0m
     │[90m Int64 [0m[90m Int64    [0m[90m Int64    [0m[90m Int64    [0m
─────┼─────────────────────────────────────
   1 │    60        45         0       935
   2 │    33        17         0       935
   3 │    69        41       440       495
   4 │    36        25       440       495
   5 │    40        17       605       330
   6 │    45        33       715       220
   7 │    55        41       825       770
   8 │    45        25       990       605
   9 │    64        41       990       605
  10 │    67        49      1100       495
  11 │    52        25      1100       495
  ⋮  │   ⋮       ⋮         ⋮         ⋮
 931 │    81        45     73480      1148
 932 │    76        33     73480       220
 933 │    95        49     73645       275
 934 │    78        33     73700       220
 935 │    84        21     73920       550
 936 │    93        41     73975    

In [15]:
rows = [size(arr, 1)[1] for arr in inputs]
numCSVs = size(inputs)[1]
SEQ_MIN = minimum(rows)
SEQ_MAX = maximum(rows)

println("Number of CSVs: $numCSVs " * "Min Rows: $SEQ_MIN " * "Max Rows: $SEQ_MAX " * "Number of columns: $(size(inputs[1])[2])")
println("Output dim: $(size(outputs))")

Number of CSVs: 10146 Min Rows: 32 Max Rows: 17225 Number of columns: 4
Output dim: (10146,)


In [13]:
using Flux

function train_model(inputs, outputs, epochs)
    input_size = 4
    output_size = 1
    hidden_size = 128
    seq_length = 100
    num_train = 8000
    num_val = length(inputs) - num_train

    # Preprocess the data
    data = [(inputs[i:i+seq_length-1,:], outputs[i+seq_length]) for i in 1:num_train-seq_length+1]
    data_val = [(inputs[i:i+seq_length-1,:], outputs[i+seq_length]) for i in num_train+1:num_train+num_val-seq_length+1]
    X_train = hcat([data[i][1]' for i in 1:num_train-seq_length+1]...)'
    Y_train = [data[i][2] for i in 1:num_train-seq_length+1]
    X_val = hcat([data_val[i][1]' for i in 1:num_val-seq_length+1]...)'
    Y_val = [data_val[i][2] for i in 1:num_val-seq_length+1]
    X_mean = mean(X_train, dims=1)
    X_std = std(X_train, dims=1)
    X_train = (X_train .- X_mean) ./ X_std
    X_val = (X_val .- X_mean) ./ X_std

    # Define the model
    model = Flux.Chain(
        LSTM(input_size, hidden_size),
        LSTM(hidden_size, hidden_size),
        Dense(hidden_size, output_size),
        sigmoid
    )

    # Define the loss function
    loss(x, y) = mean((model(x) .- y).^2)

    # Define the optimizer
    lr = 0.001
    decay = 0.1^(1/epochs)
    opt = ADAM(lr)

    # Train the model
    for epoch in 1:epochs
        Flux.train!()
        for i in 1:num_val-seq_length+1
            x = X_val[i:i+seq_length-1,:]'
            y = Y_val[i]
            val_loss += loss(x, y)
        end
        @show epoch, val_loss/(num_val-seq_length+1)
        Flux.lrdecay!(opt, decay)
    end
    
    return model
end    


train_model (generic function with 1 method)

In [14]:
train_model(inputs, outputs, 10)

LoadError: BoundsError: attempt to access 10146-element Vector{Any} at index [10147]