In [40]:
using MIDI, CSV, DataFrames, StatsBase, DataFramesMeta

In [41]:
function find_midi_files(directory::String)
    # Get a list of all the files and directories in the directory
    items = readdir(directory)

    # Initialize an empty vector to store the MIDI files
    midi_files = Vector{String}()

    # Loop through each item in the directory
    for item in items
        # Get the full path to the item
        item_path = joinpath(directory, item)

        # If the item is a directory, recursively search it for MIDI files
        if isdir(item_path)
            midi_files = vcat(midi_files, find_midi_files(item_path))
        # If the item is a MIDI file, add it to the list
        elseif endswith(item, ".mid") || endswith(item, ".midi")
            push!(midi_files, item_path)
        end
    end

    # Return the list of MIDI files
    return midi_files
end

find_midi_files (generic function with 1 method)

In [42]:
function parse_midi_files(midi_dir::String, output_dir::String)
    # Get a list of all the MIDI files in the directory
    midi_files = find_midi_files(midi_dir)
    # Loop through the MIDI files and parse them
    for midi_file in midi_files
        # Parse the MIDI file
        midi_data = load(midi_file)

        # Initialize empty vectors to store the parsed data
        track = midi_data.tracks[2]

        notes = getnotes(track)
        noteNames = []
        velocities = []
        times = []

        # Loop through the MIDI events and extract the relevant data
        for note in notes
            push!(noteNames, note.pitch)
            push!(velocities, note.velocity)
            push!(times, note.duration)
        end
        
        # Construct a DataFrame from the parsed data
        df = DataFrame(
            note=noteNames,
            velocity=velocities,
            time=times
        )

        # Write the DataFrame to a CSV file
        csv_file = joinpath(output_dir, replace(basename(midi_file), ".mid" => ".csv"))
        CSV.write(csv_file, df)
    end
end


parse_midi_files (generic function with 1 method)

In [43]:
@time parse_midi_files("midiData", "csvData")

In [44]:
function reconstruct_midi_file(csv_file::String, midi_file::String)
    # Read the CSV file into a DataFrame
    df = CSV.read(csv_file, DataFrame)

    # Convert the parsed data into Note objects
    notes = Notes()
    time = 0
    for i in 1:size(df, 1)
        # Calculate the time since the last MIDI event
        delta_time = round(Int, df.time[i])

        # Create a Note object for the note-on event
        note = Note(df.note[i], df.velocity[i], time, delta_time)
        push!(notes, note)

        # Update the time counter
        time += delta_time
    end

    # Create a MIDI track from the Note objects
    track = MIDITrack()
    addnotes!(track, notes)
    addtrackname!(track, "reconstructed track")

    # Create a MIDI file from the track
    file = MIDIFile()
    push!(file.tracks, track)
    writeMIDIFile(midi_file, file)
end


reconstruct_midi_file (generic function with 1 method)

In [45]:
reconstruct_midi_file("csvData/alb_esp1.csv", "reconstructed.mid")

MIDIFile (format=1, tpq=960) with tracks:
 reconstructed track


In [56]:
function add_synthetic_anomalies(csv_file::String, anomaly_percentage::Float64)
    # Load the CSV file into a DataFrame
    df = CSV.read(csv_file, DataFrame)

    # Add a column to track anomalies
    df.anomalies = zeros(Int64, size(df, 1))
    
    # Generate a matrix of deviations for each row
    num_anomalies = round(Int, size(df, 1) * anomaly_percentage)
    deviations = zeros(size(df))

    indices = sample(1:size(df, 1), num_anomalies, replace=false)

    # Generate random values for each column with the specified probability distributions
    for idx in indices
        deviations[idx, 1] = rand(vcat(-14:14, zeros(28)))
        deviations[idx, 2] = rand(vcat(-40:-20, zeros(40), 20:40))
        deviations[idx, 3] = rand(vcat(-500:-50, zeros(900), 50:500))
    end

    # Apply the deviations to the selected rows using broadcasting
    df[indices, :] .= df[indices, :] .+ deviations[indices, :]

    # Use the map function to count non-zero elements in each row of the selected indices of deviations
    non_zero_counts = [length(findall(!iszero, row)) for row in eachrow(deviations[indices, :])]


    # Assign the non_zero_counts to the anomalies column of the df DataFrame at the specified indices

    df.anomalies[indices] .= non_zero_counts    
    # Clamp the values between 0 and 127
    df[!, :] = clamp.(df, 0, 127)

    # Write the DataFrame to the anomalous directory
    csv_file_name = splitdir(csv_file)[end][1:end-4]
    csv_output_file = joinpath("anomalous", "$csv_file_name" * "_" * "$anomaly_percentage.csv")

    CSV.write(csv_output_file, df)
end

ErrorException: syntax: invalid escape sequence

In [55]:
csv_files = readdir("csvData")
for csv_file in csv_files
    for anomaly_percentage in range(0.05; stop=0.95, step=0.05)
        add_synthetic_anomalies(joinpath("csvData", csv_file), anomaly_percentage)
    end
end


InterruptException: InterruptException:

In [48]:
csv_files = readdir("anomalous")
for csv_file in csv_files
    df = CSV.read(joinpath("anomalous", csv_file), DataFrame)
    # count the number of non-zero values
    non_zero_count = count(!iszero, df.anomalies)
    
    # calculate the proportion of non-zero values to zero values
    prop_non_zero = non_zero_count / length(df.anomalies)

    # print the result
    println("Proportion of non-zero values to zero values $csv_file: $prop_non_zero")
end