In [1]:
using MIDI, CSV, DataFrames, StatsBase, DataFramesMeta

In [41]:
function find_midi_files(directory::String)
    # Get a list of all the files and directories in the directory
    items = readdir(directory)

    # Initialize an empty vector to store the MIDI files
    midi_files = Vector{String}()

    # Loop through each item in the directory
    for item in items
        # Get the full path to the item
        item_path = joinpath(directory, item)

        # If the item is a directory, recursively search it for MIDI files
        if isdir(item_path)
            midi_files = vcat(midi_files, find_midi_files(item_path))
        # If the item is a MIDI file, add it to the list
        elseif endswith(item, ".mid") || endswith(item, ".midi")
            push!(midi_files, item_path)
        end
    end

    # Return the list of MIDI files
    return midi_files
end

find_midi_files (generic function with 1 method)

In [42]:
function parse_midi_file(midi_file::String, output_dir::String)
    if midi_file[end] == '/'
        midi_file = midi_file[1:end-1]
    end

    # Parse the MIDI file
    midi_data = load(midi_file)

    # Initialize empty vectors to store the parsed data
    noteNames = []
    velocities = []
    positions = []
    durations = []

    for track in midi_data.tracks
        # Loop through the MIDI events and extract the relevant data
        notes = getnotes(track)
        for note in notes
            push!(noteNames, note.pitch)
            push!(velocities, note.velocity)
            push!(positions, note.position)
            push!(durations, note.duration)
        end
    end

    # Construct a DataFrame from the parsed data
    df = DataFrame(
        note=noteNames,
        velocity=velocities,
        position=positions,
        duration=durations
    )

    # Write the DataFrame to a CSV file
    csvName = replace(basename(midi_file)[1:end-4], "." => "_") * ".csv"
    csv_file = joinpath(output_dir, csvName)
    CSV.write(csv_file, df)
end

function parse_midi_files(midi_dir::String, output_dir::String)
    # Get a list of all the MIDI files in the directory
    midi_files = find_midi_files(midi_dir)
    midi_files = [replace(file, "\\" => "/") for file in midi_files]
    # Loop through the MIDI files and parse them
    for file in midi_files
        try
            parse_midi_file(file, output_dir)

        catch e
            println(e, file)
        end
    end
end

parse_midi_files (generic function with 1 method)

In [43]:
@time parse_midi_files("midiData", "csvData")

In [44]:
function reconstruct_midi_file(csv_file::String, midi_file::String)
    # Read the CSV file into a DataFrame
    df = CSV.read(csv_file, DataFrame)

    # Convert the parsed data into Note objects
    notes = Notes()
    for i in 1:size(df, 1)
        # Calculate the time since the last MIDI event

        # Create a Note object for the note-on event
        note = Note(df.note[i], df.velocity[i], df.position[i], df.duration[i])
        push!(notes, note)

        # Update the time counter
    end

    # Create a MIDI track from the Note objects
    track = MIDITrack()
    addnotes!(track, notes)
    addtrackname!(track, "reconstructed track")

    # Create a MIDI file from the track
    file = MIDIFile()
    push!(file.tracks, track)
    writeMIDIFile(midi_file, file)
end

reconstruct_midi_file (generic function with 1 method)

In [45]:
reconstruct_midi_file("csvData/alb_esp1.csv", "reconstructed.mid")

MIDIFile (format=1, tpq=960) with tracks:
 reconstructed track


In [12]:
function add_synthetic_anomalies(csv_file::String, anomaly_percentage::Float64)
    # Load the CSV file into a DataFrame
    df = CSV.read(csv_file, DataFrame)
    # Add a column to track anomalies
    df.anomalies = zeros(Int64, size(df, 1))
    
    # Generate a matrix of deviations for each row
    num_anomalies = round(Int, size(df, 1) * anomaly_percentage)
    deviations = zeros(size(df))

    indices = sample(1:size(df, 1), num_anomalies, replace=false)

    # Generate random values for each column with the specified probability distributions
    for idx in indices
        deviations[idx, 1] = rand(vcat(-14:14, zeros(28)))
        deviations[idx, 2] = rand(vcat(-40:-20, zeros(40), 20:40))
        deviations[idx, 3] = rand(vcat(-300:-50, zeros(900), 50:300))
        deviations[idx, 4] = rand(vcat(-100:-50, zeros(1000), 50:1000))
    end

    # Apply the deviations to the selected rows using broadcasting
    df[indices, :] .= df[indices, :] .+ deviations[indices, :]

    # Use the map function to count non-zero elements in each row of the selected indices of deviations
    non_zero_counts = [length(findall(!iszero, row)) for row in eachrow(deviations[indices, :])]


    # Assign the non_zero_counts to the anomalies column of the df DataFrame at the specified indices

    df.anomalies[indices] .= non_zero_counts    
    # Clamp the values between 0 and 127
    df[!, Not([:position, :duration])] .= clamp.(df[!, Not([:position, :duration])], 0, 127)

    # Write the DataFrame to the anomalous directory
    csv_file_name = splitdir(csv_file)[end][1:end-4]
    csv_output_file = joinpath("assets/anomalous", "$csv_file_name" * "_" * "$anomaly_percentage.csv")
    display(df)

    CSV.write(csv_output_file, df)
end

add_synthetic_anomalies (generic function with 1 method)

In [13]:
csv_files = readdir("assets/csvData")
for csv_file in csv_files
    for anomaly_percentage in range(0.05; stop=0.95, step=0.05)
        add_synthetic_anomalies(joinpath("assets/csvData", csv_file), anomaly_percentage)
    end
end

Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,935,0
2,33,17,0,935,0
3,69,41,440,495,0
4,36,25,440,495,0
5,40,17,605,330,0
6,45,33,715,220,0
7,55,41,825,770,0
8,45,25,990,605,0
9,64,41,990,605,0
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,935,0
2,33,17,0,935,0
3,69,41,440,495,0
4,36,25,440,495,0
5,40,17,605,330,0
6,45,33,715,220,0
7,55,41,825,770,0
8,45,25,990,605,0
9,64,41,990,605,0
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,66,82,0,935,2
2,33,17,0,935,0
3,69,41,440,495,0
4,36,25,440,495,0
5,40,17,605,330,0
6,45,33,715,220,0
7,55,41,825,770,0
8,45,25,990,605,0
9,64,41,990,605,0
10,67,49,1322,495,1


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,935,0
2,33,17,0,935,0
3,69,41,440,495,0
4,36,25,440,495,0
5,40,17,605,330,0
6,45,33,715,220,0
7,45,4,825,770,2
8,45,25,990,1436,1
9,64,41,990,605,0
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,74,24,69,935,3
2,33,17,0,935,0
3,69,41,440,495,0
4,30,2,310,570,4
5,40,17,605,330,0
6,45,33,715,220,0
7,55,41,825,770,0
8,45,58,990,1096,2
9,64,41,990,605,0
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,935,0
2,33,17,0,935,0
3,69,41,440,495,0
4,36,25,440,495,0
5,40,17,605,330,0
6,45,33,715,220,0
7,55,41,825,770,0
8,45,25,990,605,0
9,64,41,1043,605,1
10,67,49,1100,900,1


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,74,45,-144,1073,3
2,19,52,0,935,2
3,69,61,694,1495,3
4,36,25,440,495,0
5,49,17,605,586,2
6,45,33,715,220,0
7,55,41,825,770,0
8,48,25,934,605,2
9,64,41,990,605,0
10,71,49,878,495,2


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,935,0
2,33,17,0,935,0
3,69,41,440,495,0
4,36,25,440,495,0
5,40,17,605,330,0
6,45,33,715,220,0
7,55,41,825,770,0
8,45,25,990,605,0
9,64,8,990,605,1
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,70,69,208,1382,4
2,24,17,110,935,2
3,69,41,440,495,0
4,36,25,440,495,0
5,47,17,605,1204,2
6,38,60,715,220,2
7,55,41,825,770,0
8,33,62,990,724,3
9,65,41,990,605,1
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,935,0
2,33,17,0,935,0
3,69,41,440,495,0
4,36,25,440,495,0
5,40,53,605,330,1
6,37,0,715,425,3
7,55,41,825,770,0
8,45,25,990,605,0
9,64,41,990,605,0
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,935,0
2,33,47,0,1811,2
3,74,41,572,495,2
4,36,25,440,495,0
5,40,17,605,330,0
6,45,33,715,220,0
7,55,64,825,685,2
8,45,25,812,605,1
9,72,41,763,605,2
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,1411,1
2,33,17,0,935,0
3,69,41,572,1306,2
4,36,25,440,495,0
5,40,17,605,330,0
6,45,69,715,220,1
7,55,19,825,1172,2
8,45,25,990,605,0
9,50,79,990,605,2
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,935,0
2,33,17,0,935,0
3,69,20,440,495,1
4,36,25,440,495,0
5,40,17,605,330,0
6,41,33,452,288,3
7,55,41,825,770,0
8,45,48,1248,1439,3
9,64,41,990,605,0
10,67,19,1358,495,2


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,56,13,225,1716,4
2,33,17,0,1775,1
3,67,76,506,495,3
4,36,61,440,1183,2
5,40,17,605,330,0
6,43,10,715,220,2
7,55,41,825,770,0
8,54,25,990,605,1
9,64,17,990,543,2
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,0,935,0
2,33,0,0,935,1
3,69,41,440,495,0
4,36,25,440,495,0
5,40,17,605,330,0
6,37,66,997,1007,4
7,60,73,825,1043,3
8,47,25,990,605,1
9,57,41,990,1104,2
10,67,80,1100,495,1


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,45,108,935,1
2,33,17,0,935,0
3,69,41,440,1011,1
4,36,52,440,495,1
5,38,44,605,244,3
6,41,73,630,220,3
7,68,73,947,1043,4
8,54,25,990,605,1
9,76,9,808,605,3
10,72,49,1179,662,3


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,69,45,0,1477,2
2,29,17,-191,1004,3
3,71,41,440,743,2
4,47,51,440,1283,3
5,30,40,605,330,2
6,45,33,835,220,1
7,55,41,825,770,0
8,45,25,990,605,0
9,70,41,990,605,1
10,67,49,1100,495,0


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,17,0,1111,2
2,41,17,0,1839,2
3,69,11,440,495,1
4,36,25,440,495,0
5,40,17,605,330,0
6,45,33,997,1075,2
7,55,66,825,770,1
8,40,49,990,551,3
9,54,41,1067,605,2
10,67,73,1347,495,2


Row,note,velocity,position,duration,anomalies
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64
1,60,65,149,1266,3
2,33,43,0,935,1
3,69,11,706,952,3
4,32,25,440,495,1
5,26,0,605,330,2
6,45,33,440,220,1
7,55,11,825,1016,2
8,36,25,990,811,2
9,70,6,990,605,2
10,65,87,1325,495,3


In [48]:
csv_files = readdir("anomalous")
for csv_file in csv_files
    df = CSV.read(joinpath("anomalous", csv_file), DataFrame)
    # count the number of non-zero values
    non_zero_count = count(!iszero, df.anomalies)
    
    # calculate the proportion of non-zero values to zero values
    prop_non_zero = non_zero_count / length(df.anomalies)

    # print the result
    println("Proportion of non-zero values to zero values $csv_file: $prop_non_zero")
end