In [46]:
import mido
import os
import re
from pathlib import Path
import json
import numpy as np

In [47]:
dataset_folder_path = '../datasets/maestro-v3.0.0-midi'


folder_data = os.listdir(dataset_folder_path)
valid_folder_data = [folder for folder in folder_data if re.match(r'\d{4}', folder)]
print(valid_folder_data)

['2013', '2014', '2015', '2008', '2006', '2009', '2017', '2018', '2011', '2004']


In [48]:
def process_midi_file_json(midi_file_path):
    midi_file = mido.MidiFile(midi_file_path)
    return json.dumps({
        "metadata": {
            "type": midi_file.type,
            "ticks_per_beat": midi_file.ticks_per_beat,
            "length": midi_file.length,
            "track_count": len(midi_file.tracks)
        },
        "messages": [str(msg) for track in midi_file.tracks for msg in track]
    })

def save_midi_data(midi_data, output_folder, filename):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    with open(f'{output_folder}/{filename}.json', 'w') as f:
        f.write(str(midi_data))

def process_midi_folder_json(folder_path, output_folder):
    midi_files = os.listdir(folder_path)
    for midi_file in midi_files:
        midi_file_path = f'{folder_path}/{midi_file}'
        midi_data = process_midi_file_json(midi_file_path)
        save_midi_data(midi_data, output_folder, midi_file)


In [50]:
message_type_map = {
    "set_tempo": 0,
    "time_signature": 1,
    "end_of_track": 2,
    "control_change": 3,
    "note_on": 4,
    "program_change": 5,
}

array_data_position_map = {
    "general_type": 0,
    "ticks_per_beat": 1,
    "file_length": 2,
    "track_count": 3,
    "message_type": 4,
    "numerator": 5,
    "denominator": 6,
    "clocks_per_click": 7,
    "notated_32nd_notes_per_beat": 8,
    "time": 9,
    "tempo": 10,
    "channel": 11,
    "note": 12,
    "velocity": 13,
    "control": 14,
    "value": 15,
    "program": 16,
}

def process_midi_file_values(midi_file_path):
    midi_file = mido.MidiFile(midi_file_path)
    messages = []

    global_metadata_array = fixed_array(17)
    global_metadata_array[array_data_position_map["general_type"]] = midi_file.type
    global_metadata_array[array_data_position_map["ticks_per_beat"]] = midi_file.ticks_per_beat
    global_metadata_array[array_data_position_map["file_length"]] = midi_file.length
    global_metadata_array[array_data_position_map["track_count"]] = len(midi_file.tracks)

    messages.append(global_metadata_array)

    for track in midi_file.tracks:
        for msg in track:
            message_values = fixed_array(17)
            if not msg.is_meta:
                message_values[array_data_position_map["message_type"]] = message_type_map.get(msg.type, -1)
                if hasattr(msg, 'channel'):
                    message_values[array_data_position_map["channel"]] = msg.channel
                if hasattr(msg, 'note'):
                    message_values[array_data_position_map["note"]] = msg.note
                if hasattr(msg, 'velocity'):
                    message_values[array_data_position_map["velocity"]] = msg.velocity
                if hasattr(msg, 'control'):
                    message_values[array_data_position_map["control"]] = msg.control
                if hasattr(msg, 'value'):
                    message_values[array_data_position_map["value"]] = msg.value
                if hasattr(msg, 'program'):
                    message_values[array_data_position_map["program"]] = msg.program
                message_values[array_data_position_map["time"]] = msg.time
            else:
                if msg.type == 'time_signature':
                    message_values[array_data_position_map["message_type"]] = message_type_map.get("time_signature", -1)
                    message_values[array_data_position_map["numerator"]] = msg.numerator
                    message_values[array_data_position_map["denominator"]] = msg.denominator
                    message_values[array_data_position_map["clocks_per_click"]] = msg.clocks_per_click
                    message_values[array_data_position_map["notated_32nd_notes_per_beat"]] = msg.notated_32nd_notes_per_beat
                    message_values[array_data_position_map["time"]] = msg.time
                elif msg.type == 'set_tempo':
                    message_values[array_data_position_map["message_type"]] = message_type_map.get("set_tempo", -1)
                    message_values[array_data_position_map["tempo"]] = msg.tempo
                    message_values[array_data_position_map["time"]] = msg.time
                elif msg.type == 'end_of_track':
                    message_values[array_data_position_map["message_type"]] = message_type_map.get("end_of_track", -1)
                    message_values[array_data_position_map["time"]] = msg.time
            messages.append(message_values)

    return messages

def fixed_array(size):
    return np.full(size, -1)


def save_midi_values(midi_values, output_folder, filename):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    np.save(f'{output_folder}/{filename}', np.array(midi_values))

def process_midi_folder_values(folder_path, output_folder):
    midi_files = os.listdir(folder_path)
    for midi_file in midi_files:
        midi_file_path = f'{folder_path}/{midi_file}'
        midi_values = process_midi_file_values(midi_file_path)
        save_midi_values(midi_values, output_folder, midi_file)

In [None]:
messages = process_midi_file_values(f'{dataset_folder_path}/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi')
print(messages)

In [26]:
# Process all midi files into JSON

# Create midi output folder if it doesn't exist
output_folder = './midi-processed'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for folder in valid_folder_data:
    print("processing folder: ", folder)
    folder_path = f'{dataset_folder_path}/{folder}'
    process_midi_folder_json(folder_path, f'{output_folder}/{folder}')

processing folder:  2013
processing folder:  2014
processing folder:  2015
processing folder:  2008
processing folder:  2006
processing folder:  2009
processing folder:  2017
processing folder:  2018
processing folder:  2011
processing folder:  2004


In [40]:
# Process all midi files into values

# Create midi output folder if it doesn't exist
output_folder = './midi-processed-values'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for folder in valid_folder_data:
    print("processing folder: ", folder)
    folder_path = f'{dataset_folder_path}/{folder}'
    process_midi_folder_values(folder_path, f'{output_folder}/{folder}')

processing folder:  2013
processing folder:  2014
processing folder:  2015
processing folder:  2008
processing folder:  2006
processing folder:  2009
processing folder:  2017
processing folder:  2018
processing folder:  2011
processing folder:  2004
