In [3]:
import mido
import os
import re
from pathlib import Path
import json

In [4]:
dataset_folder_path = '/Users/gabriel.alvarado/Repos/thinkai/datasets/maestro-v3.0.0-midi'


folder_data = os.listdir(dataset_folder_path)
valid_folder_data = [folder for folder in folder_data if re.match(r'\d{4}', folder)]
print(valid_folder_data)

['2013', '2014', '2015', '2008', '2006', '2009', '2017', '2018', '2011', '2004']


In [8]:
def process_midi_file_json(midi_file_path):
    midi_file = mido.MidiFile(midi_file_path)
    return json.dumps({
        "metadata": {
            "type": midi_file.type,
            "ticks_per_beat": midi_file.ticks_per_beat,
            "length": midi_file.length,
            "track_count": len(midi_file.tracks)
        },
        "messages": [str(msg) for track in midi_file.tracks for msg in track]
    })

def save_midi_data(midi_data, output_folder, filename):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    with open(f'{output_folder}/{filename}.json', 'w') as f:
        f.write(str(midi_data))

def process_midi_folder_json(folder_path, output_folder):
    midi_files = os.listdir(folder_path)
    for midi_file in midi_files:
        midi_file_path = f'{folder_path}/{midi_file}'
        midi_data = process_midi_file_json(midi_file_path)
        save_midi_data(midi_data, output_folder, midi_file)


In [44]:
message_type_map = {
    "set_tempo": 0,
    "time_signature": 1,
    "end_of_track": 2,
    "control_change": 3,
    "note_on": 4,
    "program_change": 5,
}

def process_midi_file_values(midi_file_path):
    midi_file = mido.MidiFile(midi_file_path)
    messages = []

    global_metadata = [
        midi_file.type,
        midi_file.ticks_per_beat,
        midi_file.length,
        len(midi_file.tracks)
    ]

    for track in midi_file.tracks:
        for msg in track:
            if not msg.is_meta:
                message_values = []
                message_values.append(message_type_map.get(msg.type, -1))
                if hasattr(msg, 'channel'):
                    message_values.append(msg.channel)
                if hasattr(msg, 'note'):
                    message_values.append(msg.note)
                if hasattr(msg, 'velocity'):
                    message_values.append(msg.velocity)
                if hasattr(msg, 'control'):
                    message_values.append(msg.control)
                if hasattr(msg, 'value'):
                    message_values.append(msg.value)
                if hasattr(msg, 'program'):
                    message_values.append(msg.program)
                message_values.append(msg.time)

            else:
                if msg.type == 'time_signature':
                    message_values = [
                        message_type_map.get("time_signature", -1),
                        msg.numerator,
                        msg.denominator,
                        msg.clocks_per_click,
                        msg.notated_32nd_notes_per_beat,
                        msg.time,
                    ]
                elif msg.type == 'set_tempo':
                    message_values = [
                        message_type_map.get("set_tempo", -1),
                        msg.tempo,
                        msg.time,
                    ]
                elif msg.type == 'end_of_track':
                    message_values = [
                        message_type_map.get("end_of_track", -1),
                        msg.time,
                    ]
            if message_values:
                messages.append(message_values)

    return global_metadata, messages


def save_midi_values(midi_values, output_folder, filename):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    with open(f'{output_folder}/{filename}', 'w') as f:
        f.write(json.dumps(midi_values))

def process_midi_folder_values(folder_path, output_folder):
    midi_files = os.listdir(folder_path)
    for midi_file in midi_files:
        midi_file_path = f'{folder_path}/{midi_file}'
        midi_values = process_midi_file_values(midi_file_path)
        save_midi_values(midi_values, output_folder, midi_file)

In [45]:
messages = process_midi_file_values(f'{dataset_folder_path}/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi')
print(messages)

AttributeError: 'MidiFile' object has no attribute 'metadata'

In [None]:
# Process all midi files into JSON

# Create midi output folder if it doesn't exist
output_folder = './midi-processed'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for folder in valid_folder_data:
    print("processing folder: ", folder)
    folder_path = f'{dataset_folder_path}/{folder}'
    process_midi_folder_json(folder_path, f'{output_folder}/{folder}')

In [41]:
# Process all midi files into values

# Create midi output folder if it doesn't exist
output_folder = './midi-processed-values'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for folder in valid_folder_data:
    print("processing folder: ", folder)
    folder_path = f'{dataset_folder_path}/{folder}'
    process_midi_folder_values(folder_path, f'{output_folder}/{folder}')

processing folder:  2013
processing folder:  2014
processing folder:  2015
processing folder:  2008
processing folder:  2006
processing folder:  2009
processing folder:  2017
processing folder:  2018
processing folder:  2011
processing folder:  2004
