In [46]:
import mido
import os
import re
from pathlib import Path
import json
import numpy as np

In [30]:
dataset_folder_path_midi = '../datasets/maestro-v3.0.0-midi'
dataset_folder_path_spec = './spectrograms'
folder_data = os.listdir(dataset_folder_path_midi)
sample_midi = f'{dataset_folder_path_midi}/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi'
valid_folder_data = [folder for folder in folder_data if re.match(r'\d{4}', folder)]


In [33]:
from PIL import Image
import numpy as np
import torch

def grab_spectrogram_from_midi(midi_path):
    spectrogram_file_path = f'{dataset_folder_path_spec}/{midi_path.split("/")[-2]}/{midi_path.split("/")[-1].replace(".midi", ".png")}'
    image = Image.open(spectrogram_file_path).convert('L')
    image = np.array(image) / 255.0  
    image = torch.tensor(image, dtype=torch.float32).unsqueeze(0)
    return image


In [31]:
message_type_map = {
    "set_tempo": 0,
    "time_signature": 1,
    "end_of_track": 2,
    "control_change": 3,
    "note_on": 4,
    "program_change": 5,
}

array_data_position_map = {
    "general_type": 0,
    "ticks_per_beat": 1,
    "file_length": 2,
    "track_count": 3,
    "message_type": 4,
    "numerator": 5,
    "denominator": 6,
    "clocks_per_click": 7,
    "notated_32nd_notes_per_beat": 8,
    "time": 9,
    "tempo": 10,
    "channel": 11,
    "note": 12,
    "velocity": 13,
    "control": 14,
    "value": 15,
    "program": 16,
}

In [36]:
def process_midi_file_values(midi_file_path):
    messages = []

    midi_file = mido.MidiFile(midi_file_path)
    matching_spectrogram = grab_spectrogram_from_midi(midi_file_path)

    time_steps = matching_spectrogram.shape[1]
    total_duration = midi_file.length
    ticks_per_beat = midi_file.ticks_per_beat

    midi_representation = np.full((time_steps, 17), -1, np.float32)

    global_metadata_array =  np.full((time_steps, 17), -1, np.float32)
    global_metadata_array[array_data_position_map["general_type"]] = midi_file.type
    global_metadata_array[array_data_position_map["ticks_per_beat"]] = midi_file.ticks_per_beat
    global_metadata_array[array_data_position_map["file_length"]] = midi_file.length
    global_metadata_array[array_data_position_map["track_count"]] = len(midi_file.tracks)

    messages.append(global_metadata_array)

    # Initialize absolute time
    absolute_time_seconds = 0
    tempo = 500000  # Default tempo (microseconds per beat)

    for track in midi_file.tracks:
        for msg in track:
            # Handle tempo changes
            if msg.type == 'set_tempo':
                tempo = msg.tempo

            # Convert absolute time to seconds
            absolute_time_seconds += mido.tick2second(msg.time, ticks_per_beat, tempo)

            # Map message to time step
            time_step = int((absolute_time_seconds / total_duration) * time_steps)
            if time_step >= time_steps:
                continue  # Skip messages beyond the spectrogram duration

            # Create message array
            message_array = np.full(17, -1, dtype=np.float32)

            if msg.is_meta:
                if msg.type == 'time_signature':
                    message_array[array_data_position_map["message_type"]] = message_type_map["time_signature"]
                    message_array[array_data_position_map["numerator"]] = msg.numerator
                    message_array[array_data_position_map["denominator"]] = msg.denominator
                elif msg.type == 'set_tempo':
                    message_array[array_data_position_map["message_type"]] = message_type_map["set_tempo"]
                    message_array[array_data_position_map["tempo"]] = msg.tempo
                elif msg.type == 'end_of_track':
                    message_array[array_data_position_map["message_type"]] = message_type_map["end_of_track"]
            else:
                message_array[array_data_position_map["message_type"]] = message_type_map.get(msg.type, -1)
                if hasattr(msg, 'channel'):
                    message_array[array_data_position_map["channel"]] = msg.channel
                if hasattr(msg, 'note'):
                    message_array[array_data_position_map["note"]] = msg.note / 127.0  # Normalize note
                if hasattr(msg, 'velocity'):
                    message_array[array_data_position_map["velocity"]] = msg.velocity / 127.0  # Normalize velocity
                if hasattr(msg, 'control'):
                    message_array[array_data_position_map["control"]] = msg.control / 127.0  # Normalize control
                if hasattr(msg, 'value'):
                    message_array[array_data_position_map["value"]] = msg.value / 127.0  # Normalize value
                if hasattr(msg, 'program'):
                    message_array[array_data_position_map["program"]] = msg.program / 127.0  # Normalize program
            messages.append(message_array)

            # Update MIDI representation
            midi_representation[time_step] = np.maximum(
                midi_representation[time_step], message_array
            )
    return midi_representation

def save_midi_values(midi_values, output_folder, filename):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    np.save(f'{output_folder}/{filename}', np.array(midi_values))

def process_midi_folder_values(folder_path, output_folder):
    midi_files = os.listdir(folder_path)
    for midi_file in midi_files:
        midi_file_path = f'{folder_path}/{midi_file}'
        midi_values = process_midi_file_values(midi_file_path)
        save_midi_values(midi_values, output_folder, midi_file)

In [37]:
print(process_midi_file_values(sample_midi))

[[-1.         -1.         -1.         ...  0.52755904  0.984252
   0.        ]
 [-1.         -1.         -1.         ...  0.503937    0.984252
  -1.        ]
 [-1.         -1.         -1.         ...  0.52755904  0.8267717
  -1.        ]
 ...
 [-1.         -1.         -1.         ...  0.52755904  1.
  -1.        ]
 [-1.         -1.         -1.         ...  0.503937    1.
  -1.        ]
 [-1.         -1.         -1.         ...  0.503937    0.9448819
  -1.        ]]


In [None]:
from PIL import Image
import numpy as np
import torch

def load_spectrogram(image_path):
    image = Image.open(image_path).convert('L')  # Convert to grayscale
    image = np.array(image) / 255.0  # Normalize to [0, 1]
    image = torch.tensor(image, dtype=torch.float32).unsqueeze(0)  # Add channel dimension
    return image

sample_file_midi = f'{dataset_folder_path_midi}/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi'
sample_file_spec = 'spectrograms/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.png'
spectrogram = load_spectrogram(sample_file_spec)
spectrogram_time_steps = spectrogram.shape[2]
total_duration = (spectrogram_time_steps - 1) * 512 / 22050


messages = process_midi_file_values(sample_file_midi, spectrogram_time_steps, total_duration)

# Print the processed MIDI messages
print(messages)

In [39]:
# Process all midi files into values

# Create midi output folder if it doesn't exist
output_folder = './midi-processed-values'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for folder in valid_folder_data:
    print("processing folder: ", folder)
    folder_path = f'{dataset_folder_path_midi}/{folder}'
    process_midi_folder_values(folder_path, f'{output_folder}/{folder}')

processing folder:  2013
processing folder:  2014
processing folder:  2015
processing folder:  2008
processing folder:  2006
processing folder:  2009
processing folder:  2017
processing folder:  2018
processing folder:  2011
processing folder:  2004
