### Convert MAESTRO data to financial time bars

MAESTRO is Google Magenta's dataset of 200+ hours of annotated piano performance MIDI data.

In [1]:
from __future__ import division
from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as scp
import magenta
import os
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

PLOT_WIDTH = 1200
PLOT_HEIGHT = 800

def hheader(x):
    print("#########################################")
    print("### {}".format(x))
    print("#########################################")

# Magenta dependencies:
# https://github.com/magenta/magenta

# Magenta uses pretty_midi to deal with midi files
import pretty_midi

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


In [2]:
INPUT_PATH = "data/maestro/maestro-v3.0.0/"
add_input_path = lambda x: "{}/{}".format(INPUT_PATH, x).replace("//", "/")
OUTPUT_PATH = "data_processed/maestro/"
add_output_path = lambda x: "{}/{}".format(OUTPUT_PATH, x).replace("//", "/")
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### Read in MIDI metadata and pieces

In [3]:
### read in MIDI metadata
midi_metadata = pd.read_csv(add_input_path("maestro-v3.0.0.csv"))
midi_metadata.head()

Unnamed: 0,canonical_composer,canonical_title,split,year,midi_filename,audio_filename,duration
0,Alban Berg,Sonata Op. 1,train,2018,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,698.66116
1,Alban Berg,Sonata Op. 1,train,2008,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,759.518471
2,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433
3,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588
4,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508


In [4]:
import os

midi_files = []
for root, dirs, files in os.walk(INPUT_PATH):
    for file in files:
        if file.endswith(".midi"):
            print(os.path.join(root, file))
            midi_files.append(os.path.join(root, file))

data/maestro/maestro-v3.0.0/2004\MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi
data/maestro/maestro-v3.0.0/2004\MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_06_Track06_wav.midi
data/maestro/maestro-v3.0.0/2004\MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_08_Track08_wav.midi
data/maestro/maestro-v3.0.0/2004\MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_10_Track10_wav.midi
data/maestro/maestro-v3.0.0/2004\MIDI-Unprocessed_SMF_05_R1_2004_01_ORIG_MID--AUDIO_05_R1_2004_02_Track02_wav.midi
data/maestro/maestro-v3.0.0/2004\MIDI-Unprocessed_SMF_05_R1_2004_01_ORIG_MID--AUDIO_05_R1_2004_03_Track03_wav.midi
data/maestro/maestro-v3.0.0/2004\MIDI-Unprocessed_SMF_05_R1_2004_02-03_ORIG_MID--AUDIO_05_R1_2004_06_Track06_wav.midi
data/maestro/maestro-v3.0.0/2004\MIDI-Unprocessed_SMF_07_R1_2004_01_ORIG_MID--AUDIO_07_R1_2004_02_Track02_wav.midi
data/maestro/maestro-v3.0.0/2004\MIDI-Unprocessed_SMF_07_R1_2004_

### Process pieces by converting into time bars

In [5]:
"""
Process pieces
"""

all_dfs = []
for t in range(len(midi_files)):
    curr_file = midi_files[t]
    print("Processing file {} / {} ...".format(t+1, len(midi_files)))
    
    midi_test = pretty_midi.PrettyMIDI(curr_file)
    solo_piano_part = midi_test.instruments[0]
    df_notes = pd.DataFrame([(n.start, n.end, n.pitch, n.velocity, n.duration) for n in solo_piano_part.notes],
                            columns=['start', 'end', 'pitch', 'velocity', 'duration'])
    ### get metadata
    df_metadata = midi_metadata[midi_metadata['midi_filename']==curr_file.replace(INPUT_PATH, "").replace("\\", "/")]
    df_metadata = df_metadata.rename(columns={"duration": "total_duration"})
    df_metadata.drop(columns="audio_filename", inplace=True) # don't need audio for now
    curr_filename = df_metadata['midi_filename'].values[0].replace("/", "__")
    
    ### combine (so have metadata features for each row)
    df_curr = pd.merge(df_notes, df_metadata, how="cross")
    
    ### need to make sure sorted in order of notes (start)
    df_curr = df_curr.sort_values(by='start')
    
    """ Process each piece """
    
    ### Make sure everything starts at t=0
    ### TODO(echow): will have to update the audio files to also mention this
    start_time = float(df_curr['start'].head(1))
    df_curr['start_offset_before_first_note'] = start_time
    df_curr['start'] = df_curr['start'].apply(lambda x: x - start_time)
    df_curr['end'] = df_curr['end'].apply(lambda x: x - start_time)
    df_curr['total_duration'] = df_curr['total_duration'].apply(lambda x: x - start_time)
    
    """ We have to do some sampling here, because otherwise it is difficult to construct good features.
        So, the sampling here will coincide with the music being played in real-time.
    """
    
    """ Version 1: time bars
        (easiest to play back in real-time)
    """
    EVERY_N_SEC = 1
    df_curr_time = df_curr.copy()
    df_curr_time['start_sec'] =  (df_curr_time['start'] / EVERY_N_SEC).apply(np.floor).astype(int) * EVERY_N_SEC
    # round to N digits
    tmp = df_curr_time.select_dtypes(include=[np.number])
    df_curr_time.loc[:, tmp.columns] = np.round(tmp, decimals=5)
    # add filename
    df_curr_time['curr_filename'] = curr_filename
    
    ### aggregate by sample
    ### this is different from the more advanced features we will create - this is only for working
    ### with the raw music data at a usable (sampled) level
    def str_concat(x): return(','.join([str(s) for s in x]))
    df_curr_time_agg = df_curr_time.groupby(['start_sec'], as_index=False).agg({
        'start': [np.min, np.mean, np.median, np.max, str_concat],
        'end': [np.min, np.mean, np.median, np.max, str_concat],
        'pitch': [np.min, np.mean, np.median, np.max, str_concat],
        'velocity': [np.min, np.mean, np.median, np.max, str_concat],
        'duration': [np.min, np.mean, np.median, np.max, str_concat],
        # metadata features - same for all observations
        'canonical_composer': [pd.Series.mode],
        'split': [pd.Series.mode],
        'year': [pd.Series.mode],
        'total_duration': [pd.Series.mode],
        'curr_filename': [pd.Series.mode] })
    df_curr_time_agg.columns = ['_'.join([cc for cc in c if len(cc) > 0]).replace("amin", "min").replace("amax", "max")
                     for c in list(df_curr_time_agg.columns)]
    
    ### add missing time bars (just for consistency time series analysis - doesn't affect quality of data)
    df_curr_time_agg = pd.merge(df_curr_time_agg,
                             pd.DataFrame(range(0, df_curr_time_agg.shape[0], 1), columns=["start_sec"]), on="start_sec",
                            how='right').sort_values(by="start_sec")
    
    ### save
    curr_fp = curr_filename.replace(".midi", ".csv")
    df_curr_time_agg.to_csv(add_output_path(curr_fp))
    all_dfs.append(df_curr_time_agg)
    
    """ Version 2: information bars
        Skip this and volume bars for now - hard to play back in real-time.
    """
    
    
    """ Version 3: volume bars
        Skip this and volume bars for now - hard to play back in real-time.
    """
    
    
    print("... converted!")

Processing file 1 / 1276 ...
... converted!
Processing file 2 / 1276 ...
... converted!
Processing file 3 / 1276 ...
... converted!
Processing file 4 / 1276 ...
... converted!
Processing file 5 / 1276 ...
... converted!
Processing file 6 / 1276 ...
... converted!
Processing file 7 / 1276 ...
... converted!
Processing file 8 / 1276 ...
... converted!
Processing file 9 / 1276 ...
... converted!
Processing file 10 / 1276 ...
... converted!
Processing file 11 / 1276 ...
... converted!
Processing file 12 / 1276 ...
... converted!
Processing file 13 / 1276 ...
... converted!
Processing file 14 / 1276 ...
... converted!
Processing file 15 / 1276 ...
... converted!
Processing file 16 / 1276 ...
... converted!
Processing file 17 / 1276 ...
... converted!
Processing file 18 / 1276 ...
... converted!
Processing file 19 / 1276 ...
... converted!
Processing file 20 / 1276 ...
... converted!
Processing file 21 / 1276 ...
... converted!
Processing file 22 / 1276 ...
... converted!
Processing file 23 

... converted!
Processing file 182 / 1276 ...
... converted!
Processing file 183 / 1276 ...
... converted!
Processing file 184 / 1276 ...
... converted!
Processing file 185 / 1276 ...
... converted!
Processing file 186 / 1276 ...
... converted!
Processing file 187 / 1276 ...
... converted!
Processing file 188 / 1276 ...
... converted!
Processing file 189 / 1276 ...
... converted!
Processing file 190 / 1276 ...
... converted!
Processing file 191 / 1276 ...
... converted!
Processing file 192 / 1276 ...
... converted!
Processing file 193 / 1276 ...
... converted!
Processing file 194 / 1276 ...
... converted!
Processing file 195 / 1276 ...
... converted!
Processing file 196 / 1276 ...
... converted!
Processing file 197 / 1276 ...
... converted!
Processing file 198 / 1276 ...
... converted!
Processing file 199 / 1276 ...
... converted!
Processing file 200 / 1276 ...
... converted!
Processing file 201 / 1276 ...
... converted!
Processing file 202 / 1276 ...
... converted!
Processing file 203

... converted!
Processing file 361 / 1276 ...
... converted!
Processing file 362 / 1276 ...
... converted!
Processing file 363 / 1276 ...
... converted!
Processing file 364 / 1276 ...
... converted!
Processing file 365 / 1276 ...
... converted!
Processing file 366 / 1276 ...
... converted!
Processing file 367 / 1276 ...
... converted!
Processing file 368 / 1276 ...
... converted!
Processing file 369 / 1276 ...
... converted!
Processing file 370 / 1276 ...
... converted!
Processing file 371 / 1276 ...
... converted!
Processing file 372 / 1276 ...
... converted!
Processing file 373 / 1276 ...
... converted!
Processing file 374 / 1276 ...
... converted!
Processing file 375 / 1276 ...
... converted!
Processing file 376 / 1276 ...
... converted!
Processing file 377 / 1276 ...
... converted!
Processing file 378 / 1276 ...
... converted!
Processing file 379 / 1276 ...
... converted!
Processing file 380 / 1276 ...
... converted!
Processing file 381 / 1276 ...
... converted!
Processing file 382

... converted!
Processing file 541 / 1276 ...
... converted!
Processing file 542 / 1276 ...
... converted!
Processing file 543 / 1276 ...
... converted!
Processing file 544 / 1276 ...
... converted!
Processing file 545 / 1276 ...
... converted!
Processing file 546 / 1276 ...
... converted!
Processing file 547 / 1276 ...
... converted!
Processing file 548 / 1276 ...
... converted!
Processing file 549 / 1276 ...
... converted!
Processing file 550 / 1276 ...
... converted!
Processing file 551 / 1276 ...
... converted!
Processing file 552 / 1276 ...
... converted!
Processing file 553 / 1276 ...
... converted!
Processing file 554 / 1276 ...
... converted!
Processing file 555 / 1276 ...
... converted!
Processing file 556 / 1276 ...
... converted!
Processing file 557 / 1276 ...
... converted!
Processing file 558 / 1276 ...
... converted!
Processing file 559 / 1276 ...
... converted!
Processing file 560 / 1276 ...
... converted!
Processing file 561 / 1276 ...
... converted!
Processing file 562

... converted!
Processing file 720 / 1276 ...
... converted!
Processing file 721 / 1276 ...
... converted!
Processing file 722 / 1276 ...
... converted!
Processing file 723 / 1276 ...
... converted!
Processing file 724 / 1276 ...
... converted!
Processing file 725 / 1276 ...
... converted!
Processing file 726 / 1276 ...
... converted!
Processing file 727 / 1276 ...
... converted!
Processing file 728 / 1276 ...
... converted!
Processing file 729 / 1276 ...
... converted!
Processing file 730 / 1276 ...
... converted!
Processing file 731 / 1276 ...
... converted!
Processing file 732 / 1276 ...
... converted!
Processing file 733 / 1276 ...
... converted!
Processing file 734 / 1276 ...
... converted!
Processing file 735 / 1276 ...
... converted!
Processing file 736 / 1276 ...
... converted!
Processing file 737 / 1276 ...
... converted!
Processing file 738 / 1276 ...
... converted!
Processing file 739 / 1276 ...
... converted!
Processing file 740 / 1276 ...
... converted!
Processing file 741

... converted!
Processing file 899 / 1276 ...
... converted!
Processing file 900 / 1276 ...
... converted!
Processing file 901 / 1276 ...
... converted!
Processing file 902 / 1276 ...
... converted!
Processing file 903 / 1276 ...
... converted!
Processing file 904 / 1276 ...
... converted!
Processing file 905 / 1276 ...
... converted!
Processing file 906 / 1276 ...
... converted!
Processing file 907 / 1276 ...
... converted!
Processing file 908 / 1276 ...
... converted!
Processing file 909 / 1276 ...
... converted!
Processing file 910 / 1276 ...
... converted!
Processing file 911 / 1276 ...
... converted!
Processing file 912 / 1276 ...
... converted!
Processing file 913 / 1276 ...
... converted!
Processing file 914 / 1276 ...
... converted!
Processing file 915 / 1276 ...
... converted!
Processing file 916 / 1276 ...
... converted!
Processing file 917 / 1276 ...
... converted!
Processing file 918 / 1276 ...
... converted!
Processing file 919 / 1276 ...
... converted!
Processing file 920

... converted!
Processing file 1077 / 1276 ...
... converted!
Processing file 1078 / 1276 ...
... converted!
Processing file 1079 / 1276 ...
... converted!
Processing file 1080 / 1276 ...
... converted!
Processing file 1081 / 1276 ...
... converted!
Processing file 1082 / 1276 ...
... converted!
Processing file 1083 / 1276 ...
... converted!
Processing file 1084 / 1276 ...
... converted!
Processing file 1085 / 1276 ...
... converted!
Processing file 1086 / 1276 ...
... converted!
Processing file 1087 / 1276 ...
... converted!
Processing file 1088 / 1276 ...
... converted!
Processing file 1089 / 1276 ...
... converted!
Processing file 1090 / 1276 ...
... converted!
Processing file 1091 / 1276 ...
... converted!
Processing file 1092 / 1276 ...
... converted!
Processing file 1093 / 1276 ...
... converted!
Processing file 1094 / 1276 ...
... converted!
Processing file 1095 / 1276 ...
... converted!
Processing file 1096 / 1276 ...
... converted!
Processing file 1097 / 1276 ...
... converted

... converted!
Processing file 1252 / 1276 ...
... converted!
Processing file 1253 / 1276 ...
... converted!
Processing file 1254 / 1276 ...
... converted!
Processing file 1255 / 1276 ...
... converted!
Processing file 1256 / 1276 ...
... converted!
Processing file 1257 / 1276 ...
... converted!
Processing file 1258 / 1276 ...
... converted!
Processing file 1259 / 1276 ...
... converted!
Processing file 1260 / 1276 ...
... converted!
Processing file 1261 / 1276 ...
... converted!
Processing file 1262 / 1276 ...
... converted!
Processing file 1263 / 1276 ...
... converted!
Processing file 1264 / 1276 ...
... converted!
Processing file 1265 / 1276 ...
... converted!
Processing file 1266 / 1276 ...
... converted!
Processing file 1267 / 1276 ...
... converted!
Processing file 1268 / 1276 ...
... converted!
Processing file 1269 / 1276 ...
... converted!
Processing file 1270 / 1276 ...
... converted!
Processing file 1271 / 1276 ...
... converted!
Processing file 1272 / 1276 ...
... converted

In [11]:
# TODO: rerun with the above code to fill in missing start_sec bars

In [12]:
""" Also create a big music stream from all those MIDI files concatenated,
    representing one continuous stream of music
"""

df_all_time_agg = pd.concat([df.assign(piece="piece_{}".format(dx)) for dx, df in enumerate(all_dfs)])
columns = df_all_time_agg.columns.tolist()[-1:] + df_all_time_agg.columns.tolist()[:-1]
df_all_time_agg = df_all_time_agg[columns]

### preview
display(df_all_time_agg.head())
print(df_all_time_agg.shape)

### save
all_fp = "maestro_full_music_stream.csv"
df_all_time_agg.to_csv(add_output_path(all_fp))
print("Wrote to: {}".format(add_output_path(all_fp)))

# df_all_time_agg.groupby(['piece']).size().sort_values()

Unnamed: 0,piece,start_sec,start_min,start_mean,start_median,start_max,start_str_concat,end_min,end_mean,end_median,end_max,end_str_concat,pitch_min,pitch_mean,pitch_median,pitch_max,pitch_str_concat,velocity_min,velocity_mean,velocity_median,velocity_max,velocity_str_concat,duration_min,duration_mean,duration_median,duration_max,duration_str_concat,canonical_composer_mode,split_mode,year_mode,total_duration_mode,curr_filename_mode
0,piece_0,0,0.0,0.542394,0.617185,0.94479,"0.0,0.18646,0.19583,0.37083,0.54062,0.69375,0....",0.09687,0.749271,0.71823,1.43021,"0.09687,0.40417,0.70104,0.53854,0.66042,0.7354...",55.0,66.0,69.0,74.0,71557159627267745772,44.0,60.3,58.0,77.0,60445455527656686177,0.04167,0.206875,0.14375,0.53958,"0.09687,0.21771,0.50521,0.16771,0.11979,0.0416...",Johann Sebastian Bach,train,2004.0,967.16405,2004__MIDI-Unprocessed_SMF_02_R1_2004_01-05_OR...
1,piece_0,1,1.00521,1.432576,1.46562,1.9875,"1.00521,1.05625,1.07917,1.23542,1.43021,1.4656...",1.08958,1.704828,1.58125,2.35521,"1.08958,1.42708,1.1875,1.41667,1.49687,1.67083...",59.0,67.909091,67.0,74.0,7472676671647274596266,35.0,56.363636,58.0,68.0,5160575868354768506363,0.06667,0.272254,0.19271,0.76875,"0.08438,0.37083,0.10833,0.18125,0.06667,0.2052...",Johann Sebastian Bach,train,2004.0,967.16405,2004__MIDI-Unprocessed_SMF_02_R1_2004_01-05_OR...
2,piece_0,2,2.15729,2.560937,2.616665,2.9,"2.15729,2.36042,2.52708,2.70625,2.71458,2.9",2.50938,3.062153,2.903645,4.23542,"4.23542,2.50938,2.71354,2.93333,2.87396,3.10729",59.0,71.333333,72.5,79.0,677174785979,58.0,68.333333,69.5,78.0,586678775873,0.14896,0.501215,0.196875,2.07812,"2.07812,0.14896,0.18646,0.22708,0.15938,0.20729",Johann Sebastian Bach,train,2004.0,967.16405,2004__MIDI-Unprocessed_SMF_02_R1_2004_01-05_OR...
3,piece_0,3,3.08021,3.500116,3.43125,3.98542,"3.08021,3.08958,3.25625,3.42604,3.43125,3.6041...",3.24062,3.698147,3.6125,4.21458,"3.25,3.24062,3.42812,3.59167,3.6125,3.80521,3....",57.0,69.666667,72.0,79.0,766079745979725771,47.0,64.333333,68.0,76.0,765468705963704772,0.15104,0.198033,0.17396,0.3875,"0.16979,0.15104,0.17188,0.16563,0.18125,0.2010...",Johann Sebastian Bach,train,2004.0,967.16405,2004__MIDI-Unprocessed_SMF_02_R1_2004_01-05_OR...
4,piece_0,4,4.17917,4.543751,4.541145,4.92396,"4.17917,4.18542,4.34792,4.54062,4.54167,4.7177...",4.35833,4.792836,4.80573,5.26562,"4.35833,4.52187,4.54271,4.70417,5.26562,4.9072...",55.0,68.125,70.0,81.0,7266817255716959,61.0,67.75,68.5,73.0,7361737067706662,0.04688,0.249089,0.184375,0.72396,"0.17917,0.33646,0.19479,0.16354,0.72396,0.1895...",Johann Sebastian Bach,train,2004.0,967.16405,2004__MIDI-Unprocessed_SMF_02_R1_2004_01-05_OR...


(671603, 32)
Wrote to: data_processed/maestro/maestro_full_music_stream.csv
