### Convert MAESTRO data to financial time bars

MAESTRO is Google Magenta's dataset of 200+ hours of annotated piano performance MIDI data.

In [3]:
from __future__ import division
from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as scp
import magenta
import os
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

PLOT_WIDTH = 1200
PLOT_HEIGHT = 800

def hheader(x):
    print("#########################################")
    print("### {}".format(x))
    print("#########################################")

# Magenta dependencies:
# https://github.com/magenta/magenta

# Magenta uses pretty_midi to deal with midi files
import pretty_midi

In [4]:
INPUT_PATH = "data/saarland/"
add_input_path = lambda x: "{}/{}".format(INPUT_PATH, x).replace("//", "/")
OUTPUT_PATH = "data_processed/saarland/"
add_output_path = lambda x: "{}/{}".format(OUTPUT_PATH, x).replace("//", "/")
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### Read in MIDI metadata and pieces

In [None]:
### read in MIDI metadata
midi_metadata = pd.read_csv(add_input_path("maestro-v3.0.0.csv"))
midi_metadata.head()

In [None]:
import os

midi_files = []
for root, dirs, files in os.walk(INPUT_PATH):
    for file in files:
        if file.endswith(".midi"):
            print(os.path.join(root, file))
            midi_files.append(os.path.join(root, file))

### Process pieces by converting into time bars

In [None]:
"""
Process pieces
"""

all_dfs = []
for t in range(len(midi_files)):
    curr_file = midi_files[t]
    print("Processing file {} / {} ...".format(t+1, len(midi_files)))
    
    midi_test = pretty_midi.PrettyMIDI(curr_file)
    solo_piano_part = midi_test.instruments[0]
    df_notes = pd.DataFrame([(n.start, n.end, n.pitch, n.velocity, n.duration) for n in solo_piano_part.notes],
                            columns=['start', 'end', 'pitch', 'velocity', 'duration'])
    ### get metadata
    df_metadata = midi_metadata[midi_metadata['midi_filename']==curr_file.replace(INPUT_PATH, "").replace("\\", "/")]
    df_metadata = df_metadata.rename(columns={"duration": "total_duration"})
    df_metadata.drop(columns="audio_filename", inplace=True) # don't need audio for now
    curr_filename = df_metadata['midi_filename'].values[0].replace("/", "__")
    
    ### combine (so have metadata features for each row)
    df_curr = pd.merge(df_notes, df_metadata, how="cross")
    
    ### need to make sure sorted in order of notes (start)
    df_curr = df_curr.sort_values(by='start')
    
    """ Process each piece """
    
    ### Make sure everything starts at t=0
    ### TODO(echow): will have to update the audio files to also mention this
    start_time = float(df_curr['start'].head(1))
    df_curr['start_offset_before_first_note'] = start_time
    df_curr['start'] = df_curr['start'].apply(lambda x: x - start_time)
    df_curr['end'] = df_curr['end'].apply(lambda x: x - start_time)
    df_curr['total_duration'] = df_curr['total_duration'].apply(lambda x: x - start_time)
    
    """ We have to do some sampling here, because otherwise it is difficult to construct good features.
        So, the sampling here will coincide with the music being played in real-time.
    """
    
    """ Version 1: time bars
        (easiest to play back in real-time)
    """
    EVERY_N_SEC = 1
    df_curr_time = df_curr.copy()
    df_curr_time['start_sec'] =  (df_curr_time['start'] / EVERY_N_SEC).apply(np.floor).astype(int) * EVERY_N_SEC
    # round to N digits
    tmp = df_curr_time.select_dtypes(include=[np.number])
    df_curr_time.loc[:, tmp.columns] = np.round(tmp, decimals=5)
    # add filename
    df_curr_time['curr_filename'] = curr_filename
    
    ### aggregate by sample
    ### this is different from the more advanced features we will create - this is only for working
    ### with the raw music data at a usable (sampled) level
    def str_concat(x): return(','.join([str(s) for s in x]))
    df_curr_time_agg = df_curr_time.groupby(['start_sec'], as_index=False).agg({
        'start': [np.min, np.mean, np.median, np.max, str_concat],
        'end': [np.min, np.mean, np.median, np.max, str_concat],
        'pitch': [np.min, np.mean, np.median, np.max, str_concat],
        'velocity': [np.min, np.mean, np.median, np.max, str_concat],
        'duration': [np.min, np.mean, np.median, np.max, str_concat],
        # metadata features - same for all observations
        'canonical_composer': [pd.Series.mode],
        'split': [pd.Series.mode],
        'year': [pd.Series.mode],
        'total_duration': [pd.Series.mode],
        'curr_filename': [pd.Series.mode] })
    df_curr_time_agg.columns = ['_'.join([cc for cc in c if len(cc) > 0]).replace("amin", "min").replace("amax", "max")
                     for c in list(df_curr_time_agg.columns)]
    
    ### add missing time bars (just for consistency time series analysis - doesn't affect quality of data)
    df_curr_time_agg = pd.merge(df_curr_time_agg,
                             pd.DataFrame(range(0, df_curr_time_agg.shape[0], 1), columns=["start_sec"]), on="start_sec",
                            how='right').sort_values(by="start_sec")
    
    ### save
    curr_fp = curr_filename.replace(".midi", ".csv")
    df_curr_time_agg.to_csv(add_output_path(curr_fp))
    all_dfs.append(df_curr_time_agg)
    
    """ Version 2: information bars
        Skip this and volume bars for now - hard to play back in real-time.
    """
    
    
    """ Version 3: volume bars
        Skip this and volume bars for now - hard to play back in real-time.
    """
    
    
    print("... converted!")

In [None]:
""" Also create a big music stream from all those MIDI files concatenated,
    representing one continuous stream of music
"""

df_all_time_agg = pd.concat([df.assign(piece="piece_{}".format(dx)) for dx, df in enumerate(all_dfs)])
columns = df_all_time_agg.columns.tolist()[-1:] + df_all_time_agg.columns.tolist()[:-1]
df_all_time_agg = df_all_time_agg[columns]

### add cumulative start time as well for continuous streaming
### this is so we don't get weird time stop and restarts between piece boundaries

### preview
display(df_all_time_agg.head())
print(df_all_time_agg.shape)

### save
all_fp = "maestro_full_music_stream.csv"
df_all_time_agg.to_csv(add_output_path(all_fp))
print("Wrote to: {}".format(add_output_path(all_fp)))

# df_all_time_agg.groupby(['piece']).size().sort_values()