## Note-level dataset generation

This notebook uses raw data from the MAESTRO dataset to set up sequential numpy arrays suitable for training deep neural networks.

In [1]:
#### START HERE ####

dataFolder = 'data'  # make sure the path to data folder is correct

### Computing Note Information

In [20]:
import os

import numpy as np
import pandas as pd
from IPython.display import clear_output
import pretty_midi as pm

import expression_modeling as m 

def preprocess(folder, file, outfile, piece_id, include_header=None):
    
    midi = pm.PrettyMIDI(os.path.join(folder, file + '.midi'))
    df = m.buildNoteDataframeFromPerfMidi(midi)
    df['pieceId'] = piece_id

    if outfile is None:
        if include_header is None:
            include_header = True
        outfile = open('maestro.csv', 'w+')
        df.to_csv(outfile, header=include_header)
    else:
        if include_header is None:
            include_header = False
        df.to_csv(outfile, mode='a', header=include_header)
    return outfile

with open(os.path.join(dataFolder, 'maestroFull.csv'), 'w+') as outfile, open(os.path.join(dataFolder, 'maestroFull_ids.csv'), 'w+') as idfile:
    header = True
    for year in ['2004', '2006', '2008', '2009', '2011', '2013', '2014', '2015', '2017']:

        files = [os.path.splitext(f)[0] for f in os.listdir(os.path.join(dataFolder, 'maestro', year)) if os.path.splitext(f)[1] == '.midi']
        for i, f in enumerate(files):
            clear_output(wait=True)
            print("Processing " + year + " piece " + str(i+1) + '/' + str(len(files)))
            idfile.write("{},{}\n".format(int(year)*1000 + i, year + '/' + f))
            preprocess(os.path.join(dataFolder, 'maestro', year), f, outfile, int(year)*1000 + i, include_header=header)
            header = False

Processing 2017 piece 140/140


## Sequence Generation (Features)

The following cells use the CSV file produced above to format the data into sequences of notes containing a pitch vocabulary and a set of features about the note.

### Loading Note Information

In [21]:
import os
import numpy as np
import pandas as pd

# read csv
with open(os.path.join(dataFolder, 'maestroFull.csv'), 'r') as path:
    df = pd.read_csv(path)

df.drop(['Unnamed: 0'], axis=1, inplace=True)
print('initial size: ' + str(len(df)))

initial size: 6176803


### Mapping vocabulary

In [22]:
import pickle

pitches = list(df.loc[:,['pitch']].itertuples(index=False, name=None))
voc = list(set(pitches))
print('vocabulary size = ' + str(len(voc)) + ' + 4 ctl. words = ' + str(4 + len(voc)))
lex_to_ix = { lex:i+1 for i,lex in enumerate(voc) }  # index 0 is vacant for masking

with open(os.path.join(dataFolder, 'mF_pitch_dict.data'), 'wb') as filehandle:
    pickle.dump(lex_to_ix, filehandle)

pitches = list(df.loc[:,['pitch']].itertuples(index=False, name=None))
df['pitch'] = [lex_to_ix.get(m, len(lex_to_ix)+1) for m in pitches]

vocabulary size = 88 + 4 ctl. words = 92


### Picking Training / Validation / Test sets:

In [23]:
import os

with open(os.path.join(dataFolder, 'maestro', 'maestro-v1.0.0.csv')) as f:
    mlst = pd.read_csv(f)
with open(os.path.join(dataFolder, 'maestroFull_ids.csv')) as f:
    mids = pd.read_csv(f, names=['id', 'name'])
    
names_tr = mlst.loc[mlst.split == 'train'].midi_filename
names_tr = [n[:-5] for n in names_tr] # delete extension
names_v = mlst.loc[mlst.split == 'validation'].midi_filename
names_v = [n[:-5] for n in names_v]
names_ts = mlst.loc[mlst.split == 'test'].midi_filename
names_ts = [n[:-5] for n in names_ts]

training_pieces = []
validation_pieces = []
test_pieces = []
for m in mids.itertuples():
    if m.name in names_tr:
        training_pieces.append(m.id)
    elif m.name in names_v:
        validation_pieces.append(m.id)
    elif m.name in names_ts:
        test_pieces.append(m.id)
    else:
        print(m.name + ' not in any set.')
        training_pieces.append(m.id)

print('Split dataset into:')
print(str(len(training_pieces)) + ' training pieces')
print(str(len(validation_pieces)) + ' validation pieces')
print(str(len(test_pieces)) + ' test pieces')

Split dataset into:
954 training pieces
105 validation pieces
125 test pieces


### Arranging data for sequential training and saving dataset

In [24]:
import pickle

def sequencer(df, one_hot_cols=None):
    sequences = []
    maxLen = 0
    # list the pieces
    pieces = set(df.pieceId)
    for p in pieces:
        d = df.loc[df.pieceId == p, :].copy()
        maxLen = len(d) if len(d) > maxLen else maxLen
        d.drop(['pieceId'], axis=1, inplace=True)

        # convert categories to one-hot
        if one_hot_cols:
            for attrib in one_hot_cols:
                d = pd.concat([d, pd.get_dummies(d[attrib], prefix=attrib)], axis=1)
                d.drop([attrib], axis=1, inplace=True)

        # instance standardization for relevant features
        feats = ['velocity']
        aux = d.loc[:, feats]
        moments = np.zeros((aux.shape[1], 2))
        moments[:, 0] = aux.mean().to_numpy()
        moments[:, 1] = aux.std().to_numpy()
        d.loc[:, feats] = (aux - moments[:,0])/ moments[:,1]

        # add <END> token to sequence
        end = pd.DataFrame(np.zeros((1,d.shape[1])), columns=d.columns)
        end["pitch"] = len(lex_to_ix) + 2
        d = d.append(end)

        # add <SOS> token to sequence
        start = pd.DataFrame(np.zeros((1,d.shape[1])), columns=d.columns)
        start["pitch"] = len(lex_to_ix) + 3
        d = pd.concat([start, d])

        # separate output features
        outCols = ['velocity']
        y = d.loc[:, outCols].copy()
        d.drop(outCols, axis=1, inplace=True)

        sequences.append(((d, y, moments), p, 0))
    return sequences

def standardize(df, moments=None, cols=None):
    if cols is None:
        cols = (df.dtypes == 'float64')
    nums = df.loc[:,cols]
    if moments is None:
        moments = np.zeros((nums.shape[1],2))  # output mean and std for reverting predictions
        moments[:,0] = nums.mean().to_numpy()
        moments[:,1] = nums.std().to_numpy()
    df.loc[:, cols] = (nums - moments[:,0]) / moments[:,1]
    return moments, cols


# Separate Training / Validation / Test:

test = df.loc[df.pieceId.isin(test_pieces), :].copy()
train = df.loc[df.pieceId.isin(training_pieces), :].copy()
val = df.loc[df.pieceId.isin(validation_pieces), :].copy()

# Standardization
moments, cols = standardize(train, cols=['onsetDiff', 'durationSecs'])
standardize(val, moments=moments, cols=cols)
with open(os.path.join(dataFolder, 'mF_normalizer.data'), 'wb') as filehandle:
    pickle.dump((moments, cols), filehandle)

train_seq = sequencer(train)
val_seq = sequencer(val)

#  Save arrays
print('Saving data')
with open(os.path.join(dataFolder, 'mF_train_sequences.data'), 'wb') as filehandle:
    pickle.dump(train_seq, filehandle)
with open(os.path.join(dataFolder, 'mF_val_sequences.data'), 'wb') as filehandle:
    pickle.dump(val_seq, filehandle)

# Prepare test sequences
standardize(test, moments=moments, cols=cols) # using last fold moments (it's good enough)
test_seq = sequencer(test)
with open(os.path.join(dataFolder, 'mF_test_sequences.data'), 'wb') as filehandle:
    pickle.dump(test_seq, filehandle)
print('Finished.')

Saving data
Finished.
