#### Note-level dataset generation

This notebook uses raw data from the MusicNet dataset to set up sequential numpy arrays suitable for training deep neural networks.

**Before running:** Make sure to run the "Levels Computation" notebook to produce the numpy array files with global audio levels.

If the intention is training a model in a remote server, instead of uploading the whole MusicNet dataset, the best strategy is to run the first cell locally for generating the pandas csv, uploading that file and continuing from there.

In [19]:
#### START HERE ####

dataFolder = 'data/'  # make sure the path to data folder is correct
is_training_set = True 

In [14]:
import os

import numpy as np
import pandas as pd
from IPython.display import clear_output

import expression_modeling as m 

def preprocess(labelsDir, instruments={41, 42, 43, 72, 74}, csvname='per_note_train', outfile=None):

    dataset = [csv for csv in os.listdir(labelsDir)]
    for i, csv in enumerate(dataset):
        print('processing piece ' + str(i+1) + '/' + str(len(dataset)), end='\r')
        
        # load the symbolic information from the dataset
        notearray = np.genfromtxt(os.path.join(labelsDir, csv), delimiter=',', names=True, dtype=['i', 'i', 'i', 'i', 'f', 'f', '|U40'])

        #  check if piece contains any desired instrument
        csv_instruments = set(notearray['instrument'])
        csv_desired_instruments = csv_instruments.intersection(instruments)
        if not csv_desired_instruments:
            continue
        
        #  load levels (generated by "Levels computation" notebook)
        levels = np.load(dataFolder + 'levels/' + csv.replace('.csv', '_global_lvls.npy'))

        # piece key estimation (only major and minor for now)
        isMajor, key, llhoodM, llhoodm = m.estimateKey(notearray['note'])
        mode = m.Mode.major if isMajor else m.Mode.minor

        piece = m.Piece(key=key, mode=mode, name=csv)
        piece.dynMean = np.mean(levels)
        piece.dynStd = np.std(levels)
        piece.startTime = notearray['start_time'][0]
        piece.startBeat = notearray['start_beat'][0]
        piece.endTime = notearray['end_time'][-1]
        piece.endBeat = notearray['start_beat'][-1] + notearray['end_beat'][-1]
        piece.parts = m.buildNoteParts(notearray, (levels - piece.dynMean)/piece.dynStd, 44100, csv_desired_instruments)
        
        df = []
        for inst in csv_desired_instruments:
            if is_training_set:
                for tr in range(-3,4):
                    di = m.buildNoteLevelDataframe(piece, inst, transpose=tr)
                    di['transposition'] = tr
                    df.append(di)
            else:
                di = m.buildNoteLevelDataframe(piece, inst, transpose=0)
                di['transposition'] = 0
                df.append(di)
        df = pd.concat(df, ignore_index=True)
        df['pieceId'] = int(csv[0:-4])
        
        if outfile is None:
            outfile = open(dataFolder + csvname + '.csv', 'w+')
            df.to_csv(outfile)
        else:
            df.to_csv(outfile, mode='a', header=False)
    return outfile

clear_output()
if is_training_set:
    print('Begin training set')
    f = preprocess('./data/musicnet/train_labels')
    f.close()
else:
    print('Begin processing test set')
    f = preprocess('./data/musicnet/test_labels', csvname='per_note_test')
    f.close()


Begin processing test set
processing piece 10/10

In [20]:
import numpy as np
import pandas as pd

np.random.seed(1728)

#  read csv
if is_training_set:
    with open(dataFolder + 'per_note_train.csv', 'r') as path:
        df = pd.read_csv(path)
else:
    with open(dataFolder + 'per_note_test.csv', 'r') as path:
        df = pd.read_csv(path)

df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['bassNote'] = df['bassNote'].astype(pd.CategoricalDtype(list(range(0, 12))))
df['metricStrength'] = df['metricStrength'].astype(pd.CategoricalDtype(list(range(0, 4))))
df['instrument'] = df['instrument'].astype(pd.CategoricalDtype([41, 42, 43, 72, 74]))
print('initial size: ' + str(len(df)))


initial size: 2385495


In [21]:
import pickle

if is_training_set:
    #  generate vocabulary of (pitch, bassNote)

    print("Mapping melody vocabulary.")
    melodies = list(df.loc[:,['pitch', 'bassNote']].itertuples(index=False, name=None))
    voc = list(set(melodies))
    print('vocabulary size = ' + str(len(voc)))
    lex_to_ix = { lex:i+1 for i,lex in enumerate(voc) } # index 0 is vacant for masking

    with open(dataFolder + 'note_sequences_dict.data', 'wb') as filehandle:
        pickle.dump(lex_to_ix, filehandle)
else:
    with open(dataFolder + 'note_sequences_dict.data', 'rb') as filehandle:
        lex_to_ix = pickle.load(filehandle)

melodies = list(df.loc[:,['pitch', 'bassNote']].itertuples(index=False, name=None))
df['melody'] = [lex_to_ix.get(m, len(lex_to_ix)+1) for m in melodies]
df.drop(['pitch', 'bassNote'], axis=1, inplace=True)

Mapping melody vocabulary.
vocabulary size = 830


In [22]:
def sequencer(df, one_hot_cols=None):
    sequences = []
    maxLen = 0
    #  list the pieces
    pieces = set(df.pieceId)
    for p in pieces:
        # list the instruments
        piece_seq = []
        dp = df.loc[df.pieceId == p, :]
        instruments = set(dp.instrument)
        for i in instruments:
            di = dp.loc[dp.instrument == i, :]
            for tr in range(-3,4):
                d = di.loc[di.transposition == tr, :]
                maxLen = len(d) if len(d) > maxLen else maxLen
                d.drop(['pieceId', 'transposition', 'startTime', 'durationSecs'], axis=1, inplace=True)
                outCols = ['timingDev', 'timingDevLocal', 'localTempo', 'peakLevel']
                #  convert categories to one-hot
                if one_hot_cols:
                    for attrib in one_hot_cols:
                        d = pd.concat([d, pd.get_dummies(d[attrib], prefix=attrib)], axis=1)
                        d.drop([attrib], axis=1, inplace=True)
                y = d.loc[:, outCols]
                d.drop(outCols, axis=1, inplace=True)
                piece_seq.append((d, y, tr, i))
        sequences.append((piece_seq, p))
    return sequences

def standardize(df, moments=None, cols=None):
    if cols is None:
        cols = (df.dtypes == 'float64')
    nums = df.loc[:,cols]
    if moments is None:
        moments = np.zeros((nums.shape[1],2)) # output mean and std for reverting predictions
        moments[:,0] = nums.mean().to_numpy()
        moments[:,1] = nums.std().to_numpy()
    df.loc[:, cols] = (nums - moments[:,0]) / moments[:,1]
    return moments, cols

if is_training_set:
    moments, cols = standardize(df)
    with open(dataFolder + 'normalizer.data', 'wb') as filehandle:
        pickle.dump((moments, cols), filehandle)
else:
    with open(dataFolder + 'normalizer.data', 'rb') as filehandle:
        moments, cols = pickle.load(filehandle)
    standardize(df, moments=moments, cols=cols)

sequences = sequencer(df, one_hot_cols=['metricStrength', 'instrument'])

print("Number of pieces: " + str(len(sequences)))

#  check for NaNs
# nans = np.argwhere(np.isnan(X))

#  eliminate NaNs
# okrows = np.logical_not(np.logical_or(np.isnan(X).any(axis=(1,2)), np.isnan(Y).any(axis=(1,2))))
# X = X[okrows,:,:]
# Y = Y[okrows,:,:]
# moments = moments[okrows,:,:]
# pd_idx = pd_idx[okrows,:]
# print("dataset size without NaN: " + str(X.shape))

Number of pieces: 167


In [23]:
import pickle

#  Save arrays
if is_training_set:
    with open(dataFolder + 'note_sequences.data', 'wb') as filehandle:
        pickle.dump(sequences, filehandle)
else:
    with open(dataFolder + 'note_sequences_test.data', 'wb') as filehandle:
        pickle.dump(sequences, filehandle)