## Note-level dataset generation

This notebook uses raw data from the MusicNet dataset to set up sequential numpy arrays suitable for training deep neural networks.

**Before running:** Make sure to run the "Levels Computation" notebook to produce the numpy array files with global audio levels.

If the intention is training a model in a remote server, instead of uploading the whole MusicNet dataset, the best strategy is to run the first two cells locally for generating the pandas csv, uploading that file and continuing from there.

In [2]:
#### START HERE ####

dataFolder = 'data/'  # make sure the path to data folder is correct

### Computing Note Information

In [8]:
import os
import json

import numpy as np
import pandas as pd
from IPython.display import clear_output

import expression_modeling as m 

def preprocess(labelsDir, csv, outfile=None, include_header=False, include_transp=True):
        
    # load the symbolic information from the dataset
    notearray = np.genfromtxt(os.path.join(labelsDir, csv + '.csv'), delimiter=',', names=True, dtype=['i', 'i', 'i', 'i', 'f', 'f', '|U40'])

    # sort by score time first for correct parsing
    notearray.sort(order=['start_beat', 'start_time'])

    # load levels (generated by "Levels computation" notebook)
    levels = np.load(os.path.join(dataFolder, 'levels', csv + '_global_lvls.npy'))

    # piece key estimation (only major and minor for now)
    isMajor, key, llhoodM, llhoodm = m.estimateKey(notearray['note'])
    mode = m.Mode.major if isMajor else m.Mode.minor

    # load time signature information
    timesigdata = np.genfromtxt(os.path.join(dataFolder, 'musicnet', 'timesig.csv'), delimiter=',', names=True, dtype=['|U10', 'i', 'i', 'i'])
    timesigdata = timesigdata[timesigdata['id'] == csv]
    
    piece = m.Piece(key=key, mode=mode, name=csv)
    piece.beats_per_measure = timesigdata['upper'][0]
    piece.time_sig_type = timesigdata['lower'][0]
    piece.first_down_beat = timesigdata['pickup'][0]
    piece.dynMean = np.mean(levels)
    piece.dynStd = np.std(levels)
    piece.startTime = notearray['start_time'][0]
    piece.startBeat = notearray['start_beat'][0]
    piece.endTime = notearray['end_time'][-1]
    piece.endBeat = notearray['start_beat'][-1] + notearray['end_beat'][-1]
    piece.part = m.buildPart(notearray, (levels - piece.dynMean)/piece.dynStd, 44100)

    df = []
    if include_transp:
        for tr in range(-3,4):
            di = m.buildNoteLevelDataframe(piece, transpose=tr)
            di['transposition'] = tr
            df.append(di)
    else:
        df = m.buildNoteLevelDataframe(piece, transpose=0)
        df['transposition'] = 0
    df = pd.concat(df, ignore_index=True)
    df['pieceId'] = int(csv)

    if outfile is None:
        outfile = open(os.path.join(dataFolder, csv + '.csv'), 'w+')
        df.to_csv(outfile)
    else:
        df.to_csv(outfile, mode='a', header=include_header)
    return outfile

with open(os.path.join(dataFolder, 'musicnet', 'LvB_violinSonatas.json')) as f:
    LvB = json.load(f)

with open(os.path.join(dataFolder, 'LvB_violinSonatas.csv'), 'w+') as outfile:
    mvt_list = []
    header = True
    for mvts in LvB.values():
        mvt_list += mvts
    for i,mvt in enumerate(mvt_list):
        clear_output()
        print("Processing piece " + str(i+1) + '/' + str(len(mvt_list)))
        preprocess(os.path.join(dataFolder, 'musicnet', 'train_labels'), str(mvt), outfile, include_header=header)
        header = False

Processing piece 21/21


## Sequence Generation (Features)

The following cells use the CSV file produced above to format the data into sequences of notes containing a pitch vocabulary and a set of musicologically relevant features about the note.

### Loading Note Information

In [3]:
import os
import numpy as np
import pandas as pd

# read csv
with open(os.path.join(dataFolder, 'LvB_violinSonatas.csv'), 'r') as path:
    df = pd.read_csv(path)

df.drop(['Unnamed: 0', 'harmony', 'bassNote'], axis=1, inplace=True)
df['instrument'] = df['instrument'].astype(pd.CategoricalDtype([1, 41]))
df['metricStrength'] = df['metricStrength'].astype(pd.CategoricalDtype([0, 1, 2, 3]))
print('initial size: ' + str(len(df)))

initial size: 623623


### Mapping pitch vocabulary

In [4]:
import pickle

pitches = list(df.loc[:,['pitch']].itertuples(index=False, name=None))
voc = list(set(pitches))
print('vocabulary size = ' + str(len(voc)))
lex_to_ix = { lex:i+1 for i,lex in enumerate(voc) }  # index 0 is vacant for masking

with open(os.path.join(dataFolder, 'LvB_pitch_dict.data'), 'wb') as filehandle:
    pickle.dump(lex_to_ix, filehandle)

pitches = list(df.loc[:,['pitch']].itertuples(index=False, name=None))
df['pitch'] = [lex_to_ix.get(m, len(lex_to_ix)+1) for m in pitches]

vocabulary size = 81


### Arranging data for sequential training

In [5]:
import json

def sequencer(df, one_hot_cols=None, include_transp=True):
    sequences = []
    maxLen = 0
    # list the pieces
    pieces = set(df.pieceId)
    for p in pieces:
        dp = df.loc[df.pieceId == p, :].copy()
        transps = range(-3,4) if include_transp else [0]
        for tr in transps:
            d = dp.loc[dp.transposition == tr, :].copy()
            maxLen = len(d) if len(d) > maxLen else maxLen
            d.drop(['pieceId', 'transposition'], axis=1, inplace=True)

            # convert categories to one-hot
            if one_hot_cols:
                for attrib in one_hot_cols:
                    d = pd.concat([d, pd.get_dummies(d[attrib], prefix=attrib)], axis=1)
                    d.drop([attrib], axis=1, inplace=True)

            # instance standardization for relevant features
            feats = ['localTempo', 'peakLevel', 'ioiRatio']
            aux = d.loc[:, feats]
            moments = np.zeros((aux.shape[1], 2))
            moments[:, 0] = aux.mean().to_numpy()
            moments[:, 1] = aux.std().to_numpy()
            d.loc[:, feats] = (aux - moments[:,0])/ moments[:,1]

            # add <END> token to sequence
            end = pd.DataFrame(np.zeros((1,d.shape[1])), columns=d.columns)
            end["pitch"] = len(lex_to_ix) + 2
            d = d.append(end)

            # add <SOS> token to sequence
            start = pd.DataFrame(np.zeros((1,d.shape[1])), columns=d.columns)
            start["pitch"] = len(lex_to_ix) + 3
            d = pd.concat([start, d])
            
            # separate output features
            outCols = ['ioiRatio', 'timingDev', 'timingDevLocal', 'localTempo', 'peakLevel', 'startTime', 'durationSecs']
            y = d.loc[:, outCols].copy()
            d.drop(outCols, axis=1, inplace=True)

            sequences.append(((d, y, moments), p, tr))
    return sequences

def standardize(df, moments=None, cols=None):
    if cols is None:
        cols = (df.dtypes == 'float64')
    nums = df.loc[:,cols]
    if moments is None:
        moments = np.zeros((nums.shape[1],2))  # output mean and std for reverting predictions
        moments[:,0] = nums.mean().to_numpy()
        moments[:,1] = nums.std().to_numpy()
    df.loc[:, cols] = (nums - moments[:,0]) / moments[:,1]
    return moments, cols


with open(os.path.join(dataFolder, 'musicnet', 'LvB_violinSonatas.json')) as f:
    LvB = json.load(f)


# Separate Training / Validation / Test:

val_pieces = LvB["1"]  # arbitrary choice
test_pieces = LvB["2"]  # arbitrary choice
training_pieces = []
for sonata in LvB.values():
    for mvt in sonata:
        if mvt not in val_pieces and mvt not in test_pieces:
            training_pieces.append(mvt)

train = df.loc[df.pieceId.isin(training_pieces), :].copy()
val = df.loc[df.pieceId.isin(val_pieces), :].copy()
test = df.loc[df.pieceId.isin(test_pieces), :].copy()


# Prepare training sequences
          
moments, cols = standardize(train, cols=['beatDiff', 'duration', 'ioi', 'startTime', 'durationSecs', 'timingDev', 'timingDevLocal'])
with open(os.path.join(dataFolder, 'LvB_normalizer.data'), 'wb') as filehandle:
    pickle.dump((moments, cols), filehandle)
train_seq = sequencer(train, one_hot_cols=['instrument'])


# Prepare validation and test sequences
          
standardize(val, moments=moments, cols=cols)
standardize(test, moments=moments, cols=cols)
val_seq = sequencer(val, one_hot_cols=['instrument'], include_transp=False)
test_seq = sequencer(test, one_hot_cols=['instrument'], include_transp=False)


### Saving dataset

In [6]:
import pickle

#  Save arrays
with open(os.path.join(dataFolder, 'LvB_train_sequences.data'), 'wb') as filehandle:
    pickle.dump(train_seq, filehandle)
with open(os.path.join(dataFolder, 'LvB_val_sequences.data'), 'wb') as filehandle:
    pickle.dump(val_seq, filehandle)
with open(os.path.join(dataFolder, 'LvB_test_sequences.data'), 'wb') as filehandle:
    pickle.dump(test_seq, filehandle)