## Note-level dataset generation

This notebook uses raw data from the MusicNet dataset to set up sequential numpy arrays suitable for training deep neural networks.

**Before running:** Make sure to run the "Levels Computation" notebook to produce the numpy array files with global audio levels.

In [10]:
#### START HERE ####

dataFolder = 'data/'  # make sure the path to data folder is correct
num_folds = 1

### Computing Note Information

In [2]:
import os
import json
import re

import numpy as np
import pandas as pd
from IPython.display import clear_output

import expression_modeling as m 

def preprocess(labelsDir, csv, outfile=None, include_header=False, include_transp=True):
        
    # load the symbolic information from the dataset
    notearray = np.genfromtxt(os.path.join(labelsDir, csv + '.csv'), delimiter=',', names=True, dtype=['i', 'i', 'i', 'i', 'f', 'f', '|U40'])

    # sort by score time first for correct parsing
    notearray.sort(order=['start_beat', 'start_time'])

    # load levels (generated by "Levels computation" notebook)
    levels = np.load(os.path.join(dataFolder, 'levels', csv + '_global_lvls.npy'))

    piece = m.Piece(name=csv)
    piece.dynMean = np.mean(levels)
    piece.dynStd = np.std(levels)
    piece.startTime = notearray['start_time'][0]
    piece.startBeat = notearray['start_beat'][0]
    piece.endTime = notearray['end_time'][-1]
    piece.endBeat = notearray['start_beat'][-1] + notearray['end_beat'][-1]
    piece.part = m.buildPart(notearray, (levels - piece.dynMean)/piece.dynStd, 44100)

    df = []
    if include_transp:
        for tr in range(-3,4):
            di = m.buildSimpleNoteDataframe(piece, transpose=tr)
            di['transposition'] = tr
            df.append(di)
    else:
        df = m.buildSimpleNoteDataframe(piece, transpose=0)
        df['transposition'] = 0
    df = pd.concat(df, ignore_index=True)
    df['pieceId'] = int(csv)

    if outfile is None:
        outfile = open(os.path.join(dataFolder, csv + '.csv'), 'w+')
        df.to_csv(outfile)
    else:
        df.to_csv(outfile, mode='a', header=include_header)
    return outfile

# select pieces containing violin
csvfolder = os.path.join(dataFolder, 'musicnet', 'train_labels')
dataset = [csv for csv in os.listdir(csvfolder) if re.search(r'^(.*?,){2}\s*?41\s*?,(.*?,){3}', open(os.path.join(csvfolder, csv), 'r').read(), re.MULTILINE)]

with open(os.path.join(dataFolder, 'musicnet_violin_IE1.csv'), 'w+') as outfile:
    header = True
    for i,mvt in enumerate(dataset):
        clear_output(wait=True)
        print("Processing piece " + str(i+1) + '/' + str(len(dataset)))
        preprocess(csvfolder, mvt[:-4], outfile, include_header=header)
        header = False

Processing piece 123/123


## Sequence Generation (Features)

The following cells use the CSV file produced above to format the data into sequences of notes containing a pitch vocabulary and a set of musicologically relevant features about the note.

### Loading Note Information

In [13]:
import os
import numpy as np
import pandas as pd

# read csv
with open(os.path.join(dataFolder, 'musicnet_violin_IE1.csv'), 'r') as path:
    df = pd.read_csv(path)

# Input Encoding I --> minimal, i.e.: no musicological info
df.drop(['Unnamed: 0'], axis=1, inplace=True)

instrs = set(df.instrument)
df['instrument'] = df['instrument'].astype(pd.CategoricalDtype(instrs))

print('initial size: ' + str(len(df)))

initial size: 3672648


### Mapping vocabulary

In [15]:
import pickle

pitches = list(df.loc[:,['pitch']].itertuples(index=False, name=None))

# using mapping from maestro dataset (88 keys + 4 ctrl. words)
with open(os.path.join(dataFolder, 'mF_pitch_dict.data'), 'rb') as filehandle:
    lex_to_ix = pickle.load(filehandle)

pitches = list(df.loc[:,['pitch']].itertuples(index=False, name=None))
df['pitch'] = [lex_to_ix.get(m, len(lex_to_ix)+1) for m in pitches]

### Picking Training / Validation / Test sets:

In [16]:
import os
import json
import random

random.seed(777)

csvfolder = os.path.join(dataFolder, 'musicnet', 'train_labels')
all = [csv[0:-4] for csv in os.listdir(csvfolder) if re.search(r'^(.*?,){2}\s*?41\s*?,(.*?,){3}', open(os.path.join(csvfolder, csv), 'r').read(), re.MULTILINE)]

folds = []
test_pieces = []

# reserve 10% for test
test_sz = int(len(all) / 10)
for i in range(test_sz):
    m = all[random.randint(0, len(all) - 1)]
    test_pieces.append(m)
    all.remove(m)
print('Test set pieces: ' + str(test_pieces))
      
for i in range(num_folds):
    train = all.copy()
    val = []
    # another 10% of what remains for validation
    val_sz = int(len(train) / 10)
    for j in range(val_sz):
        m = train[random.randint(0, len(train) - 1)]
        val.append(m)
        train.remove(m)
    folds.append((train, val))

print('Val. set IDs: ' + str([f[1] for f in folds]))

Test set pieces: ['2140', '1923', '2186', '2397', '1872', '2627', '1828', '2433', '2148', '2288', '1788', '2178']
Val. set IDs: [['2398', '2562', '2573', '1739', '2166', '1919', '2497', '2482', '1805', '2243', '1729']]


### Arranging data for sequential training and saving dataset

In [17]:
import pickle

def sequencer(df, one_hot_cols=None, include_transp=True):
    sequences = []
    maxLen = 0
    # list the pieces
    pieces = set(df.pieceId)
    for p in pieces:
        dp = df.loc[df.pieceId == p, :].copy()
        transps = range(-3,4) if include_transp else [0]
        for tr in transps:
            d = dp.loc[dp.transposition == tr, :].copy()
            maxLen = len(d) if len(d) > maxLen else maxLen
            d.drop(['pieceId', 'transposition'], axis=1, inplace=True)

            # convert categories to one-hot
            if one_hot_cols:
                for attrib in one_hot_cols:
                    d = pd.concat([d, pd.get_dummies(d[attrib], prefix=attrib)], axis=1)
                    d.drop([attrib], axis=1, inplace=True)

            # instance standardization for relevant features
            feats = ['localTempo', 'peakLevel', 'ioiRatio']
            aux = d.loc[:, feats]
            moments = np.zeros((aux.shape[1], 2))
            moments[:, 0] = aux.mean().to_numpy()
            moments[:, 1] = aux.std().to_numpy()
            d.loc[:, feats] = (aux - moments[:,0])/ moments[:,1]

            # add <END> token to sequence
            end = pd.DataFrame(np.zeros((1,d.shape[1])), columns=d.columns)
            end["pitch"] = len(lex_to_ix) + 2
            d = d.append(end)

            # add <SOS> token to sequence
            start = pd.DataFrame(np.zeros((1,d.shape[1])), columns=d.columns)
            start["pitch"] = len(lex_to_ix) + 3
            d = pd.concat([start, d])
            
            # separate output features
            outCols = ['ioiRatio', 'timingDev', 'timingDevLocal', 'localTempo', 'peakLevel', 'startTime', 'durationSecs']
            y = d.loc[:, outCols].copy()
            d.drop(outCols, axis=1, inplace=True)

            sequences.append(((d, y, moments), p, tr))
    return sequences

def standardize(df, moments=None, cols=None):
    if cols is None:
        cols = (df.dtypes == 'float64')
    nums = df.loc[:,cols]
    if moments is None:
        moments = np.zeros((nums.shape[1],2))  # output mean and std for reverting predictions
        moments[:,0] = nums.mean().to_numpy()
        moments[:,1] = nums.std().to_numpy()
    df.loc[:, cols] = (nums - moments[:,0]) / moments[:,1]
    return moments, cols


# Separate Training / Validation / Test:

test = df.loc[df.pieceId.isin(test_pieces), :].copy()

moments = None
cols = None
for i, (training_pieces, val_pieces) in enumerate(folds):
    
    train = df.loc[df.pieceId.isin(training_pieces), :].copy()
    val = df.loc[df.pieceId.isin(val_pieces), :].copy()

    # Standardization
    moments, cols = standardize(train, cols=['beatDiff', 'duration', 'ioi', 'startTime', 'durationSecs', 'timingDev', 'timingDevLocal'])
    standardize(val, moments=moments, cols=cols)
    with open(os.path.join(dataFolder, 'MNv_I_normalizer_fold_' + str(i) + '.data'), 'wb') as filehandle:
        pickle.dump((moments, cols), filehandle)
    
    train_seq = sequencer(train, one_hot_cols=['instrument'])
    val_seq = sequencer(val, one_hot_cols=['instrument'], include_transp=False)
    
    #  Save arrays
    print('Saving fold ' + str(i))
    with open(os.path.join(dataFolder, 'MNv_I_train_sequences_fold_' + str(i) + '.data'), 'wb') as filehandle:
        pickle.dump(train_seq, filehandle)
    with open(os.path.join(dataFolder, 'MNv_I_val_sequences_fold_' + str(i) + '.data'), 'wb') as filehandle:
        pickle.dump(val_seq, filehandle)

# Prepare test sequences
print('Saving test data')
standardize(test, moments=moments, cols=cols) # using last fold moments (it's good enough)
test_seq = sequencer(test, one_hot_cols=['instrument'], include_transp=False)
with open(os.path.join(dataFolder, 'MNv_I_test_sequences.data'), 'wb') as filehandle:
    pickle.dump(test_seq, filehandle)
print('Finished.')

Saving fold 0
Saving test data
Finished.
