### Note-level sequence generation

This notebook uses note-level data from the MusicNet dataset to set up sequential arrays suitable for training deep neural networks.

**Before running:** Make sure to generate the appropriate csv files using the note-level processing notebook.

In [17]:
#### START HERE ####

dataFolder = 'data/'  # make sure the path to data folder is correct
is_training_set = True 

### Loading Note Information

In [3]:
import numpy as np
import pandas as pd

np.random.seed(1728)

#  read csv
if is_training_set:
    with open(dataFolder + 'per_note_train.csv', 'r') as path:
        df = pd.read_csv(path)
else:
    with open(dataFolder + 'per_note_test.csv', 'r') as path:
        df = pd.read_csv(path)

df.drop(['Unnamed: 0', 'bassNote', 'probChord_I', 'probChord_II', 'probChord_III',
         'probChord_IV', 'probChord_V', 'probChord_VI', 'probChord_VII', 'isDissonance'], axis=1, inplace=True)
df['instrument'] = df['instrument'].astype(pd.CategoricalDtype([41, 42, 43, 72, 74]))
print('initial size: ' + str(len(df)))


initial size: 40838


### Mapping melodic and harmonic vocabularies

In [4]:
import pickle

melodies = list(df.loc[:,['pitch']].itertuples(index=False, name=None))
harmonies = list(df.loc[:,['harmony']].itertuples(index=False, name=None))

if is_training_set:
    #  generate vocabulary of (pitch, bassNote)

    print("Mapping melody vocabulary.")
    voc_mel = list(set(melodies))
    print('vocabulary size = ' + str(len(voc_mel)))
    m_lex_to_ix = { lex:i+1 for i,lex in enumerate(voc_mel) } # index 0 is vacant for masking
    
    print("Mapping harmony vocabulary.")
    voc_harm = list(set(harmonies))
    print('vocabulary size = ' + str(len(voc_harm)))
    h_lex_to_ix = { lex:i+1 for i,lex in enumerate(voc_harm) } # index 0 is vacant for masking 
    
    with open(dataFolder + 'mel_harm_dict.data', 'wb') as filehandle:
        pickle.dump((m_lex_to_ix, h_lex_to_ix), filehandle)
else:
    with open(dataFolder + 'mel_harm_dict.data', 'rb') as filehandle:
        m_lex_to_ix, h_lex_to_ix = pickle.load(filehandle)

df['harmony'] = [h_lex_to_ix.get(h, len(h_lex_to_ix) + 1) for h in harmonies]
df['melody'] = [m_lex_to_ix.get(m, len(m_lex_to_ix) + 1) for m in melodies]
df.drop(['pitch'], axis=1, inplace=True)

Mapping melody vocabulary.
vocabulary size = 657


### Arranging data for sequential training

In [13]:
def sequencer(df, one_hot_cols=None):
    sequences = []
    maxLen = 0
    #  list the pieces
    pieces = set(df.pieceId)
    for p in pieces:
        # list the instruments
        piece_seq = []
        dp = df.loc[df.pieceId == p, :].copy()
        instruments = set(dp.instrument)
        for i in instruments:
            di = dp.loc[dp.instrument == i, :].copy()
            for tr in range(-3,4):
                d = di.loc[di.transposition == tr, :].copy()
                maxLen = len(d) if len(d) > maxLen else maxLen
                d.drop(['pieceId', 'transposition'], axis=1, inplace=True)
                
                #  convert categories to one-hot
                if one_hot_cols:
                    for attrib in one_hot_cols:
                        d = pd.concat([d, pd.get_dummies(d[attrib], prefix=attrib)], axis=1)
                        d.drop([attrib], axis=1, inplace=True)
                
                #  instance standardization for relevant features
                feats = ['localTempo', 'peakLevel']
                aux = d.loc[:, feats]
                moments = np.zeros((aux.shape[1], 2))
                moments[:, 0] = aux.mean().to_numpy()
                moments[:, 1] = aux.std().to_numpy()
                d.loc[:, feats] = (aux - moments[:,0])/ moments[:,1]
                
                #  separate output features
                outCols = ['ioiRatio', 'timingDev', 'timingDevLocal', 'localTempo', 'peakLevel', 'startTime', 'durationSecs']
                y = d.loc[:, outCols].copy()
                d.drop(outCols, axis=1, inplace=True)
                
                #  add <END> token to sequence
                endx = pd.DataFrame(np.zeros((1,d.shape[1])), columns=d.columns)
                endy = pd.DataFrame(np.zeros((1,y.shape[1])), columns=y.columns)
                endx["melody"] = len(m_lex_to_ix) + 2
                endx["harmony"] = len(h_lex_to_ix) + 2
                d = d.append(endx)
                y = y.append(endy)
                
                piece_seq.append((d, y, tr, i, moments))
        sequences.append((piece_seq, p))
    return sequences

def standardize(df, moments=None, cols=None):
    if cols is None:
        cols = (df.dtypes == 'float64')
    nums = df.loc[:,cols]
    if moments is None:
        moments = np.zeros((nums.shape[1],2)) # output mean and std for reverting predictions
        moments[:,0] = nums.mean().to_numpy()
        moments[:,1] = nums.std().to_numpy()
    df.loc[:, cols] = (nums - moments[:,0]) / moments[:,1]
    return moments, cols

if is_training_set:
    moments, cols = standardize(df, cols=['duration', 'ioi', 'ioiRatio', 'startTime', 'durationSecs', 'timingDev', 'timingDevLocal'])
    with open(dataFolder + 'normalizer.data', 'wb') as filehandle:
        pickle.dump((moments, cols), filehandle)
else:
    with open(dataFolder + 'normalizer.data', 'rb') as filehandle:
        moments, cols = pickle.load(filehandle)
    standardize(df, moments=moments, cols=cols)

sequences = sequencer(df, one_hot_cols=['instrument'])

print("Number of pieces: " + str(len(sequences)))


Number of pieces: 7


### Saving dataset

In [None]:
import pickle

#  Save arrays
if is_training_set:
    with open(dataFolder + 'mel_harm_sequences.data', 'wb') as filehandle:
        pickle.dump(sequences, filehandle)
else:
    with open(dataFolder + 'mel_harm_sequences_test.data', 'wb') as filehandle:
        pickle.dump(sequences, filehandle)