In [2]:
import os

import numpy as np
import pandas as pd
from IPython.display import clear_output

import expression_modeling as m

def preprocess(labelsDir, instruments={41, 42, 43, 72, 74}, csvname='per_note', outfile=None):

    dataset = [csv for csv in os.listdir(labelsDir)]
    for i, csv in enumerate(dataset):
        print('processing piece ' + str(i+1) + '/' + str(len(dataset)), end='\r')
        
        # load the symbolic information from the dataset
        notearray = np.genfromtxt(os.path.join(labelsDir, csv), delimiter=',', names=True, dtype=['i', 'i', 'i', 'i', 'f', 'f', '|U40'])

        #  check if piece contains any desired instrument
        csv_instruments = set(notearray['instrument'])
        csv_desired_instruments = csv_instruments.intersection(instruments)
        if not csv_desired_instruments:
            continue
        
        #  load levels (generated by "Levels computation" notebook)
        levels = np.load('data/levels/' + csv.replace('.csv', '_global_lvls.npy'))

        # piece key estimation (only major and minor for now)
        isMajor, key, llhoodM, llhoodm = m.estimateKey(notearray['note'])
        mode = m.Mode.major if isMajor else m.Mode.minor

        piece = m.Piece(key=key, mode=mode, name=csv)
        piece.dynMean = np.mean(levels)
        piece.dynStd = np.std(levels)
        piece.startTime = notearray['start_time'][0]
        piece.startBeat = notearray['start_beat'][0]
        piece.endTime = notearray['end_time'][-1]
        piece.endBeat = notearray['start_beat'][-1] + notearray['end_beat'][-1]
        piece.parts = m.buildNoteParts(notearray, (levels - piece.dynMean)/piece.dynStd, 44100, csv_desired_instruments)
        
        df = []
        for inst in csv_desired_instruments:
            di = m.buildNoteLevelDataframe(piece, inst)
            df.append(di)
        df = pd.concat(df, ignore_index=True)
        df['pieceId'] = int(csv[0:-4])
        
        if outfile is None:
            outfile = open('data/' + csvname + '.csv', 'w+')
            df.to_csv(outfile)
        else:
            df.to_csv(outfile, mode='a', header=False)
    return outfile

# clear_output()
# print('Begin training set')
# f = preprocess('./data/musicnet/train_labels')
# f.close()

print('Begin processing test set')
f = preprocess('./data/musicnet/test_labels', csvname='per_note_test')
f.close()


Begin processing test set
processing piece 10/10

In [3]:
import numpy as np
import pandas as pd

np.random.seed(1728)

#  read csv
path = open('data/per_note_train.csv', 'r')
df = pd.read_csv(path)
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df['pitch'] = df['pitch'].astype(pd.CategoricalDtype(list(range(36, 109))))
df['bassNote'] = df['bassNote'].astype(pd.CategoricalDtype(list(range(0, 12))))
df['metricStrength'] = df['metricStrength'].astype(pd.CategoricalDtype(list(range(0, 4))))
df['instrument'] = df['instrument'].astype(pd.CategoricalDtype([41, 42, 43, 72, 74]))
print('initial size: ' + str(len(df)))


initial size: 340785


In [4]:
def sequencer(df):
  sequences = []
  maxLen = 0
  #  list the instruments
  instruments = set(df.instrument)
  for ins in instruments:
    # list the pieces
    di = df.loc[df.instrument == ins, :]
    pieces = set(di.pieceId)
    for p in pieces:
      d = di.loc[di.pieceId == p, :]
      maxLen = len(d) if len(d) > maxLen else maxLen
      d = d.drop(['pieceId', 'startTime', 'durationSecs', 'peakLevel'], axis=1)
      outCols = ['timingDev', 'timingDevLocal', 'localTempo']
      
      #  standardize features
      moments = np.zeros((len(outCols),2)) # output mean and std for reverting predictions
      outs = d.loc[:,outCols]
      moments[:,0] = outs.mean().to_numpy()
      moments[:,1] = outs.std().to_numpy()
      nums = d.loc[:,(d.dtypes == 'float64')]
      d.loc[:, (d.dtypes == 'float64')] = (nums - nums.mean()) / nums.std()
      #  convert categories to one-hot
      for attrib in ['metricStrength', 'pitch', 'bassNote', 'instrument']:
          d = pd.concat([d, pd.get_dummies(d[attrib], prefix=attrib)], axis=1)
          d.drop([attrib], axis=1, inplace=True)
      
      y = d.loc[:, outCols]
      d.drop(outCols, axis=1, inplace=True)
      sequences.append((moments, d, y))
  X = np.full((len(sequences), maxLen, len(sequences[0][1].columns)), 0, dtype='float64')
  Y = np.full((len(sequences), maxLen, len(sequences[0][2].columns)), 0, dtype='float64')
  moments = np.zeros((len(sequences), len(outCols), 2))
  pd_idx = np.full((len(sequences), maxLen), -1e4, dtype='int32')
  for i, s in enumerate(sequences):
      (mm, x, y) = s
      X[i, 0:len(x), :] = x
      Y[i, 0:len(y), :] = y
      moments[i, :, :] = mm
      pd_idx[i, 0:len(x)] = x.index
  return X, Y, moments, pd_idx
        
#  make data sequential 
X, Y, moments, pd_idx = sequencer(df)
print("dataset size: " + str(X.shape))

#  check for NaNs
# nans = np.argwhere(np.isnan(X))

#  eliminate NaNs
# okrows = np.logical_not(np.logical_or(np.isnan(X).any(axis=(1,2)), np.isnan(Y).any(axis=(1,2))))
# X = X[okrows,:,:]
# Y = Y[okrows,:,:]
# moments = moments[okrows,:,:]
# pd_idx = pd_idx[okrows,:]
# print("dataset size without NaN: " + str(X.shape))

dataset size: (357, 2459, 104)


In [None]:
#  Save arrays
np.save('data/X_sequential_per_note.npy', X)
np.save('data/Y_sequential_per_note.npy', Y)
np.save('data/Y_moments.npy', moments)
np.save('data/dataframe_idx.npy', pd_idx)