# Find Your Rhythm Input Data Transformation

Transform audio input and label data into arrays for consumption into an Neural Network ML model

Labeled input data sources:

* MDB
* IDMT-SMT
* e-GMD

In [1]:
import pickle
import librosa
import numpy as np
import IPython.display as ipd
import os
import shutil
import random
import sys
import shutil

In [2]:
# import utility functions
sys.path.append('../progs')
from utility_functions import *

As all labels will be standardized to a MIDI pitch, define the drum MIDI pitches

In [3]:
midi_classes = {35: 'Acoustic Bass Drum', 36: 'Bass Drum 1', 37: 'Side Stick', 38: 'Acoustic Snare', 39: 'Hand Clap', 
                40: 'Electric Snare', 41: 'Low Floor Tom', 42: 'Closed Hi Hat', 43: 'High Floor Tom', 44: 'Pedal Hi-Hat', 
                45: 'Low Tom', 46: 'Open Hi-Hat', 47: 'Low-Mid Tom', 48: 'Hi-Mid Tom', 49: 'Crash Cymbal 1', 
                50: 'High Tom',  51: 'Ride Cymbal 1', 52: 'Chinese Cymbal', 53: 'Ride Bell', 54: 'Tambourine', 
                55: 'Splash Cymbal', 56: 'Cowbell', 57: 'Crash Cymbal 2', 58: 'Vibraslap', 59: 'Ride Cymbal 2', 
                60: 'Hi Bongo', 61: 'Low Bongo', 62: 'Mute Hi Conga', 63: 'Open Hi Conga', 64: 'Low Conga', 
                65: 'High Timbale', 66: 'Low Timbale', 67: 'High Agogo', 68: 'Low Agogo', 69: 'Cabasa', 
                70: 'Maracas', 71: 'Short Whistle', 72: 'Long Whistle', 73: 'Short Guiro', 74: 'Long Guiro', 
                75: 'Claves', 76: 'Hi Wood Block', 77: 'Low Wood Block', 78: 'Mute Cuica', 79: 'Open Cuica', 
                80: 'Mute Triangle', 81: 'Open Triangle',
               # to be removed
                22: 'Unknown 22', 26: 'Unknown 26'
               }

Read in the labels for each input dataset

Labels take the structure:

_{source: [song index, [onset time, midi pitch, offset time, duration, velocity, time signature numerator, time signature denominator, beat number, measure number]]}_

with each song_index refering to the order in which a song was labeled and serving as a key to the song-index dictionary. If a particular label element is not available in a song annotation file, then the element is assigned a Numpy 'nan' value.

The song-index dictionary takes the structure:

*{source: {song index: audio file name}}*

In [4]:
# read in label dict/list from GCP
from google.cloud import storage

auth_json = '../gcp_bucket_auth/fyr-bucket-auth.json'
bucket_name = 'fyr-audio-data'
gcp_filepath = 'raw-audio/'

In [5]:
song_dict_files = {'idmt': 'IDMTdict.pkl','mdb': 'MDBdict.pkl','egmd': 'eGMDdict.pkl'}
                   #'sf': 'SoundFontFiles/eGMDdict.pkl'}
song_list_files = {'idmt': 'IDMTlabels.pkl', 'mdb': 'MDBlabels.pkl','egmd': 'eGMDlabels.pkl'}
                   #'sf': 'SoundFontFiles/eGMDlabels.pkl'}

label_data = {}
label_dict = {}

for source in song_dict_files:
    label_data_filepath = gcp_filepath + song_list_files[source]
    label_dict_filepath = gcp_filepath + song_dict_files[source]
    
    label_data[source] = readGcpPkl(auth_json, bucket_name, label_data_filepath)
    label_dict[source] = readGcpPkl(auth_json, bucket_name, label_dict_filepath)

# hack for MDB file suffix
mdb_suffix = '_Drum'

for i in range(len(label_dict['mdb'])):
    label_dict['mdb'][i] += mdb_suffix
    
# hack for IDMT file suffix
for i in range(len(label_dict['idmt'])):
    # remove .xml suffix from IDMT file names
    label_dict['idmt'][i] = label_dict['idmt'][i].replace('.xml','')

In [6]:
# update label data items to be dict instead of list
# necessary for indexing that is not dependent on order of list items
#egmd_data_dict = {item[0] : item[1] for item in label_data['egmd'] if item[0] < 40000}
#sf_data_dict = {item[0] : item[1] for item in label_data['sf']}
idmt_data_dict = {item[0] : item[1] for item in label_data['idmt']}
mdb_data_dict = {item[0] : item[1] for item in label_data['mdb']}

#label_data['egmd'] = egmd_data_dict #eGMD files have been fixed
#label_data['sf'] = sf_data_dict
label_data['idmt'] = idmt_data_dict
label_data['mdb'] = mdb_data_dict

In [7]:
# Quick label count

label_counts = {key: 0 for key in midi_classes.keys()}

for source in label_data:
    for audio in label_data[source].values():
        for labels in audio:
            label_counts[labels[1]] += 1
            
{midi_classes[k]: v for k, v in sorted(label_counts.items(), key=lambda item: item[1], reverse=True) if v > 0}

### Function for creating numpy arrays from raw data and labels

In [34]:
def audioDataTransform(wav_file, auth_json, bucket,
                       source, representation, 
                       labels, label_data, label_dict,
                       segment_length=5, sr=22050):
    """
    Takes as input a GCP path to wav file, GCP auth json, bucket name,
    data source, array representation type, 
    instrument labels, label data, song index dict,
    segment length in seconds, and sampling rate to generate numpy arrays
    used for input into neural net models
    
    wav_file = GCP path to wav file
    auth_json = str filepath to json authentication for GCP bucket read/writes
    bucket = str bucket name
    source = 'mdb', 'idmt', or 'egmd' - necessary due to different 
             file name formats
    representation = 'stft': short time fourier transform, 
                     'cqt': constant Q transform
                     'mel_stft': log-based mel spectrogram
    labels = list of unique instrument labels (MIDI pitches)
    label_data = dictionary of annotated labels
    label_dict = dictionary of song indexes and song names
    segmentLength = length of each segment, in seconds (default is 5 seconds)
    sr = sampling rate (default is 22050)
    
    
    Outputs:
    arr_out = amplitude array of shape (number of frames,  frequency bins)
    label_out = Onset array of shape (number of frames,  instrument onsets)
    """
       
    # convert segmentLength into integer if a float value is provided
    segment_length = int(segment_length)
    # load the wav file using utility function into an audio time series array
    try:
        wav = readGcpWav(auth_json, bucket, wav_file, sr = sr)
    except:
        print('Unable to read file:', wav_file)
        return np.empty(0), np.empty(0)
        
    # length of the full wav array
    wav_len = len(wav)
    # default feature parameters
    n_fft = 1024
    hop_length = 512
    
    # create an array representation of the wav audio data
    if representation == 'stft':
        # transpose the stft array so that rows represent each frame in time and features (columns) are frequency bins
        data = np.absolute(librosa.stft(wav)).transpose()
        
    elif representation == 'cqt':
        # transpose the cqt array so that rows represent each frame in time and features (columns) are frequency bins
        data = np.absolute(librosa.cqt(wav, sr=sr)).transpose()
        
    elif representation == 'mel_stft':
        # transpose the mel stft array so that rows represent each frame in time and features (columns) are frequency bins
        # parameters are borrowed from the paper here: https://goo.gl/magenta/e-gmd-paper
        sr = 44100
        n_fft = 2048
        hop_length = 441
        data = librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=250).transpose()
        
    # pad the end of the audio with silence depending on the sequence length
    number_frames = data.shape[0]
    segment_length_frames = librosa.time_to_frames(segment_length, sr=sr, hop_length=hop_length, n_fft=n_fft)
    # calculate how much silence should be added in last segment
    remaining_frames = int(segment_length_frames-(number_frames % segment_length_frames))
    # if the audio can not fill the frames of last segment, silence is padded on to the end of the segment
    if remaining_frames != segment_length_frames:
        # attach zeros to the end of row of stft to make sure two songs are not merged within one segment 
        data = np.append(data,np.zeros((remaining_frames,data.shape[1])),axis=0)
        
    
    # create an array representation of audio onset labels
    # label array shape will be (number_frames, number_labels) 
    # each array in the second dimension (number_labels) will have 1 if there is an onset for that instrument

    # create an array of zeroes to match the length of the data array
    if remaining_frames != segment_length_frames:
        label_arr = np.zeros((number_frames + remaining_frames,len(labels)))
    else:
        label_arr = np.zeros((number_frames,len(labels)))
        
    # find the corresponding labels for the given wav_file
    song_index = [key for key in label_dict[source] if label_dict[source][key] == wav_file.split('/')[-1][:-4]][0]
        
    # enumerate labels so that instrument array can be indexed
    label_to_int = {k:v for v,k in enumerate(labels)}
    
    # given a song index, iterate through each label to get the onset time and midi pitch
    for label in label_data[source][song_index]:
        # label array is formated in frames, so convert onset time to frame index value
        index_row = librosa.time_to_frames(label[0],sr=sr)
        # find column number for the drum instrument type based on enumerated labels (label_to_int)
        index_col = label_to_int[label[1]]
        # update the label array with a value of 1 at the:
        #        frame associated with the onset time (first dimension of label_arr)
        #        array associated with the midi label (second dimension of label_arr)
        
        ### FIX FOR LABELS WITH ONSET TIMES OUTSIDE OF THE AUDIO LENGTH###
        if index_row < label_arr.shape[0]:
            label_arr[index_row,index_col] = 1
    
    
    arr_out = np.asarray(data)
    label_out = np.asarray(label_arr)
    return arr_out, label_out

### Functions for splitting MDB, IDMT, and e-GMD files into training, validation, and test files

In [59]:
# GCP bucket path to wav files
egmd_wavfile_dir = 'raw-audio/eGMD/eGMD-wavfiles/'
egmd_sf_wavfile_dir = 'raw-audio/eGMD/eGMD-SoundFont-wavfiles/'
mdb_wavfile_dir = 'raw-audio/IDMT/'
idmt_wavfile_dir = 'raw-audio/MDBDrums/'

# get file names from GCP bucket
storage_client = storage.Client.from_service_account_json(auth_json)

egmd_wavfiles = [blob.name for blob in storage_client.list_blobs(bucket_name,prefix='raw-audio/eGMD/') 
                 if blob.name[-4:] == '.wav' 
                 and blob.name.split('/')[-1][:-4] in label_dict['egmd'].values() # only pull files in label dict
                 and blob.name.split('/')[-2] != 'sr9000' # ignore degraded sample rate files
                 # genre and beat restrictions for testing
                 and blob.name.split('_')[-3] == 'beat'
                 and blob.name.split('_')[-5][:11] == 'soul-groove'
                ] 

#egmd_sf_wavfiles = [blob.name for blob in storage_client.list_blobs(bucket_name,prefix=egmd_sf_wavfile_dir)
#                    if blob.name[-4:] == '.wav']

mdb_wavfiles = [blob.name for blob in storage_client.list_blobs(bucket_name,prefix=mdb_wavfile_dir)
                if blob.name[-4:] == '.wav']

idmt_wavfiles = [blob.name for blob in storage_client.list_blobs(bucket_name,prefix=idmt_wavfile_dir)
                if blob.name[-4:] == '.wav']

# combine all files
combined_wavfiles = mdb_wavfiles + idmt_wavfiles + egmd_wavfiles # + egmd_sf_wavfiles

In [39]:

def fileShuffleSplit(wavfiles, train_p=0.7, val_p=0.2, test_p=0.1):
    """
    Given an input list of wav files, shuffle the list and output datasets corresponding to the proportion parameters
    
    Inputs:
      wavfiles = list of .wav files
      train_p = proportion of the .wav files that should be used in the training dataset
      train_p = proportion of the .wav files that should be used in the training dataset
      train_p = proportion of the .wav files that should be used in the training dataset
      
    Outputs:
      train_wavfiles = slice of wavfiles list corresponding to the train proportion parameter 
      val_wavfiles = slice of wavfiles list corresponding to the validation proportion parameter
      test_wavfiles = slice of wavfiles list corresponding to the test proportion parameter
    """
    if round(train_p + val_p + test_p, 2) != 1.0:
        raise ValueError('Split proportions train_p, val_p, test_p must total to 1.0')
    
    random.shuffle(wavfiles)

    train_index = int(len(wavfiles) * train_p)
    val_index = int(len(wavfiles) * val_p)

    train_wavfiles = wavfiles[:train_index]
    val_wavfiles = wavfiles[train_index : (train_index + val_index)]
    test_wavfiles = wavfiles[(train_index + val_index) :]

    if len(wavfiles) != (len(train_wavfiles) + len(val_wavfiles) + len(test_wavfiles)):
        raise ValueError('Train-Validation-Test file splits do not sum to total number of files')
    else:
        return train_wavfiles, val_wavfiles, test_wavfiles

In [95]:
def audioDataArrays(wavfiles, auth_json, bucket, representation,
                   labels, label_data, label_dict,
                   segment_length=5, sr=22050):
    """
    Given a list of wavfiles, representation, labels list, label_data, label_dict,
    segment length, and sample rate, output training, validation, test
    datasets with labels and audio look-up dicts

    Inputs:
    wav_file = GCP path to wav file
    auth_json = str filepath to json authentication for GCP bucket read/writes
    bucket = str bucket name
    representation = 'stft': short time fourier transform, 
                    'cqt': constant Q transform
                    (default is 'stft')
    labels = list of unique instrument labels (MIDI pitches)
    label_data = dictionary of annotated labels
    label_dict = dictionary of song indexes and song names
    segmentLength = length of each segment, in seconds (default is 5 seconds)
    sr = sampling rate (default is 22050)

    Outputs:
    One of each of the below for training, validation, and test
    (total output of 9 objects)

    arr_x = an array of concatenated data from all wavfiles 
          shape: (frames, frequency bins)
    arr_y = an array of concatenated labels from all wavfiles
          shape: (frames, instrument labels)

    arr_dict = an audio name index dict: {audio first frame: audio name}
    """
    trainfiles, valfiles, testfiles = fileShuffleSplit(wavfiles, train_p=0.8, val_p=0.1, test_p=0.1)

    def arrayOutputs(input_files):
        """
        Function to help with the repetitive nature of outputing seperate 
        arrays for training, validation, and test datasets

        Inputs:
        input_files = list of wav files

        Outputs:
        arr_x = an array of concatenated data from all wavfiles 
              shape: (frames, frequency bins)
        arr_y = an array of concatenated labels from all wavfiles
              shape: (frames, instrument labels)

        arr_dict = an audio name index dict: {audio first frame: audio name}
        """
        i = 0
        arr_dict = {}

        for file in input_files:
            if file[-9:] == '_Drum.wav': # MDB files
                source = 'mdb'
            elif file[-8:] == '#MIX.wav': # IDMT files
                source = 'idmt'
            #elif file[-9:] == '.midi.wav' and file.split('/')[-2][:2] in('SF','sr'): 
            # egmd SoundFile / reduced sampling rate files
            #    source = 'sf'
            elif file[-9:] == '.midi.wav': # eGMD files
                #wav_file = egmd_wavfile_dir + file
                source = 'egmd'

            input_data, input_labels = audioDataTransform(wav_file=file, auth_json=auth_json, bucket=bucket,
                                                          source=source, representation = representation, 
                                                          labels = labels, label_data = label_data, label_dict = label_dict,
                                                          segment_length=segment_length, sr=sr)
            
            if input_data.shape != (0,): # if there was not an empty array returned
                if i == 0:
                    arr_x = input_data
                    arr_y = input_labels
                    audio_first_frame = 0
                    i += 1
                    print('{}% complete. {} out of {} files'.format(
                        round((i/len(input_files))*100,2),i,len(input_files)
                         ))
                else:
                    arr_x = np.concatenate((arr_x, input_data), axis=0)
                    arr_y = np.concatenate((arr_y, input_labels), axis=0)
                    #audio_first_frame += input_data.shape[0]
                    i += 1
                    print('{}% complete. {} out of {} files'.format(
                        round((i/len(input_files))*100,2),i,len(input_files)
                         ))

                arr_dict.update({audio_first_frame : file})
                audio_first_frame += input_data.shape[0]

        return arr_x, arr_y, arr_dict


    # utitilize the above function for training, validation, and test
    # array outputs locally (will be uploaded to GCP and cleaned up)
    os.mkdir('model_inputs/')

    train_x, train_y, train_dict = arrayOutputs(trainfiles)
    # training dataset outputs
    np.save('model_inputs/train_x.npy', train_x)
    np.save('model_inputs/train_y.npy', train_y)
    pickle.dump(train_dict, open("model_inputs/train_dict.pkl", "wb"))

    del train_x, train_y, train_dict

    val_x, val_y, val_dict = arrayOutputs(valfiles)
    # validation dataset outputs
    np.save('model_inputs/val_x.npy', val_x)
    np.save('model_inputs/val_y.npy', val_y)
    pickle.dump(val_dict, open("model_inputs/val_dict.pkl", "wb" ))

    del val_x, val_y, val_dict


    test_x, test_y, test_dict = arrayOutputs(testfiles)
    # test dataset outputs
    np.save('model_inputs/test_x.npy', test_x)
    np.save('model_inputs/test_y.npy', test_y)
    pickle.dump(test_dict, open("model_inputs/test_dict.pkl", "wb" ))

    del test_x, test_y, test_dict
    
    # output labels as pkl file
    pickle.dump(list(labels), open("model_inputs/labels.pkl", "wb" ))


In [60]:
# adjust number of eGMD files
n = 10
random.shuffle(egmd_wavfiles)
#combined_wavfiles = mdb_wavfiles + idmt_wavfiles + egmd_sf_wavfiles[:n] # + egmd_wavfiles[:n]

In [67]:
label_counts = {key: 0 for key in midi_classes.keys()}

for audio in egmd_wavfiles[:n]:
    song_index = [key for key in label_dict['egmd'] if label_dict['egmd'][key] == audio.split('/')[-1][:-4]][0]

    for labels in label_data['egmd'][song_index]:
        label_counts[labels[1]] += 1
            
{midi_classes[k]: v for k, v in sorted(label_counts.items(), key=lambda item: item[1], reverse=True) if v > 0}

{'Closed Hi Hat': 1252,
 'Acoustic Bass Drum': 616,
 'Acoustic Snare': 525,
 'Tambourine': 384,
 'Side Stick': 86,
 'High Tom': 12,
 'Low-Mid Tom': 8,
 'Open Hi-Hat': 4,
 'Pedal Hi-Hat': 2}

In [97]:
import time

time_start = time.time()

# reduces number of classes
labels = {k: v for k, v in sorted(label_counts.items(), key=lambda item: item[1], reverse=True) if v > 0}.keys()

audioDataArrays(egmd_wavfiles[:n],auth_json,bucket_name,
                representation='mel_stft',
                labels=labels, 
                label_data=label_data, 
                label_dict=label_dict,
                segment_length=5, sr=44100)


# GCP output path formatted with number of egmd files
#output_path = 'model_inputs/mdb_idmt_egmd{}'.format(n)

time_end = time.time()

print('Train, Validation, and Test arrays (with dicts) created in', 
      round(time_end - time_start, 2), 'seconds')

12.5% complete. 1 out of 8 files
25.0% complete. 2 out of 8 files
37.5% complete. 3 out of 8 files
50.0% complete. 4 out of 8 files
62.5% complete. 5 out of 8 files
75.0% complete. 6 out of 8 files
87.5% complete. 7 out of 8 files
100.0% complete. 8 out of 8 files
100.0% complete. 1 out of 1 files
100.0% complete. 1 out of 1 files
Train, Validation, and Test arrays (with dicts) created in 16.14 seconds


In [102]:
# remove temporary local directory
shutil.rmtree('model_inputs/')

In [None]:
# Unable to read file: raw-audio/eGMD/eGMD-wavfiles/drummer9_session1_26_rock_90_beat_4-4_26.midi.wav
# Unable to read file: raw-audio/eGMD/eGMD-wavfiles/drummer9_session1_26_rock_90_beat_4-4_26.midi.wav
# Unable to read file: raw-audio/eGMD/eGMD-wavfiles/drummer9_session1_29_rock_90_beat_4-4_23.midi.wav

In [None]:
output_path = 'model_inputs/mdb_idmt_egmd_SFsr{}'.format(n)

In [None]:
!ls model_inputs

In [None]:
#!gsutil -m cp model_inputs/train_x.npy gs://{bucket_name}/{output_path}