This notebook partitions WAV files and saves their corresponding spectrograms.

Many hyperparameters are chosen based on: https://arxiv.org/pdf/1710.11153.pdf (Onsets and Frames)

In [4]:
# Load libraries

import librosa
import numpy as np
import os
from tqdm import tqdm

In [2]:
# User parameters

path = os.getcwd()
sampling_rate = 16000 # 22050 is default, 16000 is to match the paper; sampling rate for WAV files
num_freq_bins = 229
partition_size = 20 # number of seconds each partition of the song should be

### Processing WAV Files

In [5]:
for file in tqdm(os.listdir(path)):

    if file.endswith('.wav'):
    
        # Load WAV file
        audio, _ = librosa.core.load(file, sr=sampling_rate)
        
        # Compute spectrogram on each partition of the song and save it
        num_partitions = int(len(audio)/sampling_rate/partition_size)
        for i in range(num_partitions):
            
            audio_start = i*partition_size*sampling_rate
            audio_end = (i+1)*partition_size*sampling_rate
            mel_spec = librosa.feature.melspectrogram(y=audio[audio_start:audio_end], sr=sampling_rate, 
                                                      n_mels=num_freq_bins)
            log_mel_spec = librosa.power_to_db(mel_spec) # compute the log of the amplitudes
            
            with open(file[:-4] + '_' + str(i+1) + '.npy', 'wb') as np_file:
                np.save(np_file, log_mel_spec)

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:12<00:00,  1.46it/s]


Saved spectrograms will be used as inputs to the network.