# Data Preparation for spleeter

In [57]:
from pathlib import Path
import os
from pydub import AudioSegment
import shutil
import scaper
import stempeg
import musdb
import numpy as np
from tqdm import tqdm
from scipy.io.wavfile import write


In [3]:
from pathlib import Path

# create foreground folder
fg_folder = Path('F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\foreground').expanduser()  
fg_folder.mkdir(parents=True, exist_ok=True)                             

# create background folder - we need to provide one even if we don't use it
bg_folder = Path('F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\background').expanduser()
bg_folder.mkdir(parents=True, exist_ok=True)

In [5]:
mus_train = musdb.DB(root="F:\\ApprecentishipProgram\\audioUpskilling\\dataset",download=True)

Downloading MUSDB 7s Sample Dataset to F:\ApprecentishipProgram\audioUpskilling\dataset...
Done!


In [43]:
test_audio = mus_train[0].stems

In [56]:
test_audio[0]

array([[-8.84704590e-02, -3.79333496e-02],
       [-1.07421875e-01, -5.22460938e-02],
       [-8.42590332e-02, -5.48400879e-02],
       ...,
       [-8.23974609e-04,  3.05175781e-05],
       [-8.54492188e-04,  7.01904297e-04],
       [-8.54492188e-04,  1.31225586e-03]])

In [58]:
write('test.wav', 44100, test_audio[0])

In [36]:
keys = ['mixture', 'drums', 'bass', 'other', 'vocals']

In [59]:
for item in tqdm(mus_train):
    song_name = item.name
    for key, val in zip(keys,item.stems):
        src_path = fg_folder / key 
        src_path.mkdir(exist_ok=True)
        src_path = str(src_path / song_name) + '.wav'
        write(src_path,44100,val) 
    

100%|██████████| 144/144 [02:31<00:00,  1.05s/it]


In [60]:
for folder in os.listdir(fg_folder):
    if folder[0] != '.':  # ignore system folders
        stem_files = os.listdir(os.path.join(fg_folder, folder))
        print(f"\n{folder}\tfolder contains {len(stem_files)} audio files:\n")
        for sf in sorted(stem_files)[:5]:
            print(f"\t\t{sf}")
        print("\t\t...")


bass	folder contains 144 audio files:

		A Classic Education - NightOwl.wav
		AM Contra - Heart Peripheral.wav
		ANiMAL - Clinic A.wav
		ANiMAL - Easy Tiger.wav
		ANiMAL - Rockshow.wav
		...

drums	folder contains 144 audio files:

		A Classic Education - NightOwl.wav
		AM Contra - Heart Peripheral.wav
		ANiMAL - Clinic A.wav
		ANiMAL - Easy Tiger.wav
		ANiMAL - Rockshow.wav
		...

mixture	folder contains 144 audio files:

		A Classic Education - NightOwl.wav
		AM Contra - Heart Peripheral.wav
		ANiMAL - Clinic A.wav
		ANiMAL - Easy Tiger.wav
		ANiMAL - Rockshow.wav
		...

other	folder contains 144 audio files:

		A Classic Education - NightOwl.wav
		AM Contra - Heart Peripheral.wav
		ANiMAL - Clinic A.wav
		ANiMAL - Easy Tiger.wav
		ANiMAL - Rockshow.wav
		...

vocals	folder contains 144 audio files:

		A Classic Education - NightOwl.wav
		AM Contra - Heart Peripheral.wav
		ANiMAL - Clinic A.wav
		ANiMAL - Easy Tiger.wav
		ANiMAL - Rockshow.wav
		...


In [61]:
import scaper

seed = 123 # integer or np.random.RandomState(<integer>)

sc = scaper.Scaper(
    duration=5.0,
    fg_path=str(fg_folder),
    bg_path=str(bg_folder),
    random_state=seed
)

In [62]:
sc.__dict__

{'duration': 5.0,
 'sr': 44100,
 'ref_db': -12,
 'n_channels': 1,
 'fade_in_len': 0.01,
 'fade_out_len': 0.01,
 'fg_spec': [],
 'bg_spec': [],
 'fg_path': 'F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\foreground',
 'bg_path': 'F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\background',
 'fg_labels': ['bass', 'drums', 'mixture', 'other', 'vocals'],
 'bg_labels': [],
 'protected_labels': [],
 'random_state': RandomState(MT19937) at 0x2A2CFF4CC40}

In [63]:
sc.sr = 44100
sc.n_channels = 1
sc.ref_db = -20

In [64]:
labels = ['vocals', 'drums', 'bass', 'other']

for label in labels:
    sc.add_event(label=('const', label),                # set the label value explicitly using a constant
                 source_file=('choose', []),            # choose the source file randomly from all files in the folder
                 source_time=('uniform', 0, 7),         # sample the source (stem) audio starting at a time between 0-7
                 event_time=('const', 0),               # always add the stem at time 0 in the mixture
                 event_duration=('const', sc.duration), # set the stem duration to match the mixture duration
                 snr=('uniform', -5, 5),                # choose an SNR for the stem uniformly between -5 and 5 dB
                 pitch_shift=('uniform', -2, 2),        # apply a random pitch shift between -2 and 2 semitones
                 time_stretch=('uniform', 0.8, 1.2))    # apply a random time stretch between 0.8 (faster) and 1.2 (slower)

In [65]:
mixture_audio, mixture_jam, annotation_list, stem_audio_list = sc.generate()



In [66]:
# extract the annotation data from the JAMS object
ann = mixture_jam.annotations.search(namespace='scaper')[0]

# print the sampled parameters for each event in the annotation
for event in ann:
    print(f"\n{event.value['label']}:\n")
    print(event.value)


vocals:

{'label': 'vocals', 'source_file': 'F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\foreground\\vocals\\Speak Softly - Broken Man.wav', 'source_time': 0.409114508046898, 'event_time': 0, 'event_duration': 5.0, 'snr': 0.5131476908289123, 'role': 'foreground', 'pitch_shift': 0.8778758791422523, 'time_stretch': 0.9144557339801518}

drums:

{'label': 'drums', 'source_file': 'F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\foreground\\drums\\The Mountaineering Club - Mallory.wav', 'source_time': 0.9480389917468239, 'event_time': 0, 'event_duration': 4.496357975721296, 'snr': 0.7969429702261008, 'role': 'foreground', 'pitch_shift': -1.4401969494784619, 'time_stretch': 1.1120111047648316}

bass:

{'label': 'bass', 'source_file': 'F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\foreground\\bass\\Cnoc An Tursa - Bannockburn.wav', 'source_time': 1.3148022976795373, 'event_time': 0, 'event_duration': 5.0, 'snr': -0.6142775532037561, 'role': 'foreground', 'pitch_shift': -1

In [67]:
# Launch this cell in a Jupyter notebook to generate an interactive vizualization of the annotation!
ann

Unnamed: 0,time,duration,value,confidence
0,0.000,4.572,label  vocals  source_file  F:\ApprecentishipProgram\audioUpskilling\dataset\foreground\vocals\Speak Softly - Broken Man.wav  source_time  0.409114508046898  event_time  0  event_duration  5.0  snr  0.5131476908289123  role  foreground  pitch_shift  0.8778758791422523  time_stretch  0.9144557339801518,1.0
label,vocals,,,
source_file,F:\ApprecentishipProgram\audioUpskilling\dataset\foreground\vocals\Speak Softly - Broken Man.wav,,,
source_time,0.409114508046898,,,
event_time,0,,,
event_duration,5.0,,,
snr,0.5131476908289123,,,
role,foreground,,,
pitch_shift,0.8778758791422523,,,
time_stretch,0.9144557339801518,,,

0,1
label,vocals
source_file,F:\ApprecentishipProgram\audioUpskilling\dataset\foreground\vocals\Speak Softly - Broken Man.wav
source_time,0.409114508046898
event_time,0
event_duration,5.0
snr,0.5131476908289123
role,foreground
pitch_shift,0.8778758791422523
time_stretch,0.9144557339801518

0,1
label,drums
source_file,F:\ApprecentishipProgram\audioUpskilling\dataset\foreground\drums\The Mountaineering Club - Mallory.wav
source_time,0.9480389917468239
event_time,0
event_duration,4.496357975721296
snr,0.7969429702261008
role,foreground
pitch_shift,-1.4401969494784619
time_stretch,1.1120111047648316

0,1
label,bass
source_file,F:\ApprecentishipProgram\audioUpskilling\dataset\foreground\bass\Cnoc An Tursa - Bannockburn.wav
source_time,1.3148022976795373
event_time,0
event_duration,5.0
snr,-0.6142775532037561
role,foreground
pitch_shift,-1.7612884135617266
time_stretch,0.9372712064603478

0,1
label,other
source_file,F:\ApprecentishipProgram\audioUpskilling\dataset\foreground\other\Punkdisco - Oral Hygiene.wav
source_time,1.2537614074370218
event_time,0
event_duration,4.818993961211062
snr,-0.5974282193592373
role,foreground
pitch_shift,-1.6650940617456627
time_stretch,1.0375609598696094


In [68]:
mixture_audio, mixture_jam, annotation_list, stem_audio_list = sc.generate(fix_clipping=True)



In [69]:
ann = mixture_jam.annotations.search(namespace='scaper')[0]

for event in ann:
    print(f"\n{event.value['label']}:\n")
    print(event.value)


vocals:

{'label': 'vocals', 'source_file': 'F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\foreground\\vocals\\Leaf - Come Around.wav', 'source_time': 1.183983573124711, 'event_time': 0, 'event_duration': 4.937145034561059, 'snr': 3.4943179407778953, 'role': 'foreground', 'pitch_shift': 0.897821299442541, 'time_stretch': 1.0127310348387464}

drums:

{'label': 'drums', 'source_file': 'F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\foreground\\drums\\The Scarlet Brand - Les Fleurs Du Mal.wav', 'source_time': 0.7524159089875544, 'event_time': 0, 'event_duration': 5.0, 'snr': 1.547213074698198, 'role': 'foreground', 'pitch_shift': -0.5047942927997342, 'time_stretch': 0.9430915675236087}

bass:

{'label': 'bass', 'source_file': 'F:\\ApprecentishipProgram\\audioUpskilling\\dataset\\foreground\\bass\\Cristina Vane - So Easy.wav', 'source_time': 1.137931815927327, 'event_time': 0, 'event_duration': 5.0, 'snr': -4.078950600549248, 'role': 'foreground', 'pitch_shift': -0.2651953092

In [70]:
from IPython.display import Audio, display

display(Audio(data=mixture_audio.T, rate=sc.sr))

In [71]:
# extract the annotation data from the JAMS object
ann = mixture_jam.annotations.search(namespace='scaper')[0]

# iterate over the annotation and corresponding stem audio data
for obs, stem_audio in zip(ann.data, stem_audio_list):
    print(f"Instrument: {obs.value['label']} at SNR: {obs.value['snr']:.2f}")
    display(Audio(data=stem_audio.T, rate=sc.sr))

Instrument: vocals at SNR: 3.49


Instrument: drums at SNR: 1.55


Instrument: bass at SNR: -4.08


Instrument: other at SNR: -1.88


In [72]:
# 1. Define a random seed
random_state = 123

# 2. Create a Scaper object
sc = scaper.Scaper(
    duration=5.0,
    fg_path=str(fg_folder),
    bg_path=str(bg_folder),
    random_state=random_state
)

# 3. Set sample rate, reference dB, and channels (mono)
sc.sr = 44100
sc.ref_db = -20
sc.n_channels = 1

# 4. Define a template of probabilistic event parameters
event_parameters = {
    'label': ('const', 'vocals'),              
    'source_file': ('choose', []),             
    'source_time': ('uniform', 0, 7),
    'event_time': ('const', 0),
    'event_duration': ('const', sc.duration),
    'snr': ('uniform', -5, 5),
    'pitch_shift': ('uniform', -2, 2),
    'time_stretch': ('uniform', 0.8, 1.2)
}

# 5. Instatiate the template once to randomly choose a song,
#    a start time for the sources, a pitch shift and a time
#    stretch. These values must remain COHERENT across all stems
   
# Add a an event based on the probabilistic template
sc.add_event(**event_parameters)

# Instantiate the event to sample concrete values
event = sc._instantiate_event(sc.fg_spec[0])
    
# 6. Reset the Scaper object's event specficiation
sc.reset_fg_event_spec()
    
# 7. Replace the distributions for source time, pitch shift and
#    time stretch with the constant values we just sampled, to 
#    ensure our added events (stems) are coherent.    
#    NOTE: the source_file has also been sampled, and we'll keep
#    the sampled file to denote which song we'll be mixing.
event_parameters['source_time'] = ('const', event.source_time)
event_parameters['pitch_shift'] = ('const', event.pitch_shift)
event_parameters['time_stretch'] = ('const', event.time_stretch)

# 8. Iterate over the four stems (vocals, drums, bass, other) and 
#    add COHERENT events.
labels = ['vocals', 'drums', 'bass', 'other']

for label in labels:

    # Set the label to the stem we are adding
    event_parameters['label'] = ('const', label)

    # To ensure coherent source files (all from the same song), we leverage
    # the fact that all the stems from the same song have the same filename.
    # All we have to do is replace the stem file's parent folder name from "vocals" 
    # to the label we are adding in this iteration of the loop, which will give the 
    # correct path to the stem source file for this current label.
    coherent_source_file = event.source_file.replace('vocals', label)
    event_parameters['source_file'] = ('const', coherent_source_file)

    # Add the event using the modified, COHERENT, event parameters
    sc.add_event(**event_parameters)



In [73]:
def generate_and_play(sc):

    mixture_audio, mixture_jam, annotation_list, stem_audio_list = sc.generate(fix_clipping=True)

    print("Mixture:")
    display(Audio(data=mixture_audio.T, rate=sc.sr))

    # extract the annotation data from the JAMS object
    ann = mixture_jam.annotations.search(namespace='scaper')[0]
    
    # iterate over the annotation and corresponding stem audio data
    for obs, stem_audio in zip(ann.data, stem_audio_list):
        print(f"Instrument: {obs.value['label']} at SNR: {obs.value['snr']:.2f}")
        display(Audio(data=stem_audio.T, rate=sc.sr))

In [74]:
generate_and_play(sc)

Mixture:




Instrument: vocals at SNR: -0.77


Instrument: drums at SNR: 4.81


Instrument: bass at SNR: 1.85


Instrument: other at SNR: -0.19


In [75]:
# Create a template of probabilistic event parameters
template_event_parameters = {
    'label': ('const', 'vocals'),
    'source_file': ('choose', []),
    'source_time': ('uniform', 0, 7),
    'event_time': ('const', 0),
    'event_duration': ('const', 5.0),
    'snr': ('uniform', -5, 5),
    'pitch_shift': ('uniform', -2, 2),
    'time_stretch': ('uniform', 0.8, 1.2)
}


def incoherent(fg_folder, bg_folder, event_template, seed):
    """
    This function takes the paths to the MUSDB18 source materials, an event template, 
    and a random seed, and returns an INCOHERENT mixture (audio + annotations). 
    
    Stems in INCOHERENT mixtures may come from different songs and are not temporally
    aligned.
    
    Parameters
    ----------
    fg_folder : str
        Path to the foreground source material for MUSDB18
    bg_folder : str
        Path to the background material for MUSDB18 (empty folder)
    event_template: dict
        Dictionary containing a template of probabilistic event parameters
    seed : int or np.random.RandomState()
        Seed for setting the Scaper object's random state. Different seeds will 
        generate different mixtures for the same source material and event template.
        
    Returns
    -------
    mixture_audio : np.ndarray
        Audio signal for the mixture
    mixture_jams : np.ndarray
        JAMS annotation for the mixture
    annotation_list : list
        Simple annotation in list format
    stem_audio_list : list
        List containing the audio signals of the stems that comprise the mixture
    """
    
    # Create scaper object and seed random state
    sc = scaper.Scaper(
        duration=5.0,
        fg_path=str(fg_folder),
        bg_path=str(bg_folder),
        random_state=seed
    )
    
    # Set sample rate, reference dB, and channels (mono)
    sc.sr = 44100
    sc.ref_db = -20
    sc.n_channels = 1
    
    # Copy the template so we can change it
    event_parameters = event_template.copy()
    
    # Iterate over stem types and add INCOHERENT events
    labels = ['vocals', 'drums', 'bass', 'other']
    for label in labels:
        event_parameters['label'] = ('const', label)
        sc.add_event(**event_parameters)
    
    # Return the generated mixture audio + annotations 
    # while ensuring we prevent audio clipping
    return sc.generate(fix_clipping=True)


def coherent(fg_folder, bg_folder, event_template, seed):
    """
    This function takes the paths to the MUSDB18 source materials and a random seed,
    and returns an COHERENT mixture (audio + annotations).
    
    Stems in COHERENT mixtures come from the same song and are temporally aligned.
    
    Parameters
    ----------
    fg_folder : str
        Path to the foreground source material for MUSDB18
    bg_folder : str
        Path to the background material for MUSDB18 (empty folder)
    event_template: dict
        Dictionary containing a template of probabilistic event parameters
    seed : int or np.random.RandomState()
        Seed for setting the Scaper object's random state. Different seeds will 
        generate different mixtures for the same source material and event template.
        
    Returns
    -------
    mixture_audio : np.ndarray
        Audio signal for the mixture
    mixture_jams : np.ndarray
        JAMS annotation for the mixture
    annotation_list : list
        Simple annotation in list format
    stem_audio_list : list
        List containing the audio signals of the stems that comprise the mixture
    """
        
    # Create scaper object and seed random state
    sc = scaper.Scaper(
        duration=5.0,
        fg_path=str(fg_folder),
        bg_path=str(bg_folder),
        random_state=seed
    )
    
    # Set sample rate, reference dB, and channels (mono)
    sc.sr = 44100
    sc.ref_db = -20
    sc.n_channels = 1
    
    # Copy the template so we can change it
    event_parameters = event_template.copy()    
    
    # Instatiate the template once to randomly choose a song,   
    # a start time for the sources, a pitch shift and a time    
    # stretch. These values must remain COHERENT across all stems
    sc.add_event(**event_parameters)
    event = sc._instantiate_event(sc.fg_spec[0])
    
    # Reset the Scaper object's the event specification
    sc.reset_fg_event_spec()
    
    # Replace the distributions for source time, pitch shift and 
    # time stretch with the constant values we just sampled, to  
    # ensure our added events (stems) are coherent.              
    event_parameters['source_time'] = ('const', event.source_time)
    event_parameters['pitch_shift'] = ('const', event.pitch_shift)
    event_parameters['time_stretch'] = ('const', event.time_stretch)

    # Iterate over the four stems (vocals, drums, bass, other) and 
    # add COHERENT events.                                         
    labels = ['vocals', 'drums', 'bass', 'other']
    for label in labels:
        
        # Set the label to the stem we are adding
        event_parameters['label'] = ('const', label)
        
        # To ensure coherent source files (all from the same song), we leverage
        # the fact that all the stems from the same song have the same filename.
        # All we have to do is replace the stem file's parent folder name from "vocals" 
        # to the label we are adding in this iteration of the loop, which will give the 
        # correct path to the stem source file for this current label.
        coherent_source_file = event.source_file.replace('vocals', label)
        event_parameters['source_file'] = ('const', coherent_source_file)
        # Add the event using the modified, COHERENT, event parameters
        sc.add_event(**event_parameters)
    
    # Generate and return the mixture audio, stem audio, and annotations
    return sc.generate(fix_clipping=True)

In [76]:
# First double check our paths and template are correct:
print(fg_folder)
print(bg_folder)
print("")
print(template_event_parameters)

F:\ApprecentishipProgram\audioUpskilling\dataset\foreground
F:\ApprecentishipProgram\audioUpskilling\dataset\background

{'label': ('const', 'vocals'), 'source_file': ('choose', []), 'source_time': ('uniform', 0, 7), 'event_time': ('const', 0), 'event_duration': ('const', 5.0), 'snr': ('uniform', -5, 5), 'pitch_shift': ('uniform', -2, 2), 'time_stretch': ('uniform', 0.8, 1.2)}


In [77]:
# Generate 3 coherent mixtures
for seed in [1, 2, 3]:
    
    mixture_audio, mixture_jam, annotation_list, stem_audio_list = coherent(
        fg_folder, 
        bg_folder, 
        template_event_parameters, 
        seed)
    
    display(Audio(data=mixture_audio.T, rate=sc.sr))







In [78]:
# Generate 3 incoherent mixtures
for seed in [1, 2, 3]:
    
    mixture_audio, mixture_jam, annotation_list, stem_audio_list = incoherent(
        fg_folder, 
        bg_folder, 
        template_event_parameters, 
        seed)
    
    display(Audio(data=mixture_audio.T, rate=sc.sr))





