In [4]:
# import mir_eval
# a = mir_eval.io.load_labeled_intervals('/home/georgid/Documents/medleyDB/Annotations/AClassicEducation_NightOwl_SOURCEID.lab',delimiter=',')
# vocal_intervals = []
# vocal_labels = []
# for idx, label in enumerate(a[1]):
#     if label in VOCALS:
#         vocal_intervals.append(a[0][idx].tolist())
#         vocal_labels.append(label)
#         print label
# print vocal_intervals
# mir_eval.util.sort_labeled_intervals(vocal_intervals, vocal_labels)


# mir_eval.util.merge_labeled_intervals(vocal_intervals,vocal_labels)



import medleydb as mdb
import numpy
import mir_eval
import csv

ONLY_BOUNDARY = 0 # if set, store only the beginning of interval as a boundary, if 0, add end of interval

VOCALS = ["male singer", "female singer", "male speaker", "female speaker",
          "male rapper", "female rapper", "beatboxing", "vocalists"]
def samples_to_intervals(activations, min_diff = None):
    '''
    glue samples in time to intervals
    
    Parameters
    ----------------------
    activations: shape (n,2) 
        list of activation samples: (timestamp, confidence)
    min_diff:
        the minimal time, at which to split samples e.g. start a new time interval 
    
    Returns
    ----------------------------------------
    voiced_intervals: shape (n,2)
        list of (start_ts, end_ts)
    '''
    
    if min_diff == None:
        hop_size =  activations[1][0] - activations[0][0] + 0.001 # add 0.001 to avoid rounding differences
        min_diff = hop_size
    voiced_intervals = []
    start_ts = activations[0][0] # most recent start ts
    prev_sample = activations[0] # sample at previous ts
    
    for sample in activations[1:]:
        time_diff_prev = sample[0] - prev_sample[0]
        if time_diff_prev > min_diff: 
            voiced_intervals.append((start_ts, prev_sample[0])) # complete interval with most recent start_ts  
            start_ts = sample[0] # assign new start_ts
        prev_sample = sample
    voiced_intervals.append((start_ts, activations[-1,0])) # complete interval at last sample
    return voiced_intervals


def create_vocal_boundaries(track_name, activation_threshold=0.5, min_diff=0.5):
    '''
    Createa annotation of vocal boundaries. 
    take activations from http://medleydb.weebly.com/description.html#instrumentactivations with confidence > activation_threshold. 
    This replicates the way Source ID annotations are generated, but we allow to glue intervals within some min_diff (e.g. 0.5 seconds)  between consecutive intervals 
    '''
    mtrack = mdb.MultiTrack(track_name )
    predominant_stem = mtrack.predominant_stem
    print predominant_stem.instrument[0]
    
    activations = []
    if predominant_stem.instrument[0] in VOCALS:
        print ' track {}'.format(track_name)
        predominant_index = predominant_stem.stem_idx
        activation_conf = mtrack.activation_conf_from_stem(predominant_index)
        activation_conf_array = numpy.array(activation_conf)
        activations = activation_conf_array[numpy.where(activation_conf_array[:,1]>= activation_threshold)] 
    
    
    
    if len(activations) == 0:
        print 'no vocal activations for track {}'.format(track_name)
        return [],[],[]
    
    voiced_intervals = samples_to_intervals(activations, min_diff)
    
    ####### to boundaries
    voiced_boundaries = mir_eval.util.intervals_to_boundaries(voiced_intervals)
    voiced_boundaries = numpy.insert(voiced_boundaries, 0, 0) # prepend 0
    
    ##### assign vocal and novocal labels
    boundary_labels = ['vocal'] * len(voiced_boundaries)
    for i in range(0,len(boundary_labels),2):
        boundary_labels[i] = 'novocal'
    
    if not ONLY_BOUNDARY:
#         numpy.append(voiced_boundaries,mtrack.duration) # TODO: add boundary from vocal to last no-vocal
        all_intervals = mir_eval.util.boundaries_to_intervals(voiced_boundaries)
        boundary_labels.append('novocal')
    return voiced_boundaries, all_intervals, boundary_labels



In [39]:
################### example ussage: 1. create_vocal_boundaries and 2. store to csv

import glob
import os
path_vocals = '/Users/joro/Downloads/medleyDB_mono_vocal/'
vocal_tracks =  glob.glob(path_vocals + "*.wav")
vocal_track_names = []

for vocal_track in vocal_tracks:
        vocal_track_names.append(os.path.basename(vocal_track)[:-8]) # without .wav

for track_name in vocal_track_names:
    # 1. create_vocal_boundaries 
    voiced_boundaries, all_intervals,  labels = create_vocal_boundaries(track_name)
    
    ##### 2. store to csv 
    file_URI  = path_vocals + track_name +  '_MIX.csv'
    with open(file_URI,'w') as f:
        writer = csv.writer(f, delimiter=',')
        if ONLY_BOUNDARY:
            for boundary,  bd_label in zip(voiced_boundaries, labels):
                writer.writerow([boundary, bd_label])
        else:
            for interval, interval_label in zip(all_intervals, labels):
                print [interval[0], interval[1], interval_label]
                writer.writerow([interval[0], interval[1], interval_label])
    
#     raw_input('press for next track')

male singer
 track AClassicEducation_NightOwl
[[  0.00000000e+00   4.64000000e-02]
 [  4.64000000e-02   2.92570000e+00]
 [  2.92570000e+00   5.15480000e+00]
 [  5.15480000e+00   8.31270000e+00]
 [  8.31270000e+00   9.98460000e+00]
 [  9.98460000e+00   2.21054000e+01]
 [  2.21054000e+01   2.42416000e+01]
 [  2.42416000e+01   2.75853000e+01]
 [  2.75853000e+01   2.94429000e+01]
 [  2.94429000e+01   5.75855000e+01]
 [  5.75855000e+01   5.85143000e+01]
 [  5.85143000e+01   7.57435000e+01]
 [  7.57435000e+01   7.74618000e+01]
 [  7.74618000e+01   9.69665000e+01]
 [  9.69665000e+01   1.33096800e+02]
 [  1.33096800e+02   1.40016300e+02]
 [  1.40016300e+02   1.41502400e+02]
 [  1.41502400e+02   1.58035000e+02]
 [  1.58035000e+02   1.59335300e+02]
 [  1.59335300e+02   1.68437600e+02]]
HERE
[0.0, 0.046399999999999997, 'novocal']
[0.046399999999999997, 2.9257, 'vocal']
[2.9257, 5.1547999999999998, 'novocal']
[5.1547999999999998, 8.3126999999999995, 'vocal']
[8.3126999999999995, 9.9846000000000004

ValueError: operands could not be broadcast together with shapes (9,) (8,) 

In [11]:
voiced_intervals = numpy.array([[1,2],[5,6]])
voiced_boundaries = mir_eval.util.intervals_to_boundaries(voiced_intervals)
voiced_boundaries = numpy.insert(voiced_boundaries, 0, 0) # prepend 0
voiced_boundaries = numpy.append(voiced_boundaries,100) # TODO: add boundary from vocal to last no-vocal

all_intervals = mir_eval.util.boundaries_to_intervals(voiced_boundaries)
non_vocal_intervals = all_intervals[::2,:]
for interval in non_vocal_intervals:
    print interval[0]
                                    

0
2
6
