In [None]:
import glob

# import essentia
# import essentia.standard as es
import librosa
import librosa.display
import madmom
import matplotlib.pyplot as plt
import mir_eval
import numpy as np
import pandas as pd

import IPython.display as ipd
plt.rcParams["figure.figsize"] = (15,10)

In [None]:
import utils

In [None]:
# CONSTANTS
FS = 44100

In [None]:
# dataset available in https://www.eumus.edu.uy/candombe/datasets/ISMIR2015/dataset.html
file_path = [i[:-4] for i in glob.glob('../datasets/candombe/*.wav')]

In [None]:
if glob.glob('candombe_analysis.csv') == []:

    dataset_result = {}
    #madmom_beat_processor = madmom.features.downbeats.RNNDownBeatProcessor(num_threads=4)
    #madmom_beat_decoder = madmom.features.downbeats.DBNDownBeatTrackingProcessor(beats_per_bar=[4], fps=100)

    for file in file_path:
        print(f"processing {file}")
        x, fs = librosa.load(f"{file}.wav", mono=True, sr=FS)
        x_df = pd.read_csv(f"{file}.csv", names=["timestamp", "beat"])
        ground_truth = x_df['timestamp'].values

        # calculate the beats using librosa approach
        bpm, beat_frame = librosa.beat.beat_track(x, FS)
        librosa_timestamps = librosa.frames_to_time(beat_frame, FS)
        #madmom_track = madmom_beat_decoder(madmom_beat_processor(f"{file}.wav"))
        #madmom_timestamps, madmom_beats = madmom_track[:, 0], madmom_track[:, 1]
        
        cmlc_librosa, cmlt_librosa, amlc_librosa, amlt_librosa = mir_eval.beat.continuity(ground_truth, librosa_timestamps)
        #cmlc_madmom, cmlt_madmom, amlc_madmom, amlt_madmom = mir_eval.beat.continuity(ground_truth, madmom_timestamps)

        dataset_result[file]= {
            "ground_truth_beats": x_df['timestamp'].values,
            "librosa_beats": librosa_timestamps,
            "f_score_librosa": mir_eval.beat.f_measure(ground_truth, librosa_timestamps),
            "p_score_librosa": mir_eval.beat.p_score(ground_truth, librosa_timestamps),
            #"f_score_madmom": mir_eval.beat.f_measure(ground_truth, madmom_timestamps),
            #"p_score_madmom": mir_eval.beat.p_score(ground_truth, madmom_timestamps),
            # the ratio of the longest continuously correct segment to the lengtho f the input
            "cmlc_librosa": cmlc_librosa,
            #"cmlc_madmom": cmlc_madmom,
            # the total number of correct beats at the correct metrical level
            "cmlt_librosa": cmlt_librosa,
            #"cmlt_madmom": cmlt_madmom,
            "amlc_librosa": amlc_librosa,
            #"amlc_madmom": amlc_madmom,
            "amlt_librosa": amlt_librosa,
            #"amlt_madmom": amlt_madmom,
            # cria uma janela gaussiana e avalia o quão próximo (de 0 a 1) a marcação está do
            # valor de referência
            "cemgil_librosa": mir_eval.beat.cemgil(ground_truth, librosa_timestamps, cemgil_sigma=0.04),
            #"cemgil_madmom": mir_eval.beat.cemgil(ground_truth, madmom_timestamps, cemgil_sigma=0.04),
            "bpm_librosa": bpm
        }
    
    #pd.DataFrame(dataset_result).transpose().reset_index().to_csv("candombe_analysis.csv", index=False)
    pd.DataFrame(dataset_result).to_csv("candombe_analysis.csv")
else:
    print("loading file")
    candombe_csv = pd.read_csv("candombe_analysis.csv", index_col=0)
    dataset_results = candombe_csv.to_dict()
    

Perguntas
- o que significa um método detectar menos beats do que outro?
    - talvez esteja errando os primeiros beats?
- os métodos "erram" de maneira consistente?

In [None]:
# ordering from worse to best f-score
dataset_result = {k: v for k, v in sorted(dataset_results.items(), key=lambda item: item[1]["f_score_librosa"])}

In [None]:
for i in dataset_result.items():
    print(f'{i[0]} -> {i[1]["f_score_librosa"]}')

# worst f-score songs analysis

## csic.1995_ansina2_04 <a class="anchor" id="csic-1995-ansina2-04"></a>

In [None]:
song = '../datasets/candombe/csic.1995_ansina2_04'

x, fs = librosa.load(f"{song}.wav", mono=True, sr=FS)

tmp = dataset_result[song]

wrong_click_sound = np.sin(2*np.pi*np.arange(FS*.1)*500/(1.*FS))
wrong_click_sound *= np.exp(-np.arange(FS*.1)/(FS*.01)) # exponential decay

clicks_truth = mir_eval.sonify.clicks(tmp["ground_truth_beats"], FS, click=None, length=len(x))
wrong_clicks = mir_eval.sonify.clicks(tmp["librosa_beats"], FS, click=wrong_click_sound, length=len(x))

In [None]:
ipd.Audio(x, rate=FS)

In [None]:
tmp.keys()

In [None]:
tmp['p_score_librosa'], tmp['f_score_librosa'], tmp['cmlt_librosa']

In [None]:
ipd.Audio(x+clicks_truth, rate=FS)

In [None]:
ipd.Audio(x+clicks_truth+wrong_clicks, rate=FS)

In [None]:
ipd.Audio(x+wrong_clicks, rate=FS)

In [None]:
utils.plot_comparison(x, FS, tmp['ground_truth_beats'], tmp['librosa_beats'], 10, 13)

### onset detection using subbands

In [None]:
# calculate onset on each subband
onset_subbands = librosa.onset.onset_strength_multi(y=x, sr=FS, channels=[0, 32, 64, 96, 128])

_, beat_frame_0 = librosa.beat.beat_track(onset_envelope=onset_subbands[0], sr=FS)
librosa_timestamps_0 = librosa.frames_to_time(beat_frame_0, FS)

_, beat_frame_1 = librosa.beat.beat_track(onset_envelope=onset_subbands[1], sr=FS)
librosa_timestamps_1 = librosa.frames_to_time(beat_frame_1, FS)

_, beat_frame_2 = librosa.beat.beat_track(onset_envelope=onset_subbands[2], sr=FS)
librosa_timestamps_2 = librosa.frames_to_time(beat_frame_2, FS)

_, beat_frame_3 = librosa.beat.beat_track(onset_envelope=onset_subbands[3], sr=FS)
librosa_timestamps_3 = librosa.frames_to_time(beat_frame_3, FS)

start = 5
end = 10

fig, ax = plt.subplots(nrows=1, sharex=True)
ax.plot(x[start*fs:end*fs], label='waveform')

ax.vlines(
    (tmp["ground_truth_beats"][(tmp["ground_truth_beats"] >= start) & (tmp["ground_truth_beats"] <= end)]-start)*fs, 
          0, 1, alpha=0.5, color='r', linestyle='--', label='groundtruth'
)

ax.vlines(
    (librosa_timestamps_0[(librosa_timestamps_0 >= start) & (librosa_timestamps_0 <= end)]-start)*fs, 
    0, 1, alpha=0.5, color='g', linestyle='--', label='subband_0')

ax.vlines(
    (librosa_timestamps_1[(librosa_timestamps_1 >= start) & (librosa_timestamps_1 <= end)]-start)*fs, 
    0, 1, alpha=0.5, color='b', linestyle='--', label='subband_1')

ax.vlines(
    (librosa_timestamps_2[(librosa_timestamps_2 >= start) & (librosa_timestamps_2 <= end)]-start)*fs, 
    0, 1, alpha=0.5, color='b', linestyle='dotted', label='subband_2')
ax.vlines(
    (librosa_timestamps_3[(librosa_timestamps_3 >= start) & (librosa_timestamps_3 <= end)]-start)*fs, 
    0, 1, alpha=0.5, color='g', linestyle='-.', label='subband_3')

ax.legend()

In [None]:
onset_subbands = librosa.onset.onset_strength_multi(y=x, sr=FS, channels=[0, 64, 128])
onset_subbands.shape

In [None]:
_, beat_frame_0 = librosa.beat.beat_track(onset_envelope=onset_subbands[0], sr=FS)
librosa_timestamps_0 = librosa.frames_to_time(beat_frame_0, FS)

_, beat_frame_1 = librosa.beat.beat_track(onset_envelope=onset_subbands[1], sr=FS)
librosa_timestamps_1 = librosa.frames_to_time(beat_frame_1, FS)

start = 5
end = 10

fig, ax = plt.subplots(nrows=1, sharex=True)
ax.plot(x[start*fs:end*fs], label='waveform')

ax.vlines(
    (tmp["ground_truth_beats"][(tmp["ground_truth_beats"] >= start) & (tmp["ground_truth_beats"] <= end)]-start)*fs, 
          0, 1, alpha=0.5, color='r', linestyle='--', label='groundtruth'
)

ax.vlines(
    (librosa_timestamps_0[(librosa_timestamps_0 >= start) & (librosa_timestamps_0 <= end)]-start)*fs, 
    0, 1, alpha=0.5, color='g', linestyle='--', label='subband_0')

ax.vlines(
    (librosa_timestamps_1[(librosa_timestamps_1 >= start) & (librosa_timestamps_1 <= end)]-start)*fs, 
    0, 1, alpha=0.5, color='b', linestyle='--', label='subband_1')

ax.legend()

In [None]:
D = np.abs(librosa.stft(x))

fig, ax = plt.subplots(nrows=2, sharex=True)

img1 = librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max),
                         y_axis='log', x_axis='time', ax=ax[0])
ax[0].set(title='Power spectrogram')
ax[0].label_outer()
fig.colorbar(img1, ax=[ax[0]], format="%+2.f dB")

img2 = librosa.display.specshow(onset_subbands, x_axis='time', ax=ax[1])
ax[1].set(ylabel='Sub-bands', title='Sub-band onset strength')
fig.colorbar(img2, ax=[ax[1]])

In [None]:
mir_eval.beat.continuity(tmp['ground_truth_beats'], librosa_timestamps_0)

In [None]:
mir_eval.beat.f_measure(tmp['ground_truth_beats'], librosa_timestamps_0), tmp['f_score_librosa']

In [None]:
mir_eval.beat.f_measure(tmp['ground_truth_beats'], librosa_timestamps_1), tmp['f_score_librosa']

In [None]:
mir_eval.beat.f_measure(tmp['ground_truth_beats'], librosa_timestamps_2), tmp['f_score_librosa']

In [None]:
mir_eval.beat.continuity(tmp['ground_truth_beats'], librosa_timestamps_2)

In [None]:
mir_eval.beat.f_measure(tmp['ground_truth_beats'], librosa_timestamps_3), tmp['f_score_librosa']

## csic.1995_ansina2_01.wav <a class="anchor" id="csic-1995-ansina2-01"></a>

In [None]:
song = '../datasets/candombe/csic.1995_ansina2_01'

x, fs = librosa.load(f"{song}.wav", mono=True, sr=FS)

tmp = dataset_result[song]

wrong_click_sound = np.sin(2*np.pi*np.arange(FS*.1)*500/(1.*FS))
wrong_click_sound *= np.exp(-np.arange(FS*.1)/(FS*.01)) # exponential decay

clicks_truth = mir_eval.sonify.clicks(tmp["ground_truth_beats"], FS, click=None, length=len(x))
wrong_clicks = mir_eval.sonify.clicks(tmp["librosa_beats"], FS, click=wrong_click_sound, length=len(x))

In [None]:
teste = tmp["ground_truth_beats"][tmp["ground_truth_beats"] <= 10]

In [None]:
onset_env = librosa.onset.onset_strength(y=x[0:10*44100], sr=FS)
times = librosa.times_like(onset_env, sr=FS)

_, beats = librosa.beat.beat_track(onset_envelope = onset_env, sr=FS)
beats = librosa.frames_to_time(beats, FS)

plt.vlines(beats[beats <= 10], 0, 1, alpha=0.5, color='r', linestyle='--', label='librosa')
plt.vlines(teste, 0, 1, alpha=0.5, color='g', linestyle='--', label='truth')
plt.plot(times, librosa.util.normalize(onset_env), label='novelty function')
plt.legend()

In [None]:
utils.plot_comparison(x, FS, tmp['ground_truth_beats'], tmp['librosa_beats'], 0, 10)

In [None]:
utils.plot_comparison(x, FS, tmp['ground_truth_beats'], tmp['librosa_beats'], 0, 10)

In [None]:
X = np.abs(librosa.stft(x[:20*FS]))

S_db = librosa.amplitude_to_db(np.abs(X), ref=np.max)

librosa.display.specshow(S_db, x_axis='time', y_axis='log')

In [None]:
ipd.Audio(x, rate=FS)

In [None]:
ipd.Audio(x+clicks_truth+wrong_clicks, rate=FS)

In [None]:
ipd.Audio(x+clicks_truth, rate=FS)

In [None]:
ipd.Audio(x+wrong_clicks, rate=FS)

In [None]:
# good performance song
# 
song = '../datasets/candombe/zavala.muniz.2014_52'

x, fs = librosa.load(f"{song}.wav", mono=True, sr=FS)

tmp = dataset_result[song]

wrong_click_sound = np.sin(2*np.pi*np.arange(FS*.1)*500/(1.*FS))
wrong_click_sound *= np.exp(-np.arange(FS*.1)/(FS*.01)) # exponential decay

clicks_truth = mir_eval.sonify.clicks(tmp["ground_truth_beats"], FS, click=None, length=len(x))
wrong_clicks = mir_eval.sonify.clicks(tmp["librosa_beats"], FS, click=wrong_click_sound, length=len(x))

In [None]:
utils.plot_comparison(x, FS, tmp['ground_truth_beats'], tmp['librosa_beats'], 37, 47)

In [None]:
utils.plot_comparison(x, FS, tmp['ground_truth_beats'], tmp['librosa_beats'], 45, 47)

In [None]:
ipd.Audio(x, rate=FS)

In [None]:
ipd.Audio(x+clicks_truth, rate=FS)

In [None]:
ipd.Audio(x+wrong_clicks, rate=FS)

In [None]:
ipd.Audio(x+wrong_clicks+clicks_truth, rate=FS)