In [None]:
# ref: better beat tracking through robust onset aggregation

In [None]:
# HUGE refactor on this one. lesgo.

In [None]:
import glob
import os

import librosa
import numpy as np
import matplotlib.pyplot as plt
import mir_eval
import pandas as pd

In [None]:
plt.rcParams["figure.figsize"] = (15,10)

In [None]:
# dataset available in https://www.eumus.edu.uy/candombe/datasets/ISMIR2015/dataset.html
file_path = [i[:-4] for i in glob.glob('../../datasets/candombe/*.wav')]

In [None]:
# experiments settings
FS = 22050
N_FFT = 2048
MEL_BANDS = 128
CUTOFF = 8000
HOP_SIZE = 64

DATASET_PATH = '../../datasets/candombe/'

OUTPUT_CSV_PATH = 'experiments_results/robust_onset_aggregation'
BASELINE_DEFAULT_CSV_PATH = os.path.join(OUTPUT_CSV_PATH, 'baseline_default.csv')
BASELINE_PARAMETERS_CSV_PATH = os.path.join(OUTPUT_CSV_PATH, 'baseline_parameters.csv')
MEDIAN_CSV_PATH = os.path.join(OUTPUT_CSV_PATH, 'median.csv')
MEAN_CSV_PATH = os.path.join(OUTPUT_CSV_PATH, 'mean.csv')
BASELINE_DEFAULT_PATH = os.path.join(OUTPUT_CSV_PATH, 'baseline_default')
BASELINE_PARAMETERS_PATH = os.path.join(OUTPUT_CSV_PATH, 'baseline_parameters')
MEDIAN_PATH = os.path.join(OUTPUT_CSV_PATH, 'median')
MEAN_PATH = os.path.join(OUTPUT_CSV_PATH, 'mean')


In [None]:
for file in file_path:
    print(os.path.join(BASELINE_DEFAULT_PATH, os.path.basename(file)))
    break

In [None]:
def run(dataset_folder, output_folder, onset_parameters, beat_parameters, override):
    for file in dataset_folder:
        file_npz = os.path.join(output_folder, os.path.basename(file)) + '.npz'
        
        if not os.path.isfile(file_npz) or override:
            print(f"processing {file}")
            x, fs = librosa.load(f"{file}.wav", mono=True, sr=FS)
            x_df = pd.read_csv(f"{file}.csv", names=["timestamp", "beat"])
            ground_truth = x_df["timestamp"].values

            onset_parameters["y"] = x
            onset_parameters["sr"] = FS

            # the standard method from librosa already uses 128 mel bands by defaul
            # so we can just skip this (:
            onset_subbands = librosa.onset.onset_strength_multi(**onset_parameters)
            
            beat_parameters["onset_envelope"] = onset_subbands[0]
            beat_parameters["sr"] = FS
            
            bpm, beat_frame = librosa.beat.beat_track(**beat_parameters)
            beat_timestamps = librosa.frames_to_time(beat_frame, FS)

            print(f"saving {file}.npz")
            np.savez(
                file_npz, 
                onset=onset_subbands[0], 
                reference=ground_truth,
                estimated=beat_timestamps
            )
    return


## test refactor

In [None]:
baseline_onset_configs = {}
baseline_beat_configs = {}
run(file_path, BASELINE_DEFAULT_PATH, baseline_onset_configs, baseline_beat_configs, False)

In [None]:
BASELINE_DEFAULT_PATH

In [None]:
for _, _, file in os.walk(BASELINE_DEFAULT_PATH):
    baseline_files = file

In [None]:
baseline_files = [os.path.join(BASELINE_DEFAULT_PATH, i) for i in baseline_files]

In [None]:
tmp = np.load(baseline_files[0])

In [None]:
tmp['onset']

In [None]:
tmp.files

In [None]:
help(np.savez)

In [None]:
os.walk(BASELINE_DEFAULT_PATH)

In [None]:
dataset_baseline_parameters = {}
if glob.glob(BASELINE_PARAMETERS_CSV_PATH) == []:
    
    for file in file_path:
        print(f"processing {file}")
        x, fs = librosa.load(f"{file}.wav", mono=True, sr=FS)
        x_df = pd.read_csv(f"{file}.csv", names=["timestamp", "beat"])
        ground_truth = x_df['timestamp'].values

        # calculate the beats using librosa approach
        onset_subbands = librosa.onset.onset_strength_multi(
            y=x, 
            sr=FS, 
            n_fft = N_FFT,
            hop_length = HOP_SIZE,
        )
        bpm, beat_frame = librosa.beat.beat_track(onset_envelope=onset_subbands[0], sr=FS)
        beat_timestamps = librosa.frames_to_time(beat_frame, FS)
        
        print(f"saving {file}")
        np.savez(
            os.path.join(BASELINE_PARAMETERS_PATH, os.path.basename(file)), 
            onset=onset_subbands[0], 
            reference=ground_truth,
            estimated=beat_timestamps
        )

        #dataset_median[file]= mir_eval.beat.evaluate(ground_truth, librosa_timestamps)
    #pd.DataFrame(dataset_baseline_parameters).to_csv(BASELINE_PARAMETERS_PATH, index=False)
else:
    print("loading file")
    candombe_csv = pd.read_csv(BASELINE_PARAMETERS_PATH, index_col=0)
    dataset_baseline_parameters = candombe_csv.to_dict()
    

In [None]:
dataset_median = {}

if glob.glob(MEDIAN_CSV_PATH) == []:
    for file in file_path:
        print(f"processing {file}")
        x, fs = librosa.load(f"{file}.wav", mono=True, sr=FS)
        x_df = pd.read_csv(f"{file}.csv", names=["timestamp", "beat"])
        ground_truth = x_df['timestamp'].values

        # the standard method from librosa already uses 128 mel bands by defaul
        # so we can just skip this (:
        onset_subbands = librosa.onset.onset_strength_multi(
            y=x, 
            sr=FS, 
            n_fft = N_FFT,
            hop_length = HOP_SIZE,
            aggregate = np.median
        )
        bpm, beat_frame = librosa.beat.beat_track(onset_envelope=onset_subbands[0], sr=FS)
        beat_timestamps = librosa.frames_to_time(beat_frame, FS)
        
        np.savez(
            os.path.join(MEDIAN_PATH, os.path.basename(file)), 
            onset=onset_subbands[0], 
            reference=ground_truth,
            estimated=beat_timestamps
        )

        #dataset_median[file]= mir_eval.beat.evaluate(ground_truth, librosa_timestamps)
        
    #pd.DataFrame(dataset_median).to_csv(MEDIAN_PATH, index=False)
else:
    print("loading file")
    candombe_csv = pd.read_csv(MEDIAN_PATH, index_col=0)
    dataset_median = candombe_csv.to_dict()

In [None]:
dataset_mean = {}

if glob.glob(MEAN_CSV_PATH) == []:
    for file in file_path:
        print(f"processing {file}")
        x, fs = librosa.load(f"{file}.wav", mono=True, sr=FS)
        x_df = pd.read_csv(f"{file}.csv", names=["timestamp", "beat"])
        ground_truth = x_df['timestamp'].values

        # the standard method from librosa already uses 128 mel bands by defaul
        # so we can just skip this (:
        
        if not os.path.isfile(os.path.join(MEAN_PATH, os.path.basename(file))):
            onset_subbands = librosa.onset.onset_strength_multi(
                y=x, 
                sr=FS, 
                n_fft = N_FFT,
                hop_length = HOP_SIZE,
                aggregate = np.mean
            )
            bpm, beat_frame = librosa.beat.beat_track(onset_envelope=onset_subbands[0], sr=FS)
            beat_timestamps = librosa.frames_to_time(beat_frame, FS)

            np.savez(
                os.path.join(MEAN_PATH, os.path.basename(file)), 
                onset=onset_subbands[0], 
                reference=ground_truth,
                estimated=beat_timestamps
            )

        #dataset_median[file]= mir_eval.beat.evaluate(ground_truth, librosa_timestamps)
        
    #pd.DataFrame(dataset_median).to_csv(MEDIAN_PATH, index=False)
else:
    print("loading file")
    candombe_csv = pd.read_csv(MEDIAN_PATH, index_col=0)
    dataset_median = candombe_csv.to_dict()

In [None]:
pd.read_csv(SUM_PATH).transpose().reset_index()

In [None]:
dataset_baseline_default

In [None]:
pd.DataFrame(dataset_baseline_default)

In [None]:
df_baseline_default = pd.DataFrame(dataset_baseline_default).transpose().reset_index()
df_baseline_parameters = pd.DataFrame(dataset_baseline_parameters).transpose().reset_index()
df_median = pd.DataFrame(dataset_median).transpose().reset_index()
df_sum = pd.DataFrame(dataset_sum).transpose().reset_index()
df_max = pd.DataFrame(dataset_sum).transpose().reset_index()

In [None]:
df_baseline_default.mean()

In [None]:
df_median.mean()

In [None]:
df_sum.mean()

In [None]:
df_max.mean()

No caso desse dataset, o max, median e sum tiveram resultados bastante similares. Isso pode ser porque eles pegam os componentes mais fortes de onsets e na hora de agregar são "puxados" para os mesmos resultados? 

In [None]:
example_median = df_median.loc[0]
example_baseline = df_baseline.loc[0]
example_sum = df_sum.loc[0]
example_max = df_max.loc[0]

In [None]:
x, fs = librosa.load(f"{example_baseline['index']}.wav", mono=True, sr=FS)

In [None]:
start = 25
end = 30

fig, ax = plt.subplots(nrows=1, sharex=True)
ax.plot(x[start*fs:end*fs], alpha=0.5)

ax.vlines(
    (example_baseline.ground_truth_beats[
        (example_baseline.ground_truth_beats >= start) & (example_baseline.ground_truth_beats <= end)]-start)*FS, 
          0, 0.6, alpha=0.9, color='r', linestyle='-', label='groundtruth'
)

ax.vlines(
    (example_baseline.librosa_beats[
        (example_baseline.librosa_beats >= start) & (example_baseline.librosa_beats <= end)]-start)*FS, 
    0, 0.6, alpha=0.5, color='g', linestyle='--', label='baseline')


ax.vlines(
    (example_median.librosa_beats[
        (example_median.librosa_beats >= start) & (example_median.librosa_beats <= end)]-start)*FS, 
    0, 0.6, alpha=0.5, color='b', linestyle='--', label='median')

ax.vlines(
    (example_sum.librosa_beats[
        (example_sum.librosa_beats >= start) & (example_sum.librosa_beats <= end)]-start)*FS, 
    0, 0.6, alpha=0.5, color='b', linestyle='-', label='sum')

ax.vlines(
    (example_max.librosa_beats[
        (example_max.librosa_beats >= start) & (example_max.librosa_beats <= end)]-start)*FS, 
    0, 0.6, alpha=0.5, color='r', linestyle='--', label='max')

ax.legend()

In [None]:
# o que eu quero plotar?
# 1. um trecho da música original
# 2. os beats desse trecho
# 3. os beats detectados do mcfee
# 4. os beats detectados do librosa sem usar a mediana

In [None]:
# talvez faça sentido também olhar só pra função de onset pra entender
# por que os resultados estão TÃO horrorosos