In [1]:
import numpy as np
import librosa
import pickle
import pandas as pd
import random
from math import floor

In [5]:
# Utility function to load a pickle file
def load_pickle(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def beat_tracking_dp(y, sr, hop_length=512, alpha=0.5):
    """Perform beat tracking using dynamic programming."""
    tempo_estimate, _ = librosa.beat.beat_track(y=y, sr=sr)
    floor_tempo = floor(tempo_estimate)
    tempo_hypotheses = [floor_tempo - 1, floor_tempo, tempo_estimate, floor_tempo + 1]
    tempo_hypotheses = np.unique(tempo_hypotheses)

    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
    time_step = hop_length / sr
    time_grid = np.arange(len(onset_env)) * time_step
    C = np.zeros((len(onset_env), len(tempo_hypotheses)))
    P = np.zeros_like(C, dtype=int)

    for i, t in enumerate(time_grid[1:], 1):
        for k, tempo in enumerate(tempo_hypotheses):
            tau_p = 60 / tempo
            start = max(0, i - int(2 * tau_p / time_step))
            end = max(0, i - int(tau_p / (2 * time_step)))
            if start < end:
                transition_scores = C[start:end, k] - alpha * (np.log((t - time_grid[start:end]) / tau_p))**2
                best_predecessor = start + np.argmax(transition_scores)
                max_score = transition_scores[best_predecessor - start]
            else:
                best_predecessor = 0
                max_score = 0
            C[i, k] = onset_env[i] + max_score
            P[i, k] = best_predecessor

    best_tempo_index = np.argmax(C[-1])
    best_bpm = tempo_hypotheses[best_tempo_index]

    # Backtrace to recover the beat sequence
    beat_frames = [len(onset_env) - 1]  # Start from the last frame
    current_frame = beat_frames[-1]

    while current_frame > 0:
        current_frame = P[current_frame, best_tempo_index]
        beat_frames.append(current_frame)

    beat_frames.pop()  # Remove the last appended frame which is zero due to initialization
    beat_frames.reverse()  # Reverse the beat frames to be in chronological order

    return best_bpm, beat_frames, tempo_estimate

In [6]:
df = pd.read_csv('../data/dataframes/clean_labeled.csv') 
X_test = load_pickle('../data/pkl/test_data.pkl')

random.seed(42)
random_song_id = int(random.choice(list(X_test.keys())))
audio_file = f'../data/audio_files/processed/{random_song_id}.mp3'
y, sr = librosa.load(audio_file, sr=None)
tempo, beats, original_bpm = beat_tracking_dp(y, sr=sr)
print("Song ID:", random_song_id)
print("Best BPM estimate:", tempo)
print("Original BPM estimate:", original_bpm)
print("Spotify BPM:", df.loc[df['SongID'] == random_song_id]['sp_tempo'].values[0])
print("Beat frames:", beats)

Song ID: 71
Best BPM estimate: 126.0
Original BPM estimate: 125.0
Spotify BPM: 125.99
Beat frames: [89, 160, 205, 272, 340, 410, 474, 541, 592, 652, 741, 830, 875, 920, 1008, 1097, 1142, 1187, 1276, 1365, 1455, 1544, 1633, 1722, 1768, 1835, 1901, 1990, 2080, 2169, 2258, 2347, 2392, 2437, 2482, 2549, 2616, 2683, 2749, 2794, 2839, 2884, 2929, 2974, 3060, 3106, 3151, 3218, 3284, 3329, 3418, 3463, 3508, 3575, 3642, 3687, 3775, 3820, 3865, 3910, 3998, 4043, 4132, 4177, 4222, 4267, 4312, 4357, 4445, 4490, 4535, 4580, 4625, 4712, 4757, 4802, 4891, 4936, 4981, 5026, 5071, 5159, 5204, 5249, 5294, 5339, 5427, 5472, 5517, 5605, 5650, 5695, 5784, 5829, 5918, 5963, 6008, 6053, 6141, 6186, 6231, 6276, 6321, 6409, 6454, 6499, 6544, 6589, 6677, 6722, 6767, 6855, 6900, 6989, 7034, 7079, 7124, 7169, 7214, 7302, 7347, 7392, 7437, 7482, 7569, 7614, 7659, 7748, 7793, 7838, 7883, 7928, 7973, 8061, 8106, 8151, 8196, 8284, 8329, 8374, 8463, 8508, 8553, 8598, 8643, 8688, 8755, 8820, 8865, 8910, 8978, 9044, 908

In [None]:
import numpy as np
from scipy.stats import rv_continuous
import librosa
from typing import Optional, Callable, Any

def robust_tempo(
    *,
    y: Optional[np.ndarray] = None,
    sr: float = 22050,
    onset_envelope: Optional[np.ndarray] = None,
    hop_length: int = 512,
    ac_size_min: float = 3.0,  # minimum window size in seconds
    ac_size_max: float = 8.0,  # maximum window size in seconds
    max_tempo: Optional[float] = 320.0,
    prior: Optional[rv_continuous] = None,
) -> float:
    """
    Estimate the global tempo (beats per minute) with an adaptive window and by rounding to whole numbers.

    Parameters are largely the same as the original function, but with a few adjustments.
    """

    if y is None and onset_envelope is None:
        raise ParameterError('Either y or onset_envelope must be provided')

    if onset_envelope is None:
        onset_envelope = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)

    # Get a rough estimate of the tempo
    rough_tempo = librosa.feature.tempo(onset_envelope=onset_envelope, sr=sr)[0]

    # Adjust window size based on the rough tempo estimate
    ac_size = np.clip(ac_size_max - (rough_tempo - 60) / (max_tempo - 60) * (ac_size_max - ac_size_min), ac_size_min, ac_size_max)
    win_length = librosa.time_to_frames(ac_size, sr=sr, hop_length=hop_length).item()

    # Calculate the tempogram using the adjusted window size
    tg = tempogram(onset_envelope=onset_envelope, sr=sr, hop_length=hop_length, win_length=win_length)

    # Estimate the tempo from the tempogram
    refined_tempo = estimate_tempo_from_tempogram(tg, sr, hop_length, win_length, max_tempo, prior)

    # Round the tempo to the nearest whole number
    refined_tempo_rounded = np.round(refined_tempo)

    return refined_tempo_rounded.item()  # Return as a scalar

def estimate_tempo_from_tempogram(tg, sr, hop_length, win_length, max_tempo, prior):
    # This part of the code remains similar to the original function
    bpms = tempo_frequencies(win_length, hop_length=hop_length, sr=sr)
    if prior is None:
        start_bpm = 120
        std_bpm = 1.0
        logprior = -0.5 * ((np.log2(bpms) - np.log2(start_bpm)) / std_bpm) ** 2
    else:
        logprior = prior.logpdf(bpms)
    if max_tempo is not None:
        max_idx = np.argmax(bpms > max_tempo)
        logprior[max_idx:] = -np.inf

    logprior = np.expand_dims(logprior, axis=-2)

    # Get the maximum, weighted by the prior
    best_period = np.argmax(tg + logprior, axis=-2)

    tempo_est = bpms[best_period]

    return tempo_est