In [15]:
import librosa
import numpy as np
import pandas as pd
import pickle
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, LSTM, TimeDistributed, LayerNormalization
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard, CSVLogger
import matplotlib.pyplot as plt

In [6]:
def extract_scaled_melspec(y, sr, n_fft=2048, hop_length=512, n_mels=128):
    """Extracts a Mel-spectrogram from an audio signal."""
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    S_dB = librosa.power_to_db(S, ref=np.max)
    min_val = np.min(S_dB)
    max_val = np.max(S_dB)
    scaled_melspec = (S_dB - min_val) / (max_val - min_val)
    return scaled_melspec


def decompose_spectrogram_with_nmf(scaled_spectrogram, n_components):
    """Decomposes a scaled Mel-spectrogram using NMF from sklearn, without sorting the components."""
    # Initialize the NMF model with the specified max_iter
    model = NMF(n_components=n_components, max_iter=1500, init='nndsvd', random_state=0)
    
    # Fit the model to the transposed spectrogram (samples x features)
    # Note: sklearn expects features as columns, hence the transpose
    V = scaled_spectrogram.T
    W = model.fit_transform(V)  # W corresponds to the code (activation matrix)
    H = model.components_       # H corresponds to the components (dictionary matrix)
    
    # Return the components, code, and the reconstruction without sorting the components
    return H, W

def find_anchor_frame(beats, tempo, sr):
    # Convert tempo to interval in seconds between beats
    seconds_per_beat = 60.0 / tempo
    
    # Convert interval to expected frames between beats
    expected_interval = int(librosa.time_to_frames(seconds_per_beat, sr=sr))
    
    # Calculate the difference between consecutive beats
    beat_intervals = np.diff(beats)
    
    # Initialize variables to keep track of the best matching sequence
    best_match_start = None
    best_match_quality = 0
    
    # Sliding window size based on a small multiplier of the expected interval to capture tempo variations
    window_size = 3  # Small window to check consistency of intervals
    
    # Iterate through beat intervals with a sliding window
    for i in range(len(beat_intervals) - window_size + 1):
        window = beat_intervals[i:i+window_size]
        
        # Calculate the average interval in the current window and its match quality
        avg_interval = np.mean(window)
        match_quality = 1 - abs(avg_interval - expected_interval) / expected_interval
        
        # Update the best match if this window represents a higher quality match
        if match_quality > best_match_quality:
            best_match_start = beats[i]
            best_match_quality = match_quality
            
            # Early exit condition if the match quality is high enough
            if match_quality > 0.95:
                return best_match_start
    
    return best_match_start


def create_beat_grid(beats, anchor_frame, sr, beat_interval_in_frames, time_signature, duration_in_frames):
    """Creates a grid of measures based on the tempo, time signature, and beats of the song."""  
    if not anchor_frame:
        anchor_frame = beats[0]

    # Add beats before the first onset (working backwards)
    beat_grid = [anchor_frame]
    current_frame = anchor_frame
    while current_frame >= 0:
        current_frame -= beat_interval_in_frames
        beat_grid.insert(0, current_frame)

    # Remove the first beat if it's negative
    if beat_grid[0] < 0:
        beat_grid.pop(0)
        
    # Group beats into measures
    measure_grid = []
    current_frame = beat_grid[0]
    while current_frame <= duration_in_frames:
        measure_grid.append(current_frame)
        current_frame += beat_interval_in_frames * time_signature

    # Insert 0 if not already in the list
    if measure_grid[0] != 0:
        measure_grid.insert(0, 0)
    
    # Append the duration if not already in the list
    if measure_grid[-1] != duration_in_frames:
        measure_grid.append(duration_in_frames)
    
    return np.array(beat_grid), np.array(measure_grid)


def generate_and_align_labels(df, n_frames, measure_grid_frames):
    """Generates a binary sequence of labels (1 for 'chorus', 0 for 'other') for each frame in a song,
    and aligns the label sequence with the measure grid by labeling a measure as 'chorus' if at least
    1/4 of the frames within the measure are labeled as 'chorus'.
    """
    # Generate binary label sequence
    binary_label_sequence = np.zeros(n_frames, dtype=int)
    for index, row in df.iterrows():
        start_frame = row['start_frame']
        end_frame = row['end_frame']
        if row['label'] == 'chorus':
            binary_label_sequence[start_frame:end_frame] = 1

    # Initialize the aligned_labels array
    aligned_labels = np.zeros(len(measure_grid_frames) - 1, dtype=int)

    # Iterate over each measure
    for i in range(len(measure_grid_frames) - 1):
        start_frame = measure_grid_frames[i]
        end_frame = measure_grid_frames[i + 1]

        # Extract the labels for the current measure
        measure_labels = binary_label_sequence[start_frame:end_frame]

        # Calculate the proportion of '1' labels in the current measure
        proportion_of_ones = np.sum(measure_labels) / len(measure_labels)

        # If at least 1/4 of the measure is labeled as 'chorus', set the measure label to '1'
        if proportion_of_ones >= 0.25:
            aligned_labels[i] = 1
        else:
            aligned_labels[i] = 0

    return aligned_labels


def segment_data_measures(data, measure_grid_frames):
    segments = []
    for i in range(len(measure_grid_frames) - 1):
        start_frame = measure_grid_frames[i]
        end_frame = measure_grid_frames[i + 1]
        segment = data[start_frame:end_frame]
        segments.append(segment)
    return segments


def positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])  # Apply sin to even indices in the array
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])  # Apply cos to odd indices in the array
    pos_encoding = angle_rads[np.newaxis, ...]
    return pos_encoding


def apply_hybrid_positional_encoding(segments):
    num_measures = len(segments)
    n_features = segments[0].shape[1]  # Assuming all segments have the same feature dimension
    
    # Generate measure-level positional encodings (assuming it's correct and matches the number of measures)
    measure_level_encodings = positional_encoding(num_measures, n_features)
    
    encoded_segments = []
    for i, segment in enumerate(segments):
        num_frames_per_measure = segment.shape[0]
        
        # Generate frame-level positional encodings for the current segment length
        frame_level_encodings = positional_encoding(num_frames_per_measure, n_features)
        
        # Apply measure-level encoding to each frame in the measure
        # Note: measure_encoding is broadcastable as it's applied equally across all frames
        measure_encoding = measure_level_encodings[0, i, :]
        
        # Apply frame-level encoding to the segment
        # This operation is now valid as frame_level_encodings matches the segment's shape
        segment_with_frame_encoding = segment + frame_level_encodings
        
        # Combine the two encodings by adding them
        combined_encoding = segment_with_frame_encoding + measure_encoding
        
        encoded_segments.append(combined_encoding)
    
    return encoded_segments

In [None]:
# Load the DataFrame with labeled data
df = pd.read_csv('../data/dataframes/clean_labeled.csv')

segment_dir = "../data/pkl/activations"
labels_dir = "../data/pkl/activation_labels"
os.makedirs(segment_dir, exist_ok=True)
os.makedirs(labels_dir, exist_ok=True)

# Iterate through the DataFrame and prepare data for each song
for _, group in tqdm(df.groupby('SongID'), desc='Processing'):
    song_id = group['SongID'].values[0]
    audio_path = group['FilePath'].values[0]
    y, sr = librosa.load(audio_path, sr=None)

    spectrogram = extract_scaled_melspec(y, sr, n_fft=2048, hop_length=512, n_mels=128)
    _, activations = decompose_spectrogram_with_nmf(spectrogram, n_components=32)

    # Extract tempogram, beat frames
    C = np.abs(librosa.cqt(y=y, sr=sr))
    onset_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
    tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
    # Create a measure grid
    # Tempo
    bpm = group['sp_tempo'].values[0] if not pd.isna(group['sp_tempo'].values[0]) else tempo
    if bpm == 0:
        bpm = tempo
    if bpm > 140:
        bpm /= 2
    if bpm <= 70:
        bpm *= 2
    # Time signature
    time_signature = group['sp_time_signature'].values[0] if not pd.isna(group['sp_time_signature'].values[0]) else 4
    time_signature = int(time_signature) if time_signature != 0 else 4
    
    duration_in_frames = len(activations)
    beat_interval_in_frames = int(librosa.time_to_frames(60/bpm, sr=sr))
    
    anchor_frame = find_anchor_frame(beats, bpm, sr)
    beat_grid, measure_grid = create_beat_grid(beats, anchor_frame, sr, beat_interval_in_frames, time_signature, duration_in_frames)

    # Align and aggregate labels at the measure level
    aligned_labels = generate_and_align_labels(group, duration_in_frames, measure_grid)

    # Segment features into measures
    segmented_features = segment_data_measures(activations, measure_grid)

    # Position encode
    encoded_features = apply_hybrid_positional_encoding(segmented_features)
    
    # Save segmented_H as a pickle file
    with open(os.path.join(segment_dir, f"{song_id}_encoded.pkl"), "wb") as f:
        pickle.dump(encoded_features, f)

    # Save aligned_labels as a pickle file
    with open(os.path.join(labels_dir, f"{song_id}_labels.pkl"), "wb") as f:
        pickle.dump(aligned_labels, f)

In [8]:
def load_pickles_from_directory(directory):
    all_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.pkl'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'rb') as file:
                data = pickle.load(file)
                all_data.append(data)
    return all_data

# Load all segments and labels
all_segments = load_pickles_from_directory(segment_dir)
all_labels = load_pickles_from_directory(labels_dir)

In [9]:
def find_max_frames_and_measures(all_segments):
    """Find the maximum number of frames per measure and the maximum number of measures across all songs, accommodating the extra list layer."""
    max_frames_per_measure = max(max(len(measure_list[0]) for measure_list in song) for song in all_segments)
    max_measures = max(len(song) for song in all_segments)
    return max_frames_per_measure, max_measures
    
def pad_measures(all_segments, max_frames_per_measure):
    """Pad all measures within each song to have the same number of frames using 0 padding."""
    padded_segments_within_songs = []
    for song in all_segments:
        padded_song = []
        for measure_list in song:
            measure = measure_list[0]  # Access the actual measure data
            padding_needed = max_frames_per_measure - measure.shape[0]
            padded_measure = np.pad(measure, ((0, padding_needed), (0, 0)), mode='constant', constant_values=(0))
            padded_song.append(padded_measure)
        padded_segments_within_songs.append(padded_song)
    return padded_segments_within_songs

def pad_songs(padded_measures, max_measures, max_frames_per_measure, n_features=32):
    """Pad all songs to have the same number of measures using 0 padding."""
    padded_segments_across_songs = []
    for song in padded_measures:
        measures_to_add = max_measures - len(song)
        if measures_to_add > 0:
            padding_measures = [np.zeros((max_frames_per_measure, n_features)) for _ in range(measures_to_add)]
            padded_song = song + padding_measures
        else:
            padded_song = song
        padded_segments_across_songs.append(padded_song)
    return padded_segments_across_songs

def pad_labels(all_labels, max_measures):
    """Pad all labels to have the same number of measures using -1 padding."""
    padded_labels = []
    for labels in all_labels:
        padding_needed = max_measures - len(labels)
        # Ensure labels are numpy arrays for consistent operations
        labels = np.asarray(labels)
        # Pad labels with -1
        padded_label = np.pad(labels, (0, padding_needed), mode='constant', constant_values=(-1))
        padded_labels.append(padded_label)
    return padded_labels


# Find the maximum frames per measure and maximum measures across all songs
max_frames_per_measure, max_measures = find_max_frames_and_measures(all_segments)

# First, pad measures within each song to have the same number of frames
padded_measures = pad_measures(all_segments, max_frames_per_measure)

# Then, pad all songs to have the same number of measures
padded_songs = pad_songs(padded_measures, max_measures, max_frames_per_measure)

# Pad labels to match the structure of padded_segments
padded_labels = pad_labels(all_labels, max_measures)

In [10]:
# Define the number of songs
num_songs = len(padded_songs)

# Create indices for the songs
indices = np.arange(num_songs)

# Split indices into training and temporary (validation + test) sets
train_indices, temp_indices = train_test_split(indices, test_size=0.3, random_state=42)

# Further split the temporary set into validation and test sets
val_indices, test_indices = train_test_split(temp_indices, test_size=0.5, random_state=42)

# Generate the actual train, validation, and test sets using the indices
X_train = [padded_songs[i] for i in train_indices]
y_train = [padded_labels[i] for i in train_indices]

X_val = [padded_songs[i] for i in val_indices]
y_val = [padded_labels[i] for i in val_indices]

X_test = [padded_songs[i] for i in test_indices]
y_test = [padded_labels[i] for i in test_indices]

# Define the batch size as the number of songs per batch
batch_size = 32

def data_generator(X, y):
    for features, labels in zip(X, y):
        # Assuming features is a list of lists of lists (songs, measures, frames)
        # Convert features to a NumPy array with shape: (num_measures, num_frames, 32)
        features_array = np.array(features)  # This assumes features is a 3D list
        
        # Assuming labels is a 1D NumPy array of labels for each measure
        # Reshape labels to have an additional dimension: (num_measures, 1)
        labels_reshaped = np.expand_dims(labels, axis=-1)
        
        yield features_array, labels_reshaped

# Assuming X_train and y_train are your training data and labels...
train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(X_train, y_train),
    output_signature=(
        tf.TensorSpec(shape=(None, None, 32), dtype=tf.float32),  # Adjust the shape as necessary
        tf.TensorSpec(shape=(None, 1), dtype=tf.float32)
    )
).batch(batch_size)

val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(X_val, y_val),
    output_signature=(
        tf.TensorSpec(shape=(None, None, 32), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 1), dtype=tf.float32)
    )
).batch(batch_size)

train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(X_train, y_train),
    output_signature=(
        tf.TensorSpec(shape=(None, None, 32), dtype=tf.float32),  # Adjust the shape as necessary
        tf.TensorSpec(shape=(None, 1), dtype=tf.float32)
    )
).batch(batch_size)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(X_test, y_test),
    output_signature=(
        tf.TensorSpec(shape=(None, None, 32), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 1), dtype=tf.float32)
    )
).batch(batch_size)

In [18]:
def custom_binary_crossentropy(y_true, y_pred):
    """Custom binary cross-entropy loss to handle -1 labels, which are used for padding and should be ignored during loss calculation."""
    # Converts y_true to float32 to match y_pred
    y_true = tf.cast(y_true, tf.float32)
    
    # Calculate the binary crossentropy
    bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
    
    # Apply the mask to the loss
    mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)
    loss = bce * mask
    
    # Return the mean loss, but only for masked (non-ignored) values
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)


def custom_accuracy(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)  # Identifying non-padded labels
    correct_predictions = tf.equal(tf.cast(tf.round(y_pred), tf.float32), y_true)  # True or False for each prediction
    masked_correct_predictions = tf.cast(correct_predictions, tf.float32) * mask  # Apply mask
    accuracy = tf.reduce_sum(masked_correct_predictions) / tf.reduce_sum(mask)  # Calculate accuracy on non-padded data
    return accuracy


def create_crnn_model(max_frames_per_measure, max_measures, feature_per_frame):
    # Define the frame-level model
    frame_input = tf.keras.layers.Input(shape=(max_frames_per_measure, feature_per_frame))
    conv1 =  tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(frame_input)
    pool1 = tf.keras.layers.MaxPooling1D(pool_size=2, padding='same')(conv1)
    conv2 = tf.keras.layers.Conv1D(filters=256, kernel_size=3, activation='relu', padding='same')(pool1)
    pool2 = tf.keras.layers.MaxPooling1D(pool_size=2, padding='same')(conv2)
    conv3 = tf.keras.layers.Conv1D(filters=256, kernel_size=3, activation='relu', padding='same')(pool2)
    pool3 = tf.keras.layers.MaxPooling1D(pool_size=2, padding='same')(conv3)
    frame_features = tf.keras.layers.Flatten()(pool3)
    frame_feature_model = Model(inputs=frame_input, outputs=frame_features)

    # Define the measure-level model
    measure_input = tf.keras.layers.Input(shape=(max_measures, max_frames_per_measure, feature_per_frame))
    time_distributed = tf.keras.layers.TimeDistributed(frame_feature_model)(measure_input)
    masking_layer = tf.keras.layers.Masking(mask_value=0.0)(time_distributed)
    lstm_out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(masking_layer)

    # Directly connect LSTM output to TimeDistributed Dense layer
    output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation='sigmoid'))(lstm_out)
    model = Model(inputs=measure_input, outputs=output)

    # Compile the model
    model.compile(optimizer='adam', loss=custom_binary_crossentropy, metrics=[custom_accuracy])

    return model

feature_per_frame = 32
model = create_crnn_model(max_frames_per_measure, max_measures, feature_per_frame)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 204, 320, 32)]    0         
                                                                 
 time_distributed_1 (TimeDis  (None, 204, 10240)       307840    
 tributed)                                                       
                                                                 
 masking_1 (Masking)         (None, 204, 10240)        0         
                                                                 
 bidirectional (Bidirectiona  (None, 204, 512)         21497856  
 l)                                                              
                                                                 
 time_distributed_2 (TimeDis  (None, 204, 1)           513       
 tributed)                                                       
                                                           

In [19]:
# Define the directories for checkpoints and models
checkpoint_dir = os.path.join('..', 'checkpoints', 'CRNN')
model_dir = os.path.join('..', 'models', 'CRNN')

# Ensure the checkpoint and model directories exist
os.makedirs(checkpoint_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

# Define the checkpoint path for the best model
best_model_filepath = os.path.join(model_dir, 'spectrogram_best_model.h5')

callbacks = [
    ModelCheckpoint(
        filepath=best_model_filepath,
        save_weights_only=False,
        monitor='val_custom_accuracy',  # Use 'val_custom_accuracy' for validation custom accuracy
        mode='max',
        save_best_only=True,
        verbose=1
    ),
    EarlyStopping(
        monitor='val_loss',  
        patience=3,
        verbose=1,
        restore_best_weights=True
    ),
    ReduceLROnPlateau(
        monitor='val_loss',  
        factor=0.1,
        patience=2,
        verbose=1,
        min_delta=0.0001,
        min_lr=0.00001
    )
]

# Train the model with the simplified callbacks list
history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset,
    callbacks=callbacks
)

Epoch 1/10
      8/Unknown - 67s 7s/step - loss: 1.6194 - custom_accuracy: 0.5181
Epoch 1: val_custom_accuracy improved from -inf to 0.41894, saving model to ..\models\CRNN\spectrogram_best_model.h5
Epoch 2/10
Epoch 2: val_custom_accuracy did not improve from 0.41894
Epoch 3/10
Epoch 3: val_custom_accuracy did not improve from 0.41894
Epoch 4/10
Epoch 4: val_custom_accuracy improved from 0.41894 to 0.59171, saving model to ..\models\CRNN\spectrogram_best_model.h5
Epoch 5/10
Epoch 5: val_custom_accuracy did not improve from 0.59171
Epoch 6/10
Epoch 6: val_custom_accuracy did not improve from 0.59171
Epoch 7/10
Epoch 7: val_custom_accuracy did not improve from 0.59171
Epoch 8/10
Epoch 8: val_custom_accuracy did not improve from 0.59171
Epoch 9/10
Epoch 9: val_custom_accuracy did not improve from 0.59171
Epoch 10/10
Epoch 10: val_custom_accuracy did not improve from 0.59171


In [47]:
change_points = dtw_change_detection(features)
print("Change points detected at:", change_points) 

ValueError: Input vector should be 1-D.

In [44]:
audio_path = '../data/audio_files/processed/2.mp3'
df = pd.read_csv('../data/dataframes/clean_labeled.csv')
data = df.loc[df['SongID']==2]
sp_tempo = data['sp_tempo'].values[0]
y, sr = librosa.load(audio_path, sr=None)
def extract_mel_spectrogram(y, sr, n_fft=2048, hop_length=512, n_mels=128):
    """Extracts a Mel-spectrogram from an audio signal."""
    return librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
spectrogram = extract_mel_spectrogram(y, sr)
duration = librosa.frames_to_time(len(spectrogram.T), sr=sr, hop_length=512)
C = np.abs(librosa.cqt(y=y, sr=sr))
onset_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
onset_frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, backtrack=True, units='frames')
tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
if sp_tempo == 0:
    sp_tempo = tempo
if sp_tempo > 150:
    sp_tempo /= 2
tempo = round_tempo(sp_tempo)
time_signature = data['sp_time_signature'].values[0] if not pd.isna(data['sp_time_signature'].values[0]) else 4
time_signature = int(time_signature) if time_signature != 0 else 4
measure_grid = create_measure_grid(onset_env, sr, tempo, time_signature, duration)
measure_grid

array([    0,     3,   183,   363,   543,   723,   903,  1083,  1263,
        1443,  1623,  1803,  1983,  2163,  2343,  2523,  2703,  2883,
        3063,  3243,  3423,  3603,  3783,  3963,  4143,  4323,  4503,
        4683,  4863,  5043,  5223,  5403,  5583,  5763,  5943,  6123,
        6303,  6483,  6663,  6843,  7023,  7203,  7383,  7563,  7743,
        7923,  8103,  8283,  8463,  8643,  8823,  9003,  9183,  9363,
        9543,  9723,  9903, 10083, 10263, 10443, 10623, 10803, 10983,
       11163, 11343, 11523, 11703, 11883, 12063, 12243, 12423, 12603,
       12783, 12963, 13143, 13323, 13503, 13683, 13863, 14043, 14223,
       14403, 14583, 14763, 14943, 15123, 15303, 15483, 15663, 15843,
       16023, 16203, 16383, 16563, 16743, 16923, 17103, 17283, 17463,
       17643, 17823, 18003, 18183, 18363, 18543, 18723, 18903, 19083,
       19263, 19443, 19623, 19803, 19983, 20163, 20343, 20523, 20703,
       20883, 21063, 21243, 21423, 21603, 21783, 21963, 22143, 22323,
       22503, 22683,

In [14]:
audio_path = '../data/audio_files/processed/2.mp3'
df = pd.read_csv('../data/dataframes/clean_labeled.csv')
data = df.loc[df['SongID']==2]
sp_tempo = data['sp_tempo'].values[0]
y, sr = librosa.load(audio_path, sr=None)
duration = librosa.get_duration(y=y, sr=sr, hop_length=512)
C = np.abs(librosa.cqt(y=y, sr=sr))
onset_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
if sp_tempo == 0:
    sp_tempo = tempo
if sp_tempo > 150:
    sp_tempo /= 2
tempo = round_tempo(sp_tempo)
time_signature = data['sp_time_signature'].values[0] if not pd.isna(data['sp_time_signature'].values[0]) else 4
time_signature = int(time_signature) if time_signature != 0 else 4
anchor_frame = find_anchor_frame(onset_env, sr, tempo, time_signature, duration)
measure_grid = create_measure_grid(sr, tempo, time_signature, duration, anchor_frame)
measure_grid

array([    0.,   180.,   360.,   540.,   720.,   900.,  1080.,  1260.,
        1440.,  1620.,  1800.,  1980.,  2160.,  2340.,  2520.,  2700.,
        2880.,  3060.,  3240.,  3420.,  3600.,  3780.,  3960.,  4140.,
        4320.,  4500.,  4680.,  4860.,  5040.,  5220.,  5400.,  5580.,
        5760.,  5940.,  6120.,  6300.,  6480.,  6660.,  6840.,  7020.,
        7200.,  7380.,  7560.,  7740.,  7920.,  8100.,  8280.,  8460.,
        8640.,  8820.,  9000.,  9180.,  9360.,  9540.,  9720.,  9900.,
       10080., 10260., 10440., 10620., 10800., 10980., 11160., 11340.,
       11520., 11700., 11880., 12060., 12240., 12420., 12600., 12780.,
       12960., 13140., 13320., 13500., 13680., 13860., 14040., 14220.,
       14400., 14580., 14760., 14940., 15120., 15300., 15480., 15660.,
       15840., 16020., 16200., 16380., 16560., 16740., 16920., 17100.,
       17280., 17460., 17640., 17820., 18000., 18180., 18360., 18540.,
       18720., 18900., 19080., 19260., 19440., 19620., 19800., 19980.,
      