In [1]:
# Required libraries
import os
import pandas as pd
import librosa
import numpy as np
import ruptures as rpt
from tqdm import tqdm
from scipy.signal import argrelextrema

In [2]:
# Function to create a tempogram from an audio signal
def compute_tempogram(onset_env, sr, hop_length):
    # Compute the tempogram
    tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr, hop_length=hop_length)
    return tempogram

# Function to create a chromagram from a harmonic audio signal
def compute_chromagram(y_harmonic, sr, hop_length):
    # Compute the chromagram
    chromagram = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr, hop_length=hop_length, bins_per_octave=24)
    return chromagram

# Function to detect key from a chromagram using Krumhansl-Schmuckler key-finding algorithm profiles
def detect_key_from_chromagram(chromagram, sr):
    pitches = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']

    # Calculate the sum of each pitch class across all time frames
    chroma_vals = np.sum(chromagram, axis=1)

    # Krumhansl-Schmuckler key-finding algorithm profiles
    maj_profile = [6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88]
    min_profile = [6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17]

    # Correlation for major and minor keys
    maj_key_corrs = [np.corrcoef(maj_profile, np.roll(chroma_vals, i))[1, 0] for i in range(12)]
    min_key_corrs = [np.corrcoef(min_profile, np.roll(chroma_vals, i))[1, 0] for i in range(12)]

    # Combine correlations and keys
    key_corrs = maj_key_corrs + min_key_corrs
    keys = [p + ' major' for p in pitches] + [p + ' minor' for p in pitches]

    # Determine the best key
    best_idx = np.argmax(key_corrs)
    best_key = keys[best_idx]
    best_corr = key_corrs[best_idx]

    return best_key, best_corr

# Function to convert standard key into Camelot key notation
def get_camelot(key):
    # Mapping from musical key to Camelot code
    camelot_major = {
        'B': '1B', 'F#': '2B', 'C#': '3B', 'G#': '4B', 'D#': '5B',
        'A#': '6B', 'F': '7B', 'C': '8B', 'G': '9B', 'D': '10B', 'A': '11B', 'E': '12B'
    }

    camelot_minor = {
        'G#': '1A', 'D#': '2A', 'A#': '3A', 'F': '4A', 'C': '5A',
        'G': '6A', 'D': '7A', 'A': '8A', 'E': '9A', 'B': '10A', 'F#': '11A', 'C#': '12A'
    }

    # Split the detected key into pitch and mode
    pitch, mode = key.split(' ')

    # Return the corresponding Camelot code
    if mode == 'major':
        return camelot_major[pitch]
    elif mode == 'minor':
        return camelot_minor[pitch]
    else:
        raise ValueError("Invalid mode in key: should be 'major' or 'minor'.")

# Function to calculate the optimal number of segments using the elbow method
def segment_waveform_optimal_k(algo, sr, n_bkps_max=6):
    # Function to calculate the sum of costs for a given number of breakpoints
    def get_sum_of_cost(n_bkps):
        bkps = algo.predict(n_bkps=n_bkps)
        cost = algo.cost.sum_of_costs(bkps)
        # Explicitly delete the breakpoints to free up memory
        del bkps
        return cost

    # Use a generator expression for memory efficiency if only iterating once
    costs = (get_sum_of_cost(n_bkps) for n_bkps in range(1, n_bkps_max + 1))
    costs_list = list(costs)  # Convert to list if needed more than once

    # Calculate curvatures with minimal intermediate variables
    curvatures = np.abs(np.diff(costs_list, 2))
    curvatures = np.insert(curvatures, [0, len(curvatures)], 0)

    # Identify the optimal number of breakpoints based on curvature
    optimal_idx = argrelextrema(curvatures, np.greater)[0][0]
    n_bkps_optimal = optimal_idx + 1  # Account for the initial 0 insertion

    # Predict and return the optimal change points
    bkps_optimal_times = algo.predict(n_bkps=n_bkps_optimal)
    times = librosa.frames_to_time(bkps_optimal_times[:-1], sr=sr, hop_length=512) # Exclude the last breakpoint (end of signal)
    return times

# Function to segment tempogram or chromagram with a fixed number of segments
def segment_waveform_fixed_k(algo, sr, n_bkps=3):
    bkps = algo.predict(n_bkps=n_bkps)
    times = librosa.frames_to_time(bkps[:-1], sr=sr, hop_length=512) # Exclude the last breakpoint (end of signal)
    return times

# Function to combine segments and filter based on criteria
def combine_and_filter_segments(tempo_times, chroma_times):
    combined_segments = set()

    # Add all tempogram segments
    for seg in tempo_times:
        combined_segments.add(seg)

    # Add chromagram segments if they are more than 3 seconds away from any tempogram segment
    for c_seg in chroma_times:
        if all(abs(c_seg - t_seg) > 3 for t_seg in tempo_times):
            combined_segments.add(c_seg)

    # Filter segments that are at least 10 seconds apart
    combined_segments = sorted(list(combined_segments))
    filtered_segments = [combined_segments[0]]
    for seg in combined_segments[1:]:
        if seg - filtered_segments[-1] >= 10:
            filtered_segments.append(seg)

    return filtered_segments

In [6]:
reference_df = pd.read_csv('../data/dataframes/spotify_metadata.csv')
reference_df.columns
column_names = ['track_name', 'artist_names', 'duration', 'ks_key', 
                'key_corr', 'camelot_key', 'times_tempogram_kfixed',
                'times_chroma_kfixed', 'numseg_combined_kfixed', 'times_combined_kfixed'] 
segments_df = pd.DataFrame(columns=column_names)

In [3]:
# Main processing loop

# Load the DataFrame from CSV
reference_df = pd.read_csv('../data/dataframes/spotify_metadata.csv')
audio_files_dir = '../data/audio_files/processed_download'
hop_length = 512
new_columns = ['track_name', 'artist_names',
    'duration', 'ks_key', 'key_corr','overall_tempo', 'camelot_key', 'times_tempogram_kfixed',
    'times_chroma_kfixed', 'numseg_combined_kfixed', 'times_combined_kfixed'
] 

# Iterate over the audio files
for index, row in tqdm(reference_df.iterrows(), total=reference_df.shape[0]):
    file_path = os.path.join(audio_files_dir, row['filename'])
    if os.path.isfile(file_path):
        # Load the audio file
        signal, sr = librosa.load(file_path, sr=None)
        duration = librosa.get_duration(y=signal, sr=sr)

        # Separate the harmonic and percussive components
        y_harmonic, y_percussive = librosa.effects.hpss(signal)

        # Compute onset envelope
        onset_env = librosa.onset.onset_strength(y=y_percussive, sr=sr, hop_length=hop_length)

        # Compute the tempogram and chromagram
        tempogram = compute_tempogram(onset_env, sr, hop_length)
        chromagram = compute_chromagram(y_harmonic, sr, hop_length)

        # Compute the overall key
        key, key_corr = detect_key_from_chromagram(chromagram, sr)
        camelot = get_camelot(key)

        # Compute the overall tempo
        tempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr)[0]

        # Segment using tempogram with optimal k and k=3
        algo_tempogram = rpt.KernelCPD(kernel="linear").fit(tempogram.T)
        k3_segments_tempo = segment_waveform_fixed_k(algo_tempogram, sr=sr, n_bkps=3)


        # Segment using chromagram with k=3
        algo_chroma = rpt.KernelCPD(kernel="linear").fit(chromagram.T)
        k3_segments_chroma = segment_waveform_fixed_k(algo_chroma, sr=sr, n_bkps=3)

        # Combine tempogram and chromagram segments for optimal k
        #combined_segments_kopt = combine_and_filter_segments(optimal_k_segments_tempogram, optimal_k_segments_chroma)
        # Combine tempogram and chromagram segments for fixed k
        combined_segments_kfixed = combine_and_filter_segments(k3_segments_tempo, k3_segments_chroma)

        # Store the results in the DataFrame
        reference_df.at[index, 'duration'] = duration
        reference_df.at[index, 'ks_key'] = str(key)
        reference_df.at[index, 'key_corr'] = float(key_corr)
        reference_df.at[index, 'camelot_key'] = str(camelot)
        reference_df.at[index, 'overall_tempo'] = float(tempo)
        #reference_df.at[index, 'numseg_tempogram_kopt'] = len(optimal_k_segments_tempogram)
        #reference_df.at[index, 'times_tempogram_kopt'] = ','.join(map(str, optimal_k_segments_tempogram))
        reference_df.at[index, 'times_tempogram_kfixed'] = ','.join(map(str, k3_segments_tempo))
        #reference_df.at[index, 'numseg_chroma_kopt'] = len(optimal_k_segments_chroma)
        #reference_df.at[index, 'times_chroma_kopt'] = ','.join(map(str, optimal_k_segments_chroma))
        reference_df.at[index, 'times_chroma_kfixed'] = ','.join(map(str, k3_segments_chroma))
        #reference_df.at[index, 'numseg_combined_kopt'] = len(combined_segments_kopt)
        #reference_df.at[index, 'times_combined_kopt'] = ','.join(map(str, combined_segments_kopt))
        reference_df.at[index, 'numseg_combined_kfixed'] = len(combined_segments_kfixed)
        reference_df.at[index, 'times_combined_kfixed'] = ','.join(map(str, combined_segments_kfixed))
        #print(reference_df.loc[index, new_columns])

# Save the updated DataFrame
reference_df.to_csv(r'../data/dataframes/segment_df.csv', index=False)

print("Segmentation complete. Combined segments have been added to the DataFrame and saved to 'segment_df.csv'.")

100%|██████████| 353/353 [6:44:40<00:00, 68.78s/it]   

Segmentation complete. Combined segments have been added to the DataFrame and saved to 'segment_df.csv'.





In [14]:
segment_df = reference_df.copy()
prefix_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
               'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
               'time_signature']

segment_df.rename(columns=lambda x: 'sp_' + x if x in prefix_cols else x, inplace=True)

# Splitting and converting 'times_tempogram_kfixed' column to numeric
tempo_split = segment_df['times_tempogram_kfixed'].str.split(',', expand=True)
segment_df['tempo_time_1'] = pd.to_numeric(tempo_split[0], errors='coerce')
segment_df['tempo_time_2'] = pd.to_numeric(tempo_split[1], errors='coerce')
segment_df['tempo_time_3'] = pd.to_numeric(tempo_split[2], errors='coerce')

# Splitting and converting 'times_chroma_kfixed' column to numeric
chroma_split = segment_df['times_chroma_kfixed'].str.split(',', expand=True)
segment_df['chroma_time_1'] = pd.to_numeric(chroma_split[0], errors='coerce')
segment_df['chroma_time_2'] = pd.to_numeric(chroma_split[1], errors='coerce')
segment_df['chroma_time_3'] = pd.to_numeric(chroma_split[2], errors='coerce')

# Dropping the specified columns
segment_df.drop(['times_tempogram_kfixed', 'times_chroma_kfixed',
                 'numseg_combined_kfixed', 'times_combined_kfixed'], axis=1, inplace=True)

segment_df.head()

Unnamed: 0,sp_danceability,sp_energy,sp_key,sp_loudness,sp_mode,sp_speechiness,sp_acousticness,sp_instrumentalness,sp_liveness,sp_valence,...,ks_key,key_corr,camelot_key,overall_tempo,tempo_time_1,tempo_time_2,tempo_time_3,chroma_time_1,chroma_time_2,chroma_time_3
0,0.784,0.521,1,-5.701,1,0.0322,0.062,5e-06,0.0995,0.817,...,B major,0.600193,1B,102.272727,3.658667,78.229333,127.456,104.576,116.064,163.125333
1,0.905,0.838,6,-6.838,1,0.0499,0.00112,0.839,0.608,0.464,...,G minor,0.532695,6A,125.0,60.757333,120.885333,165.92,31.082667,60.821333,227.210667
2,0.897,0.692,11,-4.985,0,0.0492,0.0187,0.725,0.0603,0.607,...,F# minor,0.489705,11A,125.0,43.861333,138.016,154.101333,44.053333,139.925333,183.072
3,0.724,0.792,1,-3.332,0,0.102,0.0114,8e-06,0.625,0.559,...,B minor,0.869059,10A,110.294118,19.029333,53.066667,141.930667,19.573333,123.306667,141.717333
4,0.71,0.729,2,-4.978,1,0.0539,0.0743,4e-06,0.0419,0.532,...,A# major,0.576438,6B,125.0,29.066667,236.256,292.362667,29.610667,230.293333,260.234667


In [15]:
segment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   sp_danceability      353 non-null    float64
 1   sp_energy            353 non-null    float64
 2   sp_key               353 non-null    int64  
 3   sp_loudness          353 non-null    float64
 4   sp_mode              353 non-null    int64  
 5   sp_speechiness       353 non-null    float64
 6   sp_acousticness      353 non-null    float64
 7   sp_instrumentalness  353 non-null    float64
 8   sp_liveness          353 non-null    float64
 9   sp_valence           353 non-null    float64
 10  sp_tempo             353 non-null    float64
 11  sp_time_signature    353 non-null    int64  
 12  track_id             353 non-null    object 
 13  artist_ids           353 non-null    object 
 14  genre_list           353 non-null    object 
 15  track_name           353 non-null    obj

In [16]:
segment_df.to_csv(r'../data/dataframes/segment_df_cleaned.csv', index=False)