In [1]:
import numpy as np
import librosa
from sklearn.cluster import HDBSCAN
import umap
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean



### Check this paper

implemented based on this paper: 

"Deep Learning-based Audio Representations for the Analysis and Visualisation of Electronic Dance Music DJ Mixes" 

https://qmro.qmul.ac.uk/xmlui/handle/123456789/104084

In [None]:
def analyze_dj_mix_improved(audio_path, min_track_duration_seconds=120):
    """
    Improved DJ mix analysis with constraints on track duration
    
    Parameters:
    audio_path (str): Path to the DJ mix audio file
    min_track_duration_seconds (int): Minimum track duration to consider
    
    Returns:
    dict: Information about tracks with start/end times in seconds
    """
    y, sr = librosa.load(audio_path)

    # [Same initial feature extraction as before]
    hop_length = sr // 10  # 100ms hop length as in the paper
    mfccs = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=20)
    print("Audio is split into frames of shape:", mfccs.shape)
    # remove the first MFCC coefficient as it is usually not used.
    mfccs = mfccs[1:, :]
    print("After removing the first MFCC coefficient, shape is:", mfccs.shape)


    
    # Time-average MFCCs to get per-second features
    frames_per_second = sr // hop_length
    num_seconds = len(y) // sr
    features = []
    
    print(f"Number of seconds: {num_seconds}")
    for i in range(num_seconds):
        start_frame = i * frames_per_second
        end_frame = min((i + 1) * frames_per_second, mfccs.shape[1])
        if start_frame < mfccs.shape[1]:
            second_features = np.mean(mfccs[:, start_frame:end_frame], axis=1)
            features.append(second_features)
    
    features = np.array(features)
    
    # Dimensionality reduction with UMAP (used in the paper)
    reducer = umap.UMAP(n_components=2)
    embedding = reducer.fit_transform(features)
    
    # Adjust HDBSCAN parameters for more reasonable clustering
    clusterer = HDBSCAN(min_cluster_size=30, min_samples=5)
    cluster_labels = clusterer.fit_predict(embedding)
    
    # Find potential boundaries with temporal constraints
    potential_boundaries = []
    current_cluster = cluster_labels[0]
    
    for i in range(1, len(cluster_labels)):
        if (cluster_labels[i] != current_cluster and 
            cluster_labels[i] != -1 and 
            current_cluster != -1):
            potential_boundaries.append(i)
            current_cluster = cluster_labels[i]
    
    # Apply minimum duration constraint
    track_boundaries = []
    last_boundary = 0
    
    for boundary in potential_boundaries:
        if boundary - last_boundary >= min_track_duration_seconds:
            track_boundaries.append(boundary)
            last_boundary = boundary
    
    # Create track list with start/end times
    tracks = []
    start_time = 0
    
    for i, boundary in enumerate(track_boundaries):
        end_time = boundary
        tracks.append({
            'track_number': i + 1,
            'start_time_seconds': start_time,
            'end_time_seconds': end_time,
            'duration_seconds': end_time - start_time,
            'start_time_formatted': format_time(start_time),
            'end_time_formatted': format_time(end_time)
        })
        start_time = end_time
    
    # Add the final track
    tracks.append({
        'track_number': len(tracks) + 1,
        'start_time_seconds': start_time,
        'end_time_seconds': len(cluster_labels),
        'duration_seconds': len(cluster_labels) - start_time,
        'start_time_formatted': format_time(start_time),
        'end_time_formatted': format_time(len(cluster_labels))
    })
    
    return {
        'num_tracks': len(tracks),
        'tracks': tracks
    }

def format_time(seconds):
    """Convert seconds to HH:MM:SS format"""
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    secs = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{secs:02d}"

# Example of printing the track list
def print_track_list(results):
    print(f"DJ Mix contains {results['num_tracks']} tracks:\n")
    print(f"{'Track':^6} | {'Start Time':^10} | {'End Time':^10} | {'Duration':^10}")
    print("-" * 45)
    
    for track in results['tracks']:
            print(f"{track['track_number']:^6} | {track['start_time_formatted']:^10} | "
                f"{track['end_time_formatted']:^10} | {format_time(track['duration_seconds']):^10}")

In [26]:
# Example usage
audio_path = "breakbeat_128kbps.mp3"
MINIMUM_TRACK_DURATION_SECONDS = 180 # longer better -> however can't catch the changes
res = analyze_dj_mix_improved(audio_path, min_track_duration_seconds=MINIMUM_TRACK_DURATION_SECONDS)

Audio is split into frames of shape: (20, 37849)
After removing the first MFCC coefficient, shape is: (19, 37849)
Number of seconds: 3784


In [28]:
print_track_list(res)

DJ Mix contains 20 tracks:

Track  | Start Time |  End Time  |  Duration 
---------------------------------------------
  1    |  00:00:00  |  00:03:47  |  00:03:47 
  2    |  00:03:47  |  00:06:47  |  00:03:00 
  3    |  00:06:47  |  00:09:48  |  00:03:01 
  4    |  00:09:48  |  00:12:49  |  00:03:01 
  5    |  00:12:49  |  00:15:51  |  00:03:02 
  6    |  00:15:51  |  00:18:57  |  00:03:06 
  7    |  00:18:57  |  00:22:01  |  00:03:04 
  8    |  00:22:01  |  00:25:01  |  00:03:00 
  9    |  00:25:01  |  00:28:10  |  00:03:09 
  10   |  00:28:10  |  00:31:51  |  00:03:41 
  11   |  00:31:51  |  00:34:51  |  00:03:00 
  12   |  00:34:51  |  00:38:28  |  00:03:37 
  13   |  00:38:28  |  00:41:42  |  00:03:14 
  14   |  00:41:42  |  00:44:52  |  00:03:10 
  15   |  00:44:52  |  00:48:06  |  00:03:14 
  16   |  00:48:06  |  00:51:07  |  00:03:01 
  17   |  00:51:07  |  00:54:08  |  00:03:01 
  18   |  00:54:08  |  00:57:08  |  00:03:00 
  19   |  00:57:08  |  01:00:17  |  00:03:09 
  20  