# Data-wrangling Notebook
This notebook is comprised of three parts:
    1) Audio pre-processing

## Part 1: Audio Pre-processing
- The following code iterates through all MP3 files in the `input_dir` and performs the following operations:
  - Strips the silence from the audio.
  - Parses the MP3 metadata.
  - Exports the processed audio to the `output_dir` with a new filename based on a song ID counter.
  - Appends song information to the `song_data_list`.
  - Converts `song_data_list` to a pandas DataFrame and exports to a CSV file for further use.
  - New songs can be added into `input_dir` and processed and appended into the existing CSV.

In [None]:
import os
import pandas as pd
from datetime import datetime
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from tqdm import tqdm
from functools import reduce
from mutagen.mp3 import MP3
from mutagen.easyid3 import EasyID3

In [None]:
def strip_silence(sound, min_silence_len=1000, silence_thresh=-50):
    """
    Strip silence from an audio segment.

    Args:
        sound: The audio segment to strip silence from.
        min_silence_len: The minimum length of silence (in milliseconds) to be removed.
        silence_thresh: The threshold (in dB) below which a segment is considered silent.

    Returns:
        An audio segment with the silence removed.
    """

    nonsilent_ranges = detect_nonsilent(sound, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
    return reduce(lambda acc, val: acc + sound[val[0]:val[1]], nonsilent_ranges, AudioSegment.empty()) if nonsilent_ranges else sound


def parse_mp3_metadata(file_path):
    """
    Parse metadata from an MP3 file.

    Args:
        file_path: The path to the MP3 file.

    Returns:
        A tuple containing the track title, artists, and genre.
    """

    try:
        audiofile = MP3(file_path, ID3=EasyID3)
        track_title = audiofile.get('title', [None])[0]
        artists = audiofile.get('artist', [None])[0]
        genre = audiofile.get('genre', [None])[0]
        return track_title, artists, genre
    except Exception as e:
        print(f"Error reading metadata for {file_path}: {e}")
        return None, None, None

#### Run the cell below when processing songs for the ***first time***.

In [None]:
# Define directories
input_dir = r'..\data\audio_files\raw'
output_dir = r'..\data\audio_files\processed'

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Initialize song data DataFrame
df = pd.DataFrame(columns=["SongID", "TrackName", "Artists", "Genre", "FilePath"])

# Initialize song ID counter and list for song data
song_id_counter = 1
song_data_list = []

# Process audio files
for filename in tqdm([f for f in os.listdir(input_dir) if f.endswith('.mp3')], desc="Processing audio files", unit="file"):
    file_path = os.path.join(input_dir, filename)

    # Generate new filename using song ID counter
    new_filename = f"{song_id_counter}.mp3"
    output_file_path = os.path.join(output_dir, new_filename)

    # Load and strip silence from audio file
    sound = AudioSegment.from_file(file_path)
    stripped_sound = strip_silence(sound)

    # Parse metadata
    track_title, artists, genre = parse_mp3_metadata(file_path)

    # Export stripped audio with updated metadata
    stripped_sound.export(output_file_path, format='mp3', tags={"title": track_title, "artist": artists, "genre": genre})

    # Create dictionary of song information and add to list
    song_data_list.append({
        "SongID": song_id_counter,
        "TrackName": track_title,
        "Artists": artists,
        "Genre": genre,
        "FilePath": output_file_path
    })

    # Increment song ID counter
    song_id_counter += 1

# Convert list of dictionaries to DataFrame and save to CSV
df = pd.DataFrame(song_data_list)
df.to_csv(r'..\data\dataframes\song_data.csv', index=False)

Processing audio files: 100%|██████████| 554/554 [3:16:50<00:00, 21.32s/file]  


#### Run the cell below if processing additional songs at a later time.

In [None]:
# Define directories
input_dir = r'..\data\audio_files\raw'
output_dir = r'..\data\audio_files\processed'
song_data_csv = r'..\data\dataframes\song_data.csv'

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load existing song data
if os.path.exists(song_data_csv):
    existing_df = pd.read_csv(song_data_csv)
else:
    existing_df = pd.DataFrame(columns=["SongID", "TrackName", "Artists", "Genre", "FilePath"])

# Initialize song ID counter with the next available ID
song_id_counter = existing_df['SongID'].max() + 1 if not existing_df.empty else 1
song_data_list = []

# Process all new audio files in the input directory
for filename in tqdm([f for f in os.listdir(input_dir) if f.endswith('.mp3')], desc="Processing new audio files", unit="file"):
    file_path = os.path.join(input_dir, filename)

    # Parse metadata to compare with existing data
    track_title, artists, genre = parse_mp3_metadata(file_path)

    # Check if the song already exists in the DataFrame
    if not existing_df[
        (existing_df['TrackName'] == track_title) &
        (existing_df['Artists'] == artists)
    ].empty:
        continue  # Skip the file if it already exists

    new_filename = f"{song_id_counter}.mp3"
    output_file_path = os.path.join(output_dir, new_filename)

    sound = AudioSegment.from_file(file_path)
    stripped_sound = strip_silence(sound)
    stripped_sound.export(output_file_path, format='mp3', tags={"title": track_title, "artist": artists, "genre": genre})

    # Append new song data to the list
    song_data_list.append({
        "SongID": song_id_counter,
        "TrackName": track_title,
        "Artists": artists,
        "Genre": genre,
        "FilePath": output_file_path
    })

    # Increment the song ID counter
    song_id_counter += 1

# If we have new songs, append them to the existing dataframe and save to a new CSV with a unique identifier
if song_data_list:
    new_songs_df = pd.DataFrame(song_data_list)
    updated_df = pd.concat([existing_df, new_songs_df], ignore_index=True)

    # Get the current timestamp to use as a unique identifier
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    new_csv_filename = f'song_data_{timestamp}.csv'
    new_song_data_csv = os.path.join(os.path.dirname(song_data_csv), new_csv_filename)

    updated_df.to_csv(new_song_data_csv, index=False)
    print(f"Processed {len(song_data_list)} new songs and saved to {new_csv_filename}.")
else:
    print("No new songs to process.")

Processing new audio files: 100%|██████████| 869/869 [2:17:54<00:00,  9.52s/file]  

Processed 315 new songs and saved to song_data_20240109110632.csv.





## Part 2: Spotify Data Extraction

Using the DataFrame we just created, we'll search for each track on Spotify and extract various Spotify-generated audio features and metadata.

In [1]:
import pandas as pd
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm
import time

The Spotify client is initialized with credentials stored in a JSON file. These credentials include the `client_id` and `client_secret` necessary for accessing the Spotify Web API. The `SpotifyClientCredentials` manager handles the OAuth 2.0 flow for server-to-server authentication.

Using any text editor, create a new text file containing the following information. Save the text file as "spotify_credentials.json" and place in '../data/reference' folder. 

```
{
  "client_id":"REPLACE WITH CLIENT ID",
  "client_secret":"REPLACE WITH SECRET",
  "user_id":"REPLACE WITH SPOTIFY USERNAME"
}
```
* Retrieve your client ID and secret here: https://developer.spotify.com/dashboard/
* Retrieve your username here: https://www.spotify.com/us/account/overview/

In [2]:
# Load Spotify credentials
credentials_path = r'../data/reference/spotify_credentials.json'  # Path to your spotify_credentials.json file

with open(credentials_path, 'r') as file:
    creds = json.load(file)

# Initialize Spotify client
auth_manager = SpotifyClientCredentials(client_id=creds['client_id'], client_secret=creds['client_secret'])
sp = spotipy.Spotify(auth_manager=auth_manager)

In [3]:
# In case rate-limited by Spotify
def make_spotify_request(sp, track_title, artists):
    retries = 3  # Maximum number of retries
    backoff_factor = 0.5  # Factor to determine the next sleep time
    for attempt in range(retries):
        try:
            # Format the search query
            query = f"track:{track_title} artist:{artists[0].strip()}"  # Using the first artist for simplicity
            # Search for tracks
            return sp.search(q=query, type='track', limit=1)
        except spotipy.exceptions.SpotifyException as e:
            if e.http_status == 429:
                sleep_time = int(e.headers.get('Retry-After', 1))
                time.sleep(sleep_time)
            elif 500 <= e.http_status < 600:
                sleep_time = backoff_factor * (2 ** attempt)
                time.sleep(sleep_time)
            else:
                raise
    return None  # If all retries failed

def extract_spotify_metadata_features(track_title, artists, sp):
    """
    Extracts Spotify metadata and audio features for a given track title and list of artists.

    This function performs a search on Spotify for the track using the provided title and artist names.
    If a match is found, it retrieves the track's ID, artist IDs, genres, and audio features.
    The results are returned in a pandas DataFrame.

    Parameters:
    - track_title (str): The title of the track to search for.
    - artists (list): A list of artist names associated with the track.
    - sp (Spotify client): An authenticated instance of the Spotify client to perform API calls.

    Returns:
    - DataFrame: A pandas DataFrame containing the Spotify metadata and audio features for the track.
                 If no match is found, an empty DataFrame with the specified columns is returned.
    """

    # Initialize variables
    track_id, artist_ids, unique_genres = None, None, set()

    # Search for track ID, artist IDs, and genres
    for artist in artists:
        query = f"track:{track_title} artist:{artist.strip()}"
        track_results = sp.search(q=query, type='track', limit=1)
        if track_results['tracks']['items']:
            track_item = track_results['tracks']['items'][0]
            track_id = track_item['id']
            artist_ids = [artist['id'] for artist in track_item['artists']]
            for artist_id in artist_ids:
                artist_genre = sp.artist(artist_id)['genres']
                unique_genres.update(artist_genre)
            break

    # Retrieve audio features if a match was found
    if track_id and artist_ids:
        audio_features_dict = sp.audio_features(track_id)[0]
        audio_features_dict['sp_genre'] = list(unique_genres)
        audio_features_df = pd.DataFrame([audio_features_dict])
        columns_to_drop = ['type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'mode']
        audio_features_df.drop(columns=columns_to_drop, inplace=True)
    else:
        audio_features_df = pd.DataFrame()

    return audio_features_df

In [4]:
# Load the DataFramge or CSV that we created in the step above
df = pd.read_csv(r'..\data\dataframes\song_data_20240109110632.csv')

# Initialize an empty DataFrame to store all tracks' info
spotify_df = pd.DataFrame()

# Loop through each row of the DataFrame
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Fetching track features"):    
    track_title = row['TrackName']
    artists = [artist.strip() for artist in row['Artists'].split('/')]  # Split artists by '/' and strip whitespace

    # Use the function to extract metadata and audio features
    track_features_df = extract_spotify_metadata_features(track_title, artists, sp)

    # If the DataFrame is not empty, merge the results with the song details
    if not track_features_df.empty:
        track_features_df['SongID'] = row['SongID']  # Assuming there's a 'SongID' column in the original DataFrame
        track_features_df['sp_genre'] = ', '.join(track_features_df.loc[0, 'sp_genre'])  # Join genres into a single string
        # Append this track's DataFrame to the main DataFrame
        spotify_df = pd.concat([spotify_df, track_features_df], ignore_index=True)

Fetching track features: 100%|██████████| 869/869 [07:19<00:00,  1.98it/s]


In [7]:
# Adding 'sp_' in front of columns to indicate they are Spotify-derived features
prefix_cols = ['danceability', 'energy', 'key', 'loudness', 'speechiness',
               'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
               'time_signature']

spotify_df.rename(columns=lambda x: 'sp_' + x if x in prefix_cols else x, inplace=True)
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   sp_danceability      840 non-null    float64
 1   sp_energy            840 non-null    float64
 2   sp_key               840 non-null    int64  
 3   sp_loudness          840 non-null    float64
 4   sp_speechiness       840 non-null    float64
 5   sp_acousticness      840 non-null    float64
 6   sp_instrumentalness  840 non-null    float64
 7   sp_liveness          840 non-null    float64
 8   sp_valence           840 non-null    float64
 9   sp_tempo             840 non-null    float64
 10  sp_time_signature    840 non-null    int64  
 11  sp_genre             840 non-null    object 
 12  SongID               840 non-null    int64  
dtypes: float64(9), int64(3), object(1)
memory usage: 85.4+ KB


In [8]:
# Join with DataFrame made in Part 1
merged_df = pd.merge(df, spotify_df, on='SongID', how='left')
merged_df.to_csv(r'../data/dataframes/sp_merged.csv', index=False)

# Part 3: Audio Features & Visualizations

## Adding rhythm/chroma features

### Function to create and plot a measure grid for each song

In [10]:
def quantize_beats(beats, onset_env, tempo, sr, hop_length, duration):
    """
    Adjusts beat times to the nearest detected onsets and creates beat and measure grids for the audio.

    This function assumes the beat times are evenly spaced within each measure and are in 4/4 time signature. It also backtracks the beats using the onset envelope to align them to the nearest detected onset.

    Parameters:
    beats : np.ndarray
        An array of beat times in frame units generated from librosa.beat.beat_track.
    onset_env : np.ndarray
        Onset envelope of the audio signal, used for backtracking beats.
    tempo : float
        Estimated tempo of the audio in beats per minute.
    sr : int
        Sampling rate of the audio signal.
    hop_length : int
        Hop length used in the onset detection and beat tracking.
    duration : float
        Duration of the audio signal in seconds.

    Returns:
    beat_grid : np.ndarray
        Array of quantized beat times.
    measure_grid : np.ndarray
        Array of quantized measure start times.
    beats_per_measure : int
        Number of beats per measure, which is set to 4 for a 4/4 time signature.

    Raises:
    ValueError: If the `beats` array is empty or not one-dimensional.
    """
    # Validate input
    if beats.ndim != 1:
        raise ValueError("The 'beats' array must be one-dimensional.")
    if beats.size == 0:
        raise ValueError("The 'beats' array must not be empty.")
    
    # Hardcoded assumption of 4/4 time signature
    beats_per_measure = 4

    # Track beats to align them to the nearest detected onset
    beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=hop_length)

    # Calculate the beat interval (seconds per beat)
    beat_interval = 60.0 / tempo

    # Backtrack from the first beat to align with time 0 if necessary
    first_beat_time = 0

    # Create beat grid from the first beat time to the end of the song
    beat_grid = np.arange(first_beat_time, duration, beat_interval)

    # Ensure beat grid does not go past the duration of the song
    beat_grid = beat_grid[beat_grid <= duration]

    # Create measure grid
    measure_indices = np.arange(0, len(beat_grid), beats_per_measure)
    measure_grid = beat_grid[measure_indices]

    # Ensure measure grid does not go past the duration of the song
    measure_grid = measure_grid[measure_grid <= duration]

    return beat_grid, measure_grid


def apply_measure_grid(ax, measure_grid):
    """
    This function takes an axis object and applies measure grid lines,
    sets x-ticks to measure start times for every fourth measure starting from measure 0,
    labels them with measure numbers, and applies sub-ticks for intermediate measure times.
    
    Parameters:
    ax (matplotlib.axes.Axes): The axis object to modify.
    measure_grid (list or array): The list or array of measure start times in seconds.
    """
    measure_numbers = np.arange(len(measure_grid))
    # Adjust the list to start from measure 0 and get every fourth measure
    major_measure_indices = [i for i, measure_num in enumerate(measure_numbers) if (measure_num) % 4 == 0]
    major_measures = [measure_grid[i] for i in major_measure_indices]
    major_labels = [measure_numbers[i] for i in major_measure_indices]
    
    # Set major x-axis ticks and labels (for measure 0 and every fourth measure after)
    ax.set_xticks(major_measures, minor=False)
    ax.set_xticklabels(major_labels, minor=False)

    # Set minor x-axis ticks (for intermediate measures)
    minor_measures = [measure for i, measure in enumerate(measure_grid) if i not in major_measure_indices]
    ax.set_xticks(minor_measures, minor=True)
    
    # Overlay the major measure grid lines on the plot (for measure 0 and every fourth measure after)
    for measure_time in major_measures:
        ax.axvline(x=measure_time, color='green', linestyle='--', linewidth=2)  # Adjusted linewidth for major ticks
    
    # Overlay the minor measure grid lines on the plot (for intermediate measures)
    for measure_time in minor_measures:
        ax.axvline(x=measure_time, color='grey', linestyle=':', linewidth=1, alpha=0.8)  # Adjusted linewidth for minor ticks
    
    ax.set_xlabel('Measure Number')


# Function to detect key from a chromagram using Krumhansl-Schmuckler key-finding algorithm profiles
def detect_key_from_chromagram(chromagram, sr):
    pitches = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']

    # Calculate the sum of each pitch class across all time frames
    chroma_vals = np.sum(chromagram, axis=1)

    # Krumhansl-Schmuckler key-finding algorithm profiles
    maj_profile = [6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88]
    min_profile = [6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17]

    # Correlation for major and minor keys
    maj_key_corrs = [np.corrcoef(maj_profile, np.roll(chroma_vals, i))[1, 0] for i in range(12)]
    min_key_corrs = [np.corrcoef(min_profile, np.roll(chroma_vals, i))[1, 0] for i in range(12)]

    # Combine correlations and keys
    key_corrs = maj_key_corrs + min_key_corrs
    keys = [p + ' major' for p in pitches] + [p + ' minor' for p in pitches]

    # Determine the best key
    best_idx = np.argmax(key_corrs)
    best_key = keys[best_idx]
    best_corr = key_corrs[best_idx]

    return best_key, best_corr


# Function to convert standard key into Camelot key notation
def get_camelot(key):
    # Mapping from musical key to Camelot code
    camelot_major = {
        'B': '1B', 'F#': '2B', 'C#': '3B', 'G#': '4B', 'D#': '5B',
        'A#': '6B', 'F': '7B', 'C': '8B', 'G': '9B', 'D': '10B', 'A': '11B', 'E': '12B'
    }

    camelot_minor = {
        'G#': '1A', 'D#': '2A', 'A#': '3A', 'F': '4A', 'C': '5A',
        'G': '6A', 'D': '7A', 'A': '8A', 'E': '9A', 'B': '10A', 'F#': '11A', 'C#': '12A'
    }

    # Split the detected key into pitch and mode
    pitch, mode = key.split(' ')

    # Return the corresponding Camelot code
    if mode == 'major':
        return camelot_major[pitch]
    elif mode == 'minor':
        return camelot_minor[pitch]
    else:
        raise ValueError("Invalid mode in key: should be 'major' or 'minor'.")


def get_studio_bpm(beat_frames: np.ndarray, sr: int = 22050, hop_length: int = 512,
                   variance_threshold: float = 0.01, window_length: int = 4) -> Tuple[Optional[float], Optional[float], Optional[float], np.ndarray, np.ndarray]:
    """
    Analyze the provided beat frame indices to determine the studio BPM and the start frame of stable intervals.
    
    Parameters:
    - beat_frames (np.ndarray): Array of beat frame indices.
    - sr (int): The sample rate of the audio. Default is 22050 Hz.
    - hop_length (int): The number of samples per frame. Default is 512.
    - variance_threshold (float): The threshold for the variance to consider a window of beats as stable.
    - window_length (int): The number of beats to consider within each sliding window when calculating variance.

    Returns:
    Tuple[Optional[float], Optional[float], Optional[float], np.ndarray, np.ndarray]:
        - The mean studio BPM (float or None if not determined).
        - The median studio BPM (float or None if determined).
        - The BPM that occurs most frequently near a whole number (float or None if not determined).
        - The frame indices of the first beat of each stable interval.
        - An array of beat interval durations that are considered stable.
    """
    
    # Calculate the time in seconds for each beat frame index
    beat_times = librosa.frames_to_time(beat_frames, sr=sr, hop_length=hop_length)

    # Calculate beat intervals
    beat_intervals = np.diff(beat_times)
    total_intervals = len(beat_intervals)

    # Store stable intervals (low-variance windows)
    stable_intervals = []
    stable_beat_indices = []

    # Calculate variance in a sliding window
    for i in range(total_intervals - window_length + 1):
        window = beat_intervals[i:i + window_length]
        if np.var(window) < variance_threshold:
            # Extend the list with intervals from the current stable window
            stable_intervals.extend(window)
            # Record the frame index of the first beat in the stable window
            stable_beat_indices.append(beat_frames[i])

    # Initialize the BPM that occurs most frequently near a whole number to None
    mode_studio_bpm = None

    # If we found any stable intervals, calculate the BPMs
    if stable_intervals:
        # Calculate BPMs for each stable interval
        stable_bpms = 60.0 / np.array(stable_intervals)
        mean_studio_bpm = np.mean(stable_bpms)
        median_studio_bpm = np.median(stable_bpms)

        # Round BPMs to the nearest whole numbers and find the mode
        rounded_bpms = np.round(stable_bpms)
        mode_bpm, count = stats.mode(rounded_bpms)
        if count > 0:
            mode_studio_bpm = mode_bpm[0]
    else:
        # No stable intervals found; return None for mean and median BPM
        mean_studio_bpm = None
        median_studio_bpm = None

    # Convert the stable beat indices to frame indices
    stable_frames = beat_frames[stable_beat_indices]

    # Return the mean and median studio BPM if calculated, the mode BPM, the frame indices of the stable beats, and the stable interval durations
    return mean_studio_bpm, median_studio_bpm, mode_studio_bpm, stable_frames, stable_intervals

In [6]:
import pandas as pd
import os
import librosa
from tqdm import tqdm
import numpy as np
from typing import Tuple, List, Optional

# Define your directory and constants 
merged_df = pd.read_csv(r'..\data\dataframes\sp_merged2.csv')
mp3_directory = r"..\data\audio_files\processed"
export_directory = r"..\data\pkl"
hop_length = 512
sr = 22050
data_list = []

# Process each song
for index, row in tqdm(merged_df.iterrows(), desc="Processing audio profiles", total=merged_df.shape[0]):
    audio_file_path = row['FilePath']
    if os.path.exists(audio_file_path):
        # Load the audio file
        y, sr = librosa.load(audio_file_path, sr=sr)
        duration = librosa.get_duration(y=y, sr=sr)
        y_harm, y_perc = librosa.effects.hpss(y)

        # Chroma profile
        chroma_cq = librosa.feature.chroma_cqt(y=y_harm, sr=sr)
        key, key_corr = detect_key_from_chromagram(chroma_cq, sr)
        camelot = get_camelot(key)

        # Tempo/rhythm profile
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
        studio_bpm, stable_intervals = get_studio_bpm(beats)
        tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr, hop_length=hop_length)
        tempogram_ratio = librosa.feature.tempogram_ratio(tg=tempogram, sr=sr)
        
        # Quantize beats and create measure grid
        beat_grid, measure_grid = quantize_beats(beats, onset_env, studio_bpm, sr, hop_length, duration)
        measure_numbers = np.arange(len(measure_grid))
        measure_dict = {measure_number: measure_time for measure_number, measure_time in zip(measure_numbers, measure_grid)}
        
        # Data dictionary to hold features
        data_dict = {
            'SongID': row['SongID'], 
            'duration': duration, 
            'tempo': tempo, 
            'studio_bpm': studio_bpm,
            'key': key,
            'key_corr': key_corr,
            'camelot_key': camelot
        }
        
        # Append the data dictionary to the data list
        data_list.append(data_dict)

# Convert the list of dictionaries to a dataframe
new_data_df = pd.DataFrame(data_list)

# Append this new dataframe to the original dataframe (if that's what you need)
merged_df = merged_df.merge(new_data_df, on='SongID', how='left')

Processing audio profiles: 100%|██████████| 554/554 [1:20:07<00:00,  8.68s/it]


In [9]:
merged_df

Unnamed: 0,SongID,TrackName,Artists,Genre,FilePath,sp_danceability,sp_energy,sp_key,sp_loudness,sp_speechiness,...,sp_valence,sp_tempo,sp_time_signature,sp_genre,duration,tempo,studio_bpm,key,key_corr,camelot_key
0,1,Bass Inside,AC Slater,Bass House,..\data\audio_files\processed\1.mp3,0.905,0.838,6.0,-6.838,0.0499,...,0.464,126.007,4.0,"electro house, brostep, bass house, fidget house",259.995011,123.046875,126.005254,G minor,0.437424,6A
1,2,Fly Kicks - Wax Motif Remix,AC Slater/Chris Lorenzo/Wax Motif,Bass House,..\data\audio_files\processed\2.mp3,0.897,0.692,11.0,-4.985,0.0492,...,0.607,125.023,4.0,"bass house, house, tech house, electro house, ...",276.425034,123.046875,125.006785,F# minor,0.573062,11A
2,3,Take Me Away,ACRAZE,Pop Dance,..\data\audio_files\processed\3.mp3,0.727,0.982,11.0,-4.011,0.0782,...,0.719,126.036,4.0,"pop dance, tech house",179.529025,123.046875,125.997223,E minor,0.333447,9A
3,4,Heard It Like This,ACRAZE/Joey Valence & Brae,Pop Dance,..\data\audio_files\processed\4.mp3,0.747,0.901,1.0,-5.906,0.0491,...,0.730,125.943,4.0,"pop dance, tech house",219.281043,123.046875,124.712860,G minor,0.417818,6A
4,5,Spring Girl - Vintage Culture Remix,Adam Ten/Maori/Vintage Culture,Israeli Techno,..\data\audio_files\processed\5.mp3,0.798,0.865,7.0,-5.296,0.0568,...,0.643,127.002,4.0,"israeli techno, brazilian edm",227.216009,129.199219,127.540242,F minor,0.472424,4A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549,550,Clarity,Zedd/Foxes,Complextro,..\data\audio_files\processed\550.mp3,0.509,0.781,8.0,-3.480,0.0720,...,0.176,128.000,4.0,"edm, complextro, pop dance, uk pop, electropop...",266.043039,129.199219,128.228314,E major,0.829250,12B
550,551,One Three Nine,Zeds Dead/Scrufizzer,Brostep,..\data\audio_files\processed\551.mp3,0.810,0.864,11.0,-5.086,0.0374,...,0.768,124.992,4.0,"dubstep, edm, grime, progressive electro house...",165.090023,123.046875,124.995820,F# minor,0.605562,11A
551,552,Better Recognize,ZHU/Wax Motif,Edm,..\data\audio_files\processed\552.mp3,0.839,0.786,1.0,-6.217,0.0560,...,0.491,126.018,4.0,"electro house, bass house, tech house, edm",209.656009,123.046875,125.257548,F minor,0.678667,4A
552,553,Tunnel Vision - Don Diablo Edit,Zonderling/Don Diablo,Dutch House,..\data\audio_files\processed\553.mp3,0.762,0.913,5.0,-4.101,0.1110,...,0.365,126.001,4.0,"dutch house, edm, future house, pop dance, sky...",213.875011,123.046875,125.829299,G minor,0.727484,6A


## Saving the audio features as pkl files

In [1]:
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import librosa
import librosa.display
import numpy as np
import math
from tqdm import tqdm
import pandas as pd
import gc

In [None]:
# Define your directory and constants 
merged_df = pd.read_csv(r'..\data\dataframes\sp_merged2.csv')

mp3_directory = r"..\data\audio_files\processed"
export_directory = r"..\data\pkl"
hop_length = 512
sr = 22050


# Process each song
for index, row in tqdm(merged_df.iterrows(), desc="Processing audio profiles", total=merged_df.shape[0]):
    audio_file_path = row['FilePath']
    if os.path.exists(audio_file_path):
        # Load the audio file
        y, sr = librosa.load(audio_file_path, sr=sr)
        duration = librosa.get_duration(y=y, sr=sr)
        y_harm, y_perc = librosa.effects.hpss(y)

        # Chroma profile
        chroma_cq = librosa.feature.chroma_cqt(y=y_harm, sr=sr)
        key, key_corr = detect_key_from_chromagram(chroma_cq, sr)
        camelot = get_camelot(key)
        tonnetz = librosa.feature.tonnetz(y=y, sr=sr, chroma=chroma_cq)

        # Spectrogram
        D = np.abs(librosa.stft(y))**2
        S_mel = librosa.feature.melspectrogram(S=D, sr=sr)
        S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
        # Centroid
        centroid_mel = librosa.feature.spectral_centroid(S=S_mel, sr=sr, hop_length=hop_length)
        # MFCC
        mfccs = librosa.feature.mfcc(S=S_mel_db)
        
        # Tempo/rhythm profile
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
        studio_bpm, stable_intervals = get_studio_bpm(beats)
        tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr, hop_length=hop_length)
        tempogram_ratio = librosa.feature.tempogram_ratio(tg=tempogram, sr=sr)
        
        # Quantize beats and create measure grid
        beat_grid, measure_grid = quantize_beats(beats, onset_env, studio_bpm, sr, hop_length, duration)
        measure_numbers = np.arange(len(measure_grid))
        measure_dict = {measure_number: measure_time for measure_number, measure_time in zip(measure_numbers, measure_grid)}

        # Data dictionary to hold features
        data = {
            'SongID': row['SongID'], 
            'duration': duration, 
            'tempo': tempo, 
            'studio_bpm': studio_bpm,
            'key': key,
            'key_corr': key_corr,
            'camelot_key': camelot,
            'stable_intervals': stable_intervals.tolist(),
            'y': y.tolist(),
            'chroma_cq': chroma_cq.tolist(),
            'tonnetz': tonnetz.tolist(),
            'S_mel_db': S_mel_db.tolist(),
            'centroid_mel': centroid_mel.tolist(),
            'mfccs': mfccs.tolist(),
            'tempogram': tempogram.tolist(),
            'tempogram_ratio': tempogram_ratio.tolist(),
            'MeasureDict': measure_dict
        }

        # Save to pickle file
        pickle_file_path = os.path.join(export_directory, f"{row['SongID']}.pkl")  # Ensure proper path joining
        pd.to_pickle(data, pickle_file_path)
    else:
        print(f"File not found: {audio_file_path}")

# Making visual plots for all songs using pkl audio features

In [3]:
def apply_measure_grid(ax, measure_grid, measure_numbers):
    """
    This function takes an axis object and applies measure grid lines,
    sets x-ticks to measure start times for every fourth measure starting from measure 0,
    labels them with measure numbers, and applies sub-ticks for intermediate measure times.
    
    Parameters:
    ax (matplotlib.axes.Axes): The axis object to modify.
    measure_grid (np.array): The array of measure start times in seconds.
    measure_numbers (np.array): The array of measure numbers corresponding to the start times.
    """

    # Filter the major (every fourth) measures
    major_indices = measure_numbers % 4 == 0

    # Set major x-axis ticks and labels (for measure 0 and every fourth measure after)
    ax.set_xticks(measure_grid[major_indices])
    ax.set_xticklabels(measure_numbers[major_indices])

    # Set minor x-axis ticks (for intermediate measures)
    ax.set_xticks(measure_grid[~major_indices], minor=True)

    # Overlay the major measure grid lines on the plot (for measure 0 and every fourth measure after)
    ax.vlines(measure_grid[major_indices], ax.get_ylim()[0], ax.get_ylim()[1], color='green', linestyle='--', linewidth=2)

    # Overlay the minor measure grid lines on the plot (for intermediate measures)
    ax.vlines(measure_grid[~major_indices], ax.get_ylim()[0], ax.get_ylim()[1], color='grey', linestyle=':', linewidth=1, alpha=0.7)
    
    ax.set_xlabel('Measure Number')


def load_pickle_data(pkl_path):
    with open(pkl_path, 'rb') as file:
        return pickle.load(file)

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa.display
from tqdm import tqdm
import pickle
import gc

merged_df = pd.read_csv(r'../data/dataframes/sp_merged2.csv')
merged_df = merged_df['SongID']
export_directory = r"../figures/audio_plots"
pkl_directory = r"../data/pkl"
sr = 22050  
hop_length = 512

def apply_measure_grid(ax, measure_grid, measure_numbers):
    # Filter the major (every fourth) measures
    major_indices = measure_numbers % 4 == 0

    # Set major x-axis ticks and labels (for measure 0 and every fourth measure after)
    ax.set_xticks(measure_grid[major_indices])
    ax.set_xticklabels(measure_numbers[major_indices])

    # Set minor x-axis ticks (for intermediate measures)
    ax.set_xticks(measure_grid[~major_indices], minor=True)

    # Overlay the major measure grid lines on the plot (for measure 0 and every fourth measure after)
    ax.vlines(measure_grid[major_indices], ax.get_ylim()[0], ax.get_ylim()[1], color='green', linestyle='--', linewidth=2)

    # Overlay the minor measure grid lines on the plot (for intermediate measures)
    ax.vlines(measure_grid[~major_indices], ax.get_ylim()[0], ax.get_ylim()[1], color='grey', linestyle=':', linewidth=1, alpha=0.7)
    
    ax.set_xlabel('Measure Number')

    
def load_pickle_data(pkl_path):
    with open(pkl_path, 'rb') as file:
        return pickle.load(file)

        
for song_id in tqdm(merged_df, desc="Processing audio profiles"):
    export_fig_path = os.path.join(export_directory, f"{song_id}.png")
    pkl_path = os.path.join(pkl_directory, f"{song_id}.pkl")

    if not os.path.exists(export_fig_path):
        if os.path.exists(pkl_path):
            try:
                data = load_pickle_data(pkl_path)

                # Extracting the individual components from the data dictionary
                y_harm = np.asarray(data['y_harm'])
                y_perc = np.asarray(data['y_perc'])
                S_mel_db = np.asarray(data['S_mel_db'])
                tempogram = np.asarray(data['tempogram'])
                tempogram_ratio = np.asarray(data['tempogram_ratio'])
                chroma_cq = np.asarray(data['chroma_cq'])
                tonnetz = np.asarray(data['tonnetz'])
                duration = data['duration']
                measure_grid = np.array(list(data['MeasureDict'].values()))
                measure_numbers = np.array(list(data['MeasureDict'].keys()))
        
                # Create subplots
                fig, axs = plt.subplots(6, 1, figsize=(20, 30), dpi=125)

                # Harmonic/Percussive Waveform plot
                axs[0].plot(np.linspace(0, duration, len(y_harm)), y_harm, alpha=0.5, label='Harmonic', color='b')
                axs[0].plot(np.linspace(0, duration, len(y_perc)), y_perc, alpha=0.5, label='Percussive', color='r')
                apply_measure_grid(axs[0], measure_grid, measure_numbers)
                axs[0].set_title('Harmonic and Percussive Waveform')
                axs[0].set_xlim([0, duration])
        
                # Mel Spectrogram plot
                librosa.display.specshow(S_mel_db, sr=sr, x_axis='time', y_axis='mel', ax=axs[1], fmax=8000)
                apply_measure_grid(axs[1], measure_grid, measure_numbers)
                axs[1].set_title('Mel Spectrogram')
                # Set the y-axis limits
                axs[1].set_ylim(0, 8000)  # Assuming the fmax is 8000 Hz as specified in the specshow call
        
                # Tempogram plot
                librosa.display.specshow(tempogram, sr=sr, hop_length=512, x_axis='time', y_axis='tempo', cmap='magma', ax=axs[2])
                apply_measure_grid(axs[2], measure_grid, measure_numbers)
                axs[2].set_title('Tempogram')
        
                # Tempogram ratio
                # Define note labels for tempogram ratio
                note_labels = [
                    'Sixteenth note',
                    'Dotted sixteenth',
                    'Eighth triplet',
                    'Eighth note',
                    'Dotted eighth',
                    'Quarter triplet',
                    'Quarter note',
                    'Dotted quarter',
                    'Half triplet',
                    'Half note',
                    'Dotted half note',
                    'Whole triplet',
                    'Whole note'
                ]
                
                librosa.display.specshow(tempogram_ratio, x_axis='time', ax=axs[3], sr=sr)
                axs[3].set_xlim([0, duration])
                apply_measure_grid(axs[3], measure_grid, measure_numbers)
                axs[3].set_yticks(range(len(note_labels)))
                axs[3].set_yticklabels(note_labels)
                axs[3].set_title('Tempogram Ratio')
        
                # Chroma CQT plot
                librosa.display.specshow(chroma_cq, y_axis='chroma', x_axis='time', ax=axs[4])
                apply_measure_grid(axs[4], measure_grid, measure_numbers)
                axs[4].set_title('Chroma CQT')
        
                # Tonnetz plot
                librosa.display.specshow(tonnetz, sr=sr, hop_length=hop_length, y_axis='tonnetz', x_axis='time', ax=axs[5])
                apply_measure_grid(axs[5], measure_grid, measure_numbers)
                axs[5].set_title('Tonnetz')

                plt.tight_layout()
                plt.savefig(export_fig_path)
                plt.close(fig)
                del y_harm, y_perc, S_mel_db, tempogram, tempogram_ratio, chroma_cq, tonnetz, duration, measure_grid, measure_numbers
                gc.collect()

            except (EOFError, pickle.UnpicklingError) as e:
                print(f"Failed to load data for {song_id} due to {e} Skipping.")

Processing audio profiles: 100%|██████████| 554/554 [03:47<00:00,  2.44it/s]


In [17]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
import pickle

# Define paths
export_directory = r"../figures/audio_plots"
pkl_directory = r"../data/pkl"
sr = 22050
hop_length = 512

# Specific song_id to test
song_id = '1'  # Replace '1' with the actual ID if it's different
export_fig_path = os.path.join(export_directory, f"{song_id}.png")
pkl_path = os.path.join(pkl_directory, f"{song_id}.pkl")

# Check if the pickle file exists
if os.path.exists(pkl_path):
    with open(pkl_path, 'rb') as file:
        data = pickle.load(file)

    # Unpack data
    y_harm = np.asarray(data['y_harm'])
    y_perc =  np.asarray(data['y_perc'])
    S_mel_db =  np.asarray(data['S_mel_db'])
    tempogram =  np.asarray(data['tempogram'])
    tempogram_ratio =  np.asarray(data['tempogram_ratio'])
    chroma_cq =  np.asarray(data['chroma_cq'])
    tonnetz =  np.asarray(data['tonnetz'])
    duration =  data['duration']
    measure_grid = np.array(list(data['MeasureDict'].values()))
    measure_numbers = np.array(list(data['MeasureDict'].keys()))
    
    # Define the figure and axes
    fig, axs = plt.subplots(6, 1, figsize=(20, 30), dpi=300)

    # Harmonic/Percussive Waveform plot
    axs[0].plot(np.linspace(0, duration, len(y_harm)), y_harm, alpha=0.5, label='Harmonic', color='b')
    axs[0].plot(np.linspace(0, duration, len(y_perc)), y_perc, alpha=0.5, label='Percussive', color='r')
    axs[0].set_xlim([0, duration])
    apply_measure_grid(axs[0], measure_grid, measure_numbers)
    axs[0].set_title('Harmonic and Percussive Waveform')

    # Mel Spectrogram plot
    librosa.display.specshow(S_mel_db, sr=sr, x_axis='time', y_axis='mel', ax=axs[1], fmax=8000)
    apply_measure_grid(axs[1], measure_grid, measure_numbers)
    axs[1].set_title('Mel Spectrogram')
    # Set the y-axis limits
    axs[1].set_ylim(0, 8000)  # Assuming the fmax is 8000 Hz as specified in the specshow call

    # Tempogram plot
    librosa.display.specshow(tempogram, sr=sr, hop_length=512, x_axis='time', y_axis='tempo', cmap='magma', ax=axs[2])
    apply_measure_grid(axs[2], measure_grid, measure_numbers)
    axs[2].set_title('Tempogram')

    # Tempogram ratio
    # Define note labels for tempogram ratio
    note_labels = [
        'Sixteenth note',
        'Dotted sixteenth',
        'Eighth triplet',
        'Eighth note',
        'Dotted eighth',
        'Quarter triplet',
        'Quarter note',
        'Dotted quarter',
        'Half triplet',
        'Half note',
        'Dotted half note',
        'Whole triplet',
        'Whole note'
    ]
    librosa.display.specshow(tempogram_ratio, x_axis='time', ax=axs[3], sr=sr)
    axs[3].set_xlim([0, duration])
    apply_measure_grid(axs[3], measure_grid, measure_numbers)
    axs[3].set_yticks(range(len(note_labels)))
    axs[3].set_yticklabels(note_labels)
    axs[3].set_title('Tempogram Ratio')

    # Chroma CQT plot
    librosa.display.specshow(chroma_cq, y_axis='chroma', x_axis='time', ax=axs[4])
    apply_measure_grid(axs[4], measure_grid, measure_numbers)
    axs[4].set_title('Chroma CQT')

    # Tonnetz plot
    librosa.display.specshow(tonnetz, sr=sr, hop_length=hop_length, y_axis='tonnetz', x_axis='time', ax=axs[5])
    apply_measure_grid(axs[5], measure_grid, measure_numbers)
    axs[5].set_title('Tonnetz')

    # Layout and saving
    plt.tight_layout()
    #plt.show()  # Use plt.show() for testing instead of saving the figure
    plt.savefig(export_fig_path)  # Uncomment this to save the figure
    plt.close(fig)
else:
    print(f"The file {pkl_path} does not exist.")

In [10]:
# Define your directory and constants 
merged_df = pd.read_csv(r'../data/dataframes/sp_merged2.csv')

mp3_directory = r"..\data\audio_files\processed"
export_directory = r"..\figures\audio_plots"
pkl_directory = r"..\data\pkl"
hop_length = 512
sr = 22050

for index, row in tqdm(merged_df.iterrows(), desc="Processing audio profiles", total=merged_df.shape[0]):
    
    song_id = row['SongID']
    export_fig_path = os.path.join(export_directory, f"{song_id}.png")
    pkl_path = os.path.join(pkl_directory, f"{song_id}.pkl")
    audio_file_path = row['FilePath']
    
    if os.path.exists(audio_file_path) & os.path.exists(pkl_path):
        #df = pd.read_pickle(pkl_path)
        
        y, sr = librosa.load(audio_file_path, sr=22050)
        duration = librosa.get_duration(y=y, sr=sr)
        y_harm, y_perc = librosa.effects.hpss(y)
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
        #tempo = math.floor(tempo)
    
        # Quantize beats and create measure grid
        beat_grid, measure_grid = quantize_beats(beats, onset_env, tempo, sr, hop_length, duration)
        measure_numbers = np.arange(len(measure_grid))
        measure_dict = {measure_number: measure_time for measure_number, measure_time in zip(measure_numbers, measure_grid)}
    
        # Start a new figure
        fig = plt.figure(figsize=(20, 24), dpi=300)  # Adjust the size as needed
    
        # Create a GridSpec for the entire figure
        gs = gridspec.GridSpec(3, 2, figure=fig)
    
        # Harmonic/Percussive Waveform plot 
        ax_waveform = plt.subplot(gs[0, 0])  
        librosa.display.waveshow(y_harm, sr=sr, alpha=0.5, ax=ax_waveform, label='Harmonic', color = 'b')
        librosa.display.waveshow(y_perc, sr=sr, alpha=0.5, ax=ax_waveform, label='Percussive', color='r')
        apply_measure_grid(ax_waveform, measure_grid, measure_numbers)
        ax_waveform.set_title('Harmonic and Percussive Waveform')
        ax_waveform.set_ylabel('Amplitude')
        ax_waveform.set_xlim([0, duration])   
        ax_waveform.legend()

        # MelSpectrogram plot
        ax_melspec = plt.subplot(gs[0, 1])
        D = np.abs(librosa.stft(y))**2
        S = librosa.feature.melspectrogram(S=D, sr=sr)
        S_dB = librosa.power_to_db(S, ref=np.max)
        librosa.display.specshow(S_dB, x_axis='time',
                                 y_axis='mel', sr=sr,
                                 fmax=8000, ax=ax_melspec)
        apply_measure_grid(ax_melspec, measure_grid, measure_numbers)
        ax_melspec.set_title('MelSpectrogram')
    
        # Tempogram plot
        ax_tempogram = plt.subplot(gs[1, 0])
        tempogram = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr, hop_length=hop_length)
        librosa.display.specshow(tempogram, sr=sr, hop_length=hop_length, x_axis='time', y_axis='tempo', ax=ax_tempogram)
        ax_tempogram.set_yticks([tempo])
        ax_tempogram.set_yticklabels([str(tempo) + ' BPM'])
        apply_measure_grid(ax_tempogram, measure_grid, measure_numbers)
        ax_tempogram.set(title='Tempogram')
    
        # Tempogram ratio
        ax_tgr = plt.subplot(gs[1, 1])
        tgr = librosa.feature.tempogram_ratio(tg=tempogram, sr=sr)
        # Define note labels
        note_labels = [
            'Sixteenth note',
            'Dotted sixteenth',
            'Eighth triplet',
            'Eighth note',
            'Dotted eighth',
            'Quarter triplet',
            'Quarter note',
            'Dotted quarter',
            'Half triplet',
            'Half note',
            'Dotted half note',
            'Whole triplet',
            'Whole note'
        ]
        librosa.display.specshow(tgr, x_axis='time', ax=ax_tgr, sr=sr)
        ax_tgr.set_xlim([0, duration])
        apply_measure_grid(ax_tgr, measure_grid, measure_numbers)
        ax_tgr.set(title="Tempogram ratio")
        ax_tgr.set_yticks(range(len(note_labels)))
        ax_tgr.set_yticklabels(note_labels)
    
        # Chromagram plot
        ax_chromagram = plt.subplot(gs[2, 0])
        chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr)
        librosa.display.specshow(chroma_cq, sr=sr, hop_length=hop_length, y_axis='chroma', x_axis='time', ax=ax_chromagram)
        apply_measure_grid(ax_chromagram, measure_grid, measure_numbers)
        ax_chromagram.set_ylabel('Pitch Class')
        ax_chromagram.set_title('Chromagram')
        
        # Tonnetz Features plot
        ax_tonnetz = plt.subplot(gs[2, 1])
        tonnetz = librosa.feature.tonnetz(y=y, sr=sr, chroma=chroma_cq)
        librosa.display.specshow(tonnetz, sr=sr, hop_length=hop_length, y_axis='tonnetz', x_axis='time', ax=ax_tonnetz)
        apply_measure_grid(ax_tonnetz, measure_grid, measure_numbers)
        ax_tonnetz.set_title('Tonnetz features')
        
       
        # Save the figure to a PNG file with the same base name as the MP3 file
        plt.tight_layout()
        plt.savefig(export_fig_path)
        plt.close(fig)  # Close the figure to free memory

        del y, sr  # Delete large variables that are no longer needed
        gc.collect()  # Call garbage collector manually

Processing audio profiles:  19%|█▉        | 107/554 [36:45<2:33:34, 20.61s/it]


MemoryError: Unable to allocate 56.7 MiB for an array with shape (1025, 14499) and data type float32

In [2]:
data = pd.read_pickle(r"..\data\pkl\1.pkl")

In [5]:
row['FilePath']

'..\\data\\audio_files\\processed\\554.mp3'