# YT Playlists in Bulk Using UDFs

# *NB* 2025-01-08: Dragging this out from 2023 original go-around - seeing if it still works

## This is intended for Summary Stats off of various audio metadata like MFCCs. v1 focuses only on MFCCs

### *NB: This is getting a lot of music videos instead of the raw music file, so entry/exit of the song could be strange*

## Env. Config.

### Imports

In [1]:
import os
import subprocess
from datetime import date

import librosa
import numpy as np
import pandas as pd
import json
import glob

import youtube_dl

import gzip
import pyarrow as pa
import pyarrow.parquet as pq

import warnings
warnings.filterwarnings("ignore") # Suppress all warnings

import time

# Get UDFs
from yt_dl_fns_v2 import *

In [2]:
# How to reinstall youtube_dl after a patch:
# pip install --upgrade --force-reinstall "git+https://github.com/ytdl-org/youtube-dl.git"

# Best documentation for yt_dl in python: https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/__init__.py

### Set params

In [3]:
# YT
playlist_url = 'https://www.youtube.com/playlist?list=PL9V-Dt4XA4CN3jQK1LclKv0zJR6lhuHNe' # 'https://www.youtube.com/playlist?list=PLYV5tZUB5NRfLT4SJmU5gQ5-z5fLDwl5K'
audio_file_type = 'mp3' # 'flac'

# Directory
yt_audio_out_fp = r"C:\Users\Owner\Music\temp\music"
yt_metadata_out_fp = r"C:\Users\Owner\Music\temp\metadata"
yt_audio_data_base_fp = r"C:\Users\Owner\Documents\Data Projects\GitHub\music\2024_12_27__attempt_2\Youtube\yt_music_files" #  r"C:\Users\Owner\Documents\Data Projects\GitHub\music\Librosa\Embeddings\yt_music_files"


# Reference
# master_ref_df_fp = r"C:\Users\Owner\Documents\Data Projects\GitHub\music\Librosa\Embeddings\Dataframes\Master\df_master.csv"

## Step 1: Grab Files

In [4]:
yt_dl_pl_bulk(
    playlist_url = playlist_url, 
    yt_audio_out_fp = yt_audio_out_fp, 
    yt_metadata_out_fp = yt_metadata_out_fp, 
    audio_file_type = audio_file_type
)

150 audio files have been placed in C:\Users\Owner\Music\temp\music and 150 json files have been placed in C:\Users\Owner\Music\temp\metadata


## Step 2: Create df

In [32]:
df_yt_pl = pd.DataFrame(columns=['title', 'yt_playlist'])

In [33]:
# Specify the directory path
directory = r"C:\Users\Owner\Music\temp\music"

## Step 3: Audio Data Extraction

In [34]:
# Extract music features with librosa and return a numpy array
def extract_features(file_path):
    # Load audio file
    y, sr = librosa.load(file_path)
    
    # Extract features
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    
    # Aggregate features
    features = np.hstack((np.mean(mfcc, axis=1), 
                          np.mean(spectral_centroid),
                          np.mean(chroma, axis=1),
                          tempo))
    return features

In [35]:
# Define feature names
feature_names = ['mfcc_' + str(i) for i in range(20)] + ['spectral_centroid'] + ['chroma_' + str(i) for i in range(12)] + ['tempo']

In [36]:
# Iterate through files in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    
    # Extract features
    features = extract_features(file_path)
    
    # Create a new row
    new_row = {
        'title': filename,
        'yt_playlist': "Country Music Playlist 2024"
    }
    
    # Add features to the row
    for i, feature in enumerate(features):
        new_row[feature_names[i]] = feature
    
    # Append the new row to the DataFrame
    df_yt_pl = df_yt_pl.append(new_row, ignore_index=True)

## Step 4: Save df

In [41]:
# Save as pickle to preserve all data typess
df_yt_pl.to_pickle("extracted data/2025_01_08_Country_Music_Playlist_2024.pkl")

## Step 5: Remove Files

In [42]:
yt_temp_file_cleanup(yt_audio_out_fp, yt_metadata_out_fp)

C:\Users\Owner\Music\temp\music and C:\Users\Owner\Music\temp\metadata are now empty
