# Initial Data Exploration and Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns

# specify the date format to be parsed
date_format = '%Y-%m-%d'
# read csv files
billboard_cleaned = pd.read_csv('billboard_dataset/processed/billboard_cleaned.csv', parse_dates=['week_id'], date_format=date_format)
audio_features_merged = pd.read_csv('billboard_dataset/processed/audio_features_merged.csv', parse_dates=['week_id'], date_format=date_format)

In [2]:
print(billboard_cleaned.shape)
print(billboard_cleaned.columns)
billboard_cleaned.head(2)

(327895, 9)
Index(['week_id', 'week_position', 'song', 'performer', 'song_id', 'instance',
       'previous_week_position', 'peak_position', 'weeks_on_chart'],
      dtype='object')


Unnamed: 0,week_id,week_position,song,performer,song_id,instance,previous_week_position,peak_position,weeks_on_chart
0,1965-07-17,34,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,45.0,34,4
1,1965-07-24,22,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,34.0,22,5


In [3]:
print(audio_features_merged.shape)
print(audio_features_merged.columns)
audio_features_merged.head(2)

(29383, 23)
Index(['song_id', 'performer', 'song', 'spotify_genre',
       'spotify_track_duration_ms', 'spotify_track_explicit',
       'spotify_track_album', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'spotify_track_popularity',
       'peak_position', 'week_id', 'weeks_on_chart'],
      dtype='object')


Unnamed: 0,song_id,performer,song,spotify_genre,spotify_track_duration_ms,spotify_track_explicit,spotify_track_album,danceability,energy,key,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,spotify_track_popularity,peak_position,week_id,weeks_on_chart
0,Solid RockGoanna,Goanna,Solid Rock,"[""australian children's music"", ""children's mu...",275226.0,False,Spirit Of Place (Remastered & Expanded),0.552,0.741,9.0,...,0.00699,0.00133,0.0317,0.508,148.996,4.0,57.0,71,1983-07-23,7
1,Burning HeartVandenberg,Vandenberg,Burning Heart,"[""australian children's music""]",250693.0,False,Vandenberg,0.438,0.546,11.0,...,0.127,0.0,0.182,0.486,125.221,4.0,44.0,39,1983-04-09,14


variables to run analyses:
- tunebat
- key % mode
- duration
- genre
- seasonality


### Billboard Ranking Analysis

In [4]:
plt.figure(figsize=(12, 6))
audio_features_merged['peak_position'].hist(bins=100, edgecolor='white')
plt.title('Histogram of Peak Billboard Ranks')
plt.xlabel('Rank')
plt.ylabel('Count')
plt.xlim(0.8,100)
plt.show()

KeyboardInterrupt: 

Note: Rank 1 is the limit, resulting in the high count.

In [None]:
plt.figure(figsize=(12, 6))
audio_features_merged['weeks_on_chart'].hist(bins=100, edgecolor='white')
plt.title('Histogram of Billboard Duration')
plt.xlabel('Weeks')
plt.ylabel('Count')
plt.xlim(0.8, audio_features_merged['weeks_on_chart'].max())
plt.show()

Note the outlier: week 20.

In [None]:
audio_features_merged['weeks_on_chart'].value_counts().head(5)

In [None]:
# sort by 'peak_position', then 'weeks_on_chart'
top_10_tracks = audio_features_merged.sort_values(['peak_position', 'weeks_on_chart'], ascending=[True, False]).head(10)
top_10_tracks

In [None]:
plt.figure(figsize=(12, 6))
title_artist = top_10_tracks['song'] + '\n' + top_10_tracks['performer']
plt.bar(title_artist, top_10_tracks['weeks_on_chart'])
plt.title('The 10 Songs That Held Rank 1 the Longest')
plt.xlabel('Songs')
plt.ylabel('Weeks')
plt.xticks(rotation=45, ha='right')
plt.show()

# fix title

In [None]:
# sort by 'weeks_on_chart'
top_10_tracks_by_duration = audio_features_merged.sort_values('weeks_on_chart', ascending=False).head(10)
top_10_tracks_by_duration

In [None]:
plt.figure(figsize=(12, 6))
title_artist = top_10_tracks_by_duration['song'] + '\n' + top_10_tracks_by_duration['performer']
plt.bar(title_artist, top_10_tracks_by_duration['weeks_on_chart'])
plt.title('The 10 Songs On the Hot100 the Longest')
plt.xlabel('Songs')
plt.ylabel('Weeks')
plt.xticks(rotation=45, ha='right')
plt.show()

# Spotify Audio Features Analysis

In [None]:
audio_features_merged.dtypes

In [None]:
audio_features_merged.isna().sum()

5159 tracks don't have the audio features. Will drop for this section.

In [None]:
# list of spotify audio features (values between 0-1)
audio_features_list = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']

In [None]:
# check if those 5159 tracks have NaN values for all audio_features columns
audio_features_merged[audio_features_list].isna().all(axis=1).sum()

In [None]:
# drop rows where all audio feature columns are nan
audio_features_subset = audio_features_merged.dropna(subset=audio_features_list, how='all')
print(audio_features_subset.shape)
audio_features_subset.isna().sum()

### Bell Curves

In [None]:
# filter df to just audio features columns
spotify_audio_features = audio_features_subset[audio_features_list]
print(spotify_audio_features.shape)
print(spotify_audio_features.columns)
spotify_audio_features.describe()

Description of each feature copied from the Spotify API documentation:

| Feature          | Description                                                                                                                                               |
|:-----------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------|
| Danceability     | Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable, and 1.0 is most danceable.|
| Energy           | Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.|
| Speechiness      | Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g., talk show, audiobook, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks.|
| Acousticness     | Acousticness is a confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence that the track is acoustic.|
| Instrumentalness | Instrumentalness predicts whether a track contains no vocals. "Ooh" and "aah" sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly "vocal". The closer the instrumentalness value is to 1.0, the greater the likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.|
| Liveness         | Liveness detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides a strong likelihood that the track is live.|
| Valence          | Valence is a measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g., happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g., sad, depressed, angry).|


In [None]:
from scipy.stats import norm

plt.figure(figsize=(12, 8))

# plot bell curve for each feature
for feature in spotify_audio_features.columns:
    x = np.linspace(spotify_audio_features[feature], spotify_audio_features[feature].max(), 100)
    plt.plot(x, norm.pdf(x, spotify_audio_features[feature].mean(), spotify_audio_features[feature].std()), label=feature)

plt.title('Bell Curves of Audio Features')
plt.xlabel('Value')
plt.ylabel('Density')

plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
   
# plot histogram for each feature
for feature in spotify_audio_features.columns:
    sns.kdeplot(spotify_audio_features[feature], label=feature, linewidth=2, clip=(0,1))

plt.title('Bell Curves of Audio Features')
plt.xlabel('Value')
plt.ylabel('Density')

plt.legend()
plt.show()

In [None]:
# filter to tracks that hit rank 1
top_tracks = audio_features_subset[audio_features_subset['peak_position'] == 1][audio_features_list]
print(top_tracks.shape)
top_tracks.head(5)

In [None]:
plt.figure(figsize=(12, 8))

# plot bell curve for each feature
for feature in top_tracks.columns:
    x = np.linspace(top_tracks[feature].min(), top_tracks[feature].max(), 100)
    plt.plot(x, norm.pdf(x, top_tracks[feature].mean(), top_tracks[feature].std()), label=feature)

plt.title('Audio Features of Rank 1 Tracks')
plt.xlabel('Value')
plt.ylabel('Density')

plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
   
# plot histogram for each feature
for feature in top_tracks.columns:
    sns.kdeplot(top_tracks[feature], label=feature, linewidth=2)

plt.title('Audio Features of Rank 1 Tracks')
plt.xlabel('Value')
plt.ylabel('Density')

plt.legend()
plt.show()

In [None]:
# filter to tracks that didn't make it past rank 80
bottom_tracks_subset = audio_features_merged[audio_features_merged['peak_position'] > 80][audio_features_list]
bottom_tracks_subset.shape

In [None]:
plt.figure(figsize=(12, 8))

# plot bell curve for each feature
for feature in bottom_tracks_subset.columns:
    x = np.linspace(bottom_tracks_subset[feature].min(), bottom_tracks_subset[feature].max(), 100)
    plt.plot(x, norm.pdf(x, bottom_tracks_subset[feature].mean(), bottom_tracks_subset[feature].std()), label=feature)

plt.title('Audio Features of Low Rank Tracks')
plt.xlabel('Value')
plt.ylabel('Density')

plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
   
# plot histogram for each feature
for feature in bottom_tracks_subset.columns:
    sns.kdeplot(bottom_tracks_subset[feature], label=feature, linewidth=2)

plt.title('Audio Features of Low Rank Tracks')
plt.xlabel('Value')
plt.ylabel('Density')

plt.legend()
plt.show()

What about using popularity instead of rank?

In [None]:
# filter to tracks that didn't make it past rank 80
bottom_tracks_subset = audio_features_merged[audio_features_merged['peak_position'] > 80][audio_features_list]
bottom_tracks_subset.shape

### Histograms

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 8))

# flatten for indexing
axes = axes.flatten()

# iterate through audio features + popularity score
for i, feature in enumerate(audio_features_list + ['spotify_track_popularity']):
    axes[i].hist(audio_features_subset[feature], bins=100, color='#6495ED')
    axes[i].set_title(feature)
    axes[i].grid(True)

plt.tight_layout()
plt.suptitle('Histograms of Audio Features and Popularity', fontsize=16, y=1.05)
plt.show()

In [None]:
# filter to tracks before 2000
pre_2000_tracks = audio_features_merged[audio_features_merged['week_id'].dt.year < 2000][audio_features_list]
pre_2000_tracks.shape

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
plt.figure(figsize=(12, 8))

# plot histogram for each feature
for feature in pre_2000_tracks.columns:
    sns.kdeplot(pre_2000_tracks[feature], label=feature, linewidth=2)

plt.title('Audio Features of Pre-2000 Tracks')
plt.xlabel('Value')
plt.ylabel('Density')

plt.legend()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 8))

# flatten for indexing
axes = axes.flatten()

# iterate through audio features + popularity score
for i, feature in enumerate(audio_features_list):
    axes[i].hist(pre_2000_tracks[feature], bins=100, color='#6495ED')
    axes[i].set_title(feature)
    axes[i].grid(True)
    
# plot popularity scores for pre-2000 tracks
axes[7].hist(audio_features_merged[audio_features_merged['week_id'].dt.year < 2000]['spotify_track_popularity'], bins=100, color='#6495ED')

plt.tight_layout()
plt.suptitle('Histograms of Audio Features and Popularity of Pre-2000 Tracks', fontsize=16, y=1.05)
plt.show()

In [None]:
# filter to tracks post 2000
post_2000_tracks = audio_features_subset[audio_features_subset['week_id'].dt.year >= 2000][audio_features_list]
post_2000_tracks.shape

In [None]:
plt.figure(figsize=(12, 8))

# plot histogram for each feature
for feature in post_2000_tracks.columns:
    sns.kdeplot(post_2000_tracks[feature], label=feature, linewidth=2)

plt.title('Audio Features of Post-2000 Tracks')
plt.xlabel('Value')
plt.ylabel('Density')

plt.legend()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 8))

# flatten for indexing
axes = axes.flatten()

# iterate through audio features + popularity score
for i, feature in enumerate(post_2000_tracks):
    axes[i].hist(post_2000_tracks[feature], bins=100, color='#6495ED')
    axes[i].set_title(feature)
    axes[i].grid(True)
    
# plot popularity scores for post-2000 tracks
axes[7].hist(audio_features_merged[audio_features_merged['week_id'].dt.year >= 2000]['spotify_track_popularity'], bins=100, color='#6495ED')

plt.tight_layout()
plt.suptitle('Histograms of Audio Features and Popularity of Post-2000 Tracks', fontsize=16, y=1.05)
plt.show()

### Seaborn Histplot Heatmaps

In [None]:
# add year column (I should've done this earlier...)
audio_features_merged['year'] = audio_features_merged['week_id'].dt.year
audio_features_subset = audio_features_merged.dropna(subset=audio_features_list, how='all')

When plotting heat map, inf error.<br>
Ex. ValueError: autodetected range of [-inf, -0.005243055412371884] is not finite

In [None]:
# why do i keep getting value error?
for feature in audio_features_list:
    non_finite_values = audio_features_subset[feature][~np.isfinite(audio_features_subset[feature])]

    # print the non-finite values
    print("Non-finite values in {}: {}".format(feature, non_finite_values))

In [None]:
non_finite_values = audio_features_subset['spotify_track_popularity'][~np.isfinite(audio_features_subset['spotify_track_popularity'])]
print(f"Non-finite values in spotify_track_popularity: {non_finite_values}")

In [None]:
audio_features_subset[audio_features_subset['spotify_track_popularity'].isna()]

In [None]:
# quick data clean using tunebat.com
audio_features_subset.loc[20778, 'spotify_track_popularity'] = 1
audio_features_subset.loc[26933, 'spotify_track_popularity'] = 41
audio_features_subset.loc[27108, 'spotify_track_popularity'] = 17
audio_features_subset.loc[27364, 'spotify_track_popularity'] = 28

In [None]:
non_finite_values = audio_features_subset['spotify_track_popularity'][~np.isfinite(audio_features_subset['spotify_track_popularity'])]
print(f"Non-finite values in spotify_track_popularity: {non_finite_values}")

In [None]:
for feature in audio_features_list + ['spotify_track_popularity']:
    feature_data = audio_features_subset[feature]
    print(f"Column: {feature}")
    print(f"Min value: {feature_data.min()}")
    print(f"Max value: {feature_data.max()}")
    print(f"NaN values: {feature_data.isna().sum()}")
    print(f"Inf values: {np.isinf(feature_data).sum()}")
    print("---")

In [None]:
selected_feature = audio_features_list[1]

plt.figure(figsize=(8, 5))

# plot histogram
sns.histplot(
    data=audio_features_subset, x="year", y=selected_feature,
    bins=50, discrete=(True, False), log_scale=(False, True)
)

plt.title(f'Histogram of {selected_feature.capitalize()} Over Time')
plt.show()

In [None]:
selected_feature = audio_features_list[3]

plt.figure(figsize=(8, 5))

# plot histogram
sns.histplot(
    data=audio_features_subset, x="year", y=selected_feature,
    bins=50, discrete=(True, False), log_scale=(False, True)
)

plt.title(f'Histogram of {selected_feature.capitalize()} Over Time')
plt.show()

In [None]:
selected_feature = audio_features_list[5]

plt.figure(figsize=(8, 5))

# plot histogram
sns.histplot(
    data=audio_features_subset, x="year", y=selected_feature,
    bins=50, discrete=(True, False), log_scale=(False, True)
)

plt.title(f'Histogram of {selected_feature.capitalize()} Over Time')
plt.show()

.. only 3 out of 7 features work. popularity also doesn't work...

In [None]:
error_features = ['danceability', 'speechiness', 'instrumentalness', 'valence', 'spotify_track_popularity']

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 8))

# flatten for indexing
axes = axes.flatten()

# iterate through audio features + popularity score
for i, feature in enumerate(audio_features_list + ['spotify_track_popularity']):
    sns.histplot(
        data=audio_features_subset, x="year", y=feature, bins=30, ax=axes[i])
    axes[i].set_title(feature)

plt.tight_layout()
plt.suptitle('Bivariate Histograms of Audio Features Over Time', fontsize=16, y=1.05)
plt.show()

error fixed when i deleted discrete and log_scale parameters used in documentation...

In [None]:
'''Bell Curve of Audio Features Over the Decades'''

import matplotlib.cm as cm

# Define the audio features for which you want to plot bell curves
audio_features_list = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'spotify_track_popularity']

# Create subplots with 4 rows and 2 columns
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(10, 2 * len(audio_features_list)))

# Flatten the axes for easier indexing
axes = axes.flatten()

# Define the colormap (use 'Blues' for a blue theme)
colormap = cm.Blues

# Iterate through each audio feature
for i, feature in enumerate(audio_features_list):
    # Plot the bell curve for each decade
    for j, decade in enumerate(range(audio_features_subset['year'].min() // 10 * 10, audio_features_subset['year'].max() // 10 * 10 + 10, 10)):
        data_subset = audio_features_subset[(audio_features_subset['year'] >= decade) & (audio_features_subset['year'] < decade + 10)]

        # Fit a normal distribution to the data
        mu, std = norm.fit(data_subset[feature])

        # Generate x values for the PDF
        x = np.linspace(data_subset[feature].min(), data_subset[feature].max(), 100)
        # Calculate the PDF using the fitted parameters
        pdf = norm.pdf(x, mu, std)

        # Plot the PDF on the corresponding subplot with colormap
        axes[i].plot(x, pdf, label=f'{decade}s', color=colormap(j / len(range(audio_features_subset["year"].min(), audio_features_subset["year"].max(), 10))))

    # Set plot labels and title
    axes[i].set_title(feature.capitalize())
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Probability Density')
    axes[i].legend()

# Adjust layout
plt.suptitle('Bell Curve of Audio Features Over the Decades', fontsize=16, y=1)
plt.tight_layout()
plt.show()


In [None]:
'''Bell Curve of Audio Features Over the Decades'''

# Define the audio features for which you want to plot bell curves
audio_features_list = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'spotify_track_popularity']

# Create subplots with 4 rows and 2 columns
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(10, 2 * len(audio_features_list)))

# Flatten the axes for easier indexing
axes = axes.flatten()

# Define the colormap (use 'Blues' for a blue theme)
colormap = cm.Blues

# Iterate through each audio feature
for i, feature in enumerate(audio_features_list):
    # Plot the KDE plot for each decade
    for j, decade in enumerate(range(audio_features_subset['year'].min() // 10 * 10, audio_features_subset['year'].max() // 10 * 10 + 10, 10)):
        data_subset = audio_features_subset[(audio_features_subset['year'] >= decade) & (audio_features_subset['year'] < decade + 10)]

        # Plot the KDE plot on the corresponding subplot with colormap
        sns.kdeplot(data=data_subset[feature], ax=axes[i], label=f'{decade}s', color=colormap(j / len(range(audio_features_subset["year"].min(), audio_features_subset["year"].max(), 10))))

    # Set plot labels and title
    axes[i].set_title(feature.capitalize())
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Probability Density')
    axes[i].legend()

# Adjust layout
plt.suptitle('KDE Plot of Audio Features Over the Decades', fontsize=16, y=1)
plt.tight_layout()
plt.show()


In [None]:
'''Bell Curve of Audio Features By Popularity'''

# Define the audio features for which you want to plot bell curves
audio_features_list = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'spotify_track_popularity']

# Create subplots with 4 rows and 2 columns
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(10, 2 * len(audio_features_list)))

# Flatten the axes for easier indexing
axes = axes.flatten()

# Define the colormap (use 'Blues' for a blue theme)
colormap = cm.Blues

# Define the popularity ranges
popularity_ranges = [(i, i + 10) for i in range(0, 100, 10)]

# Iterate through each audio feature
for i, feature in enumerate(audio_features_list):
    # Plot the bell curve for each popularity range
    for j, (lower, upper) in enumerate(popularity_ranges):
        data_subset = audio_features_subset[(audio_features_subset['spotify_track_popularity'] >= lower) & (audio_features_subset['spotify_track_popularity'] < upper)]

        # Exclude NaN values from the data
        data_subset = data_subset[~data_subset[feature].isna()]

        # Check if the data_subset is not empty
        if not data_subset.empty:
            # Fit a normal distribution to the data
            mu, std = norm.fit(data_subset[feature])

            # Generate x values for the PDF
            x = np.linspace(data_subset[feature].min(), data_subset[feature].max(), 100)
            # Calculate the PDF using the fitted parameters
            pdf = norm.pdf(x, mu, std)

            # Plot the PDF on the corresponding subplot with colormap
            axes[i].plot(x, pdf, label=f'{lower}-{upper}', color=colormap(j / len(popularity_ranges)))

    # Set plot labels and title
    axes[i].set_title(feature.capitalize())
    axes[i].set_xlabel('Popularity')
    axes[i].set_ylabel('Probability Density')
    axes[i].legend()

# Adjust layout
plt.suptitle('Bell Curve of Audio Features By Popularity', fontsize=16, y=1)
plt.tight_layout()
plt.show()

In [None]:
'''KDE Plot of Audio Features By Popularity'''

# Define the audio features for which you want to plot KDE curves
audio_features_list = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'spotify_track_popularity']

# Create subplots with 4 rows and 2 columns
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(10, 2 * len(audio_features_list)))

# Flatten the axes for easier indexing
axes = axes.flatten()

# Define the colormap (use 'Blues' for a blue theme)
colormap = cm.Blues

# Define the popularity ranges
popularity_ranges = [(i, i + 10) for i in range(0, 100, 10)]

# Iterate through each audio feature
for i, feature in enumerate(audio_features_list):
    # Plot the KDE curve for each popularity range
    for j, (lower, upper) in enumerate(popularity_ranges):
        data_subset = audio_features_subset[(audio_features_subset['spotify_track_popularity'] >= lower) & (audio_features_subset['spotify_track_popularity'] < upper)]

        # Exclude NaN values from the data
        data_subset = data_subset[~data_subset[feature].isna()]

        # Check if the data_subset is not empty
        if not data_subset.empty:
            # Plot the KDE on the corresponding subplot with colormap
            sns.kdeplot(data=data_subset[feature], ax=axes[i], label=f'{lower}-{upper}', color=colormap(j / len(popularity_ranges)))

    # Set plot labels and title
    axes[i].set_title(feature.capitalize())
    axes[i].set_xlabel('Popularity')
    axes[i].set_ylabel('Density')
    axes[i].legend()

# Adjust layout
plt.suptitle('KDE Plot of Audio Features By Popularity', fontsize=16, y=1)
plt.tight_layout()
plt.show()


In [None]:
# billboard
# genre - top 10

In [None]:
'''KDE Plot of Audio Features By Billboard Ranking'''

# Define the audio features for which you want to plot KDE curves
audio_features_list = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'spotify_track_popularity']

# Create subplots with 4 rows and 2 columns
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(10, 2 * len(audio_features_list)))

# Flatten the axes for easier indexing
axes = axes.flatten()

# Define the colormap (use 'Blues' for a blue theme)
colormap = cm.Blues

# Define the popularity ranges
billboard_ranges = [(i, i + 10) for i in range(0, 100, 10)]

# Iterate through each audio feature
for i, feature in enumerate(audio_features_list):
    # Plot the KDE curve for each billboard ranking range
    for j, (lower, upper) in enumerate(billboard_ranges):
        data_subset = audio_features_subset[(audio_features_subset['peak_position'] >= lower) & (audio_features_subset['peak_position'] < upper)]

        # Exclude NaN values from the data
        data_subset = data_subset[~data_subset[feature].isna()]

        # Check if the data_subset is not empty
        if not data_subset.empty:
            # Plot the KDE on the corresponding subplot with colormap
            sns.kdeplot(data=data_subset[feature], ax=axes[i], label=f'{lower}-{upper}', color=colormap(j / len(popularity_ranges)))

    # Set plot labels and title
    axes[i].set_title(feature.capitalize())
    axes[i].set_xlabel('Ranking')
    axes[i].set_ylabel('Density')
    axes[i].legend()

# Adjust layout
plt.suptitle('KDE Plot of Audio Features By Billboard Ranking', fontsize=16, y=1)
plt.tight_layout()
plt.show()
