# Dataset Exploration

In [None]:
%cd ../

Import libraries

In [None]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import powerlaw

from src.utils.config import get_dataset_path
from src.utils.styling import apply_styling

In [None]:
colors = apply_styling()
palette = colors['palette']

Read the parquet file

In [None]:
df = pd.read_parquet(get_dataset_path('master_spotify'))
print('Rows: ', len(df))
df.head(3)

What artists and songs are most popular?

In [None]:
artist_counter = Counter(list(df['artist']))
song_counter = Counter(list(df['track']))
print('Top artists: {}'.format(artist_counter.most_common(10)))
print('Top songs: {}'.format(song_counter.most_common(10)))

Let's visualize the distribution of tracks and artist in our dataset.

In [None]:
def plot_distribution(artist_counter, song_counter, n_bins: int = 50):
    """Plot distributions of tracks and artists in the final dataset."""
    fig, axs = plt.subplots(1, 2, tight_layout=True, figsize=(10, 3.5))

    # Histogram of artists
    axs[0].hist(artist_counter.values(), bins=n_bins, color=palette[0])
    axs[0].set_title('Artists')
    axs[0].set_yscale('log')
    # axs[0].ticklabel_format(axis='x', style='sci', scilimits=(0, 0))
    axs[0].set_ylabel('No. of artists')
    axs[0].set_xlabel('No. of times artist is in a playlist')

    # Histogram of songs
    axs[1].hist(song_counter.values(), bins=n_bins, color=palette[1])
    axs[1].set_title('Songs')
    axs[1].set_yscale('log')
    axs[1].set_ylabel('No. of songs')
    axs[1].set_xlabel('No. of times song is in a playlist')
    plt.show()

In [None]:
plot_distribution(artist_counter, song_counter, n_bins=40)

Since it looks like our data is very skewed, we can use the `powerlaw` powerlaw library and formally compare the distribution of how artists are represented in playlists to a powerlaw. Specifically, we use the package to visualize the [probability density function](https://pythonhosted.org/powerlaw/#powerlaw.Fit.plot_pdf) for the theoretical distribution estimated using the number of times artists are represented in playlists.

In [None]:
data = list(artist_counter.values())
fit = powerlaw.Fit(data, discrete=True)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 4))

fit.plot_pdf(
    color=palette[0],
    linewidth=1.5,
    linestyle='-',
    ax=ax,
    label='Power law fit'
)
fit.power_law.plot_pdf(
    color=palette[0],
    linewidth=1.5,
    linestyle='--',
    ax=ax,
    label='Theoretical power law',
)
ax.hist(
    data,
    bins=np.logspace(np.log10(1), np.log10(max(data)), 50),
    density=True,
    alpha=0.75,
    # label='Artist distribution',
    color=colors['lines'],
)

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_title('Artist Playlist Distribution vs Power Law')
ax.set_ylabel('Density')
ax.set_xlabel('No. of times artist is in a playlist')
ax.legend(frameon=False)
plt.show()

In this notebook, we explored a dataset with millions of Spotify songs and their playlist groupings. You saw which artists and songs are most popular and observed how the distribution of how artists are represented in playlists follows a power law.