In [15]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

spotify_data = pd.read_csv('data/spotify_2023.csv', delimiter=',')

# These features were encoded as the incorrect data type in the original data set, so we are tranforming them into numeric here before creating visuals
spotify_data['streams'] = pd.to_numeric(spotify_data['streams'], errors='coerce')
spotify_data['in_shazam_charts'] = pd.to_numeric(spotify_data['in_shazam_charts'], errors='coerce')
spotify_data['in_deezer_playlists'] = pd.to_numeric(spotify_data['in_deezer_playlists'], errors='coerce')

spotify_data.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703.0,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286.0,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974.0,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817.0,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322.0,84,...,144,A,Minor,65,23,80,14,63,11,6


Task: Correlate - Investigate if there is a correlation between the frequency of songs released (across months) and their overall streaming performance. Are certain release times associated with higher streaming numbers?

In [88]:
import altair as alt
from vega_datasets import data

filtered_data = spotify_data[(spotify_data['released_year'] == 2022)]
brush = alt.selection_interval()

rect = alt.Chart(filtered_data).mark_rect().encode(
    alt.X('released_month').bin(),
    alt.Y('released_day').bin(),
    alt.Color('sum(streams)').scale(scheme='greenblue').title('Total streams')
)
rect

scatter_plot = alt.Chart(filtered_data).mark_circle().encode(
    alt.X('in_spotify_charts', scale=alt.Scale(domain=[0, 140])),  
    alt.Y('in_spotify_playlists', scale=alt.Scale( domain=[0, 25000])), 
    color=alt.condition(brush, alt.ColorValue('steelblue'), alt.value('lightgray')),
    tooltip=['track_name', 'artist(s)_name']
).add_params(brush)

scatter_plot

scatter_plot | rect.transform_filter(brush)


In [100]:
import altair as alt
from vega_datasets import data

filtered_data = spotify_data[(spotify_data['released_year'] == 2022)]
brush = alt.selection_interval()
select = alt.selection_single(encodings=['x', 'y'])


rect = alt.Chart(filtered_data).mark_rect().encode(
    alt.X('released_month').bin(),
    alt.Y('released_day').bin(),
    alt.Color('sum(streams)').scale(scheme='greenblue').title('Total streams')
).add_selection(
    select
)


scatter_plot = alt.Chart(filtered_data).mark_circle().encode(
    alt.X('in_spotify_charts', scale=alt.Scale(domain=[0, 140])),  
    alt.Y('in_spotify_playlists', scale=alt.Scale( domain=[0, 25000])), 
    color=alt.condition(brush, alt.ColorValue('steelblue'), alt.value('lightgray')),
    tooltip=['track_name', 'artist(s)_name']
).transform_filter(
    select
).add_params(brush)

scatter_plot

scatter_plot | rect.transform_filter(brush)
