In [1]:
# Imports
import pandas as pd
import numpy as np
import altair as alt
import sympy as sp
import math

In [2]:
spotify_data = pd.read_csv('data/spotify_2023.csv', delimiter=',')

# These features were encoded as the incorrect data type in the original data set, so we are tranforming them into numeric here before creating visuals
spotify_data['streams'] = pd.to_numeric(spotify_data['streams'], errors='coerce')
spotify_data['in_shazam_charts'] = pd.to_numeric(spotify_data['in_shazam_charts'], errors='coerce')
spotify_data['in_deezer_playlists'] = pd.to_numeric(spotify_data['in_deezer_playlists'], errors='coerce')

In [3]:
wrangled_data = spotify_data

# bin the numbers for spotify chart
# Create bins and labels
bins = [0, 20, 40, 60,80,100, float('inf')]
labels = ['0-20','21-40', '41-60','61-80', '81-100', '100+']

# Create a new column 'rank_bin' based on the bins
wrangled_data['rank_bin'] = pd.cut(wrangled_data['in_spotify_charts'], bins=bins, labels=labels, right=False)
#wrangled_data['rank_bin'] = np.where(wrangled_data['in_spotify_charts'] == 0, 'unranked', wrangled_data['rank_bin'])

month_to_season = {
    1: 'Winter',
    2: 'Winter',
    3: 'Spring',
    4: 'Spring',
    5: 'Spring',
    6: 'Summer',
    7: 'Summer',
    8: 'Summer',
    9: 'Fall',
    10: 'Fall',
    11: 'Fall',
    12: 'Winter'
}

# Create a new column 'season' based on the 'released_month' column
wrangled_data['season'] = wrangled_data['released_month'].map(month_to_season)

In [4]:
wrangled_data['arc_danceability'] = wrangled_data['danceability_%'].div(100).round(2) * math.pi
wrangled_data['arc_acousticness'] = wrangled_data['acousticness_%'].div(100).round(2) * math.pi
wrangled_data['arc_energy'] = wrangled_data['energy_%'].div(100).round(2) * math.pi
wrangled_data['arc_valence'] = wrangled_data['valence_%'].div(100).round(2) * math.pi


## Top of Dashboard

In [5]:
histogram_streams = alt.Chart(wrangled_data).mark_bar().encode(
        alt.X('streams', bin=alt.Bin(maxbins=40), title='Number of streams'),
        y='count()',
    ).properties(
        width=600,  
        height=250,  
        title='Distribution of songs per stream bracket'
    )

histogram_streams

In [6]:
# time series of releases per year
time_series = alt.Chart(wrangled_data).mark_line().encode(
    alt.X('released_month', title='Month of release'),
    y='count()',
).properties(width=600,  
             height=250, 
             title='Time series of release per month'
)

time_series

## Scatter Plot


In [38]:
selection = alt.selection_multi(fields=['season'])
size_slider = alt.binding_range(min=10, max=100, step=5, name='Point Size:')
op_var_size = alt.param(value=40, bind=size_slider)
#color_range =  ['#7fc97f','#beaed4', '#fdc086', '#795227','#386cb0', '#f0027f'] 

scatter_chart = alt.Chart(wrangled_data).mark_circle(size = op_var_size, opacity = 0.6).encode(
    alt.X('valence_%:Q', title = 'Valence Percentage'),
    alt.Y('bpm:Q', title = 'Beats per Minute (BPM)'),
    color = alt.Color('rank_bin:O', scale=alt.Scale(scheme='category10'), legend=alt.Legend(
        title = 'Appearances in Charts',
        orient='none',
        legendX=335, legendY=80,
        direction='vertical',
        titleAnchor='middle')),
    detail=['season:O'],
    tooltip = ['track_name:N','danceability_%', 'energy_%', 'season', 'in_spotify_charts']
).transform_filter(
    selection  # Apply the filter based on the selected seasons
).add_params(op_var_size).properties(title = 'Relationship between Valence and BPM across Release Season')



season_key = alt.Chart(wrangled_data).mark_square(size=300).encode(
    x= alt.X('season:O'),
    #y='season:O',
    detail='count()',
    color=alt.condition(selection, alt.value('magenta'), alt.value('lightgray')),
    tooltip = ['season', 'count()']
).properties(
    title = 'Release Season:'
).add_selection(
    selection
)

step = 20
overlap = 1

#sort=['1-30', '31-60', '61-90', '91-120', '120+']
# https://altair-viz.github.io/gallery/ridgeline_plot.html
ridge = alt.Chart(wrangled_data, height=step).transform_filter(
    selection  # selection filter for season
).transform_bin(
    'binned', field='in_spotify_charts', bin=alt.Bin(maxbins=20)
).transform_aggregate(
    value='count()', groupby=['season', 'binned']
).transform_impute(
    impute='value', groupby=['season'], key='binned', value=0
).mark_area(
    interpolate='monotone',
    fillOpacity=0.8,
    stroke='magenta',
    strokeWidth=0.5
).encode(
    alt.X('binned:Q', title='Rank in Spotify Charts'),
    alt.Y('value:Q', axis=None, scale=alt.Scale(range=[step, -step * overlap])),
    #alt.Fill('season:N', legend=None, scale=alt.Scale(scheme='lightblue'))
    alt.ColorValue('lightgray')
).facet(
    row=alt.Row('season:N', title=None, header=alt.Header(labelAngle=0, labelAlign='left')),
).properties(
    bounds='flush'
).add_selection(
    selection
)

# Combine the charts horizontally
scat_plot = ((scatter_chart | season_key) & ridge).resolve_scale(
    color='independent')


# Show the combined chart
scat_plot



## Radar Plot

In [8]:
# Function that takes in the angles of a heptagon and outputs the end point of each line 100 units from center
def calculate_coordinates(angle):
    x = float(((0 - 100 * sp.cos(angle)).evalf()))
    y = float((0 - 100 * sp.sin(angle)).evalf())
    return x, y

# Creates a heptagon centered around (0, 0)
def create_axis_chart(x, y, label):
    rules = alt.Chart().mark_rule(color = 'gray', opacity = 0.8 ).encode(
        x=alt.datum(0, type="quantitative", axis=None),
        y=alt.datum(0, type="quantitative", axis=None),
        x2=alt.datum(x),
        y2=alt.datum(y),
    )
    
    text = alt.Chart().mark_text(align='center',
                                 baseline = 'alphabetic'
                                ).encode(
        x=alt.datum(x),
        y=alt.datum(y),
        text=alt.value(label),
    )
    
    return rules + text

# Creates the lines that are distances away from the center for each characteristic
def create_line_chart(x, y, color_choice):
    return alt.Chart().mark_rule(
        color=color_choice,
        opacity = 0.5
    ).encode(
        x=alt.datum(0, type="quantitative"),
        y=alt.datum(0, type="quantitative"),
        x2=alt.datum(x),
        y2=alt.datum(y)
    )

# finds the coordinates along vertex given a characteristic distance, z is charactersitic value
def point_along_line(end_point, z):
    # Unpack coordinates
    x1, y1 = 0, 0
    x2, y2 = end_point

    # Calculate the distance between start and end points
    dist = ((x2 - x1)**2 + (y2 - y1)**2)**0.5

    # Calculate the proportion of the total distance z
    prop = z / dist

    # Calculate the coordinates of the point along the line
    x = x1 + prop * (x2 - x1)
    y = y1 + prop * (y2 - y1)

    return (x, y)

# Calculates average values for top or bottom 10 songs
# rank can only be top or bottom
def average_10(df, rank, num_songs):
    if rank == 'top':
        top_songs = df.nlargest(num_songs, 'in_spotify_playlists')
        average_values_top = top_songs[['danceability_%', 'valence_%', 
                                              'energy_%', 'acousticness_%', 
                                              'instrumentalness_%', 'liveness_%', 
                                              'speechiness_%']].mean()
        average_values = pd.DataFrame(average_values_top).T
        #avg_dist = average_values_top.values.tolist()[0]
        
    elif rank == 'bottom':
        bottom_songs = df.nsmallest(num_songs, 'in_spotify_playlists')
        average_values_bottom = bottom_songs[['danceability_%', 'valence_%', 
                                              'energy_%', 'acousticness_%', 
                                              'instrumentalness_%', 'liveness_%', 
                                              'speechiness_%']].mean()
        average_values = pd.DataFrame(average_values_bottom).T
        #avg_dist = average_values_bottom.values.tolist()[0]
    return average_values

# Creates polygon of connecting lines/radar
def create_polygon_chart(distance_points_list, color_choice, characteristic_labels, average_values, cat):
    avg_dist = average_values.values.tolist()[0]
    polygon_point = pd.DataFrame(distance_points_list, columns=['x', 'y'])
    polygon_point['category'] = f"{cat} Songs"
    polygon_point['characteristic'] = characteristic_labels
    polygon_point['characteristic_value'] = avg_dist
    polygon_point = pd.concat([polygon_point, polygon_point.iloc[[0]]], ignore_index=True)

    conecting_lines = alt.Chart(polygon_point.reset_index()
                               ).mark_area(line={"color": color_choice,
                                                 "strokeWidth": 5, 
                                                 'opacity' : 0.5},
                                           fill=color_choice,
                                           interpolate='linear').encode(
        x = alt.X("x"),
        y=alt.Y("y"),
        order='index',
        tooltip = ['category', 'characteristic', 'characteristic_value']
    )

    return conecting_lines

#Create slider
slider = alt.binding_range(min=1, max=50, step=1, name='Number of Songs:')
op_var = alt.param(value=10, bind=slider)

# heptagon coords
angles = [2 * sp.pi / 7 * i for i in range(7)]
hept_list = [calculate_coordinates(angle) for angle in angles]
labels = ['danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']

# axis plots for each set of coordinates
axis_charts = [create_axis_chart(x, y, label) for (x, y), label in zip(hept_list, labels)]

# Combine all axis plots into a single chart
combined_chart_axis = alt.layer(*axis_charts)


### Top 10

In [9]:
# TOP 10 SPECIFIC FROM HERE DOWN
# Calculates average values for top 10 songs
Number_for_avg = 10
dist_top_10 = average_10(wrangled_data, 'top', Number_for_avg)

distance_points = []

for end_point, distance in zip(hept_list, dist_top_10.iloc[0]):
    new_point = point_along_line(end_point, distance)
    distance_points.append(new_point)

# create lines for charactersitics
line_charts = [create_line_chart(x, y, 'green') for x, y in distance_points]

# combine all line plots into a single chart
combined_chart_line = alt.layer(*line_charts)
# combine acis and line
combined_chart = alt.layer(combined_chart_axis, combined_chart_line)

# polygon chart
polygon_chart = create_polygon_chart(distance_points, 'green', labels, dist_top_10, 'Top')

top_combined_chart_all = alt.layer(combined_chart_axis, combined_chart_line, polygon_chart)
top_combined_chart_all_blank = top_combined_chart_all
top_combined_chart_all_blank = top_combined_chart_all_blank.configure_view(stroke=None).configure_axis(domain=False, 
                                                                                           labels=False, 
                                                                                           ticks=False, 
                                                                                           title=None).properties(
    title = f"Average Top {Number_for_avg} Songs")
#top_combined_chart_all_blank
top_combined_chart_all

### Bottom 10

In [10]:
# Bottom 10 SPECIFIC FROM HERE DOWN
# Calculates average values for top 10 songs
dist_bottom_10 = average_10(wrangled_data, 'bottom', Number_for_avg)

distance_points = []

for end_point, distance in zip(hept_list, dist_bottom_10.iloc[0]):
    new_point = point_along_line(end_point, distance)
    distance_points.append(new_point)

# create lines for charactersitics
line_charts = [create_line_chart(x, y, 'red') for x, y in distance_points]

# combine all line plots into a single chart
combined_chart_line = alt.layer(*line_charts)

# combine acis and line
combined_chart = alt.layer(combined_chart_axis, combined_chart_line)

polygon_chart = create_polygon_chart(distance_points, 'red', labels, dist_bottom_10, 'Bottom')

bottom_combined_chart_all = alt.layer(combined_chart_axis, combined_chart_line, polygon_chart)
bottom_combined_chart_all_blank = bottom_combined_chart_all
bottom_combined_chart_all_blank = bottom_combined_chart_all_blank.configure_view(stroke=None).configure_axis(domain=False, 
                                                                                           labels=False, 
                                                                                           ticks=False, 
                                                                                           title=None).properties(
    title = f"Average Bottom {Number_for_avg} Songs")
bottom_combined_chart_all_blank

### Superimposed

In [11]:
a = alt.layer(top_combined_chart_all, bottom_combined_chart_all)
combined_chart_bottom_top = a.configure_view(stroke=None
        ).configure_axis(domain=False,
                         labels=False, 
                         ticks=False,
                         title=None
        ).properties(
    title = f"Average Top and Bottom {Number_for_avg} Songs")
combined_chart_bottom_top

## Mantle Plot

In [12]:
filtered_df = wrangled_data.nlargest(10, 'streams')

filtered_df = filtered_df.sort_values(by='streams', ascending=False)
filtered_df = filtered_df.reset_index()


### Danceability

In [13]:
dance_charts = []

for i in range(min(10, len(filtered_df))):
    theta_dance = filtered_df['arc_danceability'].iloc[i]
    
    dance_chart = alt.Chart(pd.DataFrame(filtered_df.iloc[i]).T).mark_arc(
        radius=165 - i * 15, radius2=153 - i * 15, theta=theta_dance,
        stroke="white", strokeWidth=2
    ).encode(
        color=alt.Color(field="key", type="nominal", scale=alt.Scale(scheme='Set2'), legend=None),
        tooltip=[
            alt.Tooltip("artist(s)_name:N", title="Artist"),
            alt.Tooltip("track_name:N", title="Song"),
            alt.Tooltip("streams:Q", title="Streams"),
            alt.Tooltip("energy_%:Q", title="Energy"),
            alt.Tooltip("danceability_%:Q", title="Danceability")
        ]
    )
    
    dance_charts.append(dance_chart)

layered_dance = alt.layer(*dance_charts)
layered_dance

### Energy

In [14]:
energy_charts = []

for i in range(min(10, len(filtered_df))):
    theta_energy = filtered_df['arc_energy'].iloc[i]
    
    energy_chart = alt.Chart(pd.DataFrame(filtered_df.iloc[i]).T).mark_arc(
        radius=165 - i * 15, radius2=153 - i * 15,
        theta2=((2 * math.pi) - theta_energy),
        stroke="white", strokeWidth=2
    ).encode(
        color=alt.Color(field="key", type="nominal", scale=alt.Scale(scheme='Set2'),legend=alt.Legend(
        title = 'Key',
        orient='right',
        legendX=335, legendY=80,
        direction='vertical',
        titleAnchor='middle')),
        tooltip=[
            alt.Tooltip("artist(s)_name:N", title="Artist"),
            alt.Tooltip("track_name:N", title="Song"),
            alt.Tooltip("streams:Q", title="Streams"),
            alt.Tooltip("energy_%:Q", title="Energy"),
            alt.Tooltip("danceability_%:Q", title="Danceability")
        ]
    )
    
    energy_charts.append(energy_chart)

layered_energy = alt.layer(*energy_charts)
layered_energy

### Combined

In [15]:
layered_all_arc = alt.layer(layered_energy, layered_dance)
layered_all_arc

## Heatmap

In [16]:
filtered_data = spotify_data[(spotify_data['released_year'] == 2022)]
brush = alt.selection_interval()
select = alt.selection_single(encodings=['x', 'y'])


rect = alt.Chart(filtered_data).mark_rect().encode(
    alt.X('released_month').bin(),
    alt.Y('released_day').bin(),
    alt.Color('sum(streams)').scale(scheme='lighttealblue').title('Total streams')
).add_selection(
    select
)


scatter_plot = alt.Chart(filtered_data).mark_circle().encode(
    alt.X('in_spotify_charts', scale=alt.Scale(domain=[0, 140])),  
    alt.Y('in_spotify_playlists', scale=alt.Scale( domain=[0, 25000])), 
    color=alt.condition(brush, alt.ColorValue('#db8607'), alt.value('lightgray')),
    tooltip=['track_name', 'artist(s)_name']
).transform_filter(
    select
).add_params(brush)

scatter_plot



total_heatmap_all = scatter_plot | rect.transform_filter(brush)
total_heatmap_all



In [17]:
from IPython.display import display, HTML

In [18]:
# how to remove axis from only one in layer chart and not all
hist_caption = alt.Chart().mark_text(
    align='center', # center the text
    baseline='bottom', # place the text at the bottom
    fontSize=12, # set font size
    text='XXXX'  # set the caption text
).properties(width=400)

time_series_caption = alt.Chart().mark_text(
    align='center', # center the text
    baseline='bottom', # place the text at the bottom
    fontSize=12, # set font size
    text='XXXX'  # set the caption text
).properties(width=400)

mantle_caption = alt.Chart().mark_text(
    align='center', # center the text
    baseline='bottom', # place the text at the bottom
    fontSize=12, # set font size
    text='XXXX'  # set the caption text
).properties(width=350)

scat_caption = alt.Chart().mark_text(
    align='center', # center the text
    baseline='bottom', # place the text at the bottom
    fontSize=12, # set font size
    text='Analyze whether there are distinct clusters of songs based on their BPM and valence percentages in relation to their positions on the Spotify charts. Identify the characteristics of songs in each cluster.'  # set the caption text
).properties(width=350)

heatmap_caption = alt.Chart().mark_text(
    align='center', # center the text
    baseline='bottom', # place the text at the bottom
    fontSize=12, # set font size
    text='XXXX'  # set the caption text
).properties(width=350)

radar_caption_text = (
    "Determine the average characteristics of the top and bottom 10 most saved songs on Spotify playlists, calculate the range of contributing artists or the song characteristics."
)
radar_caption = alt.Chart().mark_text(
    align='center',        # center the text
    baseline='bottom',     # place the text at the bottom
    fontSize=12,           # set font size
    text=radar_caption_text,  # set the caption text
).properties(
    width=350 
)

display(HTML("""
<style>
form.vega-bindings {
  position: absolute;
  right: 600px;
  top: 875px;
}
</style>
"""))


row1 = (histogram_streams.properties(height = 150, width = 400) & hist_caption)| (time_series.properties(height = 150, width = 400) & time_series_caption)
row3 = (layered_all_arc.resolve_scale(color='independent').properties(title = 'TITLE HERE',height = 150, width = 350) & mantle_caption) | (scat_plot & scat_caption)
row4 = (total_heatmap_all.resolve_scale(
    color='independent') & heatmap_caption)
row5 = (top_combined_chart_all.properties(title = 'Characteristics Across the Top 10 Songs', height = 400, width = 400) | bottom_combined_chart_all.properties(title = 'Characteristics Across the Bottom 10 Songs', height = 400, width = 400) & radar_caption)
task_charts = (row1 & row3 & row4.properties(title={'text': 'TITLE HERE', 'anchor': 'middle'}) & row5).configure_facet(spacing=0)


task_charts