Starter file for Project 1 - need to load in Spotify CSV!


In [None]:
# Import Dependencies
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
import numpy as np
import scipy.stats as st

In [None]:
csv_path = Path("Resources/spotify-2023.csv")

spotify_df = pd.read_csv(csv_path, encoding='iso-8859-1')

#Basic Dataframe with all information
spotify_df.head()

In [None]:
# David!

In [None]:
#Austin!

In [None]:
# Dan

In [None]:
# Pull out specific columns into a dataframe for analyzing just the music qualities:

qualities_df = spotify_df[["track_name", "Artist Name 1", "streams", "bpm", "key", "mode", "danceability_%", "valence_%",	"energy_%",	"acousticness_%",	"instrumentalness_%",	"liveness_%",	"speechiness_%"]]

qualities_df

In [None]:
# General overview of values per column.

qualities_df.nunique()

In [None]:
# In order to graph a scatter plot of all the tracks' tempos, I created a counter loop in order to assign a 
# number to each track listing in the dataframe. This makes the scatter plot easier to read. The x-axis numbers 
# are independent of the y-axis value - they simply mark individual tracks from the data set.

# This first graph is messy and hard to read, and ultimately feels not particularly useful. I decided to 
# instead look for normal distribution via song tempos, and number of songs with that tempo.

x_values = []
counter = 0

for row in qualities_df["track_name"]:

   counter = counter + 1

   x_values.append(counter)
  
y_values = qualities_df["bpm"]
plt.scatter(x_values, y_values)
plt.xlabel("Songs")
plt.ylabel("Beats Per Minute")
plt.title("BPM of Top Songs on Spotify in 2023")

plt.show()

In [None]:
# First I created a list of tempos used by the songs in the dataset.

song_speed = qualities_df["bpm"].value_counts()

tempos = song_speed.index.tolist()

print(tempos)

In [None]:
# I then created a list to show the number of times each of the above tempos was selected for a track.

total_songs = []
for song in song_speed:
    total_songs.append(song)

total_songs

In [None]:
# This scatter plot shows the distribution of tempo selections for all songs in the dataset.
# Based on the p-value, it appears there is a normal distribution to the tempos!

plt.scatter(tempos, total_songs)
plt.xlabel("Tempo in BPM")
plt.ylabel("Number of Songs")
plt.title("Distribution of Chosen Tempos for Most Popular Songs on Spotify in 2023")

std_dev = np.std(tempos)
print(f'Standard Deviation of tempos: {std_dev}.')
print(st.normaltest(tempos))
plt.show()




In [None]:
# This code was an attempt to further categorize tempos using bins, in order to
# utilize a histogram with the standard deviation calculations. Unfortunately, I had
# issues with the plot alignment and had to abandon this for presentation. I would
# consider this a worthy pursuit for "next steps" if we had more time.
 
#bins = [0, 59, 69, 79, 89, 99, 109, 119, 129, 139, 149, 159, 169, 179, 189, 199, 250]
#group_names = ["Below 60", "60-69bpm", "70-79bpm", "80-89bpm", "90-99bpm", "100-109bpm", "110-119bpm",
                #"120-129bpm", "130-139bpm", "140-149bpm", "150-159bpm", "160-169bpm", "170-179bpm", 
                #"180-189bpm", "190-199bpm", "200+bpm"]

#qualities_df["BPM Range"] = pd.cut(qualities_df["bpm"],
                         #bins, labels=group_names,
                         #include_lowest=True)

#tempo_range = qualities_df["BPM Range"].sort_values()

In [None]:
# I was interested in comparing the most streamed artists in the dataset to the dataset as a whole, so
# the next few blocks create a secondary dataframe that focuses on the to 20 artists from the dataset
# with the most overall number of streams. I first looked at total number of songs...

popular_artists = qualities_df["Artist Name 1"].value_counts()

popular_artists.sort_values(ascending=False)

popular_artists.head(20)

In [None]:
# ...And then tried a sum of streams. There were some different artists that appeared!

streams_df = qualities_df.groupby("Artist Name 1")["streams"].sum()

streams_df.sort_values(ascending=False).head(20)

In [None]:
# I focused on the 20 artists with the most total streams for my sample population.

top_artists = qualities_df.loc[(qualities_df["Artist Name 1"] == "Bad Bunny") |
                          (qualities_df["Artist Name 1"] == "The Weeknd") |
                          (qualities_df["Artist Name 1"] == "Ed Sheeran") |
                          (qualities_df["Artist Name 1"] == "Taylor Swift") |
                          (qualities_df["Artist Name 1"] == "Harry Styles") |
                          (qualities_df["Artist Name 1"] == "Eminem") |
                          (qualities_df["Artist Name 1"] == "Justin Bieber") |
                          (qualities_df["Artist Name 1"] == "Drake") |
                          (qualities_df["Artist Name 1"] == "Imagine Dragons") |
                          (qualities_df["Artist Name 1"] == "Olivia Rodrigo") |
                          (qualities_df["Artist Name 1"] == "Bruno Mars") |
                          (qualities_df["Artist Name 1"] == "Dua Lipa") |
                          (qualities_df["Artist Name 1"] == "SZA") |
                          (qualities_df["Artist Name 1"] == "Post Malone") |
                          (qualities_df["Artist Name 1"] == "Arctic Monkeys") |
                          (qualities_df["Artist Name 1"] == "Kendrick Lamar") |
                          (qualities_df["Artist Name 1"] == "Doja Cat") |
                          (qualities_df["Artist Name 1"] == "Billie Eilish") |
                          (qualities_df["Artist Name 1"] == "Coldplay") |
                          (qualities_df["Artist Name 1"] == "The Chainsmokers")]
                          
top_artists.sort_values(["Artist Name 1"])
                       

In [None]:
# I then repeated my process of creating x and y axes to create a scatter plot, 
# but only focused on the top 20 artists.
top_tempos = top_artists["bpm"].value_counts()

popular_tempos = top_tempos.index.tolist()

print(popular_tempos)

In [None]:
most_streamed_songs = []
for song in top_tempos:
    most_streamed_songs.append(song)

most_streamed_songs

In [None]:
# The most streamed artists also appear normally distributed, though the p-value is larger for 
# the sample. The Standard deviation is also about 4 beats per minute lower for the most 
# streamed artists.
plt.scatter(popular_tempos, most_streamed_songs)
plt.xlabel("Tempo in BPM")
plt.ylabel("Number of Songs")
plt.title("Distribution of Chosen Tempos for 20 Most Streamed Artists on Spotify in 2023")

std_dev = np.std(popular_tempos)
print(f'Standard Deviation of tempos: {std_dev}.')
print(st.normaltest(popular_tempos))
plt.show()

In [None]:
# The following set of calculations was to determine what share of songs 
# the most streamed artists accounted for, as well as what share of 
# total streams they captured. Interestingly, it seems that while 
# the most streamed artists comprised about 27% of the tracks in the 
# data, they accounted for 36% of the streams for 2023!
top_artists["streams"].sum()

In [None]:
qualities_df["streams"].sum()

In [None]:
((top_artists["streams"].sum()) / (qualities_df["streams"].sum())) *100

In [None]:
len(top_artists["track_name"])

In [None]:
len(qualities_df["track_name"])

In [None]:
(len(top_artists["track_name"]) / len(qualities_df["track_name"])) * 100

In [None]:
# This was an attempt before calculating the tempo distribution, involving my orioginal idea.
# The scatter plot of songs at each tempo is also messy and not helpful.

pop_x = []
counter = 0

for row in top_artists["track_name"]:

   counter = counter + 1

   pop_x.append(counter)


pop_y = top_artists["bpm"]
plt.scatter(pop_x, pop_y)
plt.xlabel("Songs")
plt.ylabel("Beats Per Minute")
plt.title("BPM of the 20 Most Streamed Artists on Spotify in 2023")

plt.show()

In [None]:
# Again, before analyzing the distribution of tempo selection, I attempted to compare
# tempos of all songs in the dataset vs. songs by the 20 most streamed artists. Both graphs
# have a similar distribution, but are generally difficult to analyze further.

fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(x_values, y_values, c='b', marker='o', label="All Artists")
ax.scatter(pop_x, pop_y, c='m', marker="s", label="20 Most Streamed Artists")
ax.set_xlabel("Songs")
ax.set_ylabel("Beats Per Minute")
ax.set_title("BPM Comparison of Most Streamed Songs vs 20 Most Streamed Artists in 2023")
ax.legend(loc='upper right')

plt.savefig("Outputs/bpm_comparison.png")
plt.show()


In [None]:
# Next, I printed the mean, median, and mode of BPMs for both dataframes.

print(f'The average BPM of all top songs from 2023 is {qualities_df["bpm"].mean()}.')
print(f'The average BPM of songs by the 20 most streamed artists in 2023 is {top_artists["bpm"].mean()}.')
print(f'--------------------------')
print(f'The median BPM of all top songs from 2023 is {qualities_df["bpm"].median()}.')
print(f'The median BPM of songs by the 20 most streamed artists in 2023 is {top_artists["bpm"].median()}.')
print(f'--------------------------')
print(f'The mode BPM of all top songs from 2023 is {qualities_df["bpm"].mode()}.')
print(f'The mode BPM of songs by the 20 most streamed artists in 2023 is {top_artists["bpm"].mode()}.')

In [None]:

top_artists.loc[top_artists["bpm"] == 90].value_counts()


In [None]:
qualities_df.loc[qualities_df["bpm"] == 120].value_counts()

In [None]:

all_mode = qualities_df.groupby(["Artist Name 1"])["mode"].value_counts()

all_modes = pd.DataFrame(all_mode.sort_values(ascending=False))

all_modes

In [None]:
top_mode = top_artists.groupby(["Artist Name 1"])["mode"].value_counts()

top_modes = pd.DataFrame(top_mode.sort_values(ascending=False))

top_modes

In [None]:
total_major = []
total_minor = []
major = 0
minor = 0

for row in qualities_df["mode"]:
    if row == "Major":
        major = major + 1
       
    elif row == "Minor":
        minor = minor + 1

total_major.append(major)
total_minor.append(minor)
print(total_major)
print(total_minor)

In [None]:
top_major = []
top_minor = []
t_major = 0
t_minor = 0

for row in top_artists["mode"]:
    if row == "Major":
        t_major = t_major + 1
       
    elif row == "Minor":
        t_minor = t_minor + 1

top_major.append(t_major)
top_minor.append(t_minor)
print(top_major)
print(top_minor)

In [None]:
modes = ["Major", "Minor"]
total_modes = [549, 403]
explode = (0.1, 0)
colors = ["yellowgreen", "blue"]

plt.pie(total_modes, explode=explode, labels=modes, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=120)
plt.title("Mode Breakdown for Top Songs in Spotify for 2023")

plt.savefig("Outputs/all_songs_modes.png")

plt.show()


In [None]:
modes = ["Major", "Minor"]
top_modes = [161, 98]
explode = (0.1, 0)
colors = ["lightskyblue", "purple"]

plt.pie(top_modes, explode=explode, labels=modes, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=120)
plt.title("Mode Breakdown for 20 Most Streamed Artists in Spotify for 2023")

plt.savefig("Outputs/top20_artists_modes.png")

plt.show()


In [None]:
print(f'The mean danceability % is {round(qualities_df["danceability_%"].mean(), 2)}.')
print(f'The median danceability % is {round(qualities_df["danceability_%"].median(), 2)}.')
print(f'The mode danceability % is {round(qualities_df["danceability_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean valence % is {round(qualities_df["valence_%"].mean(), 2)}.')
print(f'The median valence % is {round(qualities_df["valence_%"].median(), 2)}.')
print(f'The mode valence % is {round(qualities_df["valence_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean energy % is {round(qualities_df["energy_%"].mean(), 2)}.')
print(f'The median energy % is {round(qualities_df["energy_%"].median(), 2)}.')
print(f'The mode energy % is {round(qualities_df["energy_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean acousticness % is {round(qualities_df["acousticness_%"].mean(), 2)}.')
print(f'The median acousticness % is {round(qualities_df["acousticness_%"].median(), 2)}.')
print(f'The mode acousticness % is {round(qualities_df["acousticness_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean instrumentalness % is {round(qualities_df["instrumentalness_%"].mean(), 2)}.')
print(f'The median instrumentalness % is {round(qualities_df["instrumentalness_%"].median(), 2)}.')
print(f'The mode instrumentalness % is {round(qualities_df["instrumentalness_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean liveness % is {round(qualities_df["liveness_%"].mean(), 2)}.')
print(f'The median liveness % is {round(qualities_df["liveness_%"].median(), 2)}.')
print(f'The mode liveness % is {round(qualities_df["liveness_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean speechiness % is {round(qualities_df["speechiness_%"].mean(), 2)}.')
print(f'The median speechiness % is {round(qualities_df["speechiness_%"].median(), 2)}.')
print(f'The mode speechiness % is {round(qualities_df["speechiness_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean bpm is {round(qualities_df["bpm"].mean(), 2)}.')
print(f'The median bpm is {round(qualities_df["bpm"].median(), 2)}.')
print(f'The mode bpm is {round(qualities_df["bpm"].mode(), 2)}.')

In [None]:
print(f'The mean danceability % of songs by the 20 most streamed artists is {round(top_artists["danceability_%"].mean(), 2)}.')
print(f'The median danceability % of songs by the 20 most streamed artists is {round(top_artists["danceability_%"].median(), 2)}.')
print(f'The mode danceability % of songs by the 20 most streamed artists is {round(top_artists["danceability_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean valence % of songs by the 20 most streamed artists is {round(top_artists["valence_%"].mean(), 2)}.')
print(f'The median valence % of songs by the 20 most streamed artists is {round(top_artists["valence_%"].median(), 2)}.')
print(f'The mode valence % of songs by the 20 most streamed artists is {round(top_artists["valence_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean energy % of songs by the 20 most streamed artists is {round(top_artists["energy_%"].mean(), 2)}.')
print(f'The median energy % of songs by the 20 most streamed artists is {round(top_artists["energy_%"].median(), 2)}.')
print(f'The mode energy % of songs by the 20 most streamed artists is {round(top_artists["energy_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean acousticness % of songs by the 20 most streamed artists is {round(top_artists["acousticness_%"].mean(), 2)}.')
print(f'The median acousticness % of songs by the 20 most streamed artists is {round(top_artists["acousticness_%"].median(), 2)}.')
print(f'The mode acousticness % of songs by the 20 most streamed artists is {round(top_artists["acousticness_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean instrumentalness % of songs by the 20 most streamed artists is {round(top_artists["instrumentalness_%"].mean(), 2)}.')
print(f'The median instrumentalness % of songs by the 20 most streamed artists is {round(top_artists["instrumentalness_%"].median(), 2)}.')
print(f'The mode instrumentalness % of songs by the 20 most streamed artists is {round(top_artists["instrumentalness_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean liveness % of songs by the 20 most streamed artists is {round(top_artists["liveness_%"].mean(), 2)}.')
print(f'The median liveness % of songs by the 20 most streamed artists is {round(top_artists["liveness_%"].median(), 2)}.')
print(f'The mode liveness % of songs by the 20 most streamed artists is {round(top_artists["liveness_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean speechiness % of songs by the 20 most streamed artists is {round(top_artists["speechiness_%"].mean(), 2)}.')
print(f'The median speechiness % of songs by the 20 most streamed artists is {round(top_artists["speechiness_%"].median(), 2)}.')
print(f'The mode speechiness % of songs by the 20 most streamed artists is {round(top_artists["speechiness_%"].mode(), 2)}.')
print("--------------------------")
print(f'The mean bpm of songs by the 20 most streamed artists is {round(top_artists["bpm"].mean(), 2)}.')
print(f'The median bpm of songs by the 20 most streamed artists is {round(top_artists["bpm"].median(), 2)}.')
print(f'The mode bpm of songs by the 20 most streamed artists is {round(top_artists["bpm"].mode(), 2)}.')

In [None]:
all_chars = []
all_chars.append(round(qualities_df["danceability_%"].median(), 2))
all_chars.append(round(qualities_df["valence_%"].median(), 2))
all_chars.append(round(qualities_df["energy_%"].median(), 2))
all_chars.append(round(qualities_df["acousticness_%"].median(), 2))
all_chars.append(round(qualities_df["instrumentalness_%"].median(), 2))
all_chars.append(round(qualities_df["liveness_%"].median(), 2))
all_chars.append(round(qualities_df["speechiness_%"].median(), 2))


all_chars

In [None]:
top20_chars = []
top20_chars.append(round(top_artists["danceability_%"].median(), 2))
top20_chars.append(round(top_artists["valence_%"].median(), 2))
top20_chars.append(round(top_artists["energy_%"].median(), 2))
top20_chars.append(round(top_artists["acousticness_%"].median(), 2))
top20_chars.append(round(top_artists["instrumentalness_%"].median(), 2))
top20_chars.append(round(top_artists["liveness_%"].median(), 2))
top20_chars.append(round(top_artists["speechiness_%"].median(), 2))

top20_chars

In [None]:
characteristics = ["Danceability", "Valence (Positive Vibes)","Energy","Acousticness",
                   "Instrumentalness","Liveness","Speechiness"]
bar_width = 0.35
fig, ax = plt.subplots(figsize=(8, 12))
bar1 = ax.bar(characteristics, all_chars, bar_width, label="All Songs", color='b')
bar2 = ax.bar([i + bar_width for i in range(len(characteristics))], top20_chars, 
              bar_width, label="Top 20 Artists", color='purple')
ax.set_title("Median Presence of Characteristics in Top Streamed Songs in Spotify for 2023")
ax.legend()
ax.set_xlabel("Song Characteristic")
ax.set_ylabel("Percentage of Characteristic")
ax.set_xticklabels(characteristics, rotation=45)
fig.savefig("Outputs/characteristics_medians_barchart.png")
fig.show()

In [None]:
all_modes = []
all_modes.append(qualities_df["danceability_%"].mode().iloc[0])
all_modes.append(qualities_df["valence_%"].mode().iloc[0])
all_modes.append(qualities_df["energy_%"].mode().iloc[0])
all_modes.append(qualities_df["acousticness_%"].mode().iloc[0])
all_modes.append(qualities_df["instrumentalness_%"].mode().iloc[0])
all_modes.append(qualities_df["liveness_%"].mode().iloc[0])
all_modes.append(qualities_df["speechiness_%"].mode().iloc[0])


all_modes

In [None]:
top20_modes = []
top20_modes.append(top_artists["danceability_%"].mode().iloc[0])
top20_modes.append(top_artists["valence_%"].mode().iloc[0])
top20_modes.append(top_artists["energy_%"].mode().iloc[0])
top20_modes.append(top_artists["acousticness_%"].mode().iloc[0])
top20_modes.append(top_artists["instrumentalness_%"].mode().iloc[0])
top20_modes.append(top_artists["liveness_%"].mode().iloc[0])
top20_modes.append(top_artists["speechiness_%"].mode().iloc[0])

top20_modes

In [None]:
characteristics = ["Danceability", "Valence (Positive Vibes)","Energy","Acousticness",
                   "Instrumentalness","Liveness","Speechiness"]
bar_width = 0.35
fig, ax = plt.subplots(figsize=(8, 12))
bar1 = ax.bar(characteristics, all_modes, bar_width, label="All Songs", color='b')
bar2 = ax.bar([i + bar_width for i in range(len(characteristics))], top20_modes, 
              bar_width, label="Top 20 Artists", color='purple')
ax.set_title("Mode Presence of Characteristics in Top Streamed Songs in Spotify for 2023")
ax.legend()
ax.set_xlabel("Song Characteristic")
ax.set_ylabel("Percentage of Characteristic")
ax.set_xticklabels(characteristics, rotation=45)
fig.savefig("Outputs/characteristics_modes_barchart.png")
fig.show()