# Simon Spotify Data

## Exploration

In [125]:
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib
from datetime import date
from matplotlib.backends.backend_pdf import PdfPages

## Settings

In [126]:
start_date = date(2020, 1, 1)
end_date = date(2030,1, 1)

nrof_top_artists = 10
nrof_top_songs = 10

listening_time_per_day_rolling_window = 7
top_songs_rolling_window = 7
top_artists_rolling_window = 7

## Setup

In [127]:
pdf_pages = PdfPages('Wrapped.pdf')
df = pd.read_json("StreamingHistory_music_0.json")
df = pd.concat([df, pd.read_json("StreamingHistory_music_1.json")])
df = pd.concat([df, pd.read_json("StreamingHistory_music_2.json")])
df = pd.concat([df, pd.read_json("StreamingHistory_music_3.json")])

In [128]:
#df

In [None]:
df.dtypes

In [130]:
df["endTime"] = pd.to_datetime(df["endTime"])
df["msPlayed"] = pd.to_numeric(df["msPlayed"])

In [None]:
df.dtypes

In [132]:
start_date = max(start_date, df["endTime"].min().date())
end_date = min(end_date, df["endTime"].max().date())

timeFrameDf = df.query(f"endTime >= '{start_date}' & endTime < '{end_date}'")
#timeFrameDf


In [133]:
df = timeFrameDf

## Front Page

In [None]:
plt.figure(figsize=(16, 9))  # Standard letter size
plt.text(0.5, 0.95, 'Spotify Wrapped', fontsize=30, ha='center', va='top')
period = end_date - start_date
info_string = f"""Stats for period {start_date} to {end_date}\n
Total listening time: {(df["msPlayed"].sum() / 3600000).round(2)}h\n
Average listening time per day: {(df['msPlayed'].sum() / period.days / 3600000).round(2)}h\n"""

plt.text(0.5, 0.85, info_string, fontsize=20, ha='center', va='top')
plt.axis('off')  # Hide axes

# Save the text page to the PDF
pdf_pages.savefig()
plt.show()
plt.close()

## Top Songs

In [None]:
top_songs = df.groupby(["artistName", "trackName"])["msPlayed"].count().sort_values(ascending=False).rename("playCount").head(nrof_top_songs)
pd.DataFrame(top_songs)

In [None]:

plt.figure(figsize=(16,9))

top_songs_df = pd.DataFrame(top_songs)
top_songs_df = top_songs_df.sort_values(by='playCount', ascending=True)
print(top_songs_df.columns)
plt.barh([str(i[1]) + " - " + str(i[0]) for i in top_songs_df.index], top_songs_df['playCount'] , color='skyblue')
for index, value in enumerate(top_songs_df['playCount']):
    plt.text(value / 2, index, f'{value}', ha='center', va='center', fontsize=15)


plt.title(f"Top {nrof_top_songs} songs", fontsize=20)
plt.ylabel("Song Title", fontsize=15)
plt.xlabel("Play Count", fontsize=15)

plt.yticks(fontsize=15)
plt.xticks(fontsize=15)

plt.tight_layout()
pdf_pages.savefig()
plt.show()

In [137]:
top_songs_full = df[df.set_index(["artistName", "trackName"]).index.isin(top_songs.index)]
#top10_full

In [138]:
top_songs_daily = top_songs_full.groupby(["artistName", "trackName"]).resample("d", on="endTime").count()
top_songs_daily["playCount"] = top_songs_daily["artistName"]
top_songs_daily = top_songs_daily.drop(["artistName", "trackName", "msPlayed"], axis=1)
#top10_daily

In [None]:
plt.figure(figsize=(16,9))

for artist, track in top_songs.index:
    song_data = top_songs_daily.query(f"artistName == '{artist}' & trackName == '{track}'").reset_index()
    song_data['rolling_playCount'] = song_data['playCount'].rolling(window=top_songs_rolling_window).mean()
    plt.plot(song_data["endTime"], song_data["rolling_playCount"], label=f"{track}")

plt.legend(fontsize=15)
plt.title(f"Top {nrof_top_songs} Songs listening time rolling {top_songs_rolling_window} day average", fontsize=20)
plt.xlabel("Date", fontsize=15)
plt.ylabel("Play Count", fontsize=15)
plt.grid()

plt.yticks(fontsize=15)
plt.xticks(fontsize=15)

plt.tight_layout()
pdf_pages.savefig()
plt.show()

## Top artists

In [None]:
top_artists = df.groupby(["artistName"])["msPlayed"].count().sort_values(ascending=False).rename("playCount").head(nrof_top_artists)
pd.DataFrame(top_artists)

In [None]:
plt.figure(figsize=(16,9))

top_artist_df = pd.DataFrame(top_artists)

top_artist_df = top_artist_df.sort_values(by='playCount', ascending=True)

plt.barh(top_artist_df.index, top_artist_df['playCount'], color='skyblue')
for index, value in enumerate(top_artist_df['playCount']):
    plt.text(value / 2, index, f'{value}', ha='center', va='center', fontsize=15)

plt.title(f"Top {nrof_top_artists} artists", fontsize=20)
plt.ylabel("Artist name", fontsize=15)
plt.xlabel("Play Count", fontsize=15)

plt.yticks(fontsize=15)
plt.xticks(fontsize=15)

plt.tight_layout()
pdf_pages.savefig()
plt.show()

In [142]:
top_artists_full = df[df.set_index(["artistName"]).index.isin(top_artists.index)]
#top5_artists_full

In [143]:
top_artists_daily = top_artists_full.groupby(["artistName"]).resample("D", on="endTime").count()
top_artists_daily["playCount"] = top_artists_daily["artistName"]
top_artists_daily = top_artists_daily.drop(["artistName", "trackName", "msPlayed"], axis=1)
#top5_artists_daily

In [None]:
plt.figure(figsize=(16,9))

for artist in top_artists.index:
    song_data = top_artists_daily.query(f"artistName == '{artist}'").reset_index()
    song_data['rolling_playcount'] = song_data["playCount"].rolling(window=top_artists_rolling_window).mean()
    plt.plot(song_data["endTime"], song_data["rolling_playcount"], label=f"{artist}")


plt.legend(fontsize=15)

plt.title(f"Top {nrof_top_artists} artists listening time rolling {top_artists_rolling_window} day average", fontsize=20)
plt.xlabel("Date", fontsize=15)
plt.ylabel("Play Count", fontsize=15)
plt.grid()

plt.yticks(fontsize=15)
plt.xticks(fontsize=15)

plt.tight_layout()
pdf_pages.savefig()
plt.show()

## Playtime

In [None]:
playtime = df.resample("D", on="endTime")["msPlayed"].sum()

# Convert to hours
playtime = playtime.divide(3600000)

playtime = playtime.rolling(window=listening_time_per_day_rolling_window).mean()

plt.figure(figsize=(16,9))
plt.plot(playtime)

plt.title(f"Total playtime rolling {listening_time_per_day_rolling_window} day average", fontsize=20)
plt.xlabel("Date", fontsize=15)
plt.ylabel("Play Time (h)", fontsize=15)

plt.yticks(fontsize=15)
plt.xticks(fontsize=15)

plt.grid()

plt.tight_layout()
pdf_pages.savefig()
plt.show()

In [None]:
ms_per_hour = df
ms_per_hour['hours'] = df['endTime'].dt.hour
ms_per_hour = ms_per_hour.groupby('hours')['msPlayed'].sum()
ms_per_hour = ms_per_hour.reindex(range(24), fill_value=0)
total_listening_time = ms_per_hour.sum()
percent_occurrences_per_hour = (ms_per_hour / total_listening_time) * 100

plt.figure(figsize=(16, 9))
plt.bar(range(24), percent_occurrences_per_hour, color='skyblue')
for index, value in enumerate(percent_occurrences_per_hour):
   formatedValue = ''
   if (value > 1): formatedValue = f'{round(value, 2)}%'
   plt.text(index, value / 2, formatedValue, ha='center', va='center', fontsize=15, rotation=90)

plt.title('Listening spread per hour of the day', fontsize=20)
plt.xlabel('Hour of the Day', fontsize=15)
plt.ylabel('Percent of listening time', fontsize=15)

plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.xticks(rotation=0)

plt.tight_layout()
pdf_pages.savefig()
plt.show()


In [None]:
listen_time_weekday = df
listen_time_weekday['weekday'] = df['endTime'].dt.weekday
listen_time_weekday = listen_time_weekday.groupby('weekday')['msPlayed'].sum()
listen_time_weekday = listen_time_weekday.reindex(range(7), fill_value=0)

total_listening_time = listen_time_weekday.sum()
percent_occurrences_per_hour = (listen_time_weekday / total_listening_time) * 100

plt.figure(figsize=(16, 9))
plt.bar(range(7), percent_occurrences_per_hour, color='skyblue')
for index, value in enumerate(percent_occurrences_per_hour):
   formatedValue = ''
   if (value > 1): formatedValue = f'{round(value, 2)}%'
   plt.text(index, value / 2, formatedValue, ha='center', va='center', fontsize=15, rotation=90)


plt.title('Percent of listening time per weekday', fontsize=20)
plt.xlabel('Days', fontsize=15)
plt.ylabel('Percent of total listening time', fontsize=15)

plt.xticks(fontsize=15, ticks=[0,1,2,3,4,5,6], rotation=45, labels=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
plt.yticks(fontsize=15)

plt.tight_layout()
pdf_pages.savefig()
pdf_pages.close()
plt.show()