Sources
* https://jhrcook.github.io/spotify-data-analysis/10_005_analyzing-my-streaming-history.html
* https://strftime.org/

In [None]:
# %pip install pandas numpy matplotlib seaborn rapidfuzz statsmodels

In [None]:
import os
from typing import cast, Literal
from pprint import pp
from datetime import datetime

import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from matplotlib.ticker import MultipleLocator, StrMethodFormatter

import seaborn as sns
# sns.set_theme(style="white", context="notebook")



pd.set_option("display.expand_frame_repr", False)

from preprocessing import read_file, extract_filtered, SongAttributes
from utils.fuzzy_searchers import *
from utils.generic_plots import Plots
from utils.filters import Filters
from utils.extract_from_timestamp import * 
from utils.smoothen import *

In [None]:
# extract the "Spotify Extended Streaming History.zip" in a folder named data, sibling to this script

data_folder = os.path.join(os.path.abspath(''), "data")
print(data_folder)
# only reading Audio History files
files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if file.startswith("Streaming_History_Audio")]
songs = [song for file in files for song in read_file(file)]

In [None]:
filters: list[SongAttributes] = ["ts", "ms_played", "master_metadata_track_name", "master_metadata_album_artist_name", "master_metadata_album_album_name", "spotify_track_uri"]
filtered = [extract_filtered(song, filters) for song in songs]

In [None]:
df = pd.DataFrame(filtered)
df["ts"] = pd.to_datetime(df["ts"])
df.to_json("df.json", orient="records")

In [None]:
# Top artists by playtime
copy = df.copy(True)
grouped_by_artist_name = copy.groupby(copy["master_metadata_album_artist_name"])
time_listend = grouped_by_artist_name["ms_played"].sum()
time_listend = time_listend[time_listend != 0]
time_listend.sort_values(ascending=False, inplace=True)
time_listend_mins = time_listend.div(6e4)
time_listend_mins = cast(pd.Series, Filters.rows_gt(1, time_listend_mins))

Plots.plot_1(time_listend_mins, 25, "Ttitle=op 25 played artists by listening minutes", "Minutes Listened (scaled by natural log)", "Artist")

In [None]:
# Top artists by played track count
played_track_count = grouped_by_artist_name["master_metadata_album_artist_name"].count()
played_track_count.sort_values(ascending=False, inplace=True)
Plots.plot_1(played_track_count, 100, "Top 100 artists by played track count (non-unique)", "Track count (scaled by natural log)", "Artist")

In [None]:
# select artist, distinct track_name, count(track_name) from table

unique_tracks = grouped_by_artist_name["master_metadata_track_name"].nunique()
unique_tracks.sort_values(ascending=False, inplace=True)
Plots.plot_1(unique_tracks, 100, "Top 100 artists by unqiue played tracks", "Unique tracks played (scaled by natural log)", "artist")

In [None]:
# Most played tracks
most_played_tracks = df.groupby(df["master_metadata_track_name"])["master_metadata_track_name"].count()
most_played_tracks.sort_values(ascending=False, inplace=True)
Plots.plot_1(most_played_tracks, 150, "150 Most played tracks", "Play count (scaled by natural log)", "Track name")


In [None]:
# yearwise listening minutes
with_dates = df.copy(True)
with_dates["ts"] = with_dates["ts"].apply(lambda ts: datetime.strftime(ts, "%Y"))
grouped_by_ts = with_dates.groupby("ts")
total_playtime = grouped_by_ts["ms_played"].sum()
total_playtime = total_playtime.div(6e4)
Plots.plot_2(total_playtime, "Yearwise playtime in minutes", "Year", "Minutes Played")

In [None]:
def analysis_per_artist(frame: pd.DataFrame, artist:str):
    if(artist not in frame["master_metadata_album_artist_name"].values):
        raise ValueError(f"Artist {artist} is not in data")

    artist_frame = frame[frame["master_metadata_album_artist_name"] == artist].copy(True)

    # grouped = artist_frame.groupby("master_metadata_album_artist_name")
    # changing ts to year
    artist_frame["year"] = artist_frame["ts"].apply(lambda ts: datetime.strftime(ts, "%Y"))

    # Top 50 played tracks
    played_track_count = (artist_frame
                          .groupby("master_metadata_track_name")
                          .size()
                          .sort_values(ascending=False))
    Plots.plot_1(played_track_count, 50, f"Fifty most played tracks by {artist}", "Play count", "Track name")
    
    # Year wise play time
    yearwise_playtime = (artist_frame
                         .groupby("year")
                         .size())
    Plots.plot_2(yearwise_playtime, f"Tracks played in a particular year by {artist}", "Year", "Play count")
    
    
    
analysis_per_artist(df, "Nirvana")
analysis_per_artist(df, "Eminem")
analysis_per_artist(df, "Michael Jackson")
analysis_per_artist(df, "The Beatles")

In [None]:
search_tracks("rroxxaneesa", df, 70, 10)

In [None]:
search_artists("beat", df, 70, 10)

In [None]:
hour_df = df.copy(True)
hour_df["hour"] = hour_df["ts"].apply(lambda ts: datetime.strftime(ts, "%H"))
grouped = hour_df.groupby("hour")
hour_wise_count = grouped.size()
total_tracks = sum(hour_wise_count)

# late-nights = 01-04
# morning 05-11
# afternoon 12-16
# evening 17-20
# night 21-00

hour_wise_count.index = range(24) #type: ignore
smoothened = smoothen(hour_wise_count, 1000)

# daytime wise
late_nights = smoothened[(smoothened.index > 0) & (smoothened.index <=4)]
mornings = smoothened[(smoothened.index > 4) & (smoothened.index <=11)]
afternoons = smoothened[(smoothened.index > 11) & (smoothened.index <=16)]
evenings = smoothened[(smoothened.index > 16) & (smoothened.index <=20)]
nights = smoothened[(smoothened.index > 20) & (smoothened.index < 24)]

daytimes = [late_nights, mornings, afternoons, evenings, nights]
daytime_names = ["Late Nights", "Mornings", "Afternoons", "Evenings", "Nights"]
daytime_colors = ["#2c3e50", "#f1c40f", "#3498db","#e67e22","#8e44ad",]
locs = [2, 7.5, 14, 18.5, 22.5]
fig, ax = plt.subplots(1,1, figsize=(12,5))


for idx, daytime_ser in enumerate(daytimes):
    ax.fill_between(daytime_ser.index, 0, daytime_ser.values, alpha=0.25, color=daytime_colors[idx])
    sns.lineplot(x=daytime_ser.index, y=daytime_ser.values, ax=ax, label=daytime_names[idx], color=daytime_colors[idx])
    ax.vlines(daytime_ser.index.max(),0,daytime_ser.iat[-1], linestyle='solid', color="000", alpha=0.125)

ax.set_xticks(range(24))
# ax.set_xticks([0, 1, 2,2.2, 3, 4, 5, 6, 7,7.5, 8, 9, 10, 11, 12, 13,13.5, 14, 15, 16, 17, 18,18.5, 19, 20, 21, 22,22.5, 23], labels=["0", "1", "2","Late Nights", "3", "4", "5", "6", "7","Mornings""", "8", "9", "10", "11", "12", "13","Afternoon", "14", "15", "16", "17", "18","Evenings", "19", "20", "21", "22","Late Nights", "23"])

# day names spread across evenly
# ax.set_xticks([2, 7.5, 14, 18.5, 22.5], labels=daytime_names)
# ax.set_xticks(np.linspace(2, 22, len(daytime_names), True), labels=daytime_names)
# ax.grid(axis="y", linestyle=':')

# ax.minorticks_on()
# ax.xaxis.set_minor_locator(MultipleLocator(base=1))
# ax.xaxis.set_minor_formatter(StrMethodFormatter("{x:1.0f}"))
# ax.set_xticks(range(24), minor=True)
# ax.tick_params(direction='inout')



ax.set_xlabel("Hour")
ax.set_ylabel("Minutes listend")
ax.set_title("Listening activity thoroughout day")
ax.set_ylim(0, None)
sns.despine()
plt.show()

# Trends in track playtime

In [None]:
track_playtime_ser = df["ms_played"].copy(True)
track_playtime_ser= track_playtime_ser.apply(lambda ts: int(ts) / 6e4).round(3)


under_min = cast(pd.Series, Filters.rows_lteq(1, track_playtime_ser))
track_playtime_ser = cast(pd.Series, Filters.rows_gteq(1, track_playtime_ser))
track_playtime_ser = cast(pd.Series, Filters.rows_lteq(10, track_playtime_ser))

fig, ax = plt.subplots(2,1, figsize=(10,7))


# Plotting the KDE distribution of track playtimes
# To prevent skewing of data, playtimes between 1 and 10minutes are only considered
# since a lot of tracks with playtime under a minute could be skipped tracks and
# the tracks over 10 minutes might be scarce but widely spread
sns.kdeplot(x=track_playtime_ser.values, alpha=0.25, fill=True, color="red", linestyle="", ax=ax[0])
ax[0].set_title("Distribution (KDE) of track playtimes")
ax[0].axvline(track_playtime_ser.mean(), 0, label="Mean", linestyle=":", color="000", alpha=0.5)
ax[0].legend()

# KDE Distribution of playtimes under a minute
sns.kdeplot(x=under_min.values, alpha=0.25, fill=True, color="blue", linestyle="", ax=ax[1])
ax[1].set_title("Distribution of playtime under minute")
ax[1].set_xlabel("Minutes")
ax[1].axvline(under_min.mean(), 0, label="Mean", linestyle=":", color="000", alpha=0.5)
ax[1].legend()
sns.despine()

plt.tight_layout()
plt.show()

In [None]:
# daily number of songs listend to
daywise_plays_df = df[["ts", "ms_played"]].copy(True)

# filtering out skipped tracks
daywise_plays_df = daywise_plays_df[daywise_plays_df["ms_played"] >= 3e4]
daywise_plays_df.sort_values("ts", inplace=True)
dates  = daywise_plays_df["ts"].apply(lambda ts: datetime.strftime(ts, "%Y-%m-%d"))
datewise_track_count_ser = dates.groupby(dates).count()

min_date = dates.min()
max_date = dates.max()
assert max_date >= min_date

# days between first and last track record
total_days = (
    datetime.strptime(max_date, "%Y-%m-%d") - datetime.strptime(min_date, "%Y-%m-%d")
).days


fig, ax = plt.subplots(1, 1, figsize=(10, 5))

__x = pd.to_datetime(datewise_track_count_ser.index)
__y = datewise_track_count_ser.values

sns.scatterplot(
    x=__x,
    y=__y,
    ax=ax,
    hue=datewise_track_count_ser.values,
    palette="coolwarm",
    marker='.'
)
date_ticks = pd.date_range(start=min_date, end=max_date, periods=7, inclusive="both")
ax.set_xticks(
    [d.strftime(format="%Y-%m-%d") for d in date_ticks],
    labels=[d.strftime("%b %Y") for d in date_ticks],
)
sns.despine()
plt.show()

In [None]:
df

In [None]:
# artists vs songs listend each month
# number of artists listend each month
copy = df[["ts","master_metadata_album_artist_name", "master_metadata_track_name"]].copy(True)
# copy["month-year"] = copy["ts"].apply(lambda ts: datetime.strftime(ts, "%b %Y"))
copy["month"] = copy["ts"].dt.to_period("M")
copy.sort_values("ts", inplace=True, ascending=True)

monthwise_unique_artists_count = copy.groupby("month")["master_metadata_album_artist_name"].nunique()
monthwise_unique_track_count = copy.groupby("month")["master_metadata_track_name"].nunique()
aggr = pd.concat([monthwise_unique_artists_count, monthwise_unique_track_count], axis=1)
aggr["month-year"] = aggr.index.strftime("%b %Y")
aggr.rename({"master_metadata_album_artist_name":"Artist Count", "master_metadata_track_name":"Track Count"}, inplace=True, axis=1)


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))

y1 = aggr["Artist Count"]
y2 = aggr["Track Count"]

# ax.stackplot(
#     aggr["month-year"],
#     y1,
#     y2,
#     labels=["Artist Count", "Track Count"],
#     colors=["#5EABD6", "#EF5A6F"],
# )

ax.axhline(y1.max(), color="#5EABD6", alpha=0.5, linestyle='--')
ax.axhline(y2.max(), color="#EF5A6F", alpha=0.5, linestyle='--')

sns.lineplot(
    x=aggr["month-year"],
    y=y1,
    label=["Artist Count"],
    color="#5EABD6",
    marker=".",
    ax=ax
)
sns.lineplot(
    x=aggr["month-year"],
    y=y2,
    label=["Track Count"],
    color="#EF5A6F",
    marker=".",
    ax=ax
)


ax.set_xticks(date_ticks.strftime("%b %Y"))
ax.legend(
    loc="upper left",
)
sns.despine()

plt.title("Variablilty in artists and tracks")
plt.tight_layout()
plt.show()