# Spotify Listening History processing

## Install dependencies

In [None]:
%pip install pandas
%pip install matplotlib

## Import dependencies and read records

In [None]:
import glob, json
import pandas as pd

FILE_PATTERN = 'raw-data/Streaming_History_Audio*.json'
all_streams = []
for file_path in glob.glob(FILE_PATTERN):
    with open(file_path, 'r') as file:
        all_streams += json.loads(file.read())

print(len(all_streams))

## Create Dataframe

In [3]:
all_streams_df = pd.DataFrame(all_streams)

In [None]:
all_streams_df.head()

### Select columns of interest

In [4]:
all_streams_df = all_streams_df[['ts', 'ms_played', 'master_metadata_track_name', 'master_metadata_album_artist_name', 'master_metadata_album_album_name']]

### Rename columns

In [5]:
all_streams_df = all_streams_df.rename(columns={"ts": "timestamp", "master_metadata_track_name": "track_name", "master_metadata_album_artist_name": "artist_name", "master_metadata_album_album_name": "album_name"})

### Convert timestamp to date format

In [6]:
all_streams_df['timestamp'] = pd.to_datetime(all_streams_df['timestamp'])

### Filter by year

In [None]:
twenty_twentytwo_streams = all_streams_df.loc[all_streams_df["timestamp"].dt.year == 2024]
twenty_twentytwo_streams

## My 5 most listened to artists in a given year for total time listened

In [39]:
most_listened_artists = twenty_twentytwo_streams.groupby(["artist_name"])['ms_played'].agg("sum").reset_index().sort_values(by=["ms_played"], ascending=False)
top_five = most_listened_artists.iloc[:5]

### Draw chart of time spent on 5 top artists and rest

In [None]:
rest_of_artists = pd.DataFrame({
    'artist_name': ['Others'],
    'ms_played': [most_listened_artists.iloc[5:]['ms_played'].sum()]
})

result = pd.concat([top_five, rest_of_artists], ignore_index=True)
result = result.set_index(result['artist_name'])
colors = ["#6AF70C", "#787442", "#577842", "#644278", "#A10CF7", "#F7E70C"]
result.plot.pie(y="ms_played", title="Top 5 artists per time listened", figsize=(7,7), autopct='%1.0f%%', colors=colors)

### Get top 5 songs of top 5 artists

In [None]:
top_five_artists_series = top_five['artist_name']

top_songs_df = pd.DataFrame()
for i,artist in top_five_artists_series.items():
    songs_by_top_artist = twenty_twentytwo_streams[twenty_twentytwo_streams['artist_name'] == artist]
    top_songs = songs_by_top_artist.groupby(["track_name"])['ms_played'].agg("sum").reset_index().sort_values(by=["ms_played"], ascending=False).iloc[:5]
    top_songs['hours_played'] = top_songs['ms_played']/ (1000*60*60)
    top_songs['artist_name'] = artist
    top_songs = top_songs[['artist_name', 'track_name', 'hours_played']]
    top_songs_df = pd.concat([top_songs_df, top_songs])

display(top_songs_df)

### Get day of week average listen time

In [None]:
all_streams_df['day_of_week'] = all_streams_df['timestamp'].dt.day_of_week
all_streams_df['week_of_year'] = all_streams_df['timestamp'].dt.isocalendar().week
all_streams_df['year'] = all_streams_df['timestamp'].dt.year
daily_sum = all_streams_df.groupby(["day_of_week", "week_of_year", "year"])['ms_played'].agg("sum").reset_index()
daily_avg = daily_sum.groupby(['day_of_week', 'year'])['ms_played'].agg('mean').reset_index()
daily_avg['hours_played'] = daily_avg['ms_played']/(1000*60*60)
daily_avg.sort_values(by=["hours_played"], ascending=False)