In [1]:
import pandas as pd
import os
import json

def load_music(name):
    data_path = f'data/{name}_Data'
    music_files = [f for f in os.listdir(data_path) if f.startswith('StreamingHistory_music') and f.endswith('.json')]
    all_data = []

    for file in music_files:
        file_path = os.path.join(data_path, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            all_data.extend(data)

    music = pd.DataFrame(all_data)  
    music = music.sort_values(by='endTime', ascending=True)

    return music

# Get song-related data metrics
def get_song_data(name):
    # Load music data
    music = load_music(name)
    
    # Most Played Song
    most_played_song = music.groupby(['trackName', 'artistName'])['msPlayed'].sum().idxmax()
    total_playtime = music.groupby(['trackName', 'artistName'])['msPlayed'].sum().max()

    # Most Popular Song by number of plays
    most_popular_song = music.groupby(['trackName', 'artistName']).size().idxmax()
    total_plays = music.groupby(['trackName', 'artistName']).size().max()

    # Forgotten Songs (top 3 songs not played recently)
    music['endTime'] = pd.to_datetime(music['endTime'])
    most_recent_play = music.groupby(['trackName', 'artistName'])['endTime'].max()
    # Get songs that haven't been played recently (compared to the most recent play)
    last_played = most_recent_play[most_recent_play < music['endTime'].max() - pd.Timedelta(days=30)]  # Adjust the days threshold
    forgotten_songs = music.groupby(['trackName', 'artistName']).size().loc[last_played.index]
    forgotten_songs = forgotten_songs.sort_values(ascending=False).head(3)  # Top 3 forgotten songs

    # Songs gaining popularity (increase in play count)
    music['month'] = music['endTime'].dt.to_period('M')
    recent_months = music.groupby(['month', 'trackName', 'artistName']).size().unstack().fillna(0)
    recent_months_diff = recent_months.diff(axis=1)
    gaining_songs = recent_months_diff.max(axis=1).idxmax()

    # Songs losing popularity
    losing_songs = recent_months_diff.min(axis=1).idxmin()

    # Artist with most total streams
    artist_streams = music.groupby(['artistName'])['msPlayed'].sum()
    most_streamed_artist = artist_streams.idxmax()
    total_streams = artist_streams.max()

    # Average playtime for all songs
    average_playtime = music['msPlayed'].mean()

    # Songs with longest average playtime
    avg_playtime = music.groupby(['trackName', 'artistName'])['msPlayed'].mean()
    longest_songs = avg_playtime.idxmax()
    longest_song_time = avg_playtime.max()

    # Return a dictionary with all the data
    song_data = {
        'most_played_song': (most_played_song[0], most_played_song[1], total_playtime),
        'most_popular_song': (most_popular_song[0], most_popular_song[1], total_plays),
        'forgotten_songs': [(track, artist) for track, artist in forgotten_songs.index],
        'gaining_songs': (gaining_songs[0], gaining_songs[1]),
        'losing_songs': (losing_songs[0], losing_songs[1]),
        'most_streamed_artist': (most_streamed_artist, total_streams),
        'average_playtime': average_playtime,
        'longest_playtime_song': (longest_songs[0], longest_songs[1], longest_song_time)
    }
    
    return song_data

print(get_song_data("Dash"))


In [2]:
song_data = get_song_data('Dash')

In [3]:
print(song_data)

