# Dataset: network of tracks and artists

## Description

This is a dataset consisting of a network of music tracks, their albums, and their artists. The main hierarchy is:

- tracks go under albums
- albums go under artists

however, there are a lot of nuances to deal with, for instance, multiple artists collaborating on the same tracks, EPs and singles, and so forth.

## Developer notes

https://developer.spotify.com/documentation/web-api/reference/get-audio-features

In [1]:
from spotification import constants
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
import json
import pandas as pd
from datetime import datetime
YMD_FMT = "%Y-%m-%d %H:%M:%S"
CURR_TIME = datetime.now().strftime(YMD_FMT)
print(CURR_TIME)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Reference:
# https://spotipy.readthedocs.io/en/2.22.1/#module-spotipy.client
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=constants.SPOTIFY_CLIENT_ID,
                                                           client_secret=constants.SPOTIFY_CLIENT_SECRET))

2024-01-21 12:30:24


In [2]:
query = "The 1975"

tracks_data = {}
artists_data = {}
network_data = [] # edges

### We iterate over tracks as the fundamental unit of interest.
results = sp.search(q=query, limit=3)
for idx, track in enumerate(results['tracks']['items']):
    ### For each track
    
    print(idx, track['id'], track['name'], '|||', ' ; '.join([t['name'] for t in track['artists']]))
    track_id = track['id']
    track_name = track['name']
    track_date = track['album']['release_date'] # always has associated album
    
    ### Let track be the primitive unit of interest.
    ### When we look up a track, populate:
    ### 1. data for track
    ### 2. data for its artist(s)
    ### 3. edges for track-artist.
    ### (ignore albums for now, noting a track-artist edge can have multiple release dates)
    
    ### ----------------------------------------------------------------------------------------
    ### 1. data for track
            
    if track_id not in tracks_data:
        rowdata = {}
        
        # get more album info
        album_info = sp.album(track['album']["external_urls"]["spotify"])
        # can add albums as a third entity later if needed.

        # construct dataframe
        rowdata['date'] = track_date
        rowdata['id'] = track_id
        rowdata['name'] = track_name
        rowdata['album'] = track['album']['name']
        rowdata['album_genres'] = ';'.join(sorted(album_info['genres'])) # NULLABLE
        rowdata['popularity'] = track['popularity']
        rowdata['duration_seconds'] = track['duration_ms']/1000.0
        rowdata['url'] = track['external_urls']['spotify']

        # audio analysis for track
        # https://developer.spotify.com/documentation/web-api/reference/get-audio-analysis
        audio_analysis = sp.audio_analysis(track_id)
        for field in ['loudness','tempo','tempo_confidence',
                      'time_signature','time_signature_confidence',
                      'key','key_confidence', 'mode','mode_confidence']:
            rowdata[field] = audio_analysis['track'][field]

        # audio features of a track
        audio_features = sp.audio_features(track_id)[0] # list of tracks --> list of results
        for field in ['danceability','energy','speechiness','acousticness','liveness','valence']:
            rowdata[field] = audio_features[field]
        tracks_data[track_id] = rowdata
    
    ### ----------------------------------------------------------------------------------------
    ### 2. data for its artist(s)
    
    for artist in track['artists']:
        artist_id = artist['id']
        artist_name = artist['name']

        # artist info
        artist_info = sp.artist(artist_id)
        # get more artist info (genres, etc.)
        # https://stackoverflow.com/questions/61624487/extract-artist-genre-and-song-release-date-using-spotipy
        artist_info2 = sp.artist(artist_info["external_urls"]["spotify"])
        
        if artist_id not in artists_data:
            rowdata = {}
            rowdata['id'] = artist_id
            rowdata['name'] = artist_name
            genres = ';'.join(sorted(artist_info2["genres"]))
            rowdata['genres'] = genres
            rowdata['popularity'] = artist_info['popularity']
            rowdata['followers'] = artist_info['followers']['total']
            rowdata['url'] = artist_info['external_urls']['spotify']
            artists_data[artist_id] = rowdata
        
    ### ----------------------------------------------------------------------------------------
    ### 3. edges for track-artist.
    for artist in track['artists']:
        artist_id = artist['id']
        network_data.append({
            'date': track_date, # release date
            'track_id': track_id,
            'artist_id': artist_id
        })
    
### Combine all results.
index_cols = ['id','date','name','url']
df_tracks = pd.DataFrame(tracks_data).T.reset_index().drop(columns=['id']).rename(columns={'index': 'id'})
df_tracks = df_tracks[index_cols + [c for c in df_tracks if c not in index_cols]]
df_artists = pd.DataFrame(artists_data).T.reset_index().drop(columns=['id']).rename(columns={'index': 'id'})
df_network = pd.DataFrame(network_data)
for df in [df_tracks, df_artists, df_network]:
    df.insert(0, 'scrape_time', CURR_TIME)

0 1fDFHXcykq4iw8Gg7s5hG9 About You ||| The 1975
1 5hc71nKsUgtwQ3z52KEKQk Somebody Else ||| The 1975
2 12g9IeQzX7xECLNxz8dpb5 I'm In Love With You ||| The 1975


In [3]:
# info on the fetched tracks
df_tracks

Unnamed: 0,scrape_time,id,date,name,url,album,album_genres,popularity,duration_seconds,loudness,tempo,tempo_confidence,time_signature,time_signature_confidence,key,key_confidence,mode,mode_confidence,danceability,energy,speechiness,acousticness,liveness,valence
0,2024-01-21 12:30:24,1fDFHXcykq4iw8Gg7s5hG9,2022-10-14,About You,https://open.spotify.com/track/1fDFHXcykq4iw8G...,Being Funny In A Foreign Language,,83,326.49,-7.385,95.967,0.438,4,0.931,2,0.476,1,0.549,0.416,0.762,0.0279,0.312,0.0559,0.461
1,2024-01-21 12:30:24,5hc71nKsUgtwQ3z52KEKQk,2016-02-26,Somebody Else,https://open.spotify.com/track/5hc71nKsUgtwQ3z...,"I like it when you sleep, for you are so beaut...",,74,347.52,-5.724,101.045,0.529,4,0.943,0,0.738,1,0.696,0.61,0.788,0.0585,0.195,0.153,0.472
2,2024-01-21 12:30:24,12g9IeQzX7xECLNxz8dpb5,2022-10-14,I'm In Love With You,https://open.spotify.com/track/12g9IeQzX7xECLN...,Being Funny In A Foreign Language,,73,262.623,-3.924,119.008,0.827,4,0.988,2,0.684,1,0.918,0.668,0.902,0.0386,0.169,0.198,0.913


In [4]:
# info on the artists
df_artists

Unnamed: 0,scrape_time,id,name,genres,popularity,followers,url
0,2024-01-21 12:30:24,3mIj9lX2MWuHmhNCA7LSCW,The 1975,modern alternative rock;modern rock;pop;pov: i...,76,7040979,https://open.spotify.com/artist/3mIj9lX2MWuHmh...


In [5]:
# info on the network
# track --> artist_id
df_network

Unnamed: 0,scrape_time,date,track_id,artist_id
0,2024-01-21 12:30:24,2022-10-14,1fDFHXcykq4iw8Gg7s5hG9,3mIj9lX2MWuHmhNCA7LSCW
1,2024-01-21 12:30:24,2016-02-26,5hc71nKsUgtwQ3z52KEKQk,3mIj9lX2MWuHmhNCA7LSCW
2,2024-01-21 12:30:24,2022-10-14,12g9IeQzX7xECLNxz8dpb5,3mIj9lX2MWuHmhNCA7LSCW


In [None]:
# TODO: 
# - find a cohort of artists to track (because we can't pull all artists, of course)
# - deal with tracks belonging to multiple artists