# Dataset: network of tracks and artists

Notes:

https://developer.spotify.com/documentation/web-api/reference/get-audio-features

In [1]:
from spotification import constants
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
import json
import pandas as pd
from datetime import datetime
YMD_FMT = "%Y-%m-%d %H:%M:%S"
CURR_TIME = datetime.now().strftime(YMD_FMT)
print(CURR_TIME)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Reference:
# https://spotipy.readthedocs.io/en/2.22.1/#module-spotipy.client
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=constants.SPOTIFY_CLIENT_ID,
                                                           client_secret=constants.SPOTIFY_CLIENT_SECRET))

2024-01-21 12:28:54


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
query = "The 1975"

tracks_data = {}
artists_data = {}
network_data = [] # edges

### We iterate over tracks as the fundamental unit of interest.
results = sp.search(q=query, limit=3)
for idx, track in enumerate(results['tracks']['items']):
    ### For each track
    
    print(idx, track['id'], track['name'], '|||', ' ; '.join([t['name'] for t in track['artists']]))
    track_id = track['id']
    track_name = track['name']
    track_date = track['album']['release_date'] # always has associated album
    
    ### Let track be the primitive unit of interest.
    ### When we look up a track, populate:
    ### 1. data for track
    ### 2. data for its artist(s)
    ### 3. edges for track-artist.
    ### (ignore albums for now, noting a track-artist edge can have multiple release dates)
    
    ### ----------------------------------------------------------------------------------------
    ### 1. data for track
            
    if track_id not in tracks_data:
        rowdata = {}
        
        # get more album info
        album_info = sp.album(track['album']["external_urls"]["spotify"])
        # can add albums as a third entity later if needed.

        # construct dataframe
        rowdata['date'] = track_date
        rowdata['id'] = track_id
        rowdata['name'] = track_name
        rowdata['album'] = track['album']['name']
        rowdata['album_genres'] = ';'.join(sorted(album_info['genres'])) # NULLABLE
        rowdata['popularity'] = track['popularity']
        rowdata['duration_seconds'] = track['duration_ms']/1000.0
        rowdata['url'] = track['external_urls']['spotify']

        # audio analysis for track
        # https://developer.spotify.com/documentation/web-api/reference/get-audio-analysis
        audio_analysis = sp.audio_analysis(track_id)
        for field in ['loudness','tempo','tempo_confidence',
                      'time_signature','time_signature_confidence',
                      'key','key_confidence', 'mode','mode_confidence']:
            rowdata[field] = audio_analysis['track'][field]

        # audio features of a track
        audio_features = sp.audio_features(track_id)[0] # list of tracks --> list of results
        for field in ['danceability','energy','speechiness','acousticness','liveness','valence']:
            rowdata[field] = audio_features[field]
        tracks_data[track_id] = rowdata
    
    ### ----------------------------------------------------------------------------------------
    ### 2. data for its artist(s)
    
    for artist in track['artists']:
        artist_id = artist['id']
        artist_name = artist['name']

        # artist info
        artist_info = sp.artist(artist_id)
        # get more artist info (genres, etc.)
        # https://stackoverflow.com/questions/61624487/extract-artist-genre-and-song-release-date-using-spotipy
        artist_info2 = sp.artist(artist_info["external_urls"]["spotify"])
        
        if artist_id not in artists_data:
            rowdata = {}
            rowdata['id'] = artist_id
            rowdata['name'] = artist_name
            genres = ';'.join(sorted(artist_info2["genres"]))
            rowdata['genres'] = genres
            rowdata['popularity'] = artist_info['popularity']
            rowdata['followers'] = artist_info['followers']['total']
            rowdata['url'] = artist_info['external_urls']['spotify']
            artists_data[artist_id] = rowdata
        
    ### ----------------------------------------------------------------------------------------
    ### 3. edges for track-artist.
    for artist in track['artists']:
        artist_id = artist['id']
        network_data.append({
            'date': track_date, # release date
            'track_id': track_id,
            'artist_id': artist_id
        })
    
### Combine all results.
index_cols = ['id','date','name','url']
df_tracks = pd.DataFrame(tracks_data).T.reset_index().drop(columns=['id']).rename(columns={'index': 'id'})
df_tracks = df_tracks[index_cols + [c for c in df_tracks if c not in index_cols]]
df_artists = pd.DataFrame(artists_data).T.reset_index().drop(columns=['id']).rename(columns={'index': 'id'})
df_network = pd.DataFrame(network_data)
for df in [df_tracks, df_artists, df_network]:
    df.insert(0, 'scrape_time', CURR_TIME)