## Code to build the artist graph dataframe

In [100]:
import numpy as np
import pandas as pd
import sqlite3
from os import getcwd, environ
import json

In [11]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
environ["SPOTIPY_CLIENT_ID"]='35eb9288550941109f8e0582bcd6bdf3'
environ["SPOTIPY_CLIENT_SECRET"]='dc00c8b7e55140518d8f62954af8876f'

In [12]:
# Download a table of artists with at least MIN_PLAYS song streams from MIN_LISTENERS unique listeners
MIN_PLAYS = 1000
MIN_LISTENERS = 50
query_string = "SELECT artist_id, artist_name, SUM(song_plays) AS total_plays, num_listeners FROM \
    (SELECT subset_songs.artist_id, subset_songs.artist_name, subset_triples.track_id, SUM(subset_triples.num_plays) AS song_plays, COUNT(*) as num_listeners\
    FROM subset_triples LEFT JOIN subset_songs ON subset_triples.track_id=subset_songs.track_id GROUP BY subset_triples.track_id) \
    GROUP BY artist_id HAVING total_plays >= {} AND num_listeners>={}".format(MIN_PLAYS,MIN_LISTENERS)
con = sqlite3.connect('track_metadata.db')
artist_base_table = pd.read_sql_query(query_string, con)
con.close()

In [13]:
artist_base_table.set_index(['artist_id'],inplace=True) # make artist_id row key
# new column of mean plays per listener, measure of fandom
artist_base_table['play_ratio'] = artist_base_table['total_plays']/artist_base_table['num_listeners'] 
print(artist_base_table.shape)
print(artist_base_table.head())

(207, 4)
                           artist_name  total_plays  num_listeners  play_ratio
artist_id                                                                     
AR02YGA1187B9B8AC4  Bersuit Vergarabat         3627           1269    2.858156
AR035N21187FB3938E                  BT         1391             63   22.079365
AR03BDP1187FB5B324      Britney Spears         9419             86  109.523256
AR040QX1187FB4CFE1        Alexisonfire         1528            474    3.223629
AR048JZ1187B9AEB85          Yellowcard         2872           1136    2.528169


In [17]:
con = sqlite3.connect('track_metadata.db')
query_string = "SELECT subset_songs.artist_id, subset_triples.track_id \
    FROM subset_triples LEFT JOIN subset_songs \
    ON subset_triples.track_id=subset_songs.track_id GROUP BY subset_triples.track_id"
artist_track_table = pd.read_sql_query(query_string, con)
con.close()
artist_track_table.set_index(['track_id'],inplace=True) # make track_id row key

In [41]:
# Table to map track_id to number of total plays
con = sqlite3.connect('track_metadata.db')
query_string = "SELECT track_id, SUM(num_plays) as total_plays FROM subset_triples GROUP BY track_id"
track_information = pd.read_sql_query(query_string, con)
con.close()
track_information.set_index(['track_id'],inplace=True) # make track_id row key

In [40]:
# Build a dictionary mapping tracks to number of plays for a specified artist
def get_freq_dict(artist_id):
    track_list = artist_track_table.index[artist_track_table['artist_id'] == artist_id].tolist() 
    artist_freq_dict = {}
    for track in track_list:
        num_plays = track_information.loc[track]['total_plays']
        artist_freq_dict[track] = num_plays
    return artist_freq_dict

In [46]:
artist_base_table['track_freq'] = artist_base_table.index.map(get_freq_dict)

In [49]:
# Table of triples (user_id, artist_id, num_streams)
con = sqlite3.connect('track_metadata.db')
query_string = "SELECT user_id, SUM(num_plays) as user_plays, artist_id FROM \
    (SELECT *  FROM (subset_triples LEFT JOIN subset_songs ON subset_triples.track_id=subset_songs.track_id))\
    GROUP BY user_id, artist_id"
user_artist_df = pd.read_sql_query(query_string, con)
con.close()

In [90]:
# Build a dictionary mapping artists to the user_id of top k streamers
def get_user_dict(artist_id, k=1):
    artist_user_dict = {}
    artist_subtable = user_artist_df[user_artist_df['artist_id'] == artist_id] # get all users who stream this artist
    artist_top_users = artist_subtable.nlargest(k, 'user_plays') # filter to top k (ties are cut off)
    
    total_plays_topk = 0 
    for index,user in artist_top_users.iterrows(): #build dictionary
        total_plays_topk += user['user_plays']
        artist_user_dict[user['user_id']] = user['user_plays']
        
    artist_total_plays = artist_base_table.loc[artist_id]['total_plays']
    top_k_ratio = total_plays_topk/artist_total_plays # What % of streams come from these top k listeners
    artist_user_dict['top_k_ratio'] = top_k_ratio
    return artist_user_dict

In [91]:
artist_base_table['top_listeners'] = artist_base_table.index.map(get_user_dict)

In [93]:
artist_id = 'AR03BDP1187FB5B324'

In [95]:
con = sqlite3.connect('track_metadata.db')
track_map = pd.read_sql_query("SELECT * FROM track_map", con)
con.close()
track_map.set_index('track_id',inplace=True)

In [104]:
def get_spotify_artist_id(artist_id):
    # Spotify genre tags for an artist
    # this information is stored in the track_map .json files
    track_dictionary = artist_base_table.loc[artist_id]['track_freq']
    top_played_song = max(track_dictionary, key=track_dictionary.get) # get the id of their top song
    song_code = track_map.loc[top_played_song]['song_id']
    prefix_code = song_code[2:4]
    json_file_path = getcwd()+'/millionsongdataset_echonest/'+prefix_code+'/'+song_code+'.json'
    
    spotify_artist_id = None
    with open(json_file_path,'r') as open_file:
        data = json.load(open_file)['response']['songs']
        if len(data) != 0:
            artist_maps = data[0]['artist_foreign_ids']
            for mapping in artist_maps:
                if mapping['catalog']=='spotify':
                    spotify_artist_id = mapping['foreign_id']
                else: pass
    return spotify_artist_id
print(get_spotify_artist_id(artist_id))

spotify:artist:26dSoYclwsYLMAKD3tpOr4


In [115]:
def get_genre_tags(artist_id):
    spotify_id = get_spotify_artist_id(artist_id)
    if spotify_id is None: return []
    spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())
    artist_data = spotify.artist(spotify_id)
    genre_list = artist_data['genres']
    return genre_list

In [119]:
artist_base_table['genres'] = artist_base_table.index.map(get_genre_tags)
filtered_artists = artist_base_table[artist_base_table['genres'] != []] # drop artists with no genre tags

ValueError: ('Lengths must match to compare', (207,), (0,))

In [None]:
print(filtered_artists.head())

In [None]:
filtered_artists.to_csv(getcwd()+'/artist_df.csv')