In [1]:
import numpy as np
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import keyring
import time

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Setup Spotipy credentials and query wrapper

In [2]:
client_credentials_manager = SpotifyClientCredentials(client_id=keyring.get_password('spotify', 'cid'),
                                                      client_secret=keyring.get_password('spotify', 'secret') )
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

## Get sample artists data

In [3]:
artist_id = '2YZyLoL8N0Wb9xBt1NhZWg'

In [4]:
# View sp.track output
sp.artist(artist_id)

{&#39;external_urls&#39;: {&#39;spotify&#39;: &#39;https://open.spotify.com/artist/2YZyLoL8N0Wb9xBt1NhZWg&#39;},
 &#39;followers&#39;: {&#39;href&#39;: None, &#39;total&#39;: 15554576},
 &#39;genres&#39;: [&#39;conscious hip hop&#39;, &#39;hip hop&#39;, &#39;rap&#39;, &#39;west coast rap&#39;],
 &#39;href&#39;: &#39;https://api.spotify.com/v1/artists/2YZyLoL8N0Wb9xBt1NhZWg&#39;,
 &#39;id&#39;: &#39;2YZyLoL8N0Wb9xBt1NhZWg&#39;,
 &#39;images&#39;: [{&#39;height&#39;: 640,
   &#39;url&#39;: &#39;https://i.scdn.co/image/3a836196bfb341f736c7fe2704fb75de53f8dfbb&#39;,
   &#39;width&#39;: 640},
  {&#39;height&#39;: 320,
   &#39;url&#39;: &#39;https://i.scdn.co/image/5259c0496329b3f608a1ae0edb799cd2f8451acc&#39;,
   &#39;width&#39;: 320},
  {&#39;height&#39;: 160,
   &#39;url&#39;: &#39;https://i.scdn.co/image/b772a78d4cb192268d6f601a78f21044c17d6dda&#39;,
   &#39;width&#39;: 160}],
 &#39;name&#39;: &#39;Kendrick Lamar&#39;,
 &#39;popularity&#39;: 90,
 &#39;type&#39;: &#39;artist&#39;,
 &#39;u

## Get sample track data

In [5]:
track_id = "74tLlkN3rgVzRqQJgPfink"

In [6]:
# View sp.track output
sp.track(track_id)

{&#39;album&#39;: {&#39;album_type&#39;: &#39;album&#39;,
  &#39;artists&#39;: [{&#39;external_urls&#39;: {&#39;spotify&#39;: &#39;https://open.spotify.com/artist/2YZyLoL8N0Wb9xBt1NhZWg&#39;},
    &#39;href&#39;: &#39;https://api.spotify.com/v1/artists/2YZyLoL8N0Wb9xBt1NhZWg&#39;,
    &#39;id&#39;: &#39;2YZyLoL8N0Wb9xBt1NhZWg&#39;,
    &#39;name&#39;: &#39;Kendrick Lamar&#39;,
    &#39;type&#39;: &#39;artist&#39;,
    &#39;uri&#39;: &#39;spotify:artist:2YZyLoL8N0Wb9xBt1NhZWg&#39;}],
  &#39;available_markets&#39;: [&#39;AD&#39;,
   &#39;AE&#39;,
   &#39;AL&#39;,
   &#39;AR&#39;,
   &#39;AT&#39;,
   &#39;AU&#39;,
   &#39;BA&#39;,
   &#39;BE&#39;,
   &#39;BG&#39;,
   &#39;BH&#39;,
   &#39;BO&#39;,
   &#39;BR&#39;,
   &#39;BY&#39;,
   &#39;CH&#39;,
   &#39;CL&#39;,
   &#39;CO&#39;,
   &#39;CR&#39;,
   &#39;CY&#39;,
   &#39;CZ&#39;,
   &#39;DE&#39;,
   &#39;DK&#39;,
   &#39;DO&#39;,
   &#39;DZ&#39;,
   &#39;EC&#39;,
   &#39;EE&#39;,
   &#39;EG&#39;,
   &#39;ES&#39;,
   &#39;FI&#39;,
   &#39

In [7]:
# View sp.audio_featrues output
sp.audio_features(track_id)

[{&#39;danceability&#39;: 0.716,
  &#39;energy&#39;: 0.531,
  &#39;key&#39;: 7,
  &#39;loudness&#39;: -7.355,
  &#39;mode&#39;: 1,
  &#39;speechiness&#39;: 0.122,
  &#39;acousticness&#39;: 0.0703,
  &#39;instrumentalness&#39;: 0,
  &#39;liveness&#39;: 0.224,
  &#39;valence&#39;: 0.344,
  &#39;tempo&#39;: 71.994,
  &#39;type&#39;: &#39;audio_features&#39;,
  &#39;id&#39;: &#39;74tLlkN3rgVzRqQJgPfink&#39;,
  &#39;uri&#39;: &#39;spotify:track:74tLlkN3rgVzRqQJgPfink&#39;,
  &#39;track_href&#39;: &#39;https://api.spotify.com/v1/tracks/74tLlkN3rgVzRqQJgPfink&#39;,
  &#39;analysis_url&#39;: &#39;https://api.spotify.com/v1/audio-analysis/74tLlkN3rgVzRqQJgPfink&#39;,
  &#39;duration_ms&#39;: 386907,
  &#39;time_signature&#39;: 4}]

## Read consolidated spotify daily charts

In [8]:
df = pd.read_csv('data/spotify_daily_charts.csv')
df.head()

Unnamed: 0,date,position,track_id,track_name,artist,streams
0,2018-01-01,1,0ofbQMrRDsUaVKq2mGLEAb,Havana,Camila Cabello,155633
1,2018-01-01,2,0tgVpDi06FyKpA1z0VMD4v,Perfect,Ed Sheeran,134756
2,2018-01-01,3,3hBBKuWJfxlIlnd9QFoC8k,What Lovers Do (feat. SZA),Maroon 5,130898
3,2018-01-01,4,1mXVgsBdtIVeCLJnSnmtdV,Too Good At Goodbyes,Sam Smith,130798
4,2018-01-01,5,2ekn2ttSfGqwhhate0LSR0,New Rules,Dua Lipa,125472


## Get data of unique tracks in charts 

In [9]:
def get_track_data(t_id):                    
    track_data = sp.track(t_id)
    track_features = sp.audio_features(t_id)
    
    #get only main(first) artist
    td_list = [t_id,\
               track_data['name'],\
               track_data['artists'][0]['id'],\
               track_data['artists'][0]['name'],\
               track_data['album']['uri'].split(":")[2],\
               track_data['duration_ms'],\
               track_data['album']['release_date'],\
               track_data['popularity']]
    data = pd.DataFrame([td_list], columns = ['track_id','track_name','artist_id','artist_name','album_id','duration','release_date','popularity'])

    relevant_cols = ['danceability', 'energy', 'key', 'loudness', 'mode',\
                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']  
    
    tf_data = pd.DataFrame(track_features)
    tf_data = tf_data[relevant_cols]
    
    data = pd.concat([data, tf_data], axis=1)
    return data


In [10]:
get_track_data(track_id)

Unnamed: 0,track_id,track_name,artist_id,artist_name,album_id,duration,release_date,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,74tLlkN3rgVzRqQJgPfink,Money Trees,2YZyLoL8N0Wb9xBt1NhZWg,Kendrick Lamar,0Oq3mWfexhsjUh0aNNBB5u,386906,2012,73,0.716,0.531,7,-7.355,1,0.122,0.0703,0,0.224,0.344,71.994


In [11]:
track_df = df[['track_id','track_name']].drop_duplicates()
track_df

Unnamed: 0,track_id,track_name
0,0ofbQMrRDsUaVKq2mGLEAb,Havana
1,0tgVpDi06FyKpA1z0VMD4v,Perfect
2,3hBBKuWJfxlIlnd9QFoC8k,What Lovers Do (feat. SZA)
3,1mXVgsBdtIVeCLJnSnmtdV,Too Good At Goodbyes
4,2ekn2ttSfGqwhhate0LSR0,New Rules
...,...,...
196985,0zzVTGyRrWpQu8Fr28NRAv,OK Not To Be OK
197177,4G7uzhxgKFBCJApBpuuBgU,Kabet
197705,3KUGAgxo3b81X5bWoOp0U8,Back Door
197788,0rbKrBvZUYY9GN9l057BuY,"always, i'll care"


In [12]:
len(pd.unique(track_df['track_id'].values)),len(pd.unique(track_df['track_name'].values))

(2292, 1826)

> Q: Why is it that we have fewer unique track names than unique track ids? Is this expected or does it indicate a data processing error?

In [13]:
track_list = track_df['track_id'].values
df_list=[]

for i,track_id in enumerate(track_list):
    print('[%d/%d] Fetching track data for %s... ' % 
          (i+1,len(track_list),track_df[track_df['track_id']==track_id]['track_name'].values[0]), end = " ") 
    track_data = get_track_data(track_id) 
    df_list.append(track_data)
    print('done!')
    
    #sleep for 100 secs per 100 requests to avoid being blocked
    if (i % 100 == 0)&(i > 0):
        time.sleep(5)

[1973/2292] Fetching track data for 시작...  done!
[1974/2292] Fetching track data for Binibini...  done!
[1975/2292] Fetching track data for FANCY...  done!
[1976/2292] Fetching track data for THE SCOTTS...  done!
[1977/2292] Fetching track data for If You’re Too Shy (Let Me Know)...  done!
[1978/2292] Fetching track data for If You’re Too Shy (Let Me Know) - Edit...  done!
[1979/2292] Fetching track data for Righteous...  done!
[1980/2292] Fetching track data for Passenger Seat (Acoustic)...  done!
[1981/2292] Fetching track data for Nandito Na...  done!
[1982/2292] Fetching track data for I Choose - From The Netflix Original Film The Willoughbys...  done!
[1983/2292] Fetching track data for Fight Song...  done!
[1984/2292] Fetching track data for Savage Remix (feat. Beyoncé)...  done!
[1985/2292] Fetching track data for Moonlight...  done!
[1986/2292] Fetching track data for Play Date...  done!
[1987/2292] Fetching track data for Passenger Seat...  done!
[1988/2292] Fetching track dat

In [14]:
tracks_data_df = pd.concat(df_list)
tracks_data_df.head()

Unnamed: 0,track_id,track_name,artist_id,artist_name,album_id,duration,release_date,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0ofbQMrRDsUaVKq2mGLEAb,Havana,4nDoRrQiYLoBzwC5BhVJzF,Camila Cabello,5chBPOVY2I0bG5V3igb5QL,216896,2017-08-03,4,0.768,0.517,7,-4.323,0,0.0312,0.186,3.8e-05,0.104,0.418,104.992
0,0tgVpDi06FyKpA1z0VMD4v,Perfect,6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,3T4tUhGYeRNVUGevb0wThu,263400,2017-03-03,86,0.599,0.448,8,-6.312,1,0.0232,0.163,0.0,0.106,0.168,95.05
0,3hBBKuWJfxlIlnd9QFoC8k,What Lovers Do (feat. SZA),04gDigrS5kc9YWfZHwBETP,Maroon 5,1Jmq5HEJeA9kNi2SgQul4U,199849,2017-11-03,4,0.795,0.615,5,-5.211,0,0.0671,0.0786,3e-06,0.0855,0.393,110.009
0,1mXVgsBdtIVeCLJnSnmtdV,Too Good At Goodbyes,2wY79sveU1sp5g7SokKOiI,Sam Smith,3TJz2UBNYJtlEly0sPeNrQ,201000,2017-11-03,81,0.681,0.372,5,-8.237,1,0.0432,0.64,0.0,0.169,0.476,91.873
0,2ekn2ttSfGqwhhate0LSR0,New Rules,6M2wZ9GZgrQXHCFfjv46we,Dua Lipa,01sfgrNbnnPUEyz6GZYlt9,209320,2017-06-02,80,0.762,0.7,9,-6.021,0,0.0694,0.00261,1.6e-05,0.153,0.608,116.073


In [15]:
tracks_data_df.to_csv('data/spotify_daily_charts_tracks.csv', index=False, encoding='utf-8')

In [16]:
tracks_data_df.describe()

Unnamed: 0,duration,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0
mean,215783.227312,53.103839,0.635396,0.591876,5.123037,-6.876896,0.688045,0.083546,0.286942,0.009645,0.168288,0.455961,118.496587
std,46745.918466,28.050582,0.143064,0.190003,3.593686,2.860197,0.463393,0.08328,0.277537,0.06981,0.126592,0.213066,28.125389
min,37640.0,0.0,0.184,0.0541,0.0,-24.25,0.0,0.0232,2e-06,0.0,0.0215,0.0398,52.572
25%,189132.75,44.0,0.547,0.461,2.0,-8.291,0.0,0.0358,0.05315,0.0,0.0929,0.28875,96.97775
50%,209273.0,63.0,0.653,0.602,5.0,-6.4285,1.0,0.05095,0.186,0.0,0.118,0.4465,116.004
75%,238995.5,73.0,0.736,0.738,8.0,-4.87625,1.0,0.09055,0.468,1.7e-05,0.20025,0.608,136.0565
max,536217.0,100.0,0.953,0.969,11.0,0.175,1.0,0.733,0.979,0.908,0.955,0.973,207.476


## Get data of unique artists in charts 

In [3]:
#Get unique artists id
artist_df = tracks_data_df[['artist_id','artist_name']].drop_duplicates()
artist_df

NameError: name &#39;tracks_data_df&#39; is not defined

In [4]:
len(artist_df)

NameError: name &#39;artist_df&#39; is not defined

> Q: What does the ratio of unique artists to unique tracks tell you about the nature of the Spotify top-streamed market?

In [19]:
def get_artist_data(a_id):
       
    artist_data = sp.artist(a_id)

    ad_list = [a_id,\
               artist_data['name'],\
               artist_data['followers']['total'],\
               artist_data['genres'],\
               artist_data['popularity']]
    data = pd.DataFrame([ad_list], columns = ['artist_id','artist_name','total_followers','genres','popularity'])

    return data


In [20]:
get_artist_data(artist_id)

Unnamed: 0,artist_id,artist_name,total_followers,genres,popularity
0,2YZyLoL8N0Wb9xBt1NhZWg,Kendrick Lamar,15554576,"[conscious hip hop, hip hop, rap, west coast rap]",90


In [21]:
artist_list = artist_df['artist_id'].values
df_list=[]

for i,artist_id in enumerate(artist_list):
    print('[%d/%d] Fetching artist data for %s... ' % 
          (i+1,len(artist_list),artist_df[artist_df['artist_id']==artist_id]['artist_name'].values[0]), end = " ") 
    artist_data = get_artist_data(artist_id) 
    df_list.append(artist_data)
    print('done!')
    
    #sleep for 100 secs per 100 requests to avoid being blocked
    if (i % 100 == 0)& (i > 0):
        time.sleep(5)   

tching artist data for Andrea Babierra...  done!
[253/606] Fetching artist data for Anna Kendrick...  done!
[254/606] Fetching artist data for 88rising...  done!
[255/606] Fetching artist data for Why Don&#39;t We...  done!
[256/606] Fetching artist data for Tyga...  done!
[257/606] Fetching artist data for Daddy Yankee...  done!
[258/606] Fetching artist data for gnash...  done!
[259/606] Fetching artist data for Panic! At The Disco...  done!
[260/606] Fetching artist data for The Carters...  done!
[261/606] Fetching artist data for Ella Mai...  done!
[262/606] Fetching artist data for Jeremy Zucker...  done!
[263/606] Fetching artist data for Boyce Avenue...  done!
[264/606] Fetching artist data for Joseph Vincent...  done!
[265/606] Fetching artist data for Amber Leigh Irish...  done!
[266/606] Fetching artist data for Matt Johnson...  done!
[267/606] Fetching artist data for Chelsea Cutler...  done!
[268/606] Fetching artist data for Loud Luxury...  done!
[269/606] Fetching artist 

In [22]:
artist_data_df = pd.concat(df_list)
artist_data_df 

Unnamed: 0,artist_id,artist_name,total_followers,genres,popularity
0,4nDoRrQiYLoBzwC5BhVJzF,Camila Cabello,18225132,"[dance pop, pop, post-teen pop]",87
0,6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,70222645,"[pop, uk pop]",93
0,04gDigrS5kc9YWfZHwBETP,Maroon 5,27071746,"[pop, pop rock]",90
0,2wY79sveU1sp5g7SokKOiI,Sam Smith,13923279,"[pop, post-teen pop, uk pop]",90
0,6M2wZ9GZgrQXHCFfjv46we,Dua Lipa,19765369,"[dance pop, pop, uk pop]",93
...,...,...,...,...,...
0,3QJUFtGBGL05vo0kCJZsmT,salem ilese,45658,[modern indie pop],76
0,3wVXTWabe3viT0jF7DfjOL,Vedo,156789,"[pop r&b, urban contemporary]",75
0,3hB5x7E5sQzBHuG3xiqmUC,Logan Harris,25279,[],64
0,5R4HkjoZdxW3ZoVGC0e0qD,Julian Sean,5757,[opm],52


In [23]:
artist_data_df.to_csv('data/spotify_daily_charts_artists.csv', index=False, encoding='utf-8')

## Resources
- Spotify API reference manual https://developer.spotify.com/documentation/web-api/reference/search/search/