In [135]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm.notebook import tqdm
import pickle
import re
from datetime import datetime
from pprint import PrettyPrinter
pp = PrettyPrinter(indent = 2)


In [3]:
redirect_uri = "http://localhost/"

#request your own Spotify API token
#save info in local text file with username on first line, client_id on second, and client_secret on third
with open("token.txt") as f:
    username, client_id, client_secret = (x.strip("\n") for x in f)

In [4]:
sp = spotipy.client.Spotify(client_credentials_manager = SpotifyClientCredentials(client_id = client_id, client_secret = client_secret))

In [5]:
dfi = pd.read_csv("TopCharts_incomplete.csv").dropna()

In [7]:
def _get_uri(query, sp = sp):
    """
    [Private function]
    Retrieves uri for given query
    
    input: query = target artist and song title in "track:{title} artist:{artist}" format
           sp = Spotify client
    output: uri (string)
    """
    r = sp.search(q = query)
    if len(r["tracks"]["items"]) == 0: #error
        return None
    else:
        uri = r["tracks"]["items"][0]["uri"]
    return uri

In [8]:
def getAudioFeatures(df, sp = sp):
    """
    [Public function]
    Retrieves audio features for songs in df
    **Note: for current dataset with 239,000 rows and 5,125 unique songs, this takes ~1 hour to run**
    
    input: df = DataFrame with artist names and titles
           sp = Spotify client
    output: df1 = DataFrame with audio features
            errors = list of failed queries
    """
    unique_df = df.drop_duplicates(subset = ['artist', 'title'])
    df1 = df.copy()
    errors = []
    bar = tqdm(total = len(unique_df))
    start_time = datetime.now()

    for _,row in unique_df.iterrows():
        artist,title = row["artist"], row["title"]
        qtitle = (" ").join(list(filter(lambda x: len(x)!=0, re.split("\W", title))))
        query = f"track:{qtitle} artist:{artist}"
        uri = _get_uri(query, sp)
        
        if uri is None:
            errors.append(query)
        else:
            data = sp.audio_features(uri)[0]
            del data["type"], data["id"], data["track_href"], data["analysis_url"]
            for col in data.keys():
                df1.loc[(df1.artist == artist) & (df1.title == title), col] = data[col]
        bar.update(1)
    bar.close()
    end_time = datetime.now()
    print(f"Finished in {str(end_time - start_time)}")
    return df1, errors

In [9]:
dfc, e = getAudioFeatures(dfi)

HBox(children=(FloatProgress(value=0.0, max=5121.0), HTML(value='')))


Finished in 0:51:11.118993


In [59]:
dfc.head()

Unnamed: 0,rank,date,artist,title,streams,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri,duration_ms,time_signature
0,1,2017-01-01,Migos,Bad and Boujee (feat. Lil Uzi Vert),1371493,0.927,0.665,11.0,-5.313,1.0,0.244,0.061,0.0,0.123,0.175,127.076,spotify:track:4Km5HrUvYTaSUfiSGPJeQR,343150.0,4.0
1,2,2017-01-01,Drake,Fake Love,1180074,0.928,0.481,9.0,-9.35,0.0,0.287,0.105,0.0,0.176,0.613,134.007,spotify:track:343YBumqHu19cGoGARUTsd,210937.0,4.0
2,3,2017-01-01,The Weeknd,Starboy,1064351,0.679,0.587,7.0,-7.015,1.0,0.276,0.141,6e-06,0.137,0.486,186.003,spotify:track:7MXVkk9YMctZqd1Srtv4MB,230453.0,4.0
3,4,2017-01-01,The Chainsmokers,Closer,1010492,0.748,0.524,8.0,-5.599,1.0,0.0338,0.414,0.0,0.111,0.661,95.01,spotify:track:7BKLCZ1jbUBVqRi2FVlTVw,244960.0,4.0
4,5,2017-01-01,Rae Sremmurd,Black Beatles,874289,0.794,0.632,0.0,-6.163,1.0,0.0649,0.142,0.0,0.128,0.355,145.926,spotify:track:6fujklziTHa8uoM5OQSfIo,291893.0,4.0


In [25]:
dfc.to_csv("TopCharts_missing.csv", index = False)
with open("missing_queries.data", "wb") as f:
    pickle.dump(e, f)

In [54]:
for d in dfc.date.unique():
    n = len(dfc.loc[dfc.date == d, 'rank'])
    if n != 200:
        print(f"Date: {d}\n\tMissing: {200 - n}")

Date: 2017-07-20
	Missing: 4
Date: 2017-07-21
	Missing: 4
Date: 2017-07-22
	Missing: 1
Date: 2017-07-23
	Missing: 1
Date: 2017-11-09
	Missing: 1
Date: 2017-11-10
	Missing: 1
Date: 2017-11-11
	Missing: 1
Date: 2017-11-12
	Missing: 1
Date: 2017-11-13
	Missing: 1
Date: 2017-11-14
	Missing: 1


In [119]:
def fixErrors(e, df1, sp):
    df = df1.copy()
    missing_df = df.loc[df.uri.isna(), :].drop_duplicates(subset = ['artist', 'title'])
    errors = []
    bar = tqdm(total = len(missing_df))
    start_time = datetime.now()
    
    
    for _,row in missing_df.iterrows():
        artist,title = row["artist"].replace("'", ""), row["title"].replace("'", "")
        if "-" in title:
            title = title[:title.find("-")+1]
        if "(" in title:
            title = title[:title.find("(")] + title[title.find(")")+1:]
        
        query = f"track:{title} artist:{artist}"
        uri = _get_uri(query, sp)
        
        if uri is None:
            errors.append(query)
        else:
            data = sp.audio_features(uri)[0]
            del data["type"], data["id"], data["track_href"], data["analysis_url"]
            for col in data.keys():
                df.loc[(df.artist == artist) & (df.title == title), col] = data[col]
        bar.update(1)
    bar.close()
    end_time = datetime.now()
    print(f"Finished in {str(end_time - start_time)}")
    return df, errors
    

In [120]:
fixed_df, e1 = fixErrors(e, dfc, sp)

HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


Finished in 0:00:56.230983


In [136]:
pp.pprint(e1)
pp.pprint(len(e1))

[ 'track:Dont Worry Be Happy artist:BobMcFerrin',
  'track:Yeah Yeah  artist:Travis Scott',
  'track:美女と野獣 artist:Ariana Grande',
  'track:The Heart Part 4 artist:Kendrick Lamar',
  'track:Courtesy Of The Red, White And Blue  artist:ToKeith',
  'track:Ok artist:Lil Pump',
  'track:Get to the Money artist:Chad Focus',
  'track:Tu Sabes Que Te Quiero artist:Chucho Flash',
  'track:Get to the Money  artist:Chad Focus',
  'track:The Race artist:22 Savage',
  'track:Jingle Bell Rock artist:MC Ty',
  'track:Rudolph the Rednose Reindeer artist:DMX',
  'track:Jingle Bell Rock artist:BobHelms',
  'track:La Modelo artist:Lean Trap',
  'track:Hurt To Look  artist:Swae Lee',
  'track:Bounce Out With That FT. YBN Nahmir artist:DJ A1',
  'track:Who Run It  [Feat. Lil Uzi Vert] artist:G Herbo',
  'track:They Afraid Of You artist:Trippie Redd',
  'track:ORANGE SODA artist:BaKeem',
  'track:Matrix artist:NLE Choppa',
  'track:Blueberry faygo artist:Yung Anime',
  'track:Blueberry Fergo artist:Lil Monet

In [137]:
assert len(dfc) == len(fixed_df) #make sure we didnt accidentally drop rows

In [138]:
fixed_df.to_csv("TopCharts_complete.csv", index = False)