In [2]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm.notebook import tqdm
import pickle

In [3]:
redirect_uri = "http://localhost/"

#request your own Spotify API token
#save info in local text file with username on first line, client_id on second, and client_secret on third
with open("token.txt") as f:
    username, client_id, client_secret = (x.strip("\n") for x in f)

In [4]:
sp = spotipy.client.Spotify(client_credentials_manager = SpotifyClientCredentials(client_id = client_id, client_secret = client_secret))

## Setup: get audio features for songs in TopCharts (don't run if complete TopCharts csv file already exists)

In [None]:
df = pd.read_csv("TopCharts.csv")

In [62]:
with open('unique.data', 'rb') as f:
    unique = pickle.load(f)

In [60]:
def _get_uri(query, sp = sp):
    """
    [Private function]
    Retrieves uri for given query
    
    input: query = target artist and song title in "track:{title} artist:{artist}" format
           sp = Spotify client
    output: uri (string)
    """
    r = sp.search(q = query)
    if len(r["tracks"]["items"]) == 0: #error
        return None
    else:
        uri = r["tracks"]["items"][0]["uri"]
    return uri

In [61]:
def getAudioFeatures(df, unique, sp = sp):
    """
    [Public function]
    Retrieves audio features for songs in df
    **Note: for current dataset with 239,000 rows and 5,125 unique songs, this takes ~1 hour to run**
    
    input: df = DataFrame with artist names and titles
           sp = Spotify client
    output: df1 = DataFrame with audio features
            errors = list of failed queries
    """
    df1 = df.copy()
    bar = tqdm(total = sum([len(unique[i]) for i in unique.keys()])) #progress bar
    errors = []
    
    for artist in unique.keys():
        for title in unique[artist]:
            query = f"track:{title} artist:{artist}"
            
            uri = _get_uri(query, sp)
            if uri is None:
                errors.append(query)
            else:
                data = sp.audio_features(uri)[0]
                del data["type"], data["id"], data["track_href"], data["analysis_url"]
                for col in data.keys():
                    df1.loc[(df.artist == artist) & (df.title == title), col] = data[col]
            bar.update(1)
    bar.close()
    
    return df1, errors

In [None]:
df1, errors = getAudioFeatures(df, unique)

In [None]:
df1.to_csv("TopCharts_incomplete.csv")

## NOTE: Hit Memory Error. Saved incomplete df to csv file. The following functions fix the incomplete df without having access to the *errors* variable above -- don't use if *errors* variable available

In [5]:
df1 = pd.read_csv("TopCharts_incomplete.csv", index_col = 0)

In [6]:
missing_df = df1[df1.uri.isna()] #DataFrame of songs with missing audio features (resulting from failed queries in getAudioFeatures())

In [7]:
def create_unique(df):
    """
    Creates dict of unique artists and songs
    *Only use when 'errors' variable from getAudioFeatures() inaccessible
    
    input: DataFrame of artists and song titles
    output: dict of sets {artist 1: {song 1, ... ,song m}, ...., artist n: {song 1, ..., song m}}
            note: total number of songs in output dict is number of unique songs in df
    """
    d = dict()
    for i in tqdm(df.index):
        a = str(df.loc[i,"artist"])
        t = str(df.loc[i,"title"])
        if a not in d:
            d[a] = set()
        d[a].add(t)
    return d

In [8]:
missing_songs = create_unique(missing_df)

HBox(children=(FloatProgress(value=0.0, max=6470.0), HTML(value='')))




In [48]:
def _process_errors(missing_dict):
    """
    [Private function]
    
    Converts from {artist 1: {song 1, ... ,song m}, ...., artist n: {song 1, ..., song m}} format to valid query format
    
    input: dict of unique artists and songs
    output: list of tuples(song title, artist, formatted query)
    """
    q_list = []
    for artist in missing_dict:
        for title in missing_dict[artist]:
            title = str(title)
            if "'" in title:
                title = title.replace("'", "")
            if "-" in title:
                title = title[:title.find("-")]
            if "(" in title:
                s = title.find("(")
                e = title.find(")")
                title = title[:s] + title[e+1:]
            if "(" in title: #songs in dataset have at most two sets of parentheses
                s = title.find("(")
                e = title.find(")")
                title = title[:s] + title[e+1:]
            title = title.strip()

            q = f"track:{title} artist:{artist}"
            q_list.append(tuple([title, artist, q]))
    return q_list

In [51]:
def fixErrors(missing_dict, df, sp = sp):
    """
    [Public function]
    Creates a copy with DataFrame with missing values and fills in missing values
    
    input: missing_dict = dict of sets of missing songs {artist 1: {song 1, ... ,song m}, ...., artist n: {song 1, ..., song m}}
           df = DataFrame with missing values
           sp = Spotify client
    output: df1 = completed DataFrame
            new_errors = list of failed queries
    """
    df1 = df.copy()
    new_errors = []
    
    q_list = _process_errors(missing_dict)
    
    for i in tqdm(q_list):
        title, artist, q = i 
        uri = _get_uri(q, sp)

        if uri is None:
            new_errors.append(q)
        else:
            data = sp.audio_features(uri)[0]
            del data["type"], data["id"], data["track_href"], data["analysis_url"]
            for col in data.keys():
                df1.loc[(df.artist == artist) & (df.title == title), col] = data[col]

    return df1, new_errors  

In [52]:
df2, new_errors = fixErrors(missing_songs, df1)

HBox(children=(FloatProgress(value=0.0, max=162.0), HTML(value='')))




In [53]:
new_errors

['track:Dont Worry Be Happy artist:BobMcFerrin',
 "track:How Far Ill Go artist:Auli'i Cravalho",
 'track:Yeah Yeah artist:Travis Scott',
 'track:美女と野獣 artist:Ariana Grande',
 'track:The Heart Part 4 artist:Kendrick Lamar',
 'track:Courtesy Of The Red, White And Blue artist:ToKeith',
 'track:Ok artist:Lil Pump',
 'track:Get to the Money artist:Chad Focus',
 'track:Get to the Money artist:Chad Focus',
 'track:Tu Sabes Que Te Quiero artist:Chucho Flash',
 'track:The Race artist:22 Savage',
 'track:Jingle Bell Rock artist:MC Ty',
 'track:Rudolph the Rednose Reindeer artist:DMX',
 'track:Jingle Bell Rock artist:BobHelms',
 'track:La Modelo artist:Lean Trap',
 'track:Hurt To Look artist:Swae Lee',
 'track:Bounce Out With That FT. YBN Nahmir artist:DJ A1',
 'track:Who Run It  [Feat. Lil Uzi Vert] artist:G Herbo',
 'track:They Afraid Of You artist:Trippie Redd',
 'track:ORANGE SODA artist:BaKeem',
 'track:Matrix artist:NLE Choppa',
 'track:Blueberry faygo artist:Yung Anime',
 'track:Blueberry 

In [54]:
df3 = df2[~df2.uri.isna()] #tracks in new_errors can't be found on Spotify, just drop from dataset

In [59]:
df3.to_csv("TopCharts_complete.csv")