In [2]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm.notebook import tqdm
import pickle

In [3]:
redirect_uri = "http://localhost/"
with open("token.txt") as f:
    username, client_id, client_secret = (x.strip("\n") for x in f)

In [4]:
sp = spotipy.client.Spotify(client_credentials_manager = SpotifyClientCredentials(client_id = client_id, client_secret = client_secret))

## Setup: get audio features for songs in TopCharts (don't run if complete TopCharts csv file already exists)

In [None]:
df = pd.read_csv("TopCharts.csv")

In [None]:
with open('unique.data', 'rb') as f:
    unique = pickle.load(f)

In [11]:
def _get_uri(query, sp = sp):
    r = sp.search(q = query)
    if len(r["tracks"]["items"]) == 0: #error
        return None
    else:
        uri = r["tracks"]["items"][0]["uri"]
    return uri

In [None]:
def getAudioFeatures(df, unique, sp = sp):
    df1 = df.copy()
    bar = tqdm(total = sum([len(unique[i]) for i in unique.keys()])) #progress bar
    errors = []
    
    for artist in unique.keys():
        for title in unique[artist]:
            query = f"track:{title} artist:{artist}"
            
            uri = _get_uri(query, sp)
            if uri is None:
                errors.append(query)
            else:
                data = sp.audio_features(uri)[0]
                del data["type"], data["id"], data["track_href"], data["analysis_url"]
                for col in data.keys():
                    df1.loc[(df.artist == artist) & (df.title == title), col] = data[col]
            bar.update(1)
    bar.close()

            
        
    return df1, errors

In [None]:
df1, errors = getAudioFeatures(df, unique)

In [None]:
df1.to_csv("TopCharts_incomplete.csv")

## NOTE: Hit Memory Error. Saved incomplete df to csv file. The following functions fix the incomplete df without having access to the *errors* variable above -- don't use if *errors* variable available

In [5]:
df1 = pd.read_csv("TopCharts_incomplete.csv", index_col = 0)

In [6]:
missing_df = df1[df1.uri.isna()]

In [7]:
def create_unique(df):
    d = dict()
    for i in tqdm(df.index):
        a = str(df.loc[i,"artist"])
        t = str(df.loc[i,"title"])
        if a not in d:
            d[a] = set()
        d[a].add(t)
    return d

In [8]:
missing_songs = create_unique(missing_df)

HBox(children=(FloatProgress(value=0.0, max=6470.0), HTML(value='')))




In [48]:
def _process_errors(missing_dict):
    q_list = []
    for artist in missing_dict:
        for title in missing_dict[artist]:
            title = str(title)
            if "'" in title:
                title = title.replace("'", "")
            if "-" in title:
                title = title[:title.find("-")]
            if "(" in title:
                s = title.find("(")
                e = title.find(")")
                title = title[:s] + title[e+1:]
            if "(" in title:
                s = title.find("(")
                e = title.find(")")
                title = title[:s] + title[e+1:]
            title = title.strip()

            q = f"track:{title} artist:{artist}"
            q_list.append(tuple([title, artist, q]))
    return q_list

In [49]:
e = _process_errors(missing_songs)

In [50]:
e

[('Dont Wanna Know', 'Maroon 5', 'track:Dont Wanna Know artist:Maroon 5'),
 ('Dont Let Me Down',
  'The Chainsmokers',
  'track:Dont Let Me Down artist:The Chainsmokers'),
 ('Dont Say', 'The Chainsmokers', 'track:Dont Say artist:The Chainsmokers'),
 ('Takeaway', 'The Chainsmokers', 'track:Takeaway artist:The Chainsmokers'),
 ('Side Effects',
  'The Chainsmokers',
  'track:Side Effects artist:The Chainsmokers'),
 ('CANT STOP THE FEELING!',
  'Justin Timberlake',
  'track:CANT STOP THE FEELING! artist:Justin Timberlake'),
 ('Cant Take It From Me',
  'Major Lazer',
  'track:Cant Take It From Me artist:Major Lazer'),
 ('Know No Better', 'Major Lazer', 'track:Know No Better artist:Major Lazer'),
 ('Cold Water', 'Major Lazer', 'track:Cold Water artist:Major Lazer'),
 ('Run Up', 'Major Lazer', 'track:Run Up artist:Major Lazer'),
 ('Light It Up', 'Major Lazer', 'track:Light It Up artist:Major Lazer'),
 ('Call On Me', 'Starley', 'track:Call On Me artist:Starley'),
 ('How Far Ill Go',
  'Alessia

In [51]:
def fixErrors(missing_dict, df, sp = sp):
    df1 = df.copy()
    new_errors = []
    
    q_list = _process_errors(missing_dict)
    
    for i in tqdm(q_list):
        title, artist, q = i 
        uri = _get_uri(q, sp)

        if uri is None:
            new_errors.append(q)
        else:
            data = sp.audio_features(uri)[0]
            del data["type"], data["id"], data["track_href"], data["analysis_url"]
            for col in data.keys():
                df1.loc[(df.artist == artist) & (df.title == title), col] = data[col]

    return df1, new_errors  

In [52]:
df2, new_errors = fixErrors(missing_songs, df1)

HBox(children=(FloatProgress(value=0.0, max=162.0), HTML(value='')))




In [53]:
new_errors

['track:Dont Worry Be Happy artist:BobMcFerrin',
 "track:How Far Ill Go artist:Auli'i Cravalho",
 'track:Yeah Yeah artist:Travis Scott',
 'track:美女と野獣 artist:Ariana Grande',
 'track:The Heart Part 4 artist:Kendrick Lamar',
 'track:Courtesy Of The Red, White And Blue artist:ToKeith',
 'track:Ok artist:Lil Pump',
 'track:Get to the Money artist:Chad Focus',
 'track:Get to the Money artist:Chad Focus',
 'track:Tu Sabes Que Te Quiero artist:Chucho Flash',
 'track:The Race artist:22 Savage',
 'track:Jingle Bell Rock artist:MC Ty',
 'track:Rudolph the Rednose Reindeer artist:DMX',
 'track:Jingle Bell Rock artist:BobHelms',
 'track:La Modelo artist:Lean Trap',
 'track:Hurt To Look artist:Swae Lee',
 'track:Bounce Out With That FT. YBN Nahmir artist:DJ A1',
 'track:Who Run It  [Feat. Lil Uzi Vert] artist:G Herbo',
 'track:They Afraid Of You artist:Trippie Redd',
 'track:ORANGE SODA artist:BaKeem',
 'track:Matrix artist:NLE Choppa',
 'track:Blueberry faygo artist:Yung Anime',
 'track:Blueberry 

In [54]:
df3 = df2[~df2.uri.isna()]

In [59]:
df3.to_csv("TopCharts_complete.csv")