In [2]:
import pandas as pd

import requests
import base64
import time

import artist_data

## Gathering Data

By using the Spotify API, here I will get the artist Spotify popularity and followers and match them to their corresponding songs. By using the `CLIENT ID ` and `CLIENT SECRET`, that they provide to registered applications in their *developer* website.

In [3]:
CLIENT_ID = 'x'
CLIENT_SECRET = 'x'
REDIRECT_URL = 'http://localhost:5000'

token = artist_data.get_access_token(CLIENT_ID, CLIENT_SECRET)

In [4]:
# Run only one time to initialize the new columns and save the dataset to a file
# data = pd.read_csv("data/filled_spotify.csv", index_col=0)
# data["artist_popularity"] = None
# data["artist_followers"] = None
# data.to_csv("data/filled_artists_info.csv")

data = pd.read_csv("data/filled_artists_info.csv", index_col=0)

In [7]:
# Clean main artists that still have their features left, because of 'X' separating the names
# Or have features in brackets (e.g. The Poppy Family (Featuring Susan Jacks))

# data['main_artist'] = data['main_artist'].str.replace(r'\s[xX]\s.*', '', regex=True) \
#                             .str.replace(r'\(.*', '', regex=True)

# data.to_csv("data/filled_artists_info.csv")

After loading the dataset, I get only the rows with missing values for `artist_popularity` and `artist_followers` and shuffle the data.

In [10]:
missing_data = data[data["artist_popularity"].isnull()]
missing_data = missing_data.sample(frac=1) #Shuffling to avoid going over the first rows which could not be filled every time

In [7]:
# missing_data['main_artist'] = missing_data['main_artist'].str.replace(r'\s[xX]\s.*', '', regex=True) \
#                             .str.replace(r'\(.*', '', regex=True)

The following code block was ran multiple times in order to fill missing values for artist popularity and followers every time. It fills missing artist popularity and follower data by querying an API, updating the dataset in batches of 25 rows, and saving the results to a CSV file.

In [16]:
for _ in range(20):
    # Iterate through the DataFrame rows
    for idx, row in missing_data.head(25).iterrows():
        artist_name = row["main_artist"]
        info = artist_data.get_artist_followers_and_popularity(artist_name, token)
        time.sleep(0.1)
        if info:
            # Populate the DataFrame with the cached or retrieved artist information
            data.loc[data["main_artist"] == artist_name, ["artist_popularity"]] = info["popularity"]
            data.loc[data["main_artist"] == artist_name, ["artist_followers"]] = info["followers"]
            print(f"{artist_name}: f: {info['popularity']} p: {info['followers']}")
        else:
            print("Not found " + artist_name)


    missing_data = missing_data.reset_index(drop = True).drop(range(25))

data.to_csv("data/filled_artists_info.csv")

Changing Faces : f: 30 p: 5822
MoneyBagg Yo: f: 77 p: 4749750
Das EFX : f: 58 p: 390055
Not found Cymarron
Not found John W. Anderson presents KaSandra
The Kings: f: 32 p: 11605
Mickey Lee Lane: f: 11 p: 450
nan: f: 64 p: 3078486
Anita Cochran : f: 15 p: 10585
Wyatt : f: 69 p: 316739
nan: f: 64 p: 3078486
Not found Cymarron
El Alfa: f: 79 p: 3750772
Tom MacDonald: f: 67 p: 1713345
The Verve: f: 69 p: 2444237
Jennifer Hanson: f: 61 p: 2001876
