Take one look in the mirror

Implication so clear

I live life with no fear

Except for the idea

That one day you won't be here

'-' **Tyler, The Creator**  

In [2]:
# Importing necessary libraries

import spotipy
import pandas as pd
import requests
import numpy as np
import time
from requests.exceptions import ReadTimeout
from spotipy.exceptions import SpotifyException
from timeit import default_timer as timer
from datetime import timedelta
import configparser
from spotipy.oauth2 import SpotifyClientCredentials
from pandas.api.types import CategoricalDtype

In [4]:
# Create a ConfigParser object
config = configparser.ConfigParser()

# Read the config file
config.read('config.ini')

client_id = config.get('credentials', 'Client_ID')
client_secret = config.get('credentials', 'Client_Secret')

client_credentials_manager = SpotifyClientCredentials(
                                client_id = client_id, client_secret = client_secret)

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) 

In [5]:
# function to know how long it'll take to scrape the data
def format_time(seconds):
    minutes = seconds // 60
    seconds = seconds % 60
    if minutes > 0:
        return f"{minutes} minutes, {seconds} seconds"
    else:
        return f"{seconds} seconds"

In [8]:
def collect_spotify_data():
    start_time = timer()
    data = []  # Using a list of dictionaries to collect data

    for i in range(0, 1000, 50):
        try:
            track_results = sp.search(q='artist: Tyler, The Creator', type='track', limit=50, offset=i)
            for item in track_results['tracks']['items']:
                track_info = {
                    'artist_name': item['artists'][0]['name'],
                    'track_name': item['name'],
                    'track_id': item['id'],
                    'album_name': item['album']['name'],
                    'album_id': item['album']['id'],
                    'release_date': item['album']['release_date'],
                    'popularity': item['popularity'],
                    'explicit': item['explicit']
                }
                
                # Get audio features for track
                audio_features = sp.audio_features(item['id'])[0]
                if audio_features is not None:
                    audio_info = {
                        'danceability': audio_features.get('danceability', float('nan')),
                        'duration_ms': audio_features.get('duration_ms', float('nan')),
                        'energy': audio_features.get('energy', float('nan')),
                        'key': audio_features.get('key', float('nan')),
                        'loudness': audio_features.get('loudness', float('nan')),
                        'mode': audio_features.get('mode', float('nan')),
                        'speechiness': audio_features.get('speechiness', float('nan')),
                        'acousticness': audio_features.get('acousticness', float('nan')),
                        'instrumentalness': audio_features.get('instrumentalness', float('nan')),
                        'liveness': audio_features.get('liveness', float('nan')),
                        'valence': audio_features.get('valence', float('nan')),
                        'tempo': audio_features.get('tempo', float('nan')),
                        'time_signature': audio_features.get('time_signature', float('nan'))
                    }
                    track_info.update(audio_info)
                else:
                    # Handle case where audio features are not available
                    audio_info = {
                        'danceability': float('nan'),
                        'duration_ms': float('nan'),
                        'energy': float('nan'),
                        'key': float('nan'),
                        'loudness': float('nan'),
                        'mode': float('nan'),
                        'speechiness': float('nan'),
                        'acousticness': float('nan'),
                        'instrumentalness': float('nan'),
                        'liveness': float('nan'),
                        'valence': float('nan'),
                        'tempo': float('nan'),
                        'time_signature': float('nan')
                    }
                    track_info.update(audio_info)

                # Get featured artists
                if len(item['artists']) > 1:
                    featured_artists = [artist['name'] for artist in item['artists'][1:]]
                    track_info['featured_artists'] = featured_artists
                else:
                    track_info['featured_artists'] = []

                data.append(track_info)

        except requests.exceptions.ReadTimeout as e:
            print(f"Timeout error: {e}. Retrying in 5 seconds...")
            time.sleep(5)  # Retry after a short delay
        except requests.exceptions.RequestException as e:
            print(f"API error: {e}")
            break  # Break the loop if there's a persistent error

    df_raw = pd.DataFrame(data)
    end_time = timer()
    elapsed_time = int(end_time - start_time)
    print(f"Elapsed time: {elapsed_time} seconds")
    return df_raw

# Collecting data from Spotify API 
df_spotify = collect_spotify_data()


Elapsed time: 296 seconds


In [9]:
# keeping records with only 'Tyler, The Creator' as the artist name and saving as csv
df_tyler = df_spotify[df_spotify['artist_name']=='Tyler, The Creator']
df_tyler.to_csv('raw_food.csv')

In [10]:
# creating a new column to categorize songs to either Singles, Albums, EPs, Compilations

# Do this separately for the main_track df and the ft df, call it features for the ft df, before concat

studio_albums = ['Goblin',
          'Wolf',
          'Cherry Bomb',
          'Flower Boy',
          'IGOR',
          'CALL ME IF YOU GET LOST'
          ]

deluxe_editions = ['CALL ME IF YOU GET LOST: The Estate Sale']

live_albums = ['Live At Splash!']

instrumental_albums = ['Wolf + Instrumentals',
                       'Cherry Bomb + Instrumentals']

ep = ["Music Inspired by Illumination & Dr. Seuss' The Grinch"]


# Define conditions for each type

albums_condition = df_tyler['album_name'].isin(studio_albums)
deluxe_editions_condition = df_tyler['album_name'].isin(deluxe_editions)
live_albums_condition = df_tyler['album_name'].isin(live_albums)
instrumental_albums_condition = df_tyler['album_name'].isin(instrumental_albums)
ep_condition = df_tyler['album_name'].isin(ep)

In [11]:
# Using numpy.select to assign new column 'type'

conditions = [albums_condition, deluxe_editions_condition, live_albums_condition, instrumental_albums_condition, ep_condition]
choices = ['Studio Album', 'Album Deluxe', 'Live Album', 'Instrumental Album', 'EP']
df_tyler['type'] = np.select(conditions, choices, default='Single')

df_tyler.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tyler['type'] = np.select(conditions, choices, default='Single')


Unnamed: 0,artist_name,track_name,track_id,album_name,album_id,release_date,popularity,explicit,danceability,duration_ms,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,featured_artists,type
282,"Tyler, The Creator",Parking Lot - Instrumental,6UJ0uSZsAsT2sm4929OVEJ,Wolf + Instrumentals,4jEKedq9rWPV9OXuMkrZCa,2013-04-01,23,False,0.656,216347,...,1,0.0577,0.139,0.617,0.0797,0.191,80.026,4,[],Instrumental Album
283,"Tyler, The Creator",KEEP DA O'S - Instrumental,7Il5uRy0f42P1hXMIfa6iG,Cherry Bomb + Instrumentals,5V3Chnpno9oTI7JSPXKUf3,2015-04-13,23,False,0.56,250707,...,1,0.0355,0.00776,0.731,0.0614,0.106,139.948,3,[],Instrumental Album
284,"Tyler, The Creator",THE BROWN STAINS OF DARKEESE LATIFAH PART 6-12...,3MNMl544W55Fphu1do513b,Cherry Bomb + Instrumentals,5V3Chnpno9oTI7JSPXKUf3,2015-04-13,24,False,0.468,278240,...,1,0.251,0.00061,0.279,0.113,0.0881,162.275,3,[],Instrumental Album
292,"Tyler, The Creator",Yonkers,2YaT2YiSFVkOSo69v2ZfZL,PAY CLOSE ATTENTION: XL Recordings,4YepNDgdhc8RZvGFQ1dTU1,2014-08-25,14,True,0.712,249333,...,1,0.334,0.0161,0.0808,0.217,0.738,79.102,4,[],Single
304,"Tyler, The Creator",Yonkers,2YaT2YiSFVkOSo69v2ZfZL,PAY CLOSE ATTENTION: XL Recordings,4YepNDgdhc8RZvGFQ1dTU1,2014-08-25,14,True,0.712,249333,...,1,0.334,0.0161,0.0808,0.217,0.738,79.102,4,[],Single


In [15]:
# scraping a playlist that contains all Tyler's features 

# start time
start_time = timer()

# Get playlist URI
playlist_uri = "https://open.spotify.com/playlist/2LAatT3vCQMunKnTtQcmqe?si=ps3XvY7YSj6AEGT_k7NJWg"

# Get playlist tracks
playlist = sp.playlist(playlist_uri)
tracks = playlist['tracks']['items']

# Initialize data structures
song_features = []

# Define offset for pagination
offset = 0
limit = 100

# Iterate through playlist tracks, making additional requests as needed
while True:
    # Get playlist tracks with pagination
    results = sp.user_playlist_tracks(user="Mikelly", playlist_id=playlist_uri, offset=offset, limit=limit)

    # Check if there are more tracks to retrieve
    if not results['items']:
        break

    # Extract track information and audio features
    for track in results['items']:
        track_uri = track['track']['uri']
        track_name = track['track']['name']

        # Extract track information
        track_info = sp.track(track_uri)
        artist_name = track_info['artists'][0]['name']
        track_name = track_info['name']
        track_id = track_info['id']
        album_name = track_info['album']['name']
        album_id = track_info['album']['id']
        release_date = track_info['album']['release_date']
        duration_ms = track_info['duration_ms']
        popularity = track_info['popularity']
        explicit = track_info['explicit']
        
        
        # Extract audio features
        track_features = sp.audio_features(track_uri)

        # Check if audio features are available
        if track_features:
            track_features = track_features[0]

            # Extract audio features
            danceability = track_features['danceability']
            energy = track_features['energy']
            key = track_features['key']
            loudness = track_features['loudness']
            mode = track_features['mode']
            speechiness = track_features['speechiness']
            acousticness = track_features['acousticness']
            instrumentalness = track_features['instrumentalness']
            liveness = track_features['liveness']
            valence = track_features['valence']            
            tempo = track_features['tempo']
            time_signature = track_features['time_signature']
            
            # get featured artists
            featured_artists = []
            if len(track_info['artists']) > 1:
                feat_artists = []
                for j in range(1, len(track_info['artists'])):
                    feat_artists.append(track_info['artists'][j]['name'])
                featured_artists.append(feat_artists)
            else:
                featured_artists.append([])
   

            # Store data
            song_features.append([
                artist_name, track_name, track_id, album_name,
                album_id, release_date, duration_ms, popularity,
                explicit, danceability, energy, key, loudness,
                mode, speechiness, acousticness, instrumentalness,
                liveness, valence, tempo, time_signature, featured_artists
            ])
        else:
            print("Audio features not available for track:", track_name)

    # Update offset for next request
    offset += limit

# Save data to CSV file
ft = pd.DataFrame(
    song_features, 
    columns=[ 'artist_name', 'track_name', 'track_id', 'album_name', 'album_id',
                'release_date', 'duration_ms', 'popularity', 'explicit',
                'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                'acousticness', 'instrumentalness', 'liveness',
                'valence', 'tempo', 'time_signature', 'featured_artists'])

# end time and print
end_time = timer()
elapsed_time = int(end_time - start_time)
print(f"Elapsed time: {(elapsed_time)} seconds")

Elapsed time: 43 seconds


In [6]:
ft.to_csv('tyler_features.csv')

In [16]:
# creating a new column that categorizes songs from this playlist as Features
ft['type']='Feature' 

# joining both dataframes
df = pd.concat([df_tyler, ft], ignore_index= True, sort= False)

In [None]:
# modifying the 'mode' column by replacing 0 with Minor and 1 with Major
df['mode'].replace({0:'Minor', 1:'Major'}, inplace= True)

In [18]:
# modifying the duration_ms column to appear as minutes and seconds 

df['duration_ms'] = pd.to_timedelta(df['duration_ms'], unit='ms')

# extract the minutes and seconds components of the duration as strings
df['duration'] = df['duration_ms'].dt.components['minutes'].astype(str).str.zfill(2) + ':' + df['duration_ms'].dt.components['seconds'].astype(str).str.zfill(2)
df.drop(columns=['duration_ms'], inplace= True)
df[['track_name','duration']].head()

Unnamed: 0,track_name,duration
0,EARFQUAKE,03:10
1,See You Again (feat. Kali Uchis),03:00
2,BEST INTEREST,02:07
3,ARE WE STILL FRIENDS?,04:25
4,NEW MAGIC WAND,03:15


In [19]:
# creating three new fields: year, month and day a song was released from release date column
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year
df['month'] = df['release_date'].dt.month_name()
df['day_of_the_week'] = df['release_date'].dt.day_name()

df[['track_name', 'year', 'month', 'day_of_the_week']].head()

Unnamed: 0,track_name,year,month,day_of_the_week
0,EARFQUAKE,2019,May,Friday
1,See You Again (feat. Kali Uchis),2017,July,Friday
2,BEST INTEREST,2020,January,Saturday
3,ARE WE STILL FRIENDS?,2019,May,Friday
4,NEW MAGIC WAND,2019,May,Friday


In [20]:
# Time to remove the square brackets from the featured artist column 

# They have square brackets in the first place because it's a group of smaller lists in a larger list that was embbedded in the DataFrame

# We convert the data type to string no matter what, then remove the square brackets and apostrophe around it 

df['featured_artists'] = df['featured_artists'].astype(str)
df['featured_artists'] = df['featured_artists'].replace({"'": '', '"': '', r'\[':'', r'\]':''}, regex=True)
df[['track_name', 'featured_artists']].head()

Unnamed: 0,track_name,featured_artists
0,EARFQUAKE,
1,See You Again (feat. Kali Uchis),Kali Uchis
2,BEST INTEREST,
3,ARE WE STILL FRIENDS?,
4,NEW MAGIC WAND,


In [21]:
# Normalizing the keys and assigning each key number to a key name

key_dict = {
    0: "C",
    1: "C#/Db",
    2: "D",
    3: "D#/Eb",
    4: "E",
    5: "F",
    6: "F#/Gb",
    7: "G",
    8: "G#/Ab",
    9: "A",
    10: "A#/Bb",
    11: "B",
    -1: "NaN"
}

df['key'] = df['key'].map(key_dict).fillna("NaN")
df[['track_name', 'key']].head()

Unnamed: 0,track_name,key
0,EARFQUAKE,A
1,See You Again (feat. Kali Uchis),F#/Gb
2,BEST INTEREST,B
3,ARE WE STILL FRIENDS?,A#/Bb
4,NEW MAGIC WAND,F


In [23]:
# Converting the DataFrame to CSV

df.to_csv('Tyler, The Creator Dataset.csv', index= False)
df

Unnamed: 0,artist_name,track_name,track_id,album_name,album_id,release_date,popularity,explicit,danceability,energy,...,liveness,valence,tempo,time_signature,featured_artists,type,duration,year,month,day_of_the_week
0,"Tyler, The Creator",EARFQUAKE,5hVghJ4KaYES3BFUATCYn0,IGOR,5zi7WsKlIiUXv09tbGLKsE,2019-05-17,80,True,0.554,0.498,...,0.795,0.413,79.635,4,,Studio Album,03:10,2019,May,Friday
1,"Tyler, The Creator",See You Again (feat. Kali Uchis),7KA4W4McWYRpgf0fWsJZWB,Flower Boy,2nkto6YNI4rUYTLqEwWJ3o,2017-07-21,89,True,0.558,0.559,...,0.109,0.620,78.558,4,Kali Uchis,Studio Album,03:00,2017,July,Friday
2,"Tyler, The Creator",BEST INTEREST,3jHdKaLCkuNEkWcLVmQPCX,BEST INTEREST,5iUwaD3wFVwfaAfs9Z0eCh,2020-01-25,81,True,0.596,0.575,...,0.334,0.340,98.265,3,,Single,02:07,2020,January,Saturday
3,"Tyler, The Creator",ARE WE STILL FRIENDS?,5TxRUOsGeWeRl3xOML59Ai,IGOR,5zi7WsKlIiUXv09tbGLKsE,2019-05-17,79,True,0.216,0.497,...,0.111,0.313,187.308,3,,Studio Album,04:25,2019,May,Friday
4,"Tyler, The Creator",NEW MAGIC WAND,0fv2KH6hac06J86hBUTcSf,IGOR,5zi7WsKlIiUXv09tbGLKsE,2019-05-17,79,True,0.621,0.730,...,0.673,0.464,139.566,4,,Studio Album,03:15,2019,May,Friday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,Casey Veggies,DTA,6SfqAuzvIHbERSO5c613P0,Sleeping in Class,5VQnHe5Wcji6wNbiAnMQ21,2011-09-20,42,False,0.366,0.946,...,0.178,0.706,160.815,4,"Tyler, The Creator",Feature,04:00,2011,September,Tuesday
245,MellowHype,F666 the Police,0MfIT9RpAUGHjol2iQzXiq,Blackenedwhite,7KLG5Qk1Y9gLzSjdNdR96l,2011-07-12,20,True,0.520,0.663,...,0.439,0.276,125.315,4,,Feature,03:12,2011,July,Tuesday
246,The Game,Martians Vs Goblins,1UIcqXAA24eg76EFWViwr5,The R.E.D. Album,6PvZnd9Q2ymDo3gpoM5I37,2011-01-01,51,True,0.476,0.810,...,0.568,0.642,83.309,4,"Lil Wayne, Tyler, The Creator",Feature,03:48,2011,January,Saturday
247,Mike G,Timeless,5GYJvoJUMVyikb2D5sUwVg,Ali,5cVudF06zoMgAbXIRkk6aR,2010-04-11,18,True,0.606,0.767,...,0.119,0.689,79.920,4,,Feature,03:11,2010,April,Sunday
