## Get Albums and Artist Info from Spotify

In [3]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
import re
import pandas as pd
import numpy as np

#Spotify credentials to be able to access the spotify API and ultimately access their data
clientId = #Input own
clientSecret = #Input own 
clientCredentialsManager = SpotifyClientCredentials(client_id=clientId, client_secret=clientSecret)
sp = spotipy.Spotify(client_credentials_manager=clientCredentialsManager) #spotify object to access API

def spotifyArtistInfo(artist):
    result = sp.search(artist,type='artist') #search query
    artist = result['artists']['items'][0]
    followers = artist['followers']['total']
    genres = artist['genres']
    return (followers, genres)

def spotifyArtistAblums(artist):
    search = sp.search(q='artist:' + artist, type='artist')
    artist = search['artists']['items'][0]
    albums = []
    search = sp.artist_albums(artist['id'], album_type='album')
    albums.extend(search['items'])
    while search['next']:
        search = sp.next(search)
        albums.extend(search['items'])
    studioAlbums = []
    allAlbums = []
    albumNames = set()
    albums.sort(key=lambda album:album['name'].lower())
    firstAlbYear = 3000 #Arbitrarilly large
    for album in albums:
        name = album['name']
        albumId = album['id']
        year = int(album['release_date'][0:4])
        if year < firstAlbYear:
            firstAlbYear = year
        type_ = album['album_group']
        #If there is an album that has a deluxe version, the deluxe one will be selected
        exclude = ["spotify sessions", " live in ", " live from ","live at ", "(live)", " live version", "live lounge", "tour live", "stadium tour","radio release", "karaoke","extended edition", "commentary", " tour edition", "remix album", "standard version", "international edition","international version", "international deluxe", "anniversary edition", "special edition", " case version", "bonus version", "asia deluxe", "japan deluxe", "china version", "japan version", "uk version","canada version", "int'l version","(international", "international)", "live north america", 'north america live', "live at the bbc", "soundtrack", "radio release"]
        if name not in albumNames and not any(x in name.lower() for x in exclude) and type_ == 'album':
            albumNames.add(name)
            allAlbums.append((name, album))
    notDeluxe = []
    deluxeVersion = []
    unDeluxed = []
    terms = ["(deluxe)", "(deluxe edition)", "(deluxe version)", " deluxe edition", " deluxe version", "deluxe"]
    for a in allAlbums:
        if "deluxe" in a[0].lower():
            deluxeVersion.append(a)
            studioAlbums.append(a[1])
            deluxeName = a[0].lower()
            for term in terms:
                deluxeName = deluxeName.replace(term, "")
            unDeluxed.append(deluxeName)
        else:
            notDeluxe.append(a)
    for item in notDeluxe:
        #print(unDeluxed)
        if not any(d.strip().lower() == item[0].strip().lower() for d in unDeluxed):
            studioAlbums.append(item[1])
    for i in studioAlbums:
        #Toggle this on to print
        #print(i["name"])
        pass
    return (studioAlbums, firstAlbYear)
        

In [5]:
albs, firstYear = spotifyArtistAblums("Maroon 5")

Hands All Over (Deluxe)
Overexposed (Deluxe)
Red Pill Blues (Deluxe)
Red Pill Blues + (Deluxe)
V (Deluxe)
1.22.03 Acoustic
It Won't Be Soon Before Long
It Won't Be Soon Before Long.
Live Friday The 13th
Overexposed Track By Track
Songs About Jane


## Run Qualifier Check for All Billboard Artists

An artist qualifies for consideration in the study if:
- they have content which has appeared on the Billboard Hot 100 in the last 20 years
- they they have at least 3 albums released
- their first album was released at least 7 years ago

The billboard scraper goes back 1043 weeks (20 years) to collect data, and then this script will use spotify to ultimately see if the artist qualifies.

In [16]:
#Currently doesn't catch destiny's child
dataPre = pd.DataFrame(pd.read_csv('FrostData/Weekly_data_clean.csv'))
artistData = pd.DataFrame(columns = ["Artist", 'Followers', "Genres", "NumAlbums", "YearFirstAlbum"])
drops = []
outArtists = []
for index, row in dataPre.iterrows():
    artists = row["Artists"].split(",")
    ##Some special cases
    if artists == ['Machine Gun Kelly', '', ' Ambassadors', ' Bebe Rexha']:
        artists = ['Machine Gun Kelly', 'X Ambassadors', ' Bebe Rexha']
    elif artists == ['Pusha T Kanye West']:
        artists = ["Pusha T", "Kanye West"]
    elif artists == ['Jay Z Kanye West']:
        artists = ['Jay Z', "Kanye West"]
    elif 'Tyler' in artists and ' The Creator' in artists:
        artists.remove("Tyler")
        artists.remove(" The Creator")
        artists.append("Tyler The Creator")
    elif " Swae Lee Or Rae Sremmurd" in artists:
        artists = ["Swae Lee", "Rae Sremmurd"]
    elif "Plies Or OJ Da Juiceman" in artists:
        artists = ['Plies', 'OJ Da Juiceman']
    if all(a.strip() in (artistData['Artist'].tolist() + outArtists) for a in artists):
        for a in artists:
            if a.strip() in outArtists:
                drops.append(index)
        continue
    else:
        for a in artists:
            artist = a.strip().replace(" &", "")
            if " + " in artist and artist !=  "Dan + Shay":
                new = artist.split("+")
                artists.extend(new)
                continue
            elif " with " in artist:
                new = artist.split(' with ')
                artists.extend(new)
                continue
            elif " With " in artist:
                new = artist.split(' With ')
                artists.extend(new)
                continue
            if artist in outArtists:
                drops.append(index)
                continue
            if artist not in artistData['Artist'].tolist():
                print(str(index) + " ARTIST : " + artist)
                try:
                    albums, firstYear= spotifyArtistAblums(artist)
                except: 
                    #Add to this as we find
                    transferArtists = {"DU DDU":"BLACKPINK", "NBA YoungBoy":"YoungBoy Never Broke Again","Maejor Ali":"Maejor", "ROCKMAFIA":"rock mafia", "Soulja Boy Tell'em":"Soulja Boy", "Ludacris Co":"Ludacris", "Purple Ribbon All":"Purple Ribbon All-Stars", "Fat Man Scoop":"Fatman Scoop", "The Clark Family Experience":"Clark Family Experience", "SoulDecision":"Soul Decision", "Travi$ Scott": "Travis Scott" }
                    if artist in list(transferArtists.keys()):
                        artist = transferArtists[artist]
                    try:
                        albums, firstYear= spotifyArtistAblums(artist)
                    except:
                        print("Not actually a spotify artist!")
                        continue
                if len(albums) >= 3 and firstYear < 2012: #then they qualify, and collect the artist data
                    followers, genres = spotifyArtistInfo(artist)
                    artistData = artistData.append({"Artist": artist, 'Followers':followers, "Genres": ",".join(genres), "NumAlbums":len(albums), "YearFirstAlbum": firstYear}, ignore_index = True) 
                else:
                    outArtists.append(artist)
                    drops.append(index)
        
dataPost = dataPre.drop(drops)
dataPost.to_csv("FrostData/billboardRankings.csv")
artistData.to_csv("FrostData/artistInfo.csv")
        
        

0 ARTIST : Lil Nas
0 ARTIST : 
Not actually a spotify artist!
1 ARTIST : Shawn Mendes
1 ARTIST : Camila Cabello
2 ARTIST : Billie Eilish
3 ARTIST : Khalid
4 ARTIST : Ed Sheeran
4 ARTIST : Justin Bieber
5 ARTIST : Jonas Brothers
6 ARTIST : DaBaby
7 ARTIST : Drake
8 ARTIST : Chris Brown
9 ARTIST : Post Malone
10 ARTIST : Lizzo
11 ARTIST : Swae Lee
12 ARTIST : Taylor Swift
14 ARTIST : Sam Smith
14 ARTIST : Normani
15 ARTIST : Lil Nas X
16 ARTIST : Halsey
17 ARTIST : Blake Shelton
18 ARTIST : Polo G
19 ARTIST : Nicki Minaj
20 ARTIST : Ava Max
21 ARTIST : Cardi B
22 ARTIST : Ariana Grande
23 ARTIST : Panic! At The Disco
24 ARTIST : Marshmello
24 ARTIST : Bastille
25 ARTIST : Lee Brice
26 ARTIST : Katy Perry
27 ARTIST : Morgan Wallen
28 ARTIST : Blanco Brown
29 ARTIST : Young Thug
29 ARTIST : J. Cole
29 ARTIST : Travis Scott
32 ARTIST : Luke Combs
33 ARTIST : City Girls
34 ARTIST : benny blanco
36 ARTIST : Daddy Yankee
37 ARTIST : Mustard
37 ARTIST : Migos
39 ARTIST : Meek Mill
41 ARTIST : L

# Attaining Spotify's Audio Features

In [17]:
def trackInfo(albums, artist):
    albumData = pd.DataFrame(columns = ['Artist', 'Album','Name', 'Popularity', 'Duration', 'Explicit', 'Danceability', 'Energy', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'TimeSignature'])
    for album in albums:
        albumId = album['id']
        albumName = album['name']
        tracks = sp.album_tracks(albumId)['items']
        for track in tracks:
            trackName = track['name']
            features = sp.audio_features(track['uri'])
            #to get popularity and explicit
            trackInfo = sp.track(track['uri'])
            #Create DF row     ###Maybe try to change duration to be in seconds
            albumData = albumData.append({'Artist':artist, 'Album':albumName,'Name':trackName, 'Popularity':trackInfo['popularity'], 'Duration':features[0]['duration_ms'], 'Explicit':trackInfo['explicit'], 'Danceability':features[0]['danceability'], 'Energy':features[0]['energy'], 'Loudness':features[0]['loudness'], 'Mode':features[0]['mode'], 'Speechiness':features[0]['speechiness'], 'Acousticness':features[0]['acousticness'], 'Instrumentalness':features[0]['instrumentalness'], 'Liveness':features[0]['liveness'], 'Valence':features[0]['valence'], 'Tempo':features[0]['tempo'], 'TimeSignature':features[0]['time_signature']}, ignore_index = True)
    return albumData

In [18]:
trackInfo(albs, 'twenty one pilots')

KeyboardInterrupt: 

## Write the Files For Archetype Artists

In [None]:
def archetypeContentInfo(artistList):
    df = pd.DataFrame(columns = ['Artist', 'Album','Track', 'Popularity', 'Explicit', 'Danceability', 'Energy', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration', 'TimeSignature'])
    for artist in artistList:
        followers, genres = spotifyArtistInfo(artist)
        albums, firstYear = spotifyArtistAblums(artist)
        data = trackInfo(albums, artist)
        df = pd.concat([data, df])
    df.to_csv("FrostData/songAttributes.csv")
    

In [None]:
artistList = ["Maroon 5", "Taylor Swift", "Foo Fighters", "J Cole", "Justin Bieber", "Britney Spears", "Justin Timberlake", "Twenty One Pilots", "Elton John", "Beatles", "Khalid", "Billie Eilish"]
archetypeContentInfo(artistList)