In [None]:
# import dependencies
#  data packages
import pandas as pd
import numpy as np
#  web-scraping packages
from urllib.request import urlopen
from bs4 import BeautifulSoup
import wikipedia
#  Spotify packages
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
#  lyrics package
import lyricsgenius as genius
#  misc packages
import time
import re
import itertools
#  config features
from billboard_music_analysis import config

In [None]:
# initialize spotify auth
client_credentials_manager = SpotifyClientCredentials(
    client_id = config.client_id, client_secret = config.client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

### Grab Wikipedia Data
We will be using the beautiful soup library to scrape each year's top 100 tracks. Wikipedia lists each year's top-100 list in a tabulated format with the first column being the song's rank, the second column being the track name, and the last column being the artists involved.

In [None]:
# create function that grabs top 100 for each year
def wiki_lists(url_prefix, min_yr, max_yr):
    # initialize empty df to append to
    df = pd.DataFrame()
    # for each year open the url and extract the table as a df
    for yr in np.arange(min_yr, max_yr + 1):
        url = f'{url_prefix}{str(yr)}'
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        wikitab = soup.find('table',{'class':"wikitable"})
        df_yr = pd.read_html(str(wikitab))
        df_yr = pd.DataFrame(df_yr[0])
        # align column names
        df_yr.columns = ['rank','track_nm','artist_nms']
        df_yr['year'] = yr
        # append to general df
        df = pd.concat([df, df_yr], ignore_index = True)
    return df

# set min_yr back to config.min_yr when doing full analysis
#  doing a smaller subset for a faster run
df = wiki_lists(config.wiki_url_prefix, config.min_yr, config.max_yr)

In [None]:
df

Unnamed: 0,rank,track_nm,artist_nms,year
0,1,"""Joy to the World""",Three Dog Night,1971
1,2,"""Maggie May""/""Reason to Believe""",Rod Stewart,1971
2,3,"""It's Too Late""/""I Feel the Earth Move""",Carole King,1971
3,4,"""One Bad Apple""",The Osmonds,1971
4,5,"""How Can You Mend a Broken Heart""",Bee Gees,1971
...,...,...,...,...
5095,96,"""Things a Man Oughta Know""",Lainey Wilson,2021
5096,97,"""Throat Baby (Go Baby)""",BRS Kash,2021
5097,98,"""Tombstone""",Rod Wave,2021
5098,99,"""Drinkin' Beer. Talkin' God. Amen.""",Chase Rice featuring Florida Georgia Line,2021


### Grab Spotify Data
We will use the data scraped from Wikipedia to search Spotify for as many billboard year-end top 100 tracks as we can find. Using regex and some text formatting we will try to capture as many songs as possible. After finding these songs and retrieving their Spotify track id's, we can use those id's to call their track attributes.

In [None]:
def search_term(df, artist_col_nm, track_col_nm):
    # lowercase and split using vector functions
    artist_search = df[artist_col_nm].str.lower() \
        .str.replace('[\"\']', '', regex = True) \
        .str.replace('((feat\W|\(|and\s|featuring|,|with\s|&\s).*)', '', regex = True)
    # define track terms
    track_search = df[track_col_nm].str.replace('[\"\'\.]', '', regex = True).str.split(pat = "/").str[0]
    return artist_search, track_search

df['artist_search'], df['track_search'] = search_term(df, 'artist_nms', 'track_nm')
df

Unnamed: 0,rank,track_nm,artist_nms,year,artist_search,track_search
0,1,"""Joy to the World""",Three Dog Night,1971,three dog night,Joy to the World
1,2,"""Maggie May""/""Reason to Believe""",Rod Stewart,1971,rod stewart,Maggie May
2,3,"""It's Too Late""/""I Feel the Earth Move""",Carole King,1971,carole king,Its Too Late
3,4,"""One Bad Apple""",The Osmonds,1971,the osmonds,One Bad Apple
4,5,"""How Can You Mend a Broken Heart""",Bee Gees,1971,bee gees,How Can You Mend a Broken Heart
...,...,...,...,...,...,...
5095,96,"""Things a Man Oughta Know""",Lainey Wilson,2021,lainey wilson,Things a Man Oughta Know
5096,97,"""Throat Baby (Go Baby)""",BRS Kash,2021,brs kash,Throat Baby (Go Baby)
5097,98,"""Tombstone""",Rod Wave,2021,rod wave,Tombstone
5098,99,"""Drinkin' Beer. Talkin' God. Amen.""",Chase Rice featuring Florida Georgia Line,2021,chase rice,Drinkin Beer Talkin God Amen


In [None]:
# create function that iterates through artists for matching
def artist_searcher(output, artist):
    if len(output) == 0: return None, None
    else:
        for i in output:
            artists = [re.sub('[\"\']', '', a['name'].lower()) for a in i['artists']]
            artists = set(itertools.chain.from_iterable(artists))
            if set(artist.split()).isdisjoint(artists) == False:
                try: return i[0]['id'], i[0]['artists'][0]['id']
                except: return None, None
            else: return None, None

In [None]:
# create search rules to optimize accuracy of spotify search selection
def search_rule(query, artist, track):
    output = sp.search(q = query, type = 'track')['tracks']['items']
    # if condition is met we skip all other loops
    if len(output) >= 1:
        return output[0]['id'], output[0]['artists'][0]['id']
    # if first condition is not met we just search on song title
    else:
        output = sp.search(q = 'track:' + track, type = 'track')['tracks']['items']
        # use artist name to select appropriate title
        track_id, artist_id = artist_searcher(output, artist)
        if track_id is not None: return track_id, artist_id
        else:
            for word in track.split():
                output = sp.search(q = 'track:' + word, type = 'track')['tracks']['items']
                # use artist name to select appropriate title
                return artist_searcher(output, artist)

In [None]:
# for each song in the wiki df search spotify for track id's
def track_id_search(df, sp, artist_col, track_col):
    df['query'] = 'artist:' + df[artist_col] + ' track:' + df[track_col]
    track_ids, artist_ids = zip(*df[['query', artist_col, track_col]].apply(
        lambda x: search_rule(x.query, x[artist_col], x[track_col]), axis = 1))
    return track_ids, artist_ids
    
df['track_ids'], df['artist_ids'] = track_id_search(df, sp, 'artist_search', 'track_search')
df.dropna(inplace = True)

In [None]:
# create function that uses track_ids to find audio features
def audio_feature(df, id_col):
    ser = df[id_col]
    af_df = pd.DataFrame()
    for i in np.arange(0, df.shape[0], 100):
        output = sp.audio_features(list(ser[i:(i + 100)]))
        output = [i for i in output if i is not None]
        extended = pd.DataFrame(output)
        af_df = pd.concat([af_df, extended], ignore_index = True)
    return af_df

df2 = df.merge(audio_feature(df, 'track_ids'), how = 'inner', left_on = 'track_ids',
               right_on = 'id').drop_duplicates().dropna()
df2

Unnamed: 0,rank,track_nm,artist_nms,year,artist_search,track_search,query,track_ids,artist_ids,danceability,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,1,"""Joy to the World""",Three Dog Night,1971,three dog night,Joy to the World,artist:three dog night track:Joy to the World,2ymeOsYijJz09LfKw3yM2x,4FAEZeJcsYYBkNq2D3KGTV,0.649,...,0.3390,0.971,126.867,audio_features,2ymeOsYijJz09LfKw3yM2x,spotify:track:2ymeOsYijJz09LfKw3yM2x,https://api.spotify.com/v1/tracks/2ymeOsYijJz0...,https://api.spotify.com/v1/audio-analysis/2yme...,220573,4
1,2,"""Maggie May""/""Reason to Believe""",Rod Stewart,1971,rod stewart,Maggie May,artist:rod stewart track:Maggie May,6rovOdp3HgK1DeAMYDzoA7,2y8Jo9CKhJvtfeKOsYzRdT,0.605,...,0.0684,0.543,129.449,audio_features,6rovOdp3HgK1DeAMYDzoA7,spotify:track:6rovOdp3HgK1DeAMYDzoA7,https://api.spotify.com/v1/tracks/6rovOdp3HgK1...,https://api.spotify.com/v1/audio-analysis/6rov...,350267,4
2,3,"""It's Too Late""/""I Feel the Earth Move""",Carole King,1971,carole king,Its Too Late,artist:carole king track:Its Too Late,12q3V8ShACq2PSWINMc2rC,319yZVtYM9MBGqmSQnMyY6,0.450,...,0.1340,0.812,208.282,audio_features,12q3V8ShACq2PSWINMc2rC,spotify:track:12q3V8ShACq2PSWINMc2rC,https://api.spotify.com/v1/tracks/12q3V8ShACq2...,https://api.spotify.com/v1/audio-analysis/12q3...,233173,4
3,4,"""One Bad Apple""",The Osmonds,1971,the osmonds,One Bad Apple,artist:the osmonds track:One Bad Apple,50UoJUrslRVqG0cg6uboyn,5fU6lODhpw3GEGGJuaDprR,0.600,...,0.2130,0.920,101.041,audio_features,50UoJUrslRVqG0cg6uboyn,spotify:track:50UoJUrslRVqG0cg6uboyn,https://api.spotify.com/v1/tracks/50UoJUrslRVq...,https://api.spotify.com/v1/audio-analysis/50Uo...,164973,4
4,5,"""How Can You Mend a Broken Heart""",Bee Gees,1971,bee gees,How Can You Mend a Broken Heart,artist:bee gees track:How Can You Mend a Broke...,3ZP18us6p6LHOZMG1LLUjF,1LZEQNv7sE11VDY3SdxQeN,0.594,...,0.1840,0.537,104.996,audio_features,3ZP18us6p6LHOZMG1LLUjF,spotify:track:3ZP18us6p6LHOZMG1LLUjF,https://api.spotify.com/v1/tracks/3ZP18us6p6LH...,https://api.spotify.com/v1/audio-analysis/3ZP1...,241093,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5541,96,"""Things a Man Oughta Know""",Lainey Wilson,2021,lainey wilson,Things a Man Oughta Know,artist:lainey wilson track:Things a Man Oughta...,5QS8PNEWbqTEZyQ6e9ZbJf,6tPHARSq45lQ8BSALCfkFC,0.659,...,0.1330,0.397,139.931,audio_features,5QS8PNEWbqTEZyQ6e9ZbJf,spotify:track:5QS8PNEWbqTEZyQ6e9ZbJf,https://api.spotify.com/v1/tracks/5QS8PNEWbqTE...,https://api.spotify.com/v1/audio-analysis/5QS8...,203373,4
5542,97,"""Throat Baby (Go Baby)""",BRS Kash,2021,brs kash,Throat Baby (Go Baby),artist:brs kash track:Throat Baby (Go Baby),15C4TnrrVdym7UykxQIOTZ,5jJjvmEwRr8epuGZq4eUUa,0.878,...,0.1670,0.397,131.988,audio_features,15C4TnrrVdym7UykxQIOTZ,spotify:track:15C4TnrrVdym7UykxQIOTZ,https://api.spotify.com/v1/tracks/15C4TnrrVdym...,https://api.spotify.com/v1/audio-analysis/15C4...,211610,4
5543,98,"""Tombstone""",Rod Wave,2021,rod wave,Tombstone,artist:rod wave track:Tombstone,3zc8VZEpM1onYV4FWGdFvm,45TgXXqMDdF8BkjA83OM7z,0.550,...,0.1290,0.535,84.448,audio_features,3zc8VZEpM1onYV4FWGdFvm,spotify:track:3zc8VZEpM1onYV4FWGdFvm,https://api.spotify.com/v1/tracks/3zc8VZEpM1on...,https://api.spotify.com/v1/audio-analysis/3zc8...,160078,4
5544,99,"""Drinkin' Beer. Talkin' God. Amen.""",Chase Rice featuring Florida Georgia Line,2021,chase rice,Drinkin Beer Talkin God Amen,artist:chase rice track:Drinkin Beer Talkin G...,1UYfAU2bwgjaM5rIIPQleC,6pBNfggcZZDCmb0p92OnGn,0.627,...,0.3740,0.724,100.032,audio_features,1UYfAU2bwgjaM5rIIPQleC,spotify:track:1UYfAU2bwgjaM5rIIPQleC,https://api.spotify.com/v1/tracks/1UYfAU2bwgja...,https://api.spotify.com/v1/audio-analysis/1UYf...,160839,4


In [None]:
df2.to_csv("billboard_music_analysis/bb_100_feat.csv", index = False)

##### Stale Code
Code I am holding onto in case I need later

In [None]:
for i in sp.search(q = 'artist:n ii u', type = 'track')['tracks']['items']:
    print([re.sub('[\"\']', '', artist['name'].lower()).split() for artist in i['artists']])

[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['ninja', 'sex', 'party']]
[['ramon', 'ayala', 'y', 'sus', 'bravos', 'del', 'norte']]
[['guns', 'n', 'roses']]
[['ultra', 'naté']]
[['ninja', 'sex', 'party']]


In [None]:
birdy_uri = 'spotify:artist:2WX2uTcsvV5OnS0inACecP'
results = sp.artist_albums(birdy_uri, album_type='album')
albums = results['items']
while results['next']:
    results = spotify.next(results)
    albums.extend(results['items'])
for album in albums:
    print(album['name'])

Young Heart
Beautiful Lies
Beautiful Lies
Beautiful Lies (Deluxe)
Beautiful Lies (Deluxe)
Fire Within
Fire Within
Fire Within (Deluxe)
Fire Within (Deluxe)
Fire Within (Deluxe)
Live in London
Birdy
Birdy
Birdy
Birdy
Birdy (Deluxe Version)


In [None]:
html = urlopen('https://en.wikipedia.org/wiki/Dua_Lipa')
soup = BeautifulSoup(html, 'html.parser')
wikitab_temp = soup.find('table',{'class':"infobox biography vcard"})
wikitab_temp.find_all('td', {'class':"infobox-data"})[0]#.find({'class':'bday'})

<td class="infobox-data"><span style="display:none"> (<span class="bday">1995-08-22</span>) </span>22 August 1995<span class="noprint ForceAgeToShow"> (age 27)</span><br/><div class="birthplace" style="display:inline"><a href="/wiki/London" title="London">London</a>, England</div></td>

In [None]:
%time
# change to list comprehension
temp = wikitab.find_all('td')
for i in np.arange(2, 300, 3):
    art_lst = temp[i].find_all('a')
    if len(art_lst) > 0:
        main_art = art_lst[0].text
        main_art_ref = art_lst[0].get('href')
    else: main_art = None
    feat_art = []
    if len(art_lst) > 1:
        for j in range(len(art_lst))[1:]:
            feat_art.append(art_lst[j].text)
    print(main_art, main_art_ref, feat_art)


CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.53 µs
Dua Lipa /wiki/Dua_Lipa []
The Weeknd /wiki/The_Weeknd ['Ariana Grande']
None /wiki/The_Weeknd []
24kGoldn /wiki/24kGoldn ['Iann Dior']
Olivia Rodrigo /wiki/Olivia_Rodrigo []
Doja Cat /wiki/Doja_Cat ['SZA']
Silk Sonic /wiki/Silk_Sonic ['Bruno Mars', 'Anderson .Paak']
None /wiki/Silk_Sonic []
Lil Nas X /wiki/Lil_Nas_X []
Justin Bieber /wiki/Justin_Bieber ['Daniel Caesar', 'Giveon']
BTS /wiki/BTS []
The Kid Laroi /wiki/The_Kid_Laroi []
None /wiki/The_Kid_Laroi []
None /wiki/The_Kid_Laroi []
Ed Sheeran /wiki/Ed_Sheeran []
Glass Animals /wiki/Glass_Animals []
None /wiki/Glass_Animals []
Luke Combs /wiki/Luke_Combs []
Chris Brown /wiki/Chris_Brown ['Young Thug']
Masked Wolf /wiki/Masked_Wolf []
Megan Thee Stallion /wiki/Megan_Thee_Stallion []
Pop Smoke /wiki/Pop_Smoke []
Machine Gun Kelly /wiki/Machine_Gun_Kelly_(musician) ['Blackbear']
Jack Harlow /wiki/Jack_Harlow []
Billie Eilish /wiki/Billie_Eilish []
Cardi B /wiki/Cardi_B 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3f060e00-a163-4cce-a4c0-34e77cfdc670' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>