In [2]:
# import dependencies
#  data packages
import pandas as pd
import numpy as np
#  web-scraping packages
from urllib.request import urlopen
from bs4 import BeautifulSoup
#  Spotify packages
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
#  misc processing packages
import re
import itertools
from pyspark.sql import SparkSession
#  config features
import config

In [3]:
# initialize spotify auth
client_credentials_manager = SpotifyClientCredentials(
    client_id = config.client_id, client_secret = config.client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [4]:
# setup deepnote spark integration
! sudo apt-get update
! sudo mkdir -p /usr/share/man/man1
! sudo apt-get install -y openjdk-11-jdk
! pip install pyspark

# initialize spark session
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('sp_search') \
    .getOrCreate() 
sc = spark.sparkContext

Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian-security buster/updates InRelease [34.8 kB]
Get:3 http://deb.debian.org/debian buster-updates InRelease [56.6 kB]
Get:4 http://deb.debian.org/debian buster/main amd64 Packages [7,909 kB]
Get:5 http://deb.debian.org/debian-security buster/updates/main amd64 Packages [369 kB]
Get:6 http://deb.debian.org/debian buster-updates/main amd64 Packages [8,788 B]
Fetched 8,500 kB in 4s (2,385 kB/s)




The following additional packages will be installed:
  at-spi2-core ca-certificates-java dbus dbus-user-session
  dconf-gsettings-backend dconf-service dmsetup fonts-dejavu-extra
  glib-networking glib-networking-common glib-networking-services
  gsettings-desktop-schemas java-common libapparmor1 libargon2-1 libasound2
  libasound2-data libatk-bridge2.0-0 libatk-wrapper-java
  libatk-wrapper-java-jni libatspi2.0-0 libcap2 libcolord2 libcryptsetup12
  libdbus-1-3 libdconf1 libdevmapper1.02.1 libdrm-amdgpu

### Grab Wikipedia Data
We will be using the beautiful soup library to scrape each year's top 100 tracks. Wikipedia lists each year's top-100 list in a tabulated format with the first column being the song's rank, the second column being the track name, and the last column being the artists involved.

In [5]:
# grab first wikitab from wikipedia page
def grab_wikitab(url, cls):
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.find('table',{'class':cls})

In [6]:
# create function that uses artist href's to grab birth info
def birth_info(href):
    wiki_home = 'https://en.wikipedia.org'
    try: infobox = grab_wikitab(wiki_home + href, 'infobox').select('td')[1:5]
    except: return None, None
    for i in infobox:
        bday = i.find('span', {'class':'bday'})
        # get doesn't default to none if find returns none
        try: origin = i.find('a').get('title')
        except: origin = None
        if bday != None or origin != None:
            return bday, origin 
    return None, None

In [7]:
# create wikipedia scrape function that grabs artists' hrefs
def wiki_refs(wikitab):
    # initialize empty list to append to
    urls = []
    for i in np.arange(2, len(wikitab), 3):
        line = wikitab[i].find('a')
        if line != None:
            href = line.get('href')
            ### we opted against adding the below bday and origin
            ### this statement would fail as it creates too many url calls
            ### and would cause a keyboard interruption
            ### if we had more time we would explore this further
            # bday, origin = birth_info(href)
            urls.append([line.get('title'), href])
    return urls

In [10]:
# create function that grabs top 100 for each year
def wiki_lists(url_prefix, min_yr, max_yr):
    # initialize empty df to append bb songs to
    df = pd.DataFrame()
    ### we were going to create a url list so we could capture
    ### artist origin and DOB info
    ### due to time constraints to solution a better way to grab
    ### these features we have limited the scope on this
    # urls = []
    # for each year open the url and extract the table as a df
    for yr in np.arange(min_yr, max_yr + 1):
        # grab wikitab
        url = f'{url_prefix}{str(yr)}'
        wikitab = grab_wikitab(url, 'wikitable')
        # create pd dataframe out of table contents
        df_yr = pd.read_html(str(wikitab))
        df_yr = pd.DataFrame(df_yr[0])
        # align columns
        df_yr.columns = ['rank','track_nm','artist_nms']
        # create year column
        df_yr['year'] = yr
        # append to general df
        df = pd.concat([df, df_yr], ignore_index = True)
        ### as stated above we no longer need the url list
        # append to list of artist hrefs
        # urls.extend(wiki_refs(wikitab.select('td')))
    return df#, urls

df = wiki_lists(config.wiki_url_prefix, config.min_yr, config.max_yr)

### Grab Spotify Data
We will use the data scraped from Wikipedia to search Spotify for as many billboard year-end top 100 tracks as we can find. Using regex and some text formatting we will try to capture as many songs as possible. After finding these songs and retrieving their Spotify track id's, we can use those id's to call their track attributes.

In [11]:
def search_term(df, artist_col_nm, track_col_nm):
    # lowercase and split using vector functions
    artist_search = df[artist_col_nm].str.lower() \
        .str.replace('[\"\']', '', regex = True) \
        .str.replace('((feat\W|\(|and\s|featuring|,|with\s|&\s).*)', '', regex = True)
    # define track terms
    track_search = df[track_col_nm].str.replace('[\"\'\.]', '', regex = True).str.split(pat = "/").str[0]
    return artist_search, track_search

df['artist_search'], df['track_search'] = search_term(df, 'artist_nms', 'track_nm')

In [12]:
# create function that iterates through artists for matching
def artist_searcher(output, artist):
    if len(output) == 0: return None, None
    else:
        for i in output:
            artists = [re.sub('[\"\']', '', a['name'].lower()) for a in i['artists']]
            artists = set(itertools.chain.from_iterable(artists))
            if set(artist.split()).isdisjoint(artists) == False:
                try: return i[0]['id'], i[0]['artists'][0]['id']
                except: return None, None
            else: return None, None

In [13]:
# create search rules to optimize accuracy of spotify search selection
def search_rule(query, artist, track, year):
    try: output = sp.search(q = query + ' year:' + str(year) + '-' + str(year + 2),
                            limit = 2, type = 'track')['tracks']['items']
    except: output = sp.search(q = query, limit = 2, type = 'track')['tracks']['items']
    # if condition is met we skip all other loops
    if len(output) >= 1:
        return output[0]['id'], output[0]['artists'][0]['id']
    # if first condition is not met we just search on song title
    else:
        output = sp.search(q = 'track:' + track, type = 'track')['tracks']['items']
        # use artist name to select appropriate title
        track_id, artist_id = artist_searcher(output, artist)
        if track_id is not None: return track_id, artist_id
        else:
            for word in track.split():
                output = sp.search(q = 'track:' + word, type = 'track')['tracks']['items']
                # use artist name to select appropriate title
                return artist_searcher(output, artist)

In [14]:
# for each song in the wiki df search spotify for track id's
# this method leverages pyspark and RDD's
# we did this to assess data ingestion time difference
def track_id_search(df, sp, artist_col, track_col, year):
    df['query'] = 'artist:' + df[artist_col] + ' track:' + df[track_col] 
    sp_df = spark.createDataFrame(df[['query', artist_col, track_col, year]])
    rdd = sp_df.rdd.map(lambda x: search_rule(x.query, x[artist_col], x[track_col], x[year]))
    df2 = rdd.toDF(['track_ids', 'artist_ids'])
    return df2.toPandas()

In [None]:
# for each song in the wiki df search spotify for track id's
# this method is only slightly less efficient than the pyspark + RDD method
# so we have decided to use this as it is somewhat more digestible code to read
def track_id_search(df, sp, artist_col, track_col, year):
    df['query'] = 'artist:' + df[artist_col] + ' track:' + df[track_col] 
    # iterate through search rules until we find a match or give up
    track_ids, artist_ids = zip(*df[['query', artist_col, track_col, year]].apply(
        lambda x: search_rule(x.query, x[artist_col], x[track_col], x[year]), axis = 1))
    return track_ids, artist_ids
    
df['track_ids'], df['artist_ids'] = track_id_search(df, sp, 'artist_search', 'track_search', 'year')
# drop nulls as decided during the project proposal stage
df2 = df.copy().dropna()

ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:raiders track:Indian Reservation (The Lament of the Cherokee Reservation Indian) year:1971-1973', 'limit': 2, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Bad request.
ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:the hillside singers track:Id Like to Teach the World to Sing (In Perfect Harmony) year:1972-1974', 'limit': 2, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Bad request.
ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:b.j. thomas track:(Hey Wont You Play) Another Somebody Done Somebody Wrong Song year:1975-1977', 'limit': 2, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Bad request.
ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:the four seasons

In [None]:
# df = pd.read_csv('/work/billboard_music_analysis/streamlit/data/bb_100_feat.csv')

In [None]:
# create function that uses track_ids to find audio features
# and uses artist_ids to grab genres
def audio_feature(df, af_id_col, genre_id_col):
    # grab track_ids to get audio features
    af_ser = df[af_id_col]
    af_df = pd.DataFrame()
    # grab artist_ids to get artist genre
    genre_ser = df[genre_id_col]
    genre_df = pd.DataFrame()
    # iterate through series and make api calls
    for i in np.arange(0, df.shape[0], 50):
        # start with audio features
        af_output = sp.audio_features(list(af_ser[i:(i + 50)]))
        af_output = [i for i in af_output if i is not None]
        af_extended = pd.DataFrame(af_output)
        af_df = pd.concat([af_df, af_extended], ignore_index = True)
        # follow with genres
        genre_output = sp.artists(list(genre_ser[i:(i + 50)]))['artists']
        genre_output = [[i['id'], i['genres'][0]] for i in genre_output if len(i['genres']) > 0]
        genre_extended = pd.DataFrame(genre_output)
        genre_df = pd.concat([genre_df, genre_extended], ignore_index = True)
    # dedup df's to avoid duplication when joining
    af_df.drop_duplicates(inplace = True)
    genre_df.drop_duplicates(inplace = True)
    # rename genre df columns
    genre_df.columns = ['artist_ids', 'genre']
    return af_df, genre_df

df3 = df2.merge(audio_feature(df2, 'track_ids'), how = 'inner', left_on = 'track_ids',
                right_on = 'id').drop_duplicates().dropna()

In [None]:
af_df, genre_df = audio_feature(df, 'track_ids', 'artist_ids')
# merge new df's
df4 = df3.merge(af_df, how = 'inner', left_on = 'track_ids', right_on = 'id').dropna()
df4 = df4.merge(genre_df, how = 'inner', on = 'artist_ids').dropna()

In [None]:
df.head()

Unnamed: 0,rank,track_nm,artist_nms,year,artist_search,track_search,query,track_ids,artist_ids,danceability,...,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre
0,1,"""Joy to the World""",Three Dog Night,1971,three dog night,Joy to the World,artist:three dog night track:Joy to the World,2ymeOsYijJz09LfKw3yM2x,4FAEZeJcsYYBkNq2D3KGTV,0.649,...,0.971,126.867,audio_features,2ymeOsYijJz09LfKw3yM2x,spotify:track:2ymeOsYijJz09LfKw3yM2x,https://api.spotify.com/v1/tracks/2ymeOsYijJz0...,https://api.spotify.com/v1/audio-analysis/2yme...,220573,4,album rock
1,69,"""Liar""",Three Dog Night,1971,three dog night,Liar,artist:three dog night track:Liar,4NLKJfFJl19CnJaEGMGjhl,4FAEZeJcsYYBkNq2D3KGTV,0.52,...,0.674,107.616,audio_features,4NLKJfFJl19CnJaEGMGjhl,spotify:track:4NLKJfFJl19CnJaEGMGjhl,https://api.spotify.com/v1/tracks/4NLKJfFJl19C...,https://api.spotify.com/v1/audio-analysis/4NLK...,235800,4,album rock
2,63,"""Black and White""",Three Dog Night,1972,three dog night,Black and White,artist:three dog night track:Black and White,4XcUADpOth9Wroq5EVMFJq,4FAEZeJcsYYBkNq2D3KGTV,0.688,...,0.961,109.077,audio_features,4XcUADpOth9Wroq5EVMFJq,spotify:track:4XcUADpOth9Wroq5EVMFJq,https://api.spotify.com/v1/tracks/4XcUADpOth9W...,https://api.spotify.com/v1/audio-analysis/4XcU...,206080,4,album rock
3,73,"""Never Been to Spain""",Three Dog Night,1972,three dog night,Never Been to Spain,artist:three dog night track:Never Been to Spain,61OQuSzsom6geFCA18JwO6,4FAEZeJcsYYBkNq2D3KGTV,0.517,...,0.727,88.762,audio_features,61OQuSzsom6geFCA18JwO6,spotify:track:61OQuSzsom6geFCA18JwO6,https://api.spotify.com/v1/tracks/61OQuSzsom6g...,https://api.spotify.com/v1/audio-analysis/61OQ...,226067,4,album rock
4,31,"""Shambala""",Three Dog Night,1973,three dog night,Shambala,artist:three dog night track:Shambala,0P6fEgTn3cxLLyYTJYoYGj,4FAEZeJcsYYBkNq2D3KGTV,0.551,...,0.834,127.495,audio_features,0P6fEgTn3cxLLyYTJYoYGj,spotify:track:0P6fEgTn3cxLLyYTJYoYGj,https://api.spotify.com/v1/tracks/0P6fEgTn3cxL...,https://api.spotify.com/v1/audio-analysis/0P6f...,204107,4,album rock


In [None]:
df4.to_csv("/work/billboard_music_analysis/streamlit/data/bb_100_feat.csv", index = False)

In [None]:
# average by year function
def avger(df, col_list, id_col):
    avg_df = df.groupby(id_col, as_index = False)[col_list].mean()
    return avg_df

In [None]:
# create melt function...drop unnecessary id columns
def melter(df, id_list, col_list, drop_list):
    if drop_list != None:
        melt_df = df.drop(labels = drop_list, axis = 1) 
    else: melt_df = df
    melt_df = melt_df.melt(id_vars = id_list, value_vars = col_list, 
        var_name = 'audio_feature', value_name = 'value')
    return melt_df

In [None]:
col_list = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
            'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
drop_list = ['type', 'id', 'uri', 'track_href', 'analysis_url', 'query']
id_list = ['rank', 'track_nm', 'artist_nms', 'year', 'genre', 'artist_search', 'track_search']
long_melt = melter(df, id_list, col_list, drop_list)
avg_df = avger(df, col_list, 'year')
avg_melt = melter(avg_df, ['year'], col_list, None)

In [None]:
long_melt.to_csv("/work/billboard_music_analysis/streamlit/data/bb_100_feat_melt.csv", index = False)
avg_df.to_csv("/work/billboard_music_analysis/streamlit/data/avg_feat.csv", index = False)
avg_melt.to_csv("/work/billboard_music_analysis/streamlit/data/avg_feat_melt.csv", index = False)

##### Stale Code
Code I am holding onto in case I need later

In [None]:
for i in sp.search(q = 'artist:n ii u', type = 'track')['tracks']['items']:
    print([re.sub('[\"\']', '', artist['name'].lower()).split() for artist in i['artists']])

[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['ninja', 'sex', 'party']]
[['ramon', 'ayala', 'y', 'sus', 'bravos', 'del', 'norte']]
[['guns', 'n', 'roses']]
[['ultra', 'naté']]
[['ninja', 'sex', 'party']]


In [None]:
birdy_uri = 'spotify:artist:2WX2uTcsvV5OnS0inACecP'
results = sp.artist_albums(birdy_uri, album_type='album')
albums = results['items']
while results['next']:
    results = spotify.next(results)
    albums.extend(results['items'])
for album in albums:
    print(album['name'])

Young Heart
Beautiful Lies
Beautiful Lies
Beautiful Lies (Deluxe)
Beautiful Lies (Deluxe)
Fire Within
Fire Within
Fire Within (Deluxe)
Fire Within (Deluxe)
Fire Within (Deluxe)
Live in London
Birdy
Birdy
Birdy
Birdy
Birdy (Deluxe Version)


In [None]:
html = urlopen('https://en.wikipedia.org/wiki/Dua_Lipa')
soup = BeautifulSoup(html, 'html.parser')
wikitab_temp = soup.find('table',{'class':"infobox biography vcard"})
wikitab_temp.find_all('td', {'class':"infobox-data"})[0]#.find({'class':'bday'})

<td class="infobox-data"><span style="display:none"> (<span class="bday">1995-08-22</span>) </span>22 August 1995<span class="noprint ForceAgeToShow"> (age 27)</span><br/><div class="birthplace" style="display:inline"><a href="/wiki/London" title="London">London</a>, England</div></td>

In [None]:
%time
# change to list comprehension
temp = wikitab.find_all('td')
for i in np.arange(2, 300, 3):
    art_lst = temp[i].find_all('a')
    if len(art_lst) > 0:
        main_art = art_lst[0].text
        main_art_ref = art_lst[0].get('href')
    else: main_art = None
    feat_art = []
    if len(art_lst) > 1:
        for j in range(len(art_lst))[1:]:
            feat_art.append(art_lst[j].text)
    print(main_art, main_art_ref, feat_art)


CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.53 µs
Dua Lipa /wiki/Dua_Lipa []
The Weeknd /wiki/The_Weeknd ['Ariana Grande']
None /wiki/The_Weeknd []
24kGoldn /wiki/24kGoldn ['Iann Dior']
Olivia Rodrigo /wiki/Olivia_Rodrigo []
Doja Cat /wiki/Doja_Cat ['SZA']
Silk Sonic /wiki/Silk_Sonic ['Bruno Mars', 'Anderson .Paak']
None /wiki/Silk_Sonic []
Lil Nas X /wiki/Lil_Nas_X []
Justin Bieber /wiki/Justin_Bieber ['Daniel Caesar', 'Giveon']
BTS /wiki/BTS []
The Kid Laroi /wiki/The_Kid_Laroi []
None /wiki/The_Kid_Laroi []
None /wiki/The_Kid_Laroi []
Ed Sheeran /wiki/Ed_Sheeran []
Glass Animals /wiki/Glass_Animals []
None /wiki/Glass_Animals []
Luke Combs /wiki/Luke_Combs []
Chris Brown /wiki/Chris_Brown ['Young Thug']
Masked Wolf /wiki/Masked_Wolf []
Megan Thee Stallion /wiki/Megan_Thee_Stallion []
Pop Smoke /wiki/Pop_Smoke []
Machine Gun Kelly /wiki/Machine_Gun_Kelly_(musician) ['Blackbear']
Jack Harlow /wiki/Jack_Harlow []
Billie Eilish /wiki/Billie_Eilish []
Cardi B /wiki/Cardi_B 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3f060e00-a163-4cce-a4c0-34e77cfdc670' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>