In [None]:
# import dependencies
#  data packages
import pandas as pd
import numpy as np
#  web-scraping packages
from urllib.request import urlopen
from bs4 import BeautifulSoup
import wikipedia
#  Spotify packages
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
#  lyrics package
import lyricsgenius as genius
#  misc processing packages
import re
import itertools
from pyspark.sql import SparkSession
#  config features
import config

In [None]:
# initialize spotify auth
client_credentials_manager = SpotifyClientCredentials(
    client_id = config.client_id, client_secret = config.client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [None]:
! sudo apt-get update
! sudo mkdir -p /usr/share/man/man1
! sudo apt-get install -y openjdk-11-jdk
! pip install pyspark

# initialize spark session
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('sp_search') \
    .getOrCreate() 
sc = spark.sparkContext

Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian-security buster/updates InRelease [34.8 kB]
Get:3 http://deb.debian.org/debian buster-updates InRelease [56.6 kB]
Get:4 http://deb.debian.org/debian buster/main amd64 Packages [7,909 kB]
Get:5 http://deb.debian.org/debian-security buster/updates/main amd64 Packages [369 kB]
Get:6 http://deb.debian.org/debian buster-updates/main amd64 Packages [8,788 B]
Fetched 8,500 kB in 2s (3,507 kB/s)




The following additional packages will be installed:
  at-spi2-core ca-certificates-java dbus dbus-user-session
  dconf-gsettings-backend dconf-service dmsetup fonts-dejavu-extra
  glib-networking glib-networking-common glib-networking-services
  gsettings-desktop-schemas java-common libapparmor1 libargon2-1 libasound2
  libasound2-data libatk-bridge2.0-0 libatk-wrapper-java
  libatk-wrapper-java-jni libatspi2.0-0 libcap2 libcolord2 libcryptsetup12
  libdbus-1-3 libdconf1 libdevmapper1.02.1 libdrm-amdgpu

### Grab Wikipedia Data
We will be using the beautiful soup library to scrape each year's top 100 tracks. Wikipedia lists each year's top-100 list in a tabulated format with the first column being the song's rank, the second column being the track name, and the last column being the artists involved.

In [None]:
# create function that grabs top 100 for each year
def wiki_lists(url_prefix, min_yr, max_yr):
    # initialize empty df to append to
    df = pd.DataFrame()
    # for each year open the url and extract the table as a df
    for yr in np.arange(min_yr, max_yr + 1):
        url = f'{url_prefix}{str(yr)}'
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        wikitab = soup.find('table',{'class':"wikitable"})
        df_yr = pd.read_html(str(wikitab))
        df_yr = pd.DataFrame(df_yr[0])
        # align column names
        df_yr.columns = ['rank','track_nm','artist_nms']
        df_yr['year'] = yr
        # append to general df
        df = pd.concat([df, df_yr], ignore_index = True)
    return df

# set min_yr back to config.min_yr when doing full analysis
#  doing a smaller subset for a faster run
df = wiki_lists(config.wiki_url_prefix, config.min_yr, config.max_yr)

In [1]:
# url = f'{config.wiki_url_prefix}2021'
# html = urlopen(url)
# soup = BeautifulSoup(html, 'html.parser')
# wikitab = soup.find('table',{'class':"wikitable"})
# wikitab.select('td')[11].find_all('a')#.get('title')

def wiki_refs(url_prefix, min_yr, max_yr):
    # initialize empty list to append to
    lst = []
    # for each year open the url and extract artist names and wp links
    for yr in np.arange(min_yr, max_yr + 1):
        url = f'{url_prefix}{str(yr)}'
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        wikitab = soup.find('table',{'class':"wikitable"}).select('td')
        # iterate through wikitab and grab artist names and href
        # lst.extend([[x.get('title'), x.get('href')] for x := i.find('a') in wikitab])
    return [[x.get('title'), x.get('href')] for x := i.find('a') in wikitab]

wiki_refs(config.wiki_url_prefix, 2015, 2021)

SyntaxError: invalid syntax (549088003.py, line 18)

### Grab Spotify Data
We will use the data scraped from Wikipedia to search Spotify for as many billboard year-end top 100 tracks as we can find. Using regex and some text formatting we will try to capture as many songs as possible. After finding these songs and retrieving their Spotify track id's, we can use those id's to call their track attributes.

In [None]:
def search_term(df, artist_col_nm, track_col_nm):
    # lowercase and split using vector functions
    artist_search = df[artist_col_nm].str.lower() \
        .str.replace('[\"\']', '', regex = True) \
        .str.replace('((feat\W|\(|and\s|featuring|,|with\s|&\s).*)', '', regex = True)
    # define track terms
    track_search = df[track_col_nm].str.replace('[\"\'\.]', '', regex = True).str.split(pat = "/").str[0]
    return artist_search, track_search

df['artist_search'], df['track_search'] = search_term(df, 'artist_nms', 'track_nm')

In [None]:
# create function that iterates through artists for matching
def artist_searcher(output, artist):
    if len(output) == 0: return None, None
    else:
        for i in output:
            artists = [re.sub('[\"\']', '', a['name'].lower()) for a in i['artists']]
            artists = set(itertools.chain.from_iterable(artists))
            if set(artist.split()).isdisjoint(artists) == False:
                try: return i[0]['id'], i[0]['artists'][0]['id']
                except: return None, None
            else: return None, None

In [None]:
# create search rules to optimize accuracy of spotify search selection
def search_rule(query, artist, track, year):
    try: output = sp.search(q = query + ' year:' + str(year) + '-' + str(year + 2),
                            limit = 2, type = 'track')['tracks']['items']
    except: output = sp.search(q = query, limit = 2, type = 'track')['tracks']['items']
    # if condition is met we skip all other loops
    if len(output) >= 1:
        return output[0]['id'], output[0]['artists'][0]['id']
    # if first condition is not met we just search on song title
    else:
        output = sp.search(q = 'track:' + track, type = 'track')['tracks']['items']
        # use artist name to select appropriate title
        track_id, artist_id = artist_searcher(output, artist)
        if track_id is not None: return track_id, artist_id
        else:
            for word in track.split():
                output = sp.search(q = 'track:' + word, type = 'track')['tracks']['items']
                # use artist name to select appropriate title
                return artist_searcher(output, artist)

In [None]:
# for each song in the wiki df search spotify for track id's
def track_id_search(df, sp, artist_col, track_col, year):
    df['query'] = 'artist:' + df[artist_col] + ' track:' + df[track_col] 
    sp_df = spark.createDataFrame(df[['query', artist_col, track_col, year]])
    rdd = sp_df.rdd.map(lambda x: search_rule(x.query, x[artist_col], x[track_col], x[year]))
    df2 = rdd.toDF(['track_ids', 'artist_ids'])
    return df2.show()
    
track_id_search(df, sp, 'artist_search', 'track_search', 'year')
# df['track_ids'], df['artist_ids'] = track_id_search(df, sp, 'artist_search', 'track_search', 'year')
# df2 = df.dropna()

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:raiders track:Indian Reservation (The Lament of the Cherokee Reservation Indian) year:1971-1973', 'limit': 2, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Bad request.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:raiders track:Indian Reservation (The Lament of the Cherokee Reservation Indian) year:1971-1973', 'limit': 2, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Bad request.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:the hillside singers track:Id Like to Teach the World to Sing (In Perfect Harmony) year:1972-1974', 'limit': 2, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Bad request.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:b.j. thomas track:(Hey Wont You Play) Another Somebody Done Somebody Wrong Song year:1975-1977', 'l

In [None]:
# for each song in the wiki df search spotify for track id's
def track_id_search(df, sp, artist_col, track_col, year):
    df['query'] = 'artist:' + df[artist_col] + ' track:' + df[track_col] 
    track_ids, artist_ids = zip(*df[['query', artist_col, track_col, year]].apply(
        lambda x: search_rule(x.query, x[artist_col], x[track_col], x[year]), axis = 1))
    return track_ids, artist_ids
    
df['track_ids'], df['artist_ids'] = track_id_search(df, sp, 'artist_search', 'track_search', 'year')
df2 = df.dropna()

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:raiders track:Indian Reservation (The Lament of the Cherokee Reservation Indian) year:1971-1973', 'limit': 2, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Bad request.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:the hillside singers track:Id Like to Teach the World to Sing (In Perfect Harmony) year:1972-1974', 'limit': 2, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Bad request.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:b.j. thomas track:(Hey Wont You Play) Another Somebody Done Somebody Wrong Song year:1975-1977', 'limit': 2, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Bad request.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:the four seasons track:December, 1963 (Oh, What a Night) (DJ Ben Liebrand Remix) year:1994-1996', 'l

In [None]:
# create function that uses track_ids to find audio features
def audio_feature(df, id_col):
    ser = df[id_col]
    af_df = pd.DataFrame()
    for i in np.arange(0, df.shape[0], 100):
        output = sp.audio_features(list(ser[i:(i + 100)]))
        output = [i for i in output if i is not None]
        extended = pd.DataFrame(output)
        af_df = pd.concat([af_df, extended], ignore_index = True)
    return af_df

df3 = df2.merge(audio_feature(df2, 'track_ids'), how = 'inner', left_on = 'track_ids',
               right_on = 'id').drop_duplicates().dropna()

NameError: name 'df2' is not defined

In [None]:
df3.to_csv("billboard_music_analysis/streamlit/data/bb_100_feat.csv", index = False)

##### Stale Code
Code I am holding onto in case I need later

In [None]:
for i in sp.search(q = 'artist:n ii u', type = 'track')['tracks']['items']:
    print([re.sub('[\"\']', '', artist['name'].lower()).split() for artist in i['artists']])

[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['guns', 'n', 'roses']]
[['ninja', 'sex', 'party']]
[['ramon', 'ayala', 'y', 'sus', 'bravos', 'del', 'norte']]
[['guns', 'n', 'roses']]
[['ultra', 'naté']]
[['ninja', 'sex', 'party']]


In [None]:
birdy_uri = 'spotify:artist:2WX2uTcsvV5OnS0inACecP'
results = sp.artist_albums(birdy_uri, album_type='album')
albums = results['items']
while results['next']:
    results = spotify.next(results)
    albums.extend(results['items'])
for album in albums:
    print(album['name'])

Young Heart
Beautiful Lies
Beautiful Lies
Beautiful Lies (Deluxe)
Beautiful Lies (Deluxe)
Fire Within
Fire Within
Fire Within (Deluxe)
Fire Within (Deluxe)
Fire Within (Deluxe)
Live in London
Birdy
Birdy
Birdy
Birdy
Birdy (Deluxe Version)


In [None]:
html = urlopen('https://en.wikipedia.org/wiki/Dua_Lipa')
soup = BeautifulSoup(html, 'html.parser')
wikitab_temp = soup.find('table',{'class':"infobox biography vcard"})
wikitab_temp.find_all('td', {'class':"infobox-data"})[0]#.find({'class':'bday'})

<td class="infobox-data"><span style="display:none"> (<span class="bday">1995-08-22</span>) </span>22 August 1995<span class="noprint ForceAgeToShow"> (age 27)</span><br/><div class="birthplace" style="display:inline"><a href="/wiki/London" title="London">London</a>, England</div></td>

In [None]:
%time
# change to list comprehension
temp = wikitab.find_all('td')
for i in np.arange(2, 300, 3):
    art_lst = temp[i].find_all('a')
    if len(art_lst) > 0:
        main_art = art_lst[0].text
        main_art_ref = art_lst[0].get('href')
    else: main_art = None
    feat_art = []
    if len(art_lst) > 1:
        for j in range(len(art_lst))[1:]:
            feat_art.append(art_lst[j].text)
    print(main_art, main_art_ref, feat_art)


CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.53 µs
Dua Lipa /wiki/Dua_Lipa []
The Weeknd /wiki/The_Weeknd ['Ariana Grande']
None /wiki/The_Weeknd []
24kGoldn /wiki/24kGoldn ['Iann Dior']
Olivia Rodrigo /wiki/Olivia_Rodrigo []
Doja Cat /wiki/Doja_Cat ['SZA']
Silk Sonic /wiki/Silk_Sonic ['Bruno Mars', 'Anderson .Paak']
None /wiki/Silk_Sonic []
Lil Nas X /wiki/Lil_Nas_X []
Justin Bieber /wiki/Justin_Bieber ['Daniel Caesar', 'Giveon']
BTS /wiki/BTS []
The Kid Laroi /wiki/The_Kid_Laroi []
None /wiki/The_Kid_Laroi []
None /wiki/The_Kid_Laroi []
Ed Sheeran /wiki/Ed_Sheeran []
Glass Animals /wiki/Glass_Animals []
None /wiki/Glass_Animals []
Luke Combs /wiki/Luke_Combs []
Chris Brown /wiki/Chris_Brown ['Young Thug']
Masked Wolf /wiki/Masked_Wolf []
Megan Thee Stallion /wiki/Megan_Thee_Stallion []
Pop Smoke /wiki/Pop_Smoke []
Machine Gun Kelly /wiki/Machine_Gun_Kelly_(musician) ['Blackbear']
Jack Harlow /wiki/Jack_Harlow []
Billie Eilish /wiki/Billie_Eilish []
Cardi B /wiki/Cardi_B 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3f060e00-a163-4cce-a4c0-34e77cfdc670' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>