In [248]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import genius_cred as gc
from difflib import SequenceMatcher

import warnings
warnings.filterwarnings('ignore')

# Scrape Billboard Top 100

In [74]:
base_url = 'https://www.billboard.com/charts/year-end/{}/hot-100-songs/'
years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [144]:
### 2011 and 2016 have 99 songs instead of 100 

### Scrape top 100 songs by year
def get_songs(year):
    base_url = 'https://www.billboard.com/charts/year-end/{}/hot-100-songs/'
    
    song_list = []
    page = requests.get(base_url.format(year))
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all(id='title-of-a-story')
    
    if year in (2011, 2016):
        for i in results[:99]:
            song_name = i.text.replace('\n', '').replace('\t','')
            song_list.append(song_name)
    else:
        for i in results[:100]:
            song_name = i.text.replace('\n', '').replace('\t','')
            song_list.append(song_name)
        
    return song_list

### Scrape artist names for songs
def get_artists(year):
    base_url = 'https://www.billboard.com/charts/year-end/{}/hot-100-songs/'
    
    page = requests.get(base_url.format(year))
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all('span', class_='c-label')
    
    artist_list = [results[i].text.replace('\n', '').replace('\t','') for i in range(1, len(results)+1, 2)]
    
    return artist_list

In [145]:
### Aggregating data across all years
year_col= []
song_col = []
artist_col = []# 2011, 2016 missing 1 value each

for year in years: 
    if year in (2011, 2016):
        year_100 = np.zeros(99)+year
    else:
        year_100 = np.zeros(100)+year
    year_col = np.concatenate((year_col, year_100), axis=0) 
    
    songs = get_songs(year)
    song_col = np.concatenate((song_col, songs), axis=0)
    
    artists = get_artists(year)
    artist_col = np.concatenate((artist_col, artists), axis=0)

In [147]:
# Creating df
charts = dict()
charts['year'] = year_col
charts['song'] = song_col
charts['artist'] = artist_col 

charts_df = pd.DataFrame(charts)

In [148]:
charts_df.head()

Unnamed: 0,year,song,artist
0,2011.0,Rolling In The Deep,Adele
1,2011.0,Party Rock Anthem,LMFAO Featuring Lauren Bennett & GoonRock
2,2011.0,Firework,Katy Perry
3,2011.0,E.T.,Katy Perry Featuring Kanye West
4,2011.0,Give Me Everything,"Pitbull Featuring Ne-Yo, Afrojack & Nayer"


# Collect lyrics from Genius API

In [178]:
genius_url = 'https://api.genius.com/'
client_access_token = gc.token

In [181]:
r = requests.get(genius_url+f'search?access_token={client_access_token}')
r.status_code

200

In [254]:
def similarity(string1, string2):
    sim = SequenceMatcher(None, string1, string2).ratio()
    
    return sim

In [242]:
def get_url(song, artist):
    song = re.sub(r'[^\w\s]', '', song)
    song = song.replace(' ', '-')
    genius_search_url = f"http://api.genius.com/search?q={song}&access_token={client_access_token}"
    response = requests.get(genius_search_url)
    json = response.json()
    
    url = None
    for hit in json['response']['hits']:
        if artist[:5] in hit['result']['primary_artist']['name']:
            url = hit['result']['url']
            break
    if url:
        pass    
    
    return url

In [258]:
def get_url2(song, artist):
    song = re.sub(r'[^\w\s]', '', song)
    song = song.replace(' ', '-')
    genius_search_url = f"http://api.genius.com/search?q={song}&access_token={client_access_token}"
    response = requests.get(genius_search_url)
    json = response.json()
    
    url = None
    for hit in json['response']['hits']:
        if similarity(artist, hit['result']['primary_artist']['name']) > 0.5 or artist[:5] in hit['result']['primary_artist']['name']:
            url = hit['result']['url']
            break
    if url:
        pass    
    
    return url

In [262]:
lyric_urls = []
for song, artist in zip(charts_df.song, charts_df.artist):
    lyric_url = get_url2(song, artist)
    lyric_urls.append(lyric_url)

In [263]:
lyric_urls

['https://genius.com/Adele-rolling-in-the-deep-lyrics',
 'https://genius.com/Lmfao-party-rock-anthem-lyrics',
 'https://genius.com/Katy-perry-firework-lyrics',
 'https://genius.com/Katy-perry-et-remix-lyrics',
 'https://genius.com/Pitbull-give-me-everything-lyrics',
 'https://genius.com/Bruno-mars-grenade-lyrics',
 'https://genius.com/Nicki-minaj-super-bass-lyrics',
 'https://genius.com/Maroon-5-moves-like-jagger-lyrics',
 'https://genius.com/Black-eyed-peas-just-cant-get-enough-lyrics',
 'https://genius.com/Jennifer-lopez-on-the-floor-lyrics',
 'https://genius.com/Nirvana-smells-like-teen-spirit-lyrics',
 'https://genius.com/Foster-the-people-pumped-up-kicks-lyrics',
 'https://genius.com/Katy-perry-last-friday-night-tgif-lyrics',
 'https://genius.com/Bruno-mars-just-the-way-you-are-lyrics',
 'https://genius.com/Enrique-iglesias-tonight-im-lovin-you-lyrics',
 'https://genius.com/P-nk-raise-your-glass-lyrics',
 'https://genius.com/Lady-gaga-born-this-way-lyrics',
 None,
 'https://genius

In [264]:
charts_df['lyric_url'] = lyric_urls

In [269]:
no_urls = charts_df[charts_df.lyric_url.isna()]
no_urls

Unnamed: 0,year,song,artist,lyric_url
17,2011.0,F**kin' Perfect,P!nk,
31,2011.0,Blow,Ke$ha,
59,2011.0,More,Usher,
71,2011.0,Without You,David Guetta Featuring Usher,
80,2011.0,Remind Me,Brad Paisley Duet With Carrie Underwood,
...,...,...,...,...
825,2019.0,Ran$om,Lil Tecca,
840,2019.0,ME!,Taylor Swift Featuring Brendon Urie,
920,2020.0,"10,000 Hours",Dan + Shay & Justin Bieber,
969,2020.0,Hot,Young Thug Featuring Gunna,


In [270]:
np.unique(no_urls.song)

array(['#Beautiful', '1-800-273-8255', '10,000 Hours', 'Animal',
       'Animals', 'B**** Better Have My Money', 'Backseat', 'Bad',
       'Big Bank', 'Blow', 'Bodak Yellow (Money Moves)', 'Brave',
       'Broccoli', 'Burn', 'Chains', 'Cheap Thrills', 'Clarity', 'Close',
       'Cold', 'Come Over', 'Dirt', 'Down', 'Downtown', 'F**kin Problems',
       "F**kin' Perfect", 'Ghost', 'Good Girl', 'H.O.L.Y.', 'Heaven',
       'Holy Grail', 'Home', 'Hot', 'Hot Boy', 'Hurricane',
       "I Don't F**k With You",
       "I Don't Wanna Live Forever (Fifty Shades Darker)", 'In The Dark',
       'Just Give Me A Reason', "Let's Go", 'Lights', 'Lights Down Low',
       'Love Galore', 'ME!', 'Me And My Broken Heart', 'More', 'My Hitta',
       'Ni**as in Paris', 'No', 'No Mediocre', 'P*$$y Fairy (OTW)',
       'Pillowtalk', 'Ran$om', 'Remind Me', 'Springsteen',
       'Stay The Night', 'Summer',
       'Sunflower (Spider-Man: Into The Spider-Verse)', 'Taste',
       'Tuesday', 'Wanted', 'We Are Young'