In [1]:
# import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Scrape Billboard Top 100

In [2]:
# extract data from billboard.com to get top 100 songs
url = 'https://www.billboard.com/charts/year-end/2020/hot-100-songs'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, 'html.parser')

# create list of songs 
songs_container = soup.find('div', class_="container container--xxlight-grey container--no-side-padding")

In [3]:
songs = songs_container.findAll('div', class_="ye-chart-item__title")
songs_2020 = [song.text.replace('\n', '') for song in songs_container.findAll('div', class_="ye-chart-item__title")]
songs_2020[:5]

['Blinding Lights', 'Circles', 'The Box', " Don't Start Now", 'Rockstar']

In [4]:
artists = songs_container.findAll('div', class_="ye-chart-item__artist")
artists_2020 = [artist.text.replace('\n', '') for artist in artists]
artists_2020[:5]

['The Weeknd',
 'Post Malone',
 'Roddy Ricch',
 'Dua Lipa',
 'DaBaby Featuring Roddy Ricch']

### Scrape Lyrics from Genius

In [5]:
# create function that scrapes lyrics for songs from genius.com
def scrape_lyrics(artist, song):
    
    # create url 
    punc = """!()-[]{};:'"\,<>./?@#$%^&*_~"""
    for ele in artist: 
        if ele in punc: 
            artist = artist.replace(ele, "") 
    for ele in song:
        if ele in punc:
            song = song.replace(ele, "")
    artist = artist.lower().replace(' ','-')
    song = song.lower().replace(' ', '-')
    url = 'https://genius.com/' + artist + '-' + song + '-lyrics'
    
    # scrape lyrics
    headers = {"User-Agent": 'Mozilla/5.0'}
    html_page = requests.get(url, headers=headers)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    lyrics = soup.find('p')
    return lyrics.text.replace('\n', ' ')

In [6]:
# test out function 
scrape_lyrics('Justin Bieber', "Hold On")

"[Verse 1] You know you can call me if you need someone I'll pick up the pieces if you come undone  [Pre-Chorus] Painting stars up on your ceiling 'Cause you wish that you could find some feeling, yeah, you You know you can call me if you need someone  [Chorus] I need you to hold on Heaven is a place not too far away We all know I should be the one To say we all make mistakes (We all make mistakes) Take my hand and hold on Tell me everything that you need to say 'Cause I know how it feels to be someone (Someone) Feels to be someone who loses their way  [Verse 2] You're looking for answers in a place unknown You need the connection but you can't get close (Can't get close)  [Pre-Chorus] Painting stars up on your ceiling 'Cause you wish that you could find some feeling, yeah, you You know you can call me if you need someone  [Chorus] I need you to hold on (Hold on) Heaven is a place not too far away (Away, yeah) We all know I should be the one To say we all make mistakes (We all make mis

### Get Billboard Top 100's (1970-2020)

In [7]:
# Create function that creates dataframe for Billboard Top 100 
def billboard_to_df(year):
    
    # scrape billboard top 100 
    url = 'https://www.billboard.com/charts/year-end/' + str(year) +'/hot-100-songs'
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    
    # create list of songs 
    songs_container = soup.find('div', class_="container container--xxlight-grey container--no-side-padding")
    songs = songs_container.findAll('div', class_="ye-chart-item__title")
    songs_list = [song.text.replace('\n', '') for song in songs_container.findAll('div', class_="ye-chart-item__title")]
    
    # create list of artists
    artists = songs_container.findAll('div', class_="ye-chart-item__artist")
    artists_list = [artist.text.replace('\n', '') for artist in artists]
    
    # scrape lyrics 
    lyrics_list = []
    for i in list(range(len(songs_list))):
        lyrics = scrape_lyrics(artists_list[i], songs_list[i])
        lyrics_list.append(lyrics)
    
    # create column for rank and year
    rank = list(range(len(songs_list)))
    year = [year for i in rank]
    
    # create dataframe
    df = pd.DataFrame({'year':year, 'rank':rank, 'song':songs_list, 'artist':artists_list, 'lyrics':lyrics_list})
    return df

In [8]:
# create initial dataframe for 1970's billboard top 100 
year = 1970 
df = billboard_to_df(year)

# append top billboard 1970's 
while year < 1980:
    year += 1 
    df = df.append(billboard_to_df(year))
    print(year)

1971
1972
1973
1974
1975
1976
1977
1978
1979
1980


In [16]:
# append top billboard 1980's 
while year < 1990:
    year += 1 
    df = df.append(billboard_to_df(year))
    print(year)

In [21]:
# append top billboard 1990's 
while year < 2000:
    year += 1 
    df = df.append(billboard_to_df(year))
    print(year)

1991
1992
1993
1994
1995
1996
1997
1998
1999
2000


In [31]:
# append top billboard 2000's 
while year < 2010:
    year += 1 
    df = df.append(billboard_to_df(year))
    print(year)

2009
2010


In [33]:
# append top billboard 2010's 
while year < 2020:
    year += 1 
    df = df.append(billboard_to_df(year))
    print(year)

2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


In [None]:
#85 87 88 90

In [40]:
df = df.append(billboard_to_df(1985))

In [42]:
df = df.append(billboard_to_df(1987))

In [44]:
df = df.append(billboard_to_df(1988))

In [46]:
df = df.append(billboard_to_df(1990))

### Preview DataFrame and Save

In [48]:
df.info()
df.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4553 entries, 0 to 89
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    4553 non-null   int64 
 1   rank    4553 non-null   int64 
 2   song    4553 non-null   object
 3   artist  4553 non-null   object
 4   lyrics  4553 non-null   object
dtypes: int64(2), object(3)
memory usage: 213.4+ KB


Unnamed: 0,year,rank,song,artist,lyrics
85,1990,85,Without You,Motley Crue,"Without you, there's no change My nights and d..."
86,1990,86,Swing The Mood,Jive Bunny & The Mastermixers,"Sorry, we didn't mean for that to happen!"
87,1990,87,Thieves In The Temple,Prince,[Chorus] Love come quick Love come in a hurry ...
88,1990,88,Mentirosa,Mellow Man Ace,"[Intro: Band] Ain't got nobody, baby...baby [..."
89,1990,89,Tic-Tac-Toe,Kyper,"Sorry, we didn't mean for that to happen!"


In [49]:
# save dataframe 
df.to_csv('billboards.csv')