# Scraping top 200 songs from the web

In this notebook, I scrape two different websites to achieve a dataframe of top 200 songs.

## Scraping playback.fm

In [1]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd

In [2]:
url= 'https://playback.fm/charts/top-100-songs/2015'

In [3]:
response = requests.get(url) # download html with a get request
response.status_code # 200 status code means OK!

200

In [None]:
response.content

In [5]:
soup = BeautifulSoup(response.content, "html.parser") # parse html (create the 'soup')

In [None]:
soup # check that the html code looks like it should

In [None]:
soup.select("td.mobile-hide")

In [None]:
soup.select("td:nth-child(2) > a")

In [None]:
print(soup.prettify())

In [10]:
soup.select("td:nth-child(2) > a")[0]

<a class="artist" href="/artist/mark-ronson-top-songs" itemprop="byArtist">
Mark Ronson featuring Bruno Mars
</a>

In [11]:
soup.select("td:nth-child(2) > a")[0].get_text()

'\nMark Ronson featuring Bruno Mars\n'

In [12]:
soup.select("td.mobile-hide > a > span.song")[0].get_text()

'Uptown Funk'

In [13]:
#initialize empty lists
artist = []
song = []

num_iter = len(soup.select("td:nth-child(2) > a"))

tClist = soup.select("td:nth-child(2) > a")
spanlist = soup.select("td.mobile-hide > a > span.song")

for i in range(num_iter):
    artist.append(tClist[i].get_text())
    song.append(spanlist[i].get_text())

print(artist)
print(song)


['\nMark Ronson featuring Bruno Mars\n', '\nEd Sheeran\n', '\nWiz Khalifa featuring Charlie Puth\n', '\nFetty Wap\n', '\nMaroon 5\n', '\nWalk the Moon\n', '\nTaylor Swift\n', '\nSilentó\n', '\nThe Weeknd\n', '\nThe Weeknd\n', '\nOMI\n', '\nThe Weeknd\n', '\nEllie Goulding\n', '\nHozier\n', '\nTaylor Swift featuring Kendrick Lamar\n', '\nMajor Lazer and DJ Snake featuring MØ\n', '\nJason Derulo\n', '\nTaylor Swift\n', '\nSkrillex and Diplo featuring Justin Bieber\n', '\nRachel Platten\n', '\nFetty Wap featuring Remy Boyz\n', '\nMeghan Trainor\n', '\nFifth Harmony featuring Kid Ink\n', '\nOmarion featuring Chris Brown and Jhené Aiko\n', '\nAndy Grammer\n', '\nSam Smith\n', '\nSelena Gomez featuring A$AP Rocky\n', '\nMeghan Trainor\n', '\nTaylor Swift\n', '\nDrake\n', '\nDavid Guetta featuring Nicki Minaj, Bebe Rexha, and Afrojack\n', '\nFlo Rida featuring Sage the Gemini and Lookas\n', '\nJustin Bieber\n', '\nEd Sheeran\n', '\nAdele\n', '\nShawn Mendes\n', '\nTove Lo\n', '\nNick Jonas\n'

In [14]:
# each list becomes a column

artist_song_df= pd.DataFrame({"artist":artist,"song":song})

In [15]:
artist_song_df.head()

Unnamed: 0,artist,song
0,\nMark Ronson featuring Bruno Mars\n,Uptown Funk
1,\nEd Sheeran\n,Thinking Out Loud
2,\nWiz Khalifa featuring Charlie Puth\n,See You Again
3,\nFetty Wap\n,Trap Queen
4,\nMaroon 5\n,Sugar


In [16]:
artist_song_df['artist'] = artist_song_df['artist'].str.replace('\n', ' ')


In [17]:
artist_song_df.head()

Unnamed: 0,artist,song
0,Mark Ronson featuring Bruno Mars,Uptown Funk
1,Ed Sheeran,Thinking Out Loud
2,Wiz Khalifa featuring Charlie Puth,See You Again
3,Fetty Wap,Trap Queen
4,Maroon 5,Sugar


## Scraping popvortex.com

In [18]:
url= 'https://www.popvortex.com/music/charts/top-100-songs.php'

In [19]:
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [20]:
#response.content

In [21]:
soup = BeautifulSoup(response.content, "html.parser") # parse html (create the 'soup')

In [None]:
soup # check that the html code looks like it should

In [None]:
print(soup.prettify())

In [24]:
soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite")[0].get_text()

'Unholy'

In [25]:
soup.select("div.chart-content.col-xs-12.col-sm-8 > p > em")[0].get_text()

'Sam Smith & Kim Petras'

In [26]:
#initialize empty lists
artist = []
song = []


num_iter = len(soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite"))

song_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > cite")
artist_list = soup.select("div.chart-content.col-xs-12.col-sm-8 > p > em")

for i in range(num_iter):
    artist.append(artist_list[i].get_text())
    song.append(song_list[i].get_text())

print(artist)
print(song)


['Sam Smith & Kim Petras', 'Taylor Swift', 'Taylor Swift', 'Rihanna', 'Meghan Trainor', 'Timcast', 'Brett Young', 'Fleetwood Mac', 'David Guetta & Bebe Rexha', 'Tom MacDonald', 'Kane Brown & Katelyn Brown', 'HARDY & Lainey Wilson', 'Sia', 'OneRepublic', 'Jelly Roll', 'Beyoncé', 'Zach Bryan', 'P!nk', 'Coldplay', 'Louis Armstrong', 'Morgan Wallen', 'Drake & 21 Savage', 'Bailey Zimmerman', 'Morgan Wallen', 'Chris Brown', 'Lainey Wilson', 'Selena Gomez', 'Cole Swindell & Jo Dee Messina', 'Harry Styles', 'R.E.M.', 'Ed Sheeran', 'Post Malone, Mark Morrison & Sickick', 'Steve Lacy', 'Cole Swindell', 'Luke Combs', 'R.E.M.', 'CeCe Winans', 'Bobby McFerrin', 'Lee Brice', 'Ed Sheeran', 'Drake & 21 Savage', 'Elton John & Britney Spears', 'Mariah Carey', 'Quavo & Takeoff', 'Bruno Mars', 'Jordan Davis', 'JVKE', 'Bailey Zimmerman', 'Post Malone', 'Jax', 'Jordan Davis', 'Nicky Youre & Dazy', 'Colton Dixon', 'Lil Nas X', 'Luke Bryan', 'Nicki Minaj', 'Imagine Dragons', 'Taylor Swift', 'Dean Lewis', 'Bad

In [27]:
# each list becomes a column

artist_song_df_2= pd.DataFrame({"artist":artist,"song":song})

In [28]:
artist_song_df_2.head()

Unnamed: 0,artist,song
0,Sam Smith & Kim Petras,Unholy
1,Taylor Swift,Anti-Hero (feat. Bleachers)
2,Taylor Swift,Anti-Hero
3,Rihanna,Lift Me Up (From Black Panther: Wakanda Foreve...
4,Meghan Trainor,Made You Look


In [29]:
# concatenate the two dataframes

music_df = pd.concat([artist_song_df, artist_song_df_2], axis=0)
music_df=music_df.reset_index(drop=True)

In [30]:
music_df.head()

Unnamed: 0,artist,song
0,Mark Ronson featuring Bruno Mars,Uptown Funk
1,Ed Sheeran,Thinking Out Loud
2,Wiz Khalifa featuring Charlie Puth,See You Again
3,Fetty Wap,Trap Queen
4,Maroon 5,Sugar


In [31]:
music_df.shape

(200, 2)

In [32]:
music_df.columns

Index(['artist', 'song'], dtype='object')

In [33]:
music_df.to_csv('music_df.csv')